diff options
Diffstat (limited to '')
25 files changed, 4483 insertions, 0 deletions
diff --git a/intl/lwbrk/LineBreaker.cpp b/intl/lwbrk/LineBreaker.cpp new file mode 100644 index 0000000000..d4c78c789e --- /dev/null +++ b/intl/lwbrk/LineBreaker.cpp @@ -0,0 +1,1169 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/LineBreaker.h" + +#include "jisx4051class.h" +#include "nsComplexBreaker.h" +#include "nsTArray.h" +#include "nsUnicodeProperties.h" +#include "mozilla/ArrayUtils.h" + +using namespace mozilla::unicode; +using namespace mozilla::intl; + +/*static*/ +already_AddRefed<LineBreaker> LineBreaker::Create() { + return RefPtr<LineBreaker>(new LineBreaker()).forget(); +} + +/* + + Simplification of Pair Table in JIS X 4051 + + 1. The Origion Table - in 4.1.3 + + In JIS x 4051. The pair table is defined as below + + Class of + Leading Class of Trailing Char Class + Char + + 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20 + * # * # + 1 X X X X X X X X X X X X X X X X X X X X X E + 2 X X X X X X + 3 X X X X X X + 4 X X X X X X + 5 X X X X X X + 6 X X X X X X + 7 X X X X X X X + 8 X X X X X X E + 9 X X X X X X + 10 X X X X X X + 11 X X X X X X + 12 X X X X X X + 13 X X X X X X X + 14 X X X X X X X + 15 X X X X X X X X X + 16 X X X X X X X X + 17 X X X X X E + 18 X X X X X X X X X + 19 X E E E E E X X X X X X X X X X X X E X E E + 20 X X X X X E + + * Same Char + # Other Char + + X Cannot Break + + The classes mean: + 1: Open parenthesis + 2: Close parenthesis + 3: Prohibit a line break before + 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?") + 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT) + 6: Full stop + 7: Non-breakable between same characters + 8: Prefix (e.g., "$", "NO.") + 9: Postfix (e.g., "%") + 10: Ideographic space + 11: Hiragana + 12: Japanese characters (except class 11) + 13: Subscript + 14: Ruby + 15: Numeric + 16: Alphabet + 17: Space for Western language + 18: Western characters (except class 17) + 19: Split line note (Warichu) begin quote + 20: Split line note (Warichu) end quote + + 2. Simplified by remove the class which we do not care + + However, since we do not care about class 13(Subscript), 14(Ruby), + 16 (Aphabet), 19(split line note begin quote), and 20(split line note end + quote) we can simplify this par table into the following + + Class of + Leading Class of Trailing Char Class + Char + + 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18 + + 1 X X X X X X X X X X X X X X X + 2 X X X X X + 3 X X X X X + 4 X X X X X + 5 X X X X X + 6 X X X X X + 7 X X X X X X + 8 X X X X X X + 9 X X X X X + 10 X X X X X + 11 X X X X X + 12 X X X X X + 15 X X X X X X X X + 17 X X X X X + 18 X X X X X X X + + 3. Simplified by merged classes + + After the 2 simplification, the pair table have some duplication + a. class 2, 3, 4, 5, 6, are the same- we can merged them + b. class 10, 11, 12, 17 are the same- we can merged them + + We introduce an extra non-breaking pair at [b]/7 to better match + the expectations of CSS line-breaking as tested by WPT tests. + This added entry is marked as * in the tables below. + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 + + 1 X X X X X X X X + [a] X + 7 X X + 8 X X + 9 X + [b] X * + 15 X X X X + 18 X X X + + + 4. We add COMPLEX characters and make it breakable w/ all ther class + except after class 1 and before class [a] + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX + + 1 X X X X X X X X X + [a] X + 7 X X + 8 X X + 9 X + [b] X * + 15 X X X X + 18 X X X + COMPLEX X T + + T : need special handling + + + 5. However, we need two special class for some punctuations/parentheses, + theirs breaking rules like character class (18), see bug 389056. + And also we need character like punctuation that is same behavior with 18, + but the characters are not letters of all languages. (e.g., '_') + [c]. Based on open parenthesis class (1), but it is not breakable after + character class (18) or numeric class (15). + [d]. Based on close parenthesis (or punctuation) class (2), but it is not + breakable before character class (18) or numeric class (15). + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] + + 1 X X X X X X X X X X X + [a] X X X + 7 X X + 8 X X + 9 X + [b] X * X + 15 X X X X X X + 18 X X X X X + COMPLEX X T + [c] X X X X X X X X X X X + [d] X X X X + + + 6. And Unicode has "NON-BREAK" characters. The lines should be broken around + them. But in JIS X 4051, such class is not, therefore, we create [e]. + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] + + 1 X X X X X X X X X X X X + [a] X X X + 7 X X X + 8 X X X + 9 X X + [b] X * X X + 15 X X X X X X X + 18 X X X X X X + COMPLEX X T X + [c] X X X X X X X X X X X X + [d] X X X X X + [e] X X X X X X X X X X X X + + + 7. Now we use one bit to encode whether it is breakable, and use 2 bytes + for one row, then the bit table will look like: + + 18 <- 1 + + 1 0000 1111 1111 1111 = 0x0FFF + [a] 0000 1100 0000 0010 = 0x0C02 + 7 0000 1000 0000 0110 = 0x0806 + 8 0000 1000 0100 0010 = 0x0842 + 9 0000 1000 0000 0010 = 0x0802 + [b] 0000 1100 0000 0110 = 0x0C06 + 15 0000 1110 1101 0010 = 0x0ED2 + 18 0000 1110 1100 0010 = 0x0EC2 + COMPLEX 0000 1001 0000 0010 = 0x0902 + [c] 0000 1111 1111 1111 = 0x0FFF + [d] 0000 1100 1100 0010 = 0x0CC2 + [e] 0000 1111 1111 1111 = 0x0FFF +*/ + +#define MAX_CLASSES 12 + +static const uint16_t gPair[MAX_CLASSES] = {0x0FFF, 0x0C02, 0x0806, 0x0842, + 0x0802, 0x0C06, 0x0ED2, 0x0EC2, + 0x0902, 0x0FFF, 0x0CC2, 0x0FFF}; + +/* + + 8. And if the character is not enough far from word start, word end and + another break point, we should not break in non-CJK languages. + I.e., Don't break around 15, 18, [c] and [d], but don't change + that if they are related to [b]. + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] + + 1 X X X X X X X X X X X X + [a] X X X X X X + 7 X X X X X X X + 8 X X X X X X + 9 X X X X X X + [b] X * X X + 15 X X X X X X X X X X X + 18 X X X X X X X X X X X + COMPLEX X X X T X X X + [c] X X X X X X X X X X X X + [d] X X X X X X X X X X X + [e] X X X X X X X X X X X X + + 18 <- 1 + + 1 0000 1111 1111 1111 = 0x0FFF + [a] 0000 1110 1100 0010 = 0x0EC2 + 7 0000 1110 1100 0110 = 0x0EC6 + 8 0000 1110 1100 0010 = 0x0EC2 + 9 0000 1110 1100 0010 = 0x0EC2 + [b] 0000 1100 0000 0110 = 0x0C06 + 15 0000 1111 1101 1111 = 0x0FDF + 18 0000 1111 1101 1111 = 0x0FDF + COMPLEX 0000 1111 1100 0010 = 0x0FC2 + [c] 0000 1111 1111 1111 = 0x0FFF + [d] 0000 1111 1101 1111 = 0x0FDF + [e] 0000 1111 1111 1111 = 0x0FFF +*/ + +static const uint16_t gPairConservative[MAX_CLASSES] = { + 0x0FFF, 0x0EC2, 0x0EC6, 0x0EC2, 0x0EC2, 0x0C06, + 0x0FDF, 0x0FDF, 0x0FC2, 0x0FFF, 0x0FDF, 0x0FFF}; + +/* + + 9. Now we map the class to number + + 0: 1 + 1: [a]- 2, 3, 4, 5, 6 + 2: 7 + 3: 8 + 4: 9 + 5: [b]- 10, 11, 12, 17 + 6: 15 + 7: 18 + 8: COMPLEX + 9: [c] + A: [d] + B: [e] + + and they mean: + 0: Open parenthesis + 1: Punctuation that prohibits break before + 2: Non-breakable between same classes + 3: Prefix + 4: Postfix + 5: Breakable character (Spaces and Most Japanese characters) + 6: Numeric + 7: Characters + 8: Need special handling characters (E.g., Thai) + 9: Open parentheses like Character (See bug 389056) + A: Close parenthese (or punctuations) like Character (See bug 389056) + B: Non breakable (See bug 390920) + +*/ + +#define CLASS_NONE INT8_MAX + +#define CLASS_OPEN 0x00 +#define CLASS_CLOSE 0x01 +#define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02 +#define CLASS_PREFIX 0x03 +#define CLASS_POSTFFIX 0x04 +#define CLASS_BREAKABLE 0x05 +#define CLASS_NUMERIC 0x06 +#define CLASS_CHARACTER 0x07 +#define CLASS_COMPLEX 0x08 +#define CLASS_OPEN_LIKE_CHARACTER 0x09 +#define CLASS_CLOSE_LIKE_CHARACTER 0x0A +#define CLASS_NON_BREAKABLE 0x0B + +#define U_NULL char16_t(0x0000) +#define U_SLASH char16_t('/') +#define U_SPACE char16_t(' ') +#define U_HYPHEN char16_t('-') +#define U_EQUAL char16_t('=') +#define U_PERCENT char16_t('%') +#define U_AMPERSAND char16_t('&') +#define U_SEMICOLON char16_t(';') +#define U_BACKSLASH char16_t('\\') +#define U_OPEN_SINGLE_QUOTE char16_t(0x2018) +#define U_OPEN_DOUBLE_QUOTE char16_t(0x201C) +#define U_OPEN_GUILLEMET char16_t(0x00AB) + +#define NEED_CONTEXTUAL_ANALYSIS(c) \ + (IS_HYPHEN(c) || (c) == U_SLASH || (c) == U_PERCENT || (c) == U_AMPERSAND || \ + (c) == U_SEMICOLON || (c) == U_BACKSLASH || (c) == U_OPEN_SINGLE_QUOTE || \ + (c) == U_OPEN_DOUBLE_QUOTE || (c) == U_OPEN_GUILLEMET) + +#define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039) + +static inline int GETCLASSFROMTABLE(const uint32_t* t, uint16_t l) { + return ((((t)[(l >> 3)]) >> ((l & 0x0007) << 2)) & 0x000f); +} + +static inline int IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u) { + return ((0xff66 <= (u)) && ((u) <= 0xff70)); +} + +static inline int IS_CJK_CHAR(char32_t u) { + return ( + (0x1100 <= (u) && (u) <= 0x11ff) || (0x2e80 <= (u) && (u) <= 0xd7ff) || + (0xf900 <= (u) && (u) <= 0xfaff) || (0xff00 <= (u) && (u) <= 0xffef) || + (0x20000 <= (u) && (u) <= 0x2fffd)); +} + +static inline bool IS_NONBREAKABLE_SPACE(char16_t u) { + return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE +} + +static inline bool IS_HYPHEN(char16_t u) { + return (u == U_HYPHEN || u == 0x2010 || // HYPHEN + u == 0x2012 || // FIGURE DASH + u == 0x2013 || // EN DASH +#if ANDROID + /* Bug 1647377: On Android, we don't have a "platform" backend + * that supports Tibetan (nsRuleBreaker.cpp only knows about + * Thai), so instead we just treat the TSHEG like a hyphen to + * provide basic line-breaking possibilities. + */ + u == 0x0F0B || // TIBETAN MARK INTERSYLLABIC TSHEG +#endif + u == 0x058A); // ARMENIAN HYPHEN +} + +static int8_t GetClass(uint32_t u, LineBreaker::Strictness aLevel, + bool aIsChineseOrJapanese) { + // Mapping for Unicode LineBreak.txt classes to the (simplified) set of + // character classes used here. + // XXX The mappings here were derived by comparing the Unicode LineBreak + // values of BMP characters to the classes our existing GetClass returns + // for the same codepoints; in cases where characters with the same + // LineBreak class mapped to various classes here, I picked what seemed + // the most prevalent equivalence. + // Some of these are unclear to me, but currently they are ONLY used + // for characters not handled by the old code below, so all the JISx405 + // special cases should already be accounted for. + static const int8_t sUnicodeLineBreakToClass[] = { + /* UNKNOWN = 0, [XX] */ CLASS_CHARACTER, + /* AMBIGUOUS = 1, [AI] */ CLASS_CHARACTER, + /* ALPHABETIC = 2, [AL] */ CLASS_CHARACTER, + /* BREAK_BOTH = 3, [B2] */ CLASS_CHARACTER, + /* BREAK_AFTER = 4, [BA] */ CLASS_CHARACTER, + /* BREAK_BEFORE = 5, [BB] */ CLASS_OPEN_LIKE_CHARACTER, + /* MANDATORY_BREAK = 6, [BK] */ CLASS_CHARACTER, + /* CONTINGENT_BREAK = 7, [CB] */ CLASS_CHARACTER, + /* CLOSE_PUNCTUATION = 8, [CL] */ CLASS_CHARACTER, + /* COMBINING_MARK = 9, [CM] */ CLASS_CHARACTER, + /* CARRIAGE_RETURN = 10, [CR] */ CLASS_BREAKABLE, + /* EXCLAMATION = 11, [EX] */ CLASS_CHARACTER, + /* GLUE = 12, [GL] */ CLASS_NON_BREAKABLE, + /* HYPHEN = 13, [HY] */ CLASS_CHARACTER, + /* IDEOGRAPHIC = 14, [ID] */ CLASS_BREAKABLE, + /* INSEPARABLE = 15, [IN] */ CLASS_CLOSE_LIKE_CHARACTER, + /* INFIX_NUMERIC = 16, [IS] */ CLASS_CHARACTER, + /* LINE_FEED = 17, [LF] */ CLASS_BREAKABLE, + /* NONSTARTER = 18, [NS] */ CLASS_CLOSE_LIKE_CHARACTER, + /* NUMERIC = 19, [NU] */ CLASS_NUMERIC, + /* OPEN_PUNCTUATION = 20, [OP] */ CLASS_CHARACTER, + /* POSTFIX_NUMERIC = 21, [PO] */ CLASS_CHARACTER, + /* PREFIX_NUMERIC = 22, [PR] */ CLASS_CHARACTER, + /* QUOTATION = 23, [QU] */ CLASS_CHARACTER, + /* COMPLEX_CONTEXT = 24, [SA] */ CLASS_CHARACTER, + /* SURROGATE = 25, [SG] */ CLASS_CHARACTER, + /* SPACE = 26, [SP] */ CLASS_BREAKABLE, + /* BREAK_SYMBOLS = 27, [SY] */ CLASS_CHARACTER, + /* ZWSPACE = 28, [ZW] */ CLASS_BREAKABLE, + /* NEXT_LINE = 29, [NL] */ CLASS_CHARACTER, + /* WORD_JOINER = 30, [WJ] */ CLASS_NON_BREAKABLE, + /* H2 = 31, [H2] */ CLASS_BREAKABLE, + /* H3 = 32, [H3] */ CLASS_BREAKABLE, + /* JL = 33, [JL] */ CLASS_CHARACTER, + /* JT = 34, [JT] */ CLASS_CHARACTER, + /* JV = 35, [JV] */ CLASS_CHARACTER, + /* CLOSE_PARENTHESIS = 36, [CP] */ CLASS_CLOSE_LIKE_CHARACTER, + /* CONDITIONAL_JAPANESE_STARTER = 37, [CJ] */ CLASS_CLOSE, + /* HEBREW_LETTER = 38, [HL] */ CLASS_CHARACTER, + /* REGIONAL_INDICATOR = 39, [RI] */ CLASS_CHARACTER, + /* E_BASE = 40, [EB] */ CLASS_BREAKABLE, + /* E_MODIFIER = 41, [EM] */ CLASS_CHARACTER, + /* ZWJ = 42, [ZWJ]*/ CLASS_CHARACTER}; + + static_assert(U_LB_COUNT == mozilla::ArrayLength(sUnicodeLineBreakToClass), + "Gecko vs ICU LineBreak class mismatch"); + + auto cls = GetLineBreakClass(u); + MOZ_ASSERT(cls < mozilla::ArrayLength(sUnicodeLineBreakToClass)); + + // Overrides based on rules for the different line-break values given in + // https://drafts.csswg.org/css-text-3/#line-break-property + switch (aLevel) { + case LineBreaker::Strictness::Auto: + // For now, just use legacy Gecko behavior. + // XXX Possible enhancement - vary strictness according to line width + // or other criteria. + break; + case LineBreaker::Strictness::Strict: + if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER || + (u == 0x3095 || u == 0x3096 || u == 0x30f5 || u == 0x30f6)) { + return CLASS_CLOSE; + } + if (cls == U_LB_INSEPARABLE) { + return CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS; + } + if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E || + u == 0x30FD || u == 0x30FE) { + return CLASS_CLOSE_LIKE_CHARACTER; + } + if (aIsChineseOrJapanese) { + if (cls == U_LB_POSTFIX_NUMERIC && IsEastAsianWidthAFW(u)) { + return CLASS_CLOSE_LIKE_CHARACTER; + } + if (cls == U_LB_PREFIX_NUMERIC && IsEastAsianWidthAFW(u)) { + return CLASS_OPEN_LIKE_CHARACTER; + } + if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) { + return CLASS_CLOSE_LIKE_CHARACTER; + } + } + break; + case LineBreaker::Strictness::Normal: + if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER) { + return CLASS_BREAKABLE; + } + if (cls == U_LB_INSEPARABLE) { + return CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS; + } + if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E || + u == 0x30FD || u == 0x30FE) { + return CLASS_CLOSE_LIKE_CHARACTER; + } + if (aIsChineseOrJapanese) { + if (cls == U_LB_POSTFIX_NUMERIC && IsEastAsianWidthAFW(u)) { + return CLASS_CLOSE_LIKE_CHARACTER; + } + if (cls == U_LB_PREFIX_NUMERIC && IsEastAsianWidthAFW(u)) { + return CLASS_OPEN_LIKE_CHARACTER; + } + if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) { + return CLASS_BREAKABLE; + } + } + break; + case LineBreaker::Strictness::Loose: + if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER) { + return CLASS_BREAKABLE; + } + if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E || + u == 0x30FD || u == 0x30FE) { + return CLASS_BREAKABLE; + } + if (cls == U_LB_INSEPARABLE) { + return CLASS_BREAKABLE; + } + if (aIsChineseOrJapanese) { + if (u == 0x30FB || u == 0xFF1A || u == 0xFF1B || u == 0xFF65 || + u == 0x203C || u == 0x2047 || u == 0x2048 || u == 0x2049 || + u == 0xFF01 || u == 0xFF1F) { + return CLASS_BREAKABLE; + } + if (cls == U_LB_POSTFIX_NUMERIC && IsEastAsianWidthAFW(u)) { + return CLASS_BREAKABLE; + } + if (cls == U_LB_PREFIX_NUMERIC && IsEastAsianWidthAFW(u)) { + return CLASS_BREAKABLE; + } + if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) { + return CLASS_BREAKABLE; + } + } + break; + case LineBreaker::Strictness::Anywhere: + MOZ_ASSERT_UNREACHABLE("should have been handled already"); + break; + } + + if (u < 0x10000) { + uint16_t h = u & 0xFF00; + uint16_t l = u & 0x00ff; + + // Handle 3 range table first + if (0x0000 == h) { + return GETCLASSFROMTABLE(gLBClass00, l); + } + if (0x1700 == h) { + return GETCLASSFROMTABLE(gLBClass17, l); + } + if (NS_NeedsPlatformNativeHandling(u)) { + return CLASS_COMPLEX; + } + if (0x0E00 == h) { + return GETCLASSFROMTABLE(gLBClass0E, l); + } + if (0x2000 == h) { + return GETCLASSFROMTABLE(gLBClass20, l); + } + if (0x2100 == h) { + return GETCLASSFROMTABLE(gLBClass21, l); + } + if (0x3000 == h) { + return GETCLASSFROMTABLE(gLBClass30, l); + } + if (0xff00 == h) { + if (l <= 0x0060) { // Fullwidth ASCII variant + // Fullwidth comma and period are exceptions to our map-to-ASCII + // behavior: https://bugzilla.mozilla.org/show_bug.cgi?id=1595428 + if (l + 0x20 == ',' || l + 0x20 == '.') { + return CLASS_CLOSE; + } + // Also special-case fullwidth left/right white parenthesis, + // which do not fit the pattern of mapping to the ASCII block + if (l == 0x005f) { + return CLASS_OPEN; + } + if (l == 0x0060) { + return CLASS_CLOSE; + } + return GETCLASSFROMTABLE(gLBClass00, (l + 0x20)); + } + if (l < 0x00a0) { // Halfwidth Katakana variants + switch (l) { + case 0x61: + return GetClass(0x3002, aLevel, aIsChineseOrJapanese); + case 0x62: + return GetClass(0x300c, aLevel, aIsChineseOrJapanese); + case 0x63: + return GetClass(0x300d, aLevel, aIsChineseOrJapanese); + case 0x64: + return GetClass(0x3001, aLevel, aIsChineseOrJapanese); + case 0x65: + return GetClass(0x30fb, aLevel, aIsChineseOrJapanese); + case 0x9e: + return GetClass(0x309b, aLevel, aIsChineseOrJapanese); + case 0x9f: + return GetClass(0x309c, aLevel, aIsChineseOrJapanese); + default: + if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) { + return CLASS_CLOSE; // jis x4051 class 3 + } + return CLASS_BREAKABLE; // jis x4051 class 11 + } + } + if (l < 0x00e0) { + return CLASS_CHARACTER; // Halfwidth Hangul variants + } + if (l < 0x00f0) { + static char16_t NarrowFFEx[16] = { + 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000, + 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000}; + return GetClass(NarrowFFEx[l - 0x00e0], aLevel, aIsChineseOrJapanese); + } + } else if (0x3100 == h) { + if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun + // XXX: This is per UAX #14, but UAX #14 may change + // the line breaking rules about Kanbun and Bopomofo. + return CLASS_BREAKABLE; + } + if (l >= 0xf0) { // Katakana small letters for Ainu + return CLASS_CLOSE; + } + } else if (0x0300 == h) { + if (0x4F == l || (0x5C <= l && l <= 0x62)) { + return CLASS_NON_BREAKABLE; + } + } else if (0x0500 == h) { + // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14) + if (l == 0x8A) { + return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN)); + } + } else if (0x0F00 == h) { + // Tibetan chars with class = BA + if (0x34 == l || 0x7f == l || 0x85 == l || 0xbe == l || 0xbf == l || + 0xd2 == l) { + return CLASS_BREAKABLE; + } + } else if (0x1800 == h) { + if (0x0E == l) { + return CLASS_NON_BREAKABLE; + } + } else if (0x1600 == h) { + if (0x80 == l) { // U+1680 OGHAM SPACE MARK + return CLASS_BREAKABLE; + } + } else if (u == 0xfeff) { + return CLASS_NON_BREAKABLE; + } + } + + return sUnicodeLineBreakToClass[cls]; +} + +static bool GetPair(int8_t c1, int8_t c2) { + NS_ASSERTION(c1 < MAX_CLASSES, "illegal classes 1"); + NS_ASSERTION(c2 < MAX_CLASSES, "illegal classes 2"); + + return (0 == ((gPair[c1] >> c2) & 0x0001)); +} + +static bool GetPairConservative(int8_t c1, int8_t c2) { + NS_ASSERTION(c1 < MAX_CLASSES, "illegal classes 1"); + NS_ASSERTION(c2 < MAX_CLASSES, "illegal classes 2"); + + return (0 == ((gPairConservative[c1] >> c2) & 0x0001)); +} + +class ContextState { + public: + ContextState(const char16_t* aText, uint32_t aLength) + : mUniText(aText), mText(nullptr), mLength(aLength) { + Init(); + } + + ContextState(const uint8_t* aText, uint32_t aLength) + : mUniText(nullptr), mText(aText), mLength(aLength) { + Init(); + } + + uint32_t Length() const { return mLength; } + uint32_t Index() const { return mIndex; } + + // This gets a single code unit of the text, without checking for surrogates + // (in the case of a 16-bit text buffer). That's OK if we're only checking for + // specific characters that are known to be BMP values. + char16_t GetCodeUnitAt(uint32_t aIndex) const { + MOZ_ASSERT(aIndex < mLength, "Out of range!"); + return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]); + } + + // This gets a 32-bit Unicode character (codepoint), handling surrogate pairs + // as necessary. It must ONLY be called for 16-bit text, not 8-bit. + char32_t GetUnicodeCharAt(uint32_t aIndex) const { + MOZ_ASSERT(mUniText, "Only for 16-bit text!"); + MOZ_ASSERT(aIndex < mLength, "Out of range!"); + char32_t c = mUniText[aIndex]; + if (aIndex + 1 < mLength && NS_IS_SURROGATE_PAIR(c, mUniText[aIndex + 1])) { + c = SURROGATE_TO_UCS4(c, mUniText[aIndex + 1]); + } + return c; + } + + void AdvanceIndex() { ++mIndex; } + + void NotifyBreakBefore() { mLastBreakIndex = mIndex; } + + // A word of western language should not be broken. But even if the word has + // only ASCII characters, non-natural context words should be broken, e.g., + // URL and file path. For protecting the natural words, we should use + // conservative breaking rules at following conditions: + // 1. at near the start of word + // 2. at near the end of word + // 3. at near the latest broken point + // CONSERVATIVE_RANGE_{LETTER,OTHER} define the 'near' in characters, + // which varies depending whether we are looking at a letter or a non-letter + // character: for non-letters, we use an extended "conservative" range. + +#define CONSERVATIVE_RANGE_LETTER 2 +#define CONSERVATIVE_RANGE_OTHER 6 + + bool UseConservativeBreaking(uint32_t aOffset = 0) const { + if (mHasCJKChar) return false; + uint32_t index = mIndex + aOffset; + + // If the character at index is a letter (rather than various punctuation + // characters, etc) then we want a shorter "conservative" range + uint32_t conservativeRangeStart, conservativeRangeEnd; + if (index < mLength && + nsUGenCategory::kLetter == + (mText ? GetGenCategory(mText[index]) + : GetGenCategory(GetUnicodeCharAt(index)))) { + // Primarily for hyphenated word prefixes/suffixes; we add 1 to Start + // to get more balanced behavior (if we break off a 2-letter prefix, + // that means the break will actually be three letters from start of + // word, to include the hyphen; whereas a 2-letter suffix will be + // broken only two letters from end of word). + conservativeRangeEnd = CONSERVATIVE_RANGE_LETTER; + conservativeRangeStart = CONSERVATIVE_RANGE_LETTER + 1; + } else { + conservativeRangeEnd = conservativeRangeStart = CONSERVATIVE_RANGE_OTHER; + } + + bool result = (index < conservativeRangeStart || + mLength - index < conservativeRangeEnd || + index - mLastBreakIndex < conservativeRangeStart); + if (result || !mHasNonbreakableSpace) return result; + + // This text has no-breakable space, we need to check whether the index + // is near it. + + // Note that index is always larger than conservativeRange here. + for (uint32_t i = index; index - conservativeRangeStart < i; --i) { + if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i - 1))) return true; + } + // Note that index is always less than mLength - conservativeRange. + for (uint32_t i = index + 1; i < index + conservativeRangeEnd; ++i) { + if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i))) return true; + } + return false; + } + + bool HasPreviousEqualsSign() const { return mHasPreviousEqualsSign; } + void NotifySeenEqualsSign() { mHasPreviousEqualsSign = true; } + + bool HasPreviousSlash() const { return mHasPreviousSlash; } + void NotifySeenSlash() { mHasPreviousSlash = true; } + + bool HasPreviousBackslash() const { return mHasPreviousBackslash; } + void NotifySeenBackslash() { mHasPreviousBackslash = true; } + + uint32_t GetPreviousNonHyphenCharacter() const { + return mPreviousNonHyphenCharacter; + } + void NotifyNonHyphenCharacter(uint32_t ch) { + mPreviousNonHyphenCharacter = ch; + } + + private: + void Init() { + mIndex = 0; + mLastBreakIndex = 0; + mPreviousNonHyphenCharacter = U_NULL; + mHasCJKChar = false; + mHasNonbreakableSpace = false; + mHasPreviousEqualsSign = false; + mHasPreviousSlash = false; + mHasPreviousBackslash = false; + + if (mText) { + // 8-bit text: we only need to check for + for (uint32_t i = 0; i < mLength; ++i) { + if (IS_NONBREAKABLE_SPACE(mText[i])) { + mHasNonbreakableSpace = true; + break; + } + } + } else { + // 16-bit text: handle surrogates and check for CJK as well as + for (uint32_t i = 0; i < mLength; ++i) { + char32_t u = GetUnicodeCharAt(i); + if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) { + mHasNonbreakableSpace = true; + if (mHasCJKChar) { + break; + } + } else if (!mHasCJKChar && IS_CJK_CHAR(u)) { + mHasCJKChar = true; + if (mHasNonbreakableSpace) { + break; + } + } + if (u > 0xFFFFu) { + ++i; // step over trailing low surrogate + } + } + } + } + + const char16_t* const mUniText; + const uint8_t* const mText; + + uint32_t mIndex; + const uint32_t mLength; // length of text + uint32_t mLastBreakIndex; + char32_t mPreviousNonHyphenCharacter; // The last character we have seen + // which is not U_HYPHEN + bool mHasCJKChar; // if the text has CJK character, this is true. + bool mHasNonbreakableSpace; // if the text has no-breakable space, + // this is true. + bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL + bool mHasPreviousSlash; // True if we have seen a U_SLASH + bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH +}; + +static int8_t ContextualAnalysis(char32_t prev, char32_t cur, char32_t next, + ContextState& aState, + LineBreaker::Strictness aLevel, + bool aIsChineseOrJapanese) { + // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE. + + if (IS_HYPHEN(cur)) { + // If next character is hyphen, we don't need to break between them. + if (IS_HYPHEN(next)) return CLASS_CHARACTER; + // If prev and next characters are numeric, it may be in Math context. + // So, we should not break here. + bool prevIsNum = IS_ASCII_DIGIT(prev); + bool nextIsNum = IS_ASCII_DIGIT(next); + if (prevIsNum && nextIsNum) return CLASS_NUMERIC; + // If one side is numeric and the other is a character, or if both sides are + // characters, the hyphen should be breakable. + if (!aState.UseConservativeBreaking(1)) { + char32_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter(); + if (prevOfHyphen && next) { + int8_t prevClass = GetClass(prevOfHyphen, aLevel, aIsChineseOrJapanese); + int8_t nextClass = GetClass(next, aLevel, aIsChineseOrJapanese); + bool prevIsNumOrCharOrClose = + prevIsNum || + (prevClass == CLASS_CHARACTER && + !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) || + prevClass == CLASS_CLOSE || prevClass == CLASS_CLOSE_LIKE_CHARACTER; + bool nextIsNumOrCharOrOpen = + nextIsNum || + (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) || + nextClass == CLASS_OPEN || nextClass == CLASS_OPEN_LIKE_CHARACTER || + next == U_OPEN_SINGLE_QUOTE || next == U_OPEN_DOUBLE_QUOTE || + next == U_OPEN_GUILLEMET; + if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) { + return CLASS_CLOSE; + } + } + } + } else { + aState.NotifyNonHyphenCharacter(cur); + if (cur == U_SLASH || cur == U_BACKSLASH) { + // If this is immediately after same char, we should not break here. + if (prev == cur) return CLASS_CHARACTER; + // If this text has two or more (BACK)SLASHs, this may be file path or + // URL. Make sure to compute shouldReturn before we notify on this slash. + bool shouldReturn = !aState.UseConservativeBreaking() && + (cur == U_SLASH ? aState.HasPreviousSlash() + : aState.HasPreviousBackslash()); + + if (cur == U_SLASH) { + aState.NotifySeenSlash(); + } else { + aState.NotifySeenBackslash(); + } + + if (shouldReturn) return CLASS_OPEN; + } else if (cur == U_PERCENT) { + // If this is a part of the param of URL, we should break before. + if (!aState.UseConservativeBreaking()) { + if (aState.Index() >= 3 && + aState.GetCodeUnitAt(aState.Index() - 3) == U_PERCENT) + return CLASS_OPEN; + if (aState.Index() + 3 < aState.Length() && + aState.GetCodeUnitAt(aState.Index() + 3) == U_PERCENT) + return CLASS_OPEN; + } + } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) { + // If this may be a separator of params of URL, we should break after. + if (!aState.UseConservativeBreaking(1) && aState.HasPreviousEqualsSign()) + return CLASS_CLOSE; + } else if (cur == U_OPEN_SINGLE_QUOTE || cur == U_OPEN_DOUBLE_QUOTE || + cur == U_OPEN_GUILLEMET) { + // for CJK usage, we treat these as openers to allow a break before them, + // but otherwise treat them as normal characters because quote mark usage + // in various Western languages varies too much; see bug #450088 + // discussion. + if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next)) + return CLASS_OPEN; + } else { + NS_ERROR("Forgot to handle the current character!"); + } + } + return GetClass(cur, aLevel, aIsChineseOrJapanese); +} + +int32_t LineBreaker::WordMove(const char16_t* aText, uint32_t aLen, + uint32_t aPos, int8_t aDirection) { + bool textNeedsJISx4051 = false; + int32_t begin, end; + + for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) { + if (IS_CJK_CHAR(aText[begin]) || + NS_NeedsPlatformNativeHandling(aText[begin])) { + textNeedsJISx4051 = true; + } + } + for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) { + if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) { + textNeedsJISx4051 = true; + } + } + + int32_t ret; + AutoTArray<uint8_t, 2000> breakState; + if (!textNeedsJISx4051) { + // No complex text character, do not try to do complex line break. + // (This is required for serializers. See Bug #344816.) + if (aDirection < 0) { + ret = (begin == int32_t(aPos)) ? begin - 1 : begin; + } else { + ret = end; + } + } else { + // XXX(Bug 1631371) Check if this should use a fallible operation as it + // pretended earlier. + breakState.AppendElements(end - begin); + GetJISx4051Breaks(aText + begin, end - begin, WordBreak::Normal, + Strictness::Auto, false, breakState.Elements()); + + ret = aPos; + do { + ret += aDirection; + } while (begin < ret && ret < end && !breakState[ret - begin]); + } + + return ret; +} + +int32_t LineBreaker::Next(const char16_t* aText, uint32_t aLen, uint32_t aPos) { + NS_ASSERTION(aText, "aText shouldn't be null"); + NS_ASSERTION(aLen > aPos, + "Bad position passed to nsJISx4051LineBreaker::Next"); + + int32_t nextPos = WordMove(aText, aLen, aPos, 1); + return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT; +} + +int32_t LineBreaker::Prev(const char16_t* aText, uint32_t aLen, uint32_t aPos) { + NS_ASSERTION(aText, "aText shouldn't be null"); + NS_ASSERTION(aLen >= aPos && aPos > 0, + "Bad position passed to nsJISx4051LineBreaker::Prev"); + + int32_t prevPos = WordMove(aText, aLen, aPos, -1); + return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT; +} + +static bool SuppressBreakForKeepAll(uint32_t aPrev, uint32_t aCh) { + auto affectedByKeepAll = [](uint8_t aLBClass) { + switch (aLBClass) { + // Per https://drafts.csswg.org/css-text-3/#valdef-word-break-keep-all: + // "implicit soft wrap opportunities between typographic letter units + // (or other typographic character units belonging to the NU, AL, AI, + // or ID Unicode line breaking classes [UAX14]) are suppressed..." + case U_LB_ALPHABETIC: + case U_LB_AMBIGUOUS: + case U_LB_NUMERIC: + case U_LB_IDEOGRAPHIC: + // Additional classes that should be treated similarly, but have been + // broken out as separate classes in newer Unicode versions: + case U_LB_H2: + case U_LB_H3: + case U_LB_JL: + case U_LB_JV: + case U_LB_JT: + case U_LB_CONDITIONAL_JAPANESE_STARTER: + return true; + default: + return false; + } + }; + return affectedByKeepAll(GetLineBreakClass(aPrev)) && + affectedByKeepAll(GetLineBreakClass(aCh)); +} + +void LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength, + WordBreak aWordBreak, Strictness aLevel, + bool aIsChineseOrJapanese, + uint8_t* aBreakBefore) { + uint32_t cur; + int8_t lastClass = CLASS_NONE; + ContextState state(aChars, aLength); + + for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { + char32_t ch = state.GetUnicodeCharAt(cur); + uint32_t chLen = ch > 0xFFFFu ? 2 : 1; + int8_t cl; + + if (NEED_CONTEXTUAL_ANALYSIS(ch)) { + char32_t prev, next; + if (cur > 0) { + // not using state.GetUnicodeCharAt() here because we're looking back + // rather than forward for possible surrogates + prev = aChars[cur - 1]; + if (cur > 1 && NS_IS_SURROGATE_PAIR(aChars[cur - 2], prev)) { + prev = SURROGATE_TO_UCS4(aChars[cur - 2], prev); + } + } else { + prev = 0; + } + if (cur + chLen < aLength) { + next = state.GetUnicodeCharAt(cur + chLen); + } else { + next = 0; + } + cl = ContextualAnalysis(prev, ch, next, state, aLevel, + aIsChineseOrJapanese); + } else { + if (ch == U_EQUAL) state.NotifySeenEqualsSign(); + state.NotifyNonHyphenCharacter(ch); + cl = GetClass(ch, aLevel, aIsChineseOrJapanese); + } + + // To implement word-break:break-all, we overwrite the line-break class of + // alphanumeric characters so they are treated the same as ideographic. + // The relevant characters will have been assigned CLASS_CHARACTER, _CLOSE, + // _CLOSE_LIKE_CHARACTER, or _NUMERIC by GetClass(), but those classes also + // include others that we don't want to touch here, so we re-check the + // Unicode line-break class to determine which ones to modify. + if (aWordBreak == WordBreak::BreakAll && + (cl == CLASS_CHARACTER || cl == CLASS_CLOSE || + cl == CLASS_CLOSE_LIKE_CHARACTER || cl == CLASS_NUMERIC)) { + auto cls = GetLineBreakClass(ch); + if (cls == U_LB_ALPHABETIC || cls == U_LB_NUMERIC || + cls == U_LB_AMBIGUOUS || cls == U_LB_COMPLEX_CONTEXT || + /* Additional Japanese and Korean LB classes; CSS Text spec doesn't + explicitly mention these, but this appears to give expected + behavior (spec issue?) */ + cls == U_LB_CONDITIONAL_JAPANESE_STARTER || + (cls >= U_LB_H2 && cls <= U_LB_JV)) { + cl = CLASS_BREAKABLE; + } + } + + bool allowBreak = false; + if (cur > 0) { + NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl, + "Loop should have prevented adjacent complex chars here"); + auto prev = [=]() { + char32_t c = aChars[cur - 1]; + if (cur > 1 && NS_IS_SURROGATE_PAIR(aChars[cur - 2], c)) { + c = SURROGATE_TO_UCS4(aChars[cur - 2], c); + } + return c; + }; + allowBreak = + (state.UseConservativeBreaking() ? GetPairConservative(lastClass, cl) + : GetPair(lastClass, cl)) && + (aWordBreak != WordBreak::KeepAll || + !SuppressBreakForKeepAll(prev(), ch)); + } + aBreakBefore[cur] = allowBreak; + if (allowBreak) state.NotifyBreakBefore(); + lastClass = cl; + if (CLASS_COMPLEX == cl) { + uint32_t end = cur + chLen; + + while (end < aLength) { + char32_t c = state.GetUnicodeCharAt(end); + if (CLASS_COMPLEX != GetClass(c, aLevel, false)) { + break; + } + ++end; + if (c > 0xFFFFU) { // it was a surrogate pair + ++end; + } + } + + if (aWordBreak == WordBreak::BreakAll) { + // For break-all, we don't need to run a dictionary-based breaking + // algorithm, we just allow breaks between all grapheme clusters. + ClusterIterator ci(aChars + cur, end - cur); + while (!ci.AtEnd()) { + ci.Next(); + aBreakBefore[ci - aChars] = true; + } + } else { + NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur); + // restore breakability at chunk begin, which was always set to false + // by the complex line breaker + aBreakBefore[cur] = allowBreak; + } + + cur = end - 1; + } + + if (chLen == 2) { + // Supplementary-plane character: mark that we cannot break before the + // trailing low surrogate, and advance past it. + ++cur; + aBreakBefore[cur] = false; + state.AdvanceIndex(); + } + } +} + +void LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength, + WordBreak aWordBreak, Strictness aLevel, + bool aIsChineseOrJapanese, + uint8_t* aBreakBefore) { + uint32_t cur; + int8_t lastClass = CLASS_NONE; + ContextState state(aChars, aLength); + + for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { + char32_t ch = aChars[cur]; + int8_t cl; + + if (NEED_CONTEXTUAL_ANALYSIS(ch)) { + cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, ch, + cur + 1 < aLength ? aChars[cur + 1] : U_NULL, + state, aLevel, aIsChineseOrJapanese); + } else { + if (ch == U_EQUAL) state.NotifySeenEqualsSign(); + state.NotifyNonHyphenCharacter(ch); + cl = GetClass(ch, aLevel, aIsChineseOrJapanese); + } + if (aWordBreak == WordBreak::BreakAll && + (cl == CLASS_CHARACTER || cl == CLASS_CLOSE || + cl == CLASS_CLOSE_LIKE_CHARACTER || cl == CLASS_NUMERIC)) { + auto cls = GetLineBreakClass(ch); + // Don't need to check additional Japanese/Korean classes in 8-bit + if (cls == U_LB_ALPHABETIC || cls == U_LB_NUMERIC || + cls == U_LB_COMPLEX_CONTEXT) { + cl = CLASS_BREAKABLE; + } + } + + bool allowBreak = false; + if (cur > 0) { + allowBreak = + (state.UseConservativeBreaking() ? GetPairConservative(lastClass, cl) + : GetPair(lastClass, cl)) && + (aWordBreak != WordBreak::KeepAll || + !SuppressBreakForKeepAll(aChars[cur - 1], ch)); + } + aBreakBefore[cur] = allowBreak; + if (allowBreak) state.NotifyBreakBefore(); + lastClass = cl; + } +} diff --git a/intl/lwbrk/LineBreaker.h b/intl/lwbrk/LineBreaker.h new file mode 100644 index 0000000000..eaea8e36cc --- /dev/null +++ b/intl/lwbrk/LineBreaker.h @@ -0,0 +1,88 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef mozilla_intl_LineBreaker_h__ +#define mozilla_intl_LineBreaker_h__ + +#include "nscore.h" +#include "nsISupports.h" + +#define NS_LINEBREAKER_NEED_MORE_TEXT -1 + +namespace mozilla { +namespace intl { + +class LineBreaker { + public: + NS_INLINE_DECL_REFCOUNTING(LineBreaker) + + enum class WordBreak : uint8_t { + Normal = 0, // default + BreakAll = 1, // break all + KeepAll = 2 // always keep + }; + + enum class Strictness : uint8_t { + Auto = 0, + Loose = 1, + Normal = 2, + Strict = 3, + Anywhere = 4 + }; + + static already_AddRefed<LineBreaker> Create(); + + int32_t Next(const char16_t* aText, uint32_t aLen, uint32_t aPos); + + int32_t Prev(const char16_t* aText, uint32_t aLen, uint32_t aPos); + + // Call this on a word with whitespace at either end. We will apply JISx4051 + // rules to find breaks inside the word. aBreakBefore is set to the break- + // before status of each character; aBreakBefore[0] will always be false + // because we never return a break before the first character. + // aLength is the length of the aText array and also the length of the + // aBreakBefore output array. + void GetJISx4051Breaks(const char16_t* aText, uint32_t aLength, + WordBreak aWordBreak, Strictness aLevel, + bool aIsChineseOrJapanese, uint8_t* aBreakBefore); + void GetJISx4051Breaks(const uint8_t* aText, uint32_t aLength, + WordBreak aWordBreak, Strictness aLevel, + bool aIsChineseOrJapanese, uint8_t* aBreakBefore); + + private: + ~LineBreaker() = default; + + int32_t WordMove(const char16_t* aText, uint32_t aLen, uint32_t aPos, + int8_t aDirection); +}; + +static inline bool NS_IsSpace(char16_t u) { + return u == 0x0020 || // SPACE + u == 0x0009 || // CHARACTER TABULATION + u == 0x000D || // CARRIAGE RETURN + (0x2000 <= u && u <= 0x2006) || // EN QUAD, EM QUAD, EN SPACE, + // EM SPACE, THREE-PER-EM SPACE, + // FOUR-PER-SPACE, SIX-PER-EM SPACE, + (0x2008 <= u && u <= 0x200B) || // PUNCTUATION SPACE, THIN SPACE, + // HAIR SPACE, ZERO WIDTH SPACE + u == 0x1361 || // ETHIOPIC WORDSPACE + u == 0x1680 || // OGHAM SPACE MARK + u == 0x205F; // MEDIUM MATHEMATICAL SPACE +} + +static inline bool NS_NeedsPlatformNativeHandling(char16_t aChar) { + return +#if ANDROID // Bug 1647377: no "platform native" support for Tibetan; + // better to just use our class-based breaker. + (0x0e01 <= aChar && aChar <= 0x0eff) || // Thai, Lao +#else + (0x0e01 <= aChar && aChar <= 0x0fff) || // Thai, Lao, Tibetan +#endif + (0x1780 <= aChar && aChar <= 0x17ff); // Khmer +} + +} // namespace intl +} // namespace mozilla + +#endif /* mozilla_intl_LineBreaker_h__ */ diff --git a/intl/lwbrk/WordBreaker.cpp b/intl/lwbrk/WordBreaker.cpp new file mode 100644 index 0000000000..269d084d93 --- /dev/null +++ b/intl/lwbrk/WordBreaker.cpp @@ -0,0 +1,218 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/WordBreaker.h" +#include "mozilla/StaticPrefs_layout.h" +#include "nsComplexBreaker.h" +#include "nsUnicodeProperties.h" + +using mozilla::intl::WordBreakClass; +using mozilla::intl::WordBreaker; +using mozilla::intl::WordRange; +using mozilla::unicode::GetScriptCode; + +/*static*/ +already_AddRefed<WordBreaker> WordBreaker::Create() { + return RefPtr<WordBreaker>(new WordBreaker()).forget(); +} + +bool WordBreaker::BreakInBetween(const char16_t* aText1, uint32_t aTextLen1, + const char16_t* aText2, uint32_t aTextLen2) { + MOZ_ASSERT(nullptr != aText1, "null ptr"); + MOZ_ASSERT(nullptr != aText2, "null ptr"); + + if (!aText1 || !aText2 || (0 == aTextLen1) || (0 == aTextLen2)) return false; + + uint8_t c1 = GetClass(aText1[aTextLen1 - 1]); + uint8_t c2 = GetClass(aText2[0]); + + if (c1 == c2 && kWbClassScriptioContinua == c1) { + nsAutoString text(aText1, aTextLen1); + text.Append(aText2, aTextLen2); + AutoTArray<uint8_t, 256> breakBefore; + breakBefore.SetLength(aTextLen1 + aTextLen2); + NS_GetComplexLineBreaks(text.get(), text.Length(), breakBefore.Elements()); + bool ret = breakBefore[aTextLen1]; + return ret; + } + + return (c1 != c2); +} + +#define IS_ASCII(c) (0 == (0xFF80 & (c))) +#define ASCII_IS_ALPHA(c) \ + ((('a' <= (c)) && ((c) <= 'z')) || (('A' <= (c)) && ((c) <= 'Z'))) +#define ASCII_IS_DIGIT(c) (('0' <= (c)) && ((c) <= '9')) +#define ASCII_IS_SPACE(c) \ + ((' ' == (c)) || ('\t' == (c)) || ('\r' == (c)) || ('\n' == (c))) +#define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80) + +// we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect +// Unicode 3.0 +#define IS_HAN(c) \ + ((0x3400 <= (c)) && ((c) <= 0x9fff)) || ((0xf900 <= (c)) && ((c) <= 0xfaff)) +#define IS_KATAKANA(c) ((0x30A0 <= (c)) && ((c) <= 0x30FF)) +#define IS_HIRAGANA(c) ((0x3040 <= (c)) && ((c) <= 0x309F)) +#define IS_HALFWIDTHKATAKANA(c) ((0xFF60 <= (c)) && ((c) <= 0xFF9F)) + +// Return true if aChar belongs to a SEAsian script that is written without +// word spaces, so we need to use the "complex breaker" to find possible word +// boundaries. (https://en.wikipedia.org/wiki/Scriptio_continua) +// (How well this works depends on the level of platform support for finding +// possible line breaks - or possible word boundaries - in the particular +// script. Thai, at least, works pretty well on the major desktop OSes. If +// the script is not supported by the platform, we just won't find any useful +// boundaries.) +static bool IsScriptioContinua(char16_t aChar) { + Script sc = GetScriptCode(aChar); + return sc == Script::THAI || sc == Script::MYANMAR || sc == Script::KHMER || + sc == Script::JAVANESE || sc == Script::BALINESE || + sc == Script::SUNDANESE || sc == Script::LAO; +} + +/* static */ +WordBreakClass WordBreaker::GetClass(char16_t c) { + // begin of the hack + + if (IS_ALPHABETICAL_SCRIPT(c)) { + if (IS_ASCII(c)) { + if (ASCII_IS_SPACE(c)) { + return kWbClassSpace; + } + if (ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c) || + (c == '_' && !StaticPrefs::layout_word_select_stop_at_underscore())) { + return kWbClassAlphaLetter; + } + return kWbClassPunct; + } + if (c == 0x00A0 /*NBSP*/) { + return kWbClassSpace; + } + if (GetGenCategory(c) == nsUGenCategory::kPunctuation) { + return kWbClassPunct; + } + if (IsScriptioContinua(c)) { + return kWbClassScriptioContinua; + } + return kWbClassAlphaLetter; + } + if (IS_HAN(c)) { + return kWbClassHanLetter; + } + if (IS_KATAKANA(c)) { + return kWbClassKatakanaLetter; + } + if (IS_HIRAGANA(c)) { + return kWbClassHiraganaLetter; + } + if (IS_HALFWIDTHKATAKANA(c)) { + return kWbClassHWKatakanaLetter; + } + if (GetGenCategory(c) == nsUGenCategory::kPunctuation) { + return kWbClassPunct; + } + if (IsScriptioContinua(c)) { + return kWbClassScriptioContinua; + } + return kWbClassAlphaLetter; +} + +WordRange WordBreaker::FindWord(const char16_t* aText, uint32_t aTextLen, + uint32_t aOffset) { + WordRange range; + MOZ_ASSERT(nullptr != aText, "null ptr"); + MOZ_ASSERT(0 != aTextLen, "len = 0"); + MOZ_ASSERT(aOffset <= aTextLen, "aOffset > aTextLen"); + + range.mBegin = aTextLen + 1; + range.mEnd = aTextLen + 1; + + if (!aText || aOffset > aTextLen) return range; + + WordBreakClass c = GetClass(aText[aOffset]); + uint32_t i; + // Scan forward + range.mEnd--; + for (i = aOffset + 1; i <= aTextLen; i++) { + if (c != GetClass(aText[i])) { + range.mEnd = i; + break; + } + } + + // Scan backward + range.mBegin = 0; + for (i = aOffset; i > 0; i--) { + if (c != GetClass(aText[i - 1])) { + range.mBegin = i; + break; + } + } + + if (kWbClassScriptioContinua == c) { + // we pass the whole text segment to the complex word breaker to find a + // shorter answer + AutoTArray<uint8_t, 256> breakBefore; + breakBefore.SetLength(range.mEnd - range.mBegin); + NS_GetComplexLineBreaks(aText + range.mBegin, range.mEnd - range.mBegin, + breakBefore.Elements()); + + // Scan forward + for (i = aOffset + 1; i < range.mEnd; i++) { + if (breakBefore[i - range.mBegin]) { + range.mEnd = i; + break; + } + } + + // Scan backward + for (i = aOffset; i > range.mBegin; i--) { + if (breakBefore[i - range.mBegin]) { + range.mBegin = i; + break; + } + } + } + return range; +} + +int32_t WordBreaker::NextWord(const char16_t* aText, uint32_t aLen, + uint32_t aPos) { + WordBreakClass c1, c2; + uint32_t cur = aPos; + if (cur == aLen) { + return NS_WORDBREAKER_NEED_MORE_TEXT; + } + c1 = GetClass(aText[cur]); + + for (cur++; cur < aLen; cur++) { + c2 = GetClass(aText[cur]); + if (c2 != c1) { + break; + } + } + + if (kWbClassScriptioContinua == c1) { + // we pass the whole text segment to the complex word breaker to find a + // shorter answer + AutoTArray<uint8_t, 256> breakBefore; + breakBefore.SetLength(aLen - aPos); + NS_GetComplexLineBreaks(aText + aPos, aLen - aPos, breakBefore.Elements()); + uint32_t i = 1; + while (i < cur - aPos && !breakBefore[i]) { + i++; + } + if (i < cur - aPos) { + return aPos + i; + } + } + + if (cur == aLen) { + return NS_WORDBREAKER_NEED_MORE_TEXT; + } + + MOZ_ASSERT(cur != aPos); + return cur; +} diff --git a/intl/lwbrk/WordBreaker.h b/intl/lwbrk/WordBreaker.h new file mode 100644 index 0000000000..57cb4b18b7 --- /dev/null +++ b/intl/lwbrk/WordBreaker.h @@ -0,0 +1,53 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef mozilla_intl_WordBreaker_h__ +#define mozilla_intl_WordBreaker_h__ + +#include "nscore.h" +#include "nsISupports.h" + +#define NS_WORDBREAKER_NEED_MORE_TEXT -1 + +namespace mozilla { +namespace intl { + +typedef struct { + uint32_t mBegin; + uint32_t mEnd; +} WordRange; + +enum WordBreakClass : uint8_t { + kWbClassSpace = 0, + kWbClassAlphaLetter, + kWbClassPunct, + kWbClassHanLetter, + kWbClassKatakanaLetter, + kWbClassHiraganaLetter, + kWbClassHWKatakanaLetter, + kWbClassScriptioContinua +}; + +class WordBreaker { + public: + NS_INLINE_DECL_REFCOUNTING(WordBreaker) + + static already_AddRefed<WordBreaker> Create(); + + bool BreakInBetween(const char16_t* aText1, uint32_t aTextLen1, + const char16_t* aText2, uint32_t aTextLen2); + WordRange FindWord(const char16_t* aText1, uint32_t aTextLen1, + uint32_t aOffset); + int32_t NextWord(const char16_t* aText, uint32_t aLen, uint32_t aPos); + + static WordBreakClass GetClass(char16_t aChar); + + private: + ~WordBreaker() = default; +}; + +} // namespace intl +} // namespace mozilla + +#endif /* mozilla_intl_WordBreaker_h__ */ diff --git a/intl/lwbrk/crashtests/416721.html b/intl/lwbrk/crashtests/416721.html new file mode 100644 index 0000000000..0a6625ba8a --- /dev/null +++ b/intl/lwbrk/crashtests/416721.html @@ -0,0 +1,11 @@ +<!DOCTYPE html> +<html> + <head> + <title>Testcase for bug 416721</title> + <meta http-equiv="content-type" content="text/html; charset=utf-8"> + </head> + <body> + <p>กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛</p> + </body> +</html> + diff --git a/intl/lwbrk/crashtests/crashtests.list b/intl/lwbrk/crashtests/crashtests.list new file mode 100644 index 0000000000..a7cb7a173b --- /dev/null +++ b/intl/lwbrk/crashtests/crashtests.list @@ -0,0 +1 @@ +load 416721.html
diff --git a/intl/lwbrk/gtest/TestLineBreak.cpp b/intl/lwbrk/gtest/TestLineBreak.cpp new file mode 100644 index 0000000000..5c3215c228 --- /dev/null +++ b/intl/lwbrk/gtest/TestLineBreak.cpp @@ -0,0 +1,283 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <stdio.h> +#include "nsXPCOM.h" +#include "nsISupports.h" +#include "nsServiceManagerUtils.h" +#include "nsString.h" +#include "gtest/gtest.h" + +#include "mozilla/intl/LineBreaker.h" +#include "mozilla/intl/WordBreaker.h" + +static char teng1[] = + // 1 2 3 4 5 6 7 + // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 + "This is a test to test(reasonable) line break. This 0.01123 = 45 x 48."; + +static uint32_t lexp1[] = {4, 7, 9, 14, 17, 34, 39, 40, 41, + 42, 49, 54, 62, 64, 67, 69, 73}; + +static uint32_t wexp1[] = {4, 5, 7, 8, 9, 10, 14, 15, 17, 18, 22, + 23, 33, 34, 35, 39, 43, 48, 49, 50, 54, 55, + 56, 57, 62, 63, 64, 65, 67, 68, 69, 70, 72}; + +static char teng2[] = + // 1 2 3 4 5 6 7 + // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 + "()((reasonab(l)e) line break. .01123=45x48."; + +static uint32_t lexp2[] = {17, 22, 23, 30, 44}; + +static uint32_t wexp2[] = {4, 12, 13, 14, 15, 16, 17, 18, 22, + 24, 29, 30, 31, 32, 37, 38, 43}; + +static char teng3[] = + // 1 2 3 4 5 6 7 + // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 + "It's a test to test(ronae ) line break...."; + +static uint32_t lexp3[] = {4, 6, 11, 14, 25, 27, 32, 42}; + +static uint32_t wexp3[] = {2, 3, 4, 5, 6, 7, 11, 12, 14, 15, + 19, 20, 25, 26, 27, 28, 32, 33, 38}; + +static char ruler1[] = + " 1 2 3 4 5 6 7 "; +static char ruler2[] = + "0123456789012345678901234567890123456789012345678901234567890123456789012"; + +bool Check(const char* in, const uint32_t* out, uint32_t outlen, uint32_t i, + uint32_t res[256]) { + bool ok = true; + + if (i != outlen) { + ok = false; + printf("WARNING!!! return size wrong, expect %d but got %d \n", outlen, i); + } + + for (uint32_t j = 0; j < i; j++) { + if (j < outlen) { + if (res[j] != out[j]) { + ok = false; + printf("[%d] expect %d but got %d\n", j, out[j], res[j]); + } + } else { + ok = false; + printf("[%d] additional %d\n", j, res[j]); + } + } + + if (!ok) { + printf("string = \n%s\n", in); + printf("%s\n", ruler1); + printf("%s\n", ruler2); + + printf("Expect = \n"); + for (uint32_t j = 0; j < outlen; j++) { + printf("%d,", out[j]); + } + + printf("\nResult = \n"); + for (uint32_t j = 0; j < i; j++) { + printf("%d,", res[j]); + } + printf("\n"); + } + + return ok; +} + +bool TestASCIILB(mozilla::intl::LineBreaker* lb, const char* in, + const uint32_t* out, uint32_t outlen) { + NS_ConvertASCIItoUTF16 eng1(in); + uint32_t i; + uint32_t res[256]; + int32_t curr; + + for (i = 0, curr = 0; curr != NS_LINEBREAKER_NEED_MORE_TEXT && i < 256; i++) { + curr = lb->Next(eng1.get(), eng1.Length(), curr); + res[i] = curr != NS_LINEBREAKER_NEED_MORE_TEXT ? curr : eng1.Length(); + } + + return Check(in, out, outlen, i, res); +} + +bool TestASCIIWB(mozilla::intl::WordBreaker* lb, const char* in, + const uint32_t* out, uint32_t outlen) { + NS_ConvertASCIItoUTF16 eng1(in); + + uint32_t i; + uint32_t res[256]; + int32_t curr = 0; + + for (i = 0, curr = lb->NextWord(eng1.get(), eng1.Length(), curr); + curr != NS_WORDBREAKER_NEED_MORE_TEXT && i < 256; + curr = lb->NextWord(eng1.get(), eng1.Length(), curr), i++) { + res[i] = curr != NS_WORDBREAKER_NEED_MORE_TEXT ? curr : eng1.Length(); + } + + return Check(in, out, outlen, i, res); +} + +TEST(LineBreak, LineBreaker) +{ + RefPtr<mozilla::intl::LineBreaker> t = mozilla::intl::LineBreaker::Create(); + + ASSERT_TRUE(t); + + ASSERT_TRUE(TestASCIILB(t, teng1, lexp1, sizeof(lexp1) / sizeof(uint32_t))); + ASSERT_TRUE(TestASCIILB(t, teng2, lexp2, sizeof(lexp2) / sizeof(uint32_t))); + ASSERT_TRUE(TestASCIILB(t, teng3, lexp3, sizeof(lexp3) / sizeof(uint32_t))); +} + +TEST(LineBreak, WordBreaker) +{ + RefPtr<mozilla::intl::WordBreaker> t = mozilla::intl::WordBreaker::Create(); + ASSERT_TRUE(t); + + ASSERT_TRUE(TestASCIIWB(t, teng1, wexp1, sizeof(wexp1) / sizeof(uint32_t))); + ASSERT_TRUE(TestASCIIWB(t, teng2, wexp2, sizeof(wexp2) / sizeof(uint32_t))); + ASSERT_TRUE(TestASCIIWB(t, teng3, wexp3, sizeof(wexp3) / sizeof(uint32_t))); +} + +// 012345678901234 +static const char wb0[] = "T"; +static const char wb1[] = "h"; +static const char wb2[] = "is is a int"; +static const char wb3[] = "ernationali"; +static const char wb4[] = "zation work."; + +static const char* wb[] = {wb0, wb1, wb2, wb3, wb4}; + +void TestPrintWordWithBreak() { + uint32_t numOfFragment = sizeof(wb) / sizeof(char*); + RefPtr<mozilla::intl::WordBreaker> wbk = mozilla::intl::WordBreaker::Create(); + + nsAutoString result; + + for (uint32_t i = 0; i < numOfFragment; i++) { + NS_ConvertASCIItoUTF16 fragText(wb[i]); + + int32_t cur = 0; + cur = wbk->NextWord(fragText.get(), fragText.Length(), cur); + uint32_t start = 0; + for (uint32_t j = 0; cur != NS_WORDBREAKER_NEED_MORE_TEXT; j++) { + result.Append(Substring(fragText, start, cur - start)); + result.Append('^'); + start = (cur >= 0 ? cur : cur - start); + cur = wbk->NextWord(fragText.get(), fragText.Length(), cur); + } + + result.Append(Substring(fragText, fragText.Length() - start)); + + if (i != numOfFragment - 1) { + NS_ConvertASCIItoUTF16 nextFragText(wb[i + 1]); + + bool canBreak = true; + canBreak = wbk->BreakInBetween(fragText.get(), fragText.Length(), + nextFragText.get(), nextFragText.Length()); + if (canBreak) { + result.Append('^'); + } + fragText.Assign(nextFragText); + } + } + ASSERT_STREQ("is^ ^is^ ^a^ ^ is a intzation^ ^work^ation work.", + NS_ConvertUTF16toUTF8(result).get()); +} + +void TestFindWordBreakFromPosition(uint32_t fragN, uint32_t offset, + const char* expected) { + uint32_t numOfFragment = sizeof(wb) / sizeof(char*); + RefPtr<mozilla::intl::WordBreaker> wbk = mozilla::intl::WordBreaker::Create(); + + NS_ConvertASCIItoUTF16 fragText(wb[fragN]); + + mozilla::intl::WordRange res = + wbk->FindWord(fragText.get(), fragText.Length(), offset); + + bool canBreak; + nsAutoString result(Substring(fragText, res.mBegin, res.mEnd - res.mBegin)); + + if ((uint32_t)fragText.Length() == res.mEnd) { + // if we hit the end of the fragment + nsAutoString curFragText = fragText; + for (uint32_t p = fragN + 1; p < numOfFragment; p++) { + NS_ConvertASCIItoUTF16 nextFragText(wb[p]); + canBreak = wbk->BreakInBetween(curFragText.get(), curFragText.Length(), + nextFragText.get(), nextFragText.Length()); + if (canBreak) { + break; + } + mozilla::intl::WordRange r = + wbk->FindWord(nextFragText.get(), nextFragText.Length(), 0); + + result.Append(Substring(nextFragText, r.mBegin, r.mEnd - r.mBegin)); + + if ((uint32_t)nextFragText.Length() != r.mEnd) { + break; + } + nextFragText.Assign(curFragText); + } + } + + if (0 == res.mBegin) { + // if we hit the beginning of the fragment + nsAutoString curFragText = fragText; + for (uint32_t p = fragN; p > 0; p--) { + NS_ConvertASCIItoUTF16 prevFragText(wb[p - 1]); + canBreak = wbk->BreakInBetween(prevFragText.get(), prevFragText.Length(), + curFragText.get(), curFragText.Length()); + if (canBreak) { + break; + } + mozilla::intl::WordRange r = wbk->FindWord( + prevFragText.get(), prevFragText.Length(), prevFragText.Length()); + + result.Insert(Substring(prevFragText, r.mBegin, r.mEnd - r.mBegin), 0); + + if (0 != r.mBegin) { + break; + } + prevFragText.Assign(curFragText); + } + } + + ASSERT_STREQ(expected, NS_ConvertUTF16toUTF8(result).get()) + << "FindWordBreakFromPosition(" << fragN << ", " << offset << ")"; +} + +void TestNextWordBreakWithComplexLanguage() { + RefPtr<mozilla::intl::WordBreaker> wbk = mozilla::intl::WordBreaker::Create(); + nsString fragText(u"\u0e40\u0e1b\u0e47\u0e19\u0e19\u0e31\u0e01"); + + int32_t offset = 0; + while (offset != NS_WORDBREAKER_NEED_MORE_TEXT) { + int32_t newOffset = + wbk->NextWord(fragText.get(), fragText.Length(), offset); + ASSERT_NE(offset, newOffset); + offset = newOffset; + } + ASSERT_TRUE(true); +} + +TEST(LineBreak, WordBreakUsage) +{ + TestPrintWordWithBreak(); + TestFindWordBreakFromPosition(0, 0, "This"); + TestFindWordBreakFromPosition(1, 0, "his"); + TestFindWordBreakFromPosition(2, 0, "is"); + TestFindWordBreakFromPosition(2, 1, "is"); + TestFindWordBreakFromPosition(2, 9, " "); + TestFindWordBreakFromPosition(2, 10, "internationalization"); + TestFindWordBreakFromPosition(3, 4, "ernationalization"); + TestFindWordBreakFromPosition(3, 8, "ernationalization"); + TestFindWordBreakFromPosition(4, 6, " "); + TestFindWordBreakFromPosition(4, 7, "work"); + TestNextWordBreakWithComplexLanguage(); +} diff --git a/intl/lwbrk/gtest/moz.build b/intl/lwbrk/gtest/moz.build new file mode 100644 index 0000000000..c9fbab8e76 --- /dev/null +++ b/intl/lwbrk/gtest/moz.build @@ -0,0 +1,11 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +UNIFIED_SOURCES += [ + "TestLineBreak.cpp", +] + +FINAL_LIBRARY = "xul-gtest" diff --git a/intl/lwbrk/jisx4051class.h b/intl/lwbrk/jisx4051class.h new file mode 100644 index 0000000000..3140cf63a7 --- /dev/null +++ b/intl/lwbrk/jisx4051class.h @@ -0,0 +1,217 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +/* + DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY + mozilla/intl/lwbrk/tools/anzx4051.pl + */ +static const uint32_t gLBClass00[32] = { + 0x55555555, // U+0000 - U+0007 + 0x55555555, // U+0008 - U+000F + 0x55555555, // U+0010 - U+0017 + 0x55555555, // U+0018 - U+001F + 0x7AABAAA5, // U+0020 - U+0027 + 0x7A7AAAA9, // U+0028 - U+002F + 0x66666666, // U+0030 - U+0037 + 0xAAA9AA66, // U+0038 - U+003F + 0x77777777, // U+0040 - U+0047 + 0x77777777, // U+0048 - U+004F + 0x77777777, // U+0050 - U+0057 + 0x77AA9777, // U+0058 - U+005F + 0x77777777, // U+0060 - U+0067 + 0x77777777, // U+0068 - U+006F + 0x77777777, // U+0070 - U+0077 + 0x7AAA9777, // U+0078 - U+007F + 0x77777777, // U+0080 - U+0087 + 0x77777777, // U+0088 - U+008F + 0x77777777, // U+0090 - U+0097 + 0x77777777, // U+0098 - U+009F + 0xAA9A9AAB, // U+00A0 - U+00A7 + 0x77A9777A, // U+00A8 - U+00AF + 0xAAAAAAAA, // U+00B0 - U+00B7 + 0xAAAAAAAA, // U+00B8 - U+00BF + 0x77777777, // U+00C0 - U+00C7 + 0x77777777, // U+00C8 - U+00CF + 0x77777777, // U+00D0 - U+00D7 + 0x77777777, // U+00D8 - U+00DF + 0x77777777, // U+00E0 - U+00E7 + 0x77777777, // U+00E8 - U+00EF + 0xA7777777, // U+00F0 - U+00F7 + 0x77777777, // U+00F8 - U+00FF +}; + +static const uint32_t gLBClass20[32] = { + 0xB5555555, // U+2000 - U+2007 + 0x77775555, // U+2008 - U+200F + 0x777277B7, // U+2010 - U+2017 + 0x77A777A7, // U+2018 - U+201F + 0xA1117777, // U+2020 - U+2027 + 0xB7777777, // U+2028 - U+202F + 0x77744444, // U+2030 - U+2037 + 0x7A115107, // U+2038 - U+203F + 0x11017777, // U+2040 - U+2047 + 0x77777711, // U+2048 - U+204F + 0x77777777, // U+2050 - U+2057 + 0x57777777, // U+2058 - U+205F + 0x7777777B, // U+2060 - U+2067 + 0x77777777, // U+2068 - U+206F + 0x77777777, // U+2070 - U+2077 + 0x77777777, // U+2078 - U+207F + 0x77777777, // U+2080 - U+2087 + 0x77777777, // U+2088 - U+208F + 0x77777777, // U+2090 - U+2097 + 0x77777777, // U+2098 - U+209F + 0x77777777, // U+20A0 - U+20A7 + 0x77777777, // U+20A8 - U+20AF + 0x77777777, // U+20B0 - U+20B7 + 0x77777777, // U+20B8 - U+20BF + 0x77777777, // U+20C0 - U+20C7 + 0x77777777, // U+20C8 - U+20CF + 0x77777777, // U+20D0 - U+20D7 + 0x77777777, // U+20D8 - U+20DF + 0x77777777, // U+20E0 - U+20E7 + 0x77777777, // U+20E8 - U+20EF + 0x77777777, // U+20F0 - U+20F7 + 0x77777777, // U+20F8 - U+20FF +}; + +static const uint32_t gLBClass21[32] = { + 0x77777777, // U+2100 - U+2107 + 0x77777777, // U+2108 - U+210F + 0x73777777, // U+2110 - U+2117 + 0x77777777, // U+2118 - U+211F + 0x77777777, // U+2120 - U+2127 + 0x77777777, // U+2128 - U+212F + 0x77777777, // U+2130 - U+2137 + 0x77777777, // U+2138 - U+213F + 0x77777777, // U+2140 - U+2147 + 0x77777777, // U+2148 - U+214F + 0x77777777, // U+2150 - U+2157 + 0x77777777, // U+2158 - U+215F + 0x55555555, // U+2160 - U+2167 + 0x55555555, // U+2168 - U+216F + 0x55555555, // U+2170 - U+2177 + 0x55555555, // U+2178 - U+217F + 0x77777777, // U+2180 - U+2187 + 0x77777777, // U+2188 - U+218F + 0x77777777, // U+2190 - U+2197 + 0x77777777, // U+2198 - U+219F + 0x77777777, // U+21A0 - U+21A7 + 0x77777777, // U+21A8 - U+21AF + 0x77777777, // U+21B0 - U+21B7 + 0x77777777, // U+21B8 - U+21BF + 0x77777777, // U+21C0 - U+21C7 + 0x77777777, // U+21C8 - U+21CF + 0x77777777, // U+21D0 - U+21D7 + 0x77777777, // U+21D8 - U+21DF + 0x77777777, // U+21E0 - U+21E7 + 0x77777777, // U+21E8 - U+21EF + 0x77777777, // U+21F0 - U+21F7 + 0x77777777, // U+21F8 - U+21FF +}; + +static const uint32_t gLBClass30[32] = { + 0x55155115, // U+3000 - U+3007 + 0x10101010, // U+3008 - U+300F + 0x10105510, // U+3010 - U+3017 + 0x11011010, // U+3018 - U+301F + 0x55555555, // U+3020 - U+3027 + 0x55555555, // U+3028 - U+302F + 0x55555555, // U+3030 - U+3037 + 0x55555555, // U+3038 - U+303F + 0x15151515, // U+3040 - U+3047 + 0x55555515, // U+3048 - U+304F + 0x55555555, // U+3050 - U+3057 + 0x55555555, // U+3058 - U+305F + 0x55551555, // U+3060 - U+3067 + 0x55555555, // U+3068 - U+306F + 0x55555555, // U+3070 - U+3077 + 0x55555555, // U+3078 - U+307F + 0x15151555, // U+3080 - U+3087 + 0x51555555, // U+3088 - U+308F + 0x55555555, // U+3090 - U+3097 + 0x51111115, // U+3098 - U+309F + 0x15151515, // U+30A0 - U+30A7 + 0x55555515, // U+30A8 - U+30AF + 0x55555555, // U+30B0 - U+30B7 + 0x55555555, // U+30B8 - U+30BF + 0x55551555, // U+30C0 - U+30C7 + 0x55555555, // U+30C8 - U+30CF + 0x55555555, // U+30D0 - U+30D7 + 0x55555555, // U+30D8 - U+30DF + 0x15151555, // U+30E0 - U+30E7 + 0x51555555, // U+30E8 - U+30EF + 0x51155555, // U+30F0 - U+30F7 + 0x51111555, // U+30F8 - U+30FF +}; + +static const uint32_t gLBClass0E[32] = { + 0x88888888, // U+0E00 - U+0E07 + 0x88888888, // U+0E08 - U+0E0F + 0x88888888, // U+0E10 - U+0E17 + 0x88888888, // U+0E18 - U+0E1F + 0x88888888, // U+0E20 - U+0E27 + 0x18888888, // U+0E28 - U+0E2F + 0x88888888, // U+0E30 - U+0E37 + 0x08888888, // U+0E38 - U+0E3F + 0x81888888, // U+0E40 - U+0E47 + 0x78888888, // U+0E48 - U+0E4F + 0x66666666, // U+0E50 - U+0E57 + 0x88881166, // U+0E58 - U+0E5F + 0x88888888, // U+0E60 - U+0E67 + 0x88888888, // U+0E68 - U+0E6F + 0x88888888, // U+0E70 - U+0E77 + 0x88888888, // U+0E78 - U+0E7F + 0x88888888, // U+0E80 - U+0E87 + 0x88888888, // U+0E88 - U+0E8F + 0x88888888, // U+0E90 - U+0E97 + 0x88888888, // U+0E98 - U+0E9F + 0x88888888, // U+0EA0 - U+0EA7 + 0x18888888, // U+0EA8 - U+0EAF + 0x88888888, // U+0EB0 - U+0EB7 + 0x88888888, // U+0EB8 - U+0EBF + 0x81888888, // U+0EC0 - U+0EC7 + 0x88888888, // U+0EC8 - U+0ECF + 0x66666666, // U+0ED0 - U+0ED7 + 0x88888866, // U+0ED8 - U+0EDF + 0x88888888, // U+0EE0 - U+0EE7 + 0x88888888, // U+0EE8 - U+0EEF + 0x88888888, // U+0EF0 - U+0EF7 + 0x88888888, // U+0EF8 - U+0EFF +}; + +static const uint32_t gLBClass17[32] = { + 0x77777777, // U+1700 - U+1707 + 0x77777777, // U+1708 - U+170F + 0x77777777, // U+1710 - U+1717 + 0x77777777, // U+1718 - U+171F + 0x77777777, // U+1720 - U+1727 + 0x77777777, // U+1728 - U+172F + 0x70077777, // U+1730 - U+1737 + 0x77777777, // U+1738 - U+173F + 0x77777777, // U+1740 - U+1747 + 0x77777777, // U+1748 - U+174F + 0x77777777, // U+1750 - U+1757 + 0x77777777, // U+1758 - U+175F + 0x77777777, // U+1760 - U+1767 + 0x77777777, // U+1768 - U+176F + 0x77777777, // U+1770 - U+1777 + 0x77777777, // U+1778 - U+177F + 0x88888888, // U+1780 - U+1787 + 0x88888888, // U+1788 - U+178F + 0x88888888, // U+1790 - U+1797 + 0x88888888, // U+1798 - U+179F + 0x88888888, // U+17A0 - U+17A7 + 0x88888888, // U+17A8 - U+17AF + 0x88888888, // U+17B0 - U+17B7 + 0x88888888, // U+17B8 - U+17BF + 0x88888888, // U+17C0 - U+17C7 + 0x88888888, // U+17C8 - U+17CF + 0x88118888, // U+17D0 - U+17D7 + 0x77888181, // U+17D8 - U+17DF + 0x88888888, // U+17E0 - U+17E7 + 0x77777788, // U+17E8 - U+17EF + 0x88888888, // U+17F0 - U+17F7 + 0x77777788, // U+17F8 - U+17FF +}; diff --git a/intl/lwbrk/jisx4051pairtable.txt b/intl/lwbrk/jisx4051pairtable.txt new file mode 100644 index 0000000000..2bae1b18fe --- /dev/null +++ b/intl/lwbrk/jisx4051pairtable.txt @@ -0,0 +1,286 @@ + + + +/* + + Simplification of Pair Table in JIS X 4051 + + 1. The Origion Table - in 4.1.3 + + In JIS x 4051. The pair table is defined as below + + Class of + Leading Class of Trailing Char Class + Char + + 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20 + * # * # + 1 X X X X X X X X X X X X X X X X X X X X X E + 2 X X X X X X + 3 X X X X X X + 4 X X X X X X + 5 X X X X X X + 6 X X X X X X + 7 X X X X X X X + 8 X X X X X X E + 9 X X X X X X + 10 X X X X X X + 11 X X X X X X + 12 X X X X X X + 13 X X X X X X X + 14 X X X X X X X + 15 X X X X X X X X X + 16 X X X X X X X X + 17 X X X X X E + 18 X X X X X X X X X + 19 X E E E E E X X X X X X X X X X X X E X E E + 20 X X X X X E + + * Same Char + # Other Char + + 2. Simplified by remove the class which we do not care + + However, since we do not care about class 13(Subscript), 14(Ruby), + 19(split line note begin quote), and 20(split line note end quote) + we can simplify this par table into the following + + Class of + Leading Class of Trailing Char Class + Char + + 1 2 3 4 5 6 7 8 9 10 11 12 15 16 17 18 + + 1 X X X X X X X X X X X X X X X X + 2 X X X X X + 3 X X X X X + 4 X X X X X + 5 X X X X X + 6 X X X X X + 7 X X X X X X + 8 X X X X X X + 9 X X X X X + 10 X X X X X + 11 X X X X X + 12 X X X X X + 15 X X X X X X X X + 16 X X X X X X X + 17 X X X X X + 18 X X X X X X X X + + 3. Simplified by merged classes + + After the 2 simplification, the pair table have some duplication + a. class 2, 3, 4, 5, 6, are the same- we can merged them + b. class 10, 11, 12, 17 are the same- we can merged them + + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 16 18 + + 1 X X X X X X X X X + [a] X + 7 X X + 8 X X + 9 X + [b] X + 15 X X X X + 16 X X X + 18 X X X X + + + 4. Now we use one bit to encode weather it is breakable, and use 2 bytes + for one row, then the bit table will look like: + + 18 <- 1 + + 1 0000 0001 1111 1111 = 0x01FF + [a] 0000 0000 0000 0010 = 0x0002 + 7 0000 0000 0000 0110 = 0x0006 + 8 0000 0000 0100 0010 = 0x0042 + 9 0000 0000 0000 0010 = 0x0002 + [b] 0000 0000 0000 0010 = 0x0042 + 15 0000 0001 0101 0010 = 0x0152 + 16 0000 0001 1000 0010 = 0x0182 + 17 0000 0001 1100 0010 = 0x01C2 + +*/ + +static uint16_t gJISx4051SimplifiedPair[9] = { + 0x01FF, 0x0002, 0x0006, 0x0042, 0x0002, 0x0042, 0x0152, 0x0182, 0x01C2 +}; + +PRBool XXXX::ClassesToPair(nsJISx4051Cls aCls1, nsJISx4051Cls aCls1) +{ + NS_ASSERTION( (aCls1 < 9) "invalid class"); + NS_ASSERTION( (aCls2 < 9) "invalid class"); + return ( 0 != (gJISx4051SimplifiedPair[aCls1] & (1L << aCls2) )); +} + + +#define X4051_IS_DIGIT(u) ((0x0030 >= (u)) && ((u) >= 0x0039)) + +nsJISx4051Cls XXXX::GetClass( + PRUnichar aChar, PRUnichar aBefore = 0, PRUnichar aAfter = 0) +{ + // take care the special case in cls 15 + if( ((0x2C == aChar) || (0x2E == aChar)) && + (X4051_IS_DIGIT(aBefore)) && X4051_IS_DIGIT(aAfter))) + { + return kJISx4051Cls_15; + } + + nsJISx4051Cls cls; + if(gSingle->Lookup(aChar, &cls)) + return cls; + + if(gRange->Lookup(aChar, &cls)) + return cls; + + return kJISx4051Cls_15; +} + + +typedef enum { + kJISx4051Cls_1 = 0, + kJISx4051Cls_2 = 1, + kJISx4051Cls_3 = 1, + kJISx4051Cls_4 = 1, + kJISx4051Cls_5 = 1, + kJISx4051Cls_6 = 1, + kJISx4051Cls_7 = 2, + kJISx4051Cls_8 = 3, + kJISx4051Cls_9 = 4, + kJISx4051Cls_10 = 5, + kJISx4051Cls_11 = 5, + kJISx4051Cls_12 = 5, + // kJISx4051Cls_13 = 0, + // kJISx4051Cls_14 = 0, + kJISx4051Cls_15 = 6, + kJISx4051Cls_16 = 7, + kJISx4051Cls_17 = 5, + kJISx4051Cls_18 = 8, + // kJISx4051Cls_19 = 0, + // kJISx4051Cls_20 = 0 +} nsJISx4051Cls; + + + // Table 2 + YYYY(kJISx4051Cls_1 , 0x0028), + YYYY(kJISx4051Cls_1 , 0x005B), + YYYY(kJISx4051Cls_1 , 0x007B), + YYYY(kJISx4051Cls_1 , 0x2018), + YYYY(kJISx4051Cls_1 , 0x201B), + YYYY(kJISx4051Cls_1 , 0x201C), + YYYY(kJISx4051Cls_1 , 0x201F), + YYYY(kJISx4051Cls_1 , 0x3008), + YYYY(kJISx4051Cls_1 , 0x300A), + YYYY(kJISx4051Cls_1 , 0x300C), + YYYY(kJISx4051Cls_1 , 0x300E), + YYYY(kJISx4051Cls_1 , 0x3010), + YYYY(kJISx4051Cls_1 , 0x3014), + YYYY(kJISx4051Cls_1 , 0x3016), + YYYY(kJISx4051Cls_1 , 0x3018), + YYYY(kJISx4051Cls_1 , 0x301A), + YYYY(kJISx4051Cls_1 , 0x301D), + + // Table 3 + YYYY(kJISx4051Cls_2 , 0x0029), + YYYY(kJISx4051Cls_2 , 0x002C), + YYYY(kJISx4051Cls_2 , 0x005D), + YYYY(kJISx4051Cls_2 , 0x007D), + YYYY(kJISx4051Cls_2 , 0x2019), + YYYY(kJISx4051Cls_2 , 0x201A), + YYYY(kJISx4051Cls_2 , 0x201D), + YYYY(kJISx4051Cls_2 , 0x201E), + YYYY(kJISx4051Cls_2 , 0x3001), + YYYY(kJISx4051Cls_2 , 0x3009), + YYYY(kJISx4051Cls_2 , 0x300B), + YYYY(kJISx4051Cls_2 , 0x300D), + YYYY(kJISx4051Cls_2 , 0x300F), + YYYY(kJISx4051Cls_2 , 0x3011), + YYYY(kJISx4051Cls_2 , 0x3015), + YYYY(kJISx4051Cls_2 , 0x3017), + YYYY(kJISx4051Cls_2 , 0x3019), + YYYY(kJISx4051Cls_2 , 0x301B), + YYYY(kJISx4051Cls_2 , 0x301E), + YYYY(kJISx4051Cls_2 , 0x301F), + + // Table 4 + YYYY(kJISx4051Cls_3 , 0x203C), + YYYY(kJISx4051Cls_3 , 0x2044), + YYYY(kJISx4051Cls_3 , 0x301C), + YYYY(kJISx4051Cls_3 , 0x3041), + YYYY(kJISx4051Cls_3 , 0x3043), + YYYY(kJISx4051Cls_3 , 0x3045), + YYYY(kJISx4051Cls_3 , 0x3047), + YYYY(kJISx4051Cls_3 , 0x3049), + YYYY(kJISx4051Cls_3 , 0x3063), + YYYY(kJISx4051Cls_3 , 0x3083), + YYYY(kJISx4051Cls_3 , 0x3085), + YYYY(kJISx4051Cls_3 , 0x3087), + YYYY(kJISx4051Cls_3 , 0x308E), + YYYY(kJISx4051Cls_3 , 0x309D), + YYYY(kJISx4051Cls_3 , 0x309E), + YYYY(kJISx4051Cls_3 , 0x30A1), + YYYY(kJISx4051Cls_3 , 0x30A3), + YYYY(kJISx4051Cls_3 , 0x30A5), + YYYY(kJISx4051Cls_3 , 0x30A7), + YYYY(kJISx4051Cls_3 , 0x30A9), + YYYY(kJISx4051Cls_3 , 0x30C3), + YYYY(kJISx4051Cls_3 , 0x30E3), + YYYY(kJISx4051Cls_3 , 0x30E5), + YYYY(kJISx4051Cls_3 , 0x30E7), + YYYY(kJISx4051Cls_3 , 0x30EE), + YYYY(kJISx4051Cls_3 , 0x30F5), + YYYY(kJISx4051Cls_3 , 0x30F6), + YYYY(kJISx4051Cls_3 , 0x30FC), + YYYY(kJISx4051Cls_3 , 0x30FD), + YYYY(kJISx4051Cls_3 , 0x30FE), + + // Table 5 + YYYY(kJISx4051Cls_4 , 0x0021), + YYYY(kJISx4051Cls_4 , 0x003F), + + // Table 6 + YYYY(kJISx4051Cls_5 , 0x003A), + YYYY(kJISx4051Cls_5 , 0x003B), + YYYY(kJISx4051Cls_5 , 0x30FB), + + // Table 7 + YYYY(kJISx4051Cls_6 , 0x002E), + YYYY(kJISx4051Cls_6 , 0x3002), + + // Table 8 + YYYY(kJISx4051Cls_7 , 0x2014), + YYYY(kJISx4051Cls_7 , 0x2024), + YYYY(kJISx4051Cls_7 , 0x2025), + YYYY(kJISx4051Cls_7 , 0x2026), + + // Table 9 + YYYY(kJISx4051Cls_8 , 0x0024), + YYYY(kJISx4051Cls_8 , 0x00A3), + YYYY(kJISx4051Cls_8 , 0x00A5), + YYYY(kJISx4051Cls_8 , 0x2116), + + // Table 10 + YYYY(kJISx4051Cls_9 , 0x0025), + YYYY(kJISx4051Cls_9 , 0x00A2), + YYYY(kJISx4051Cls_9 , 0x00B0), + YYYY(kJISx4051Cls_9 , 0x2030), + YYYY(kJISx4051Cls_9 , 0x2031), + YYYY(kJISx4051Cls_9 , 0x2032), + YYYY(kJISx4051Cls_9 , 0x2033), + + // Table 1 + YYYY(kJISx4051Cls_10, 0x3000), + + // Table 1 + ZZZZ(kJISx4051Cls_11, 0x3000), + + + + diff --git a/intl/lwbrk/moz.build b/intl/lwbrk/moz.build new file mode 100644 index 0000000000..b47a49e279 --- /dev/null +++ b/intl/lwbrk/moz.build @@ -0,0 +1,40 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +TEST_DIRS += ["gtest"] + +EXPORTS.mozilla.intl += [ + "LineBreaker.h", + "WordBreaker.h", +] + +UNIFIED_SOURCES += [ + "LineBreaker.cpp", + "WordBreaker.cpp", +] + +if CONFIG["MOZ_WIDGET_TOOLKIT"] == "gtk": + SOURCES += [ + "nsPangoBreaker.cpp", + ] + CXXFLAGS += CONFIG["MOZ_PANGO_CFLAGS"] +elif CONFIG["MOZ_WIDGET_TOOLKIT"] == "windows": + SOURCES += [ + "nsUniscribeBreaker.cpp", + ] +elif CONFIG["MOZ_WIDGET_TOOLKIT"] == "cocoa": + UNIFIED_SOURCES += [ + "nsCarbonBreaker.cpp", + ] +else: + SOURCES += [ + "nsRuleBreaker.cpp", + ] + SOURCES += [ + "rulebrk.c", + ] + +FINAL_LIBRARY = "xul" diff --git a/intl/lwbrk/nsCarbonBreaker.cpp b/intl/lwbrk/nsCarbonBreaker.cpp new file mode 100644 index 0000000000..d1d81b2578 --- /dev/null +++ b/intl/lwbrk/nsCarbonBreaker.cpp @@ -0,0 +1,43 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <CoreFoundation/CoreFoundation.h> +#include <stdint.h> +#include "nsDebug.h" +#include "nscore.h" + +void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore) { + NS_ASSERTION(aText, "aText shouldn't be null"); + + memset(aBreakBefore, 0, aLength * sizeof(uint8_t)); + + CFStringRef str = ::CFStringCreateWithCharactersNoCopy( + kCFAllocatorDefault, reinterpret_cast<const UniChar*>(aText), aLength, + kCFAllocatorNull); + if (!str) { + return; + } + + CFStringTokenizerRef st = ::CFStringTokenizerCreate( + kCFAllocatorDefault, str, ::CFRangeMake(0, aLength), + kCFStringTokenizerUnitLineBreak, nullptr); + if (!st) { + ::CFRelease(str); + return; + } + + CFStringTokenizerTokenType tt = ::CFStringTokenizerAdvanceToNextToken(st); + while (tt != kCFStringTokenizerTokenNone) { + CFRange r = ::CFStringTokenizerGetCurrentTokenRange(st); + if (r.location != 0) { // Ignore leading edge + aBreakBefore[r.location] = true; + } + tt = CFStringTokenizerAdvanceToNextToken(st); + } + + ::CFRelease(st); + ::CFRelease(str); +} diff --git a/intl/lwbrk/nsComplexBreaker.h b/intl/lwbrk/nsComplexBreaker.h new file mode 100644 index 0000000000..0b508a4645 --- /dev/null +++ b/intl/lwbrk/nsComplexBreaker.h @@ -0,0 +1,18 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsComplexBreaker_h__ +#define nsComplexBreaker_h__ + +#include "nsString.h" + +/** + * Find line break opportunities in aText[] of aLength characters, + * filling boolean values indicating line break opportunities for + * corresponding charactersin aBreakBefore[] on return. + */ +void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore); + +#endif /* nsComplexBreaker_h__ */ diff --git a/intl/lwbrk/nsLWBrkCIID.h b/intl/lwbrk/nsLWBrkCIID.h new file mode 100644 index 0000000000..b612155ef0 --- /dev/null +++ b/intl/lwbrk/nsLWBrkCIID.h @@ -0,0 +1,28 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsLWBrkCIID_h__ +#define nsLWBrkCIID_h__ + +// {2BF64764-997F-450D-AF96-3028D1A902B0} +#define NS_LBRK_CID \ + { \ + 0x2bf64764, 0x997f, 0x450d, { \ + 0xaf, 0x96, 0x30, 0x28, 0xd1, 0xa9, 0x2, 0xb0 \ + } \ + } + +#define NS_LBRK_CONTRACTID "@mozilla.org/intl/lbrk;1" + +// {2BF64765-997F-450D-AF96-3028D1A902B0} +#define NS_WBRK_CID \ + { \ + 0x2bf64765, 0x997f, 0x450d, { \ + 0xaf, 0x96, 0x30, 0x28, 0xd1, 0xa9, 0x2, 0xb0 \ + } \ + } + +#define NS_WBRK_CONTRACTID "@mozilla.org/intl/wbrk;1" + +#endif diff --git a/intl/lwbrk/nsPangoBreaker.cpp b/intl/lwbrk/nsPangoBreaker.cpp new file mode 100644 index 0000000000..ca3d3d54c9 --- /dev/null +++ b/intl/lwbrk/nsPangoBreaker.cpp @@ -0,0 +1,58 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsComplexBreaker.h" + +#include <pango/pango-break.h> +#include "nsUTF8Utils.h" +#include "nsString.h" +#include "nsTArray.h" + +void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore) { + NS_ASSERTION(aText, "aText shouldn't be null"); + + memset(aBreakBefore, false, aLength * sizeof(uint8_t)); + + AutoTArray<PangoLogAttr, 2000> attrBuffer; + // XXX(Bug 1631371) Check if this should use a fallible operation as it + // pretended earlier. + attrBuffer.AppendElements(aLength + 1); + + NS_ConvertUTF16toUTF8 aUTF8(aText, aLength); + + const gchar* p = aUTF8.Data(); + const gchar* end = p + aUTF8.Length(); + uint32_t u16Offset = 0; + + static PangoLanguage* language = pango_language_from_string("en"); + + while (p < end) { + PangoLogAttr* attr = attrBuffer.Elements(); + pango_get_log_attrs(p, end - p, -1, language, attr, attrBuffer.Length()); + + while (p < end) { + aBreakBefore[u16Offset] = attr->is_line_break; + if (NS_IS_LOW_SURROGATE(aText[u16Offset])) + aBreakBefore[++u16Offset] = false; // Skip high surrogate + ++u16Offset; + + // We're iterating over text obtained from NS_ConvertUTF16toUTF8, + // so we know we have valid UTF-8 and don't need to check for + // errors. + uint32_t ch = UTF8CharEnumerator::NextChar(&p, end); + ++attr; + + if (!ch) { + // pango_break (pango 1.16.2) only analyses text before the + // first NUL (but sets one extra attr). Workaround loop to call + // pango_break again to analyse after the NUL is done somewhere else + // (gfx/thebes/gfxFontconfigFonts.cpp: SetupClusterBoundaries()). + // So, we do the same here for pango_get_log_attrs. + break; + } + } + } +} diff --git a/intl/lwbrk/nsRuleBreaker.cpp b/intl/lwbrk/nsRuleBreaker.cpp new file mode 100644 index 0000000000..4c1c9aff90 --- /dev/null +++ b/intl/lwbrk/nsRuleBreaker.cpp @@ -0,0 +1,17 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsComplexBreaker.h" + +#define TH_UNICODE +#include "rulebrk.h" + +void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore) { + NS_ASSERTION(aText, "aText shouldn't be null"); + + for (uint32_t i = 0; i < aLength; i++) + aBreakBefore[i] = (0 == TrbWordBreakPos(aText, i, aText + i, aLength - i)); +} diff --git a/intl/lwbrk/nsUniscribeBreaker.cpp b/intl/lwbrk/nsUniscribeBreaker.cpp new file mode 100644 index 0000000000..503b756b61 --- /dev/null +++ b/intl/lwbrk/nsUniscribeBreaker.cpp @@ -0,0 +1,60 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsComplexBreaker.h" + +#include <windows.h> + +#include <usp10.h> + +#include "nsUTF8Utils.h" +#include "nsString.h" +#include "nsTArray.h" + +void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore) { + NS_ASSERTION(aText, "aText shouldn't be null"); + + int outItems = 0; + HRESULT result; + AutoTArray<SCRIPT_ITEM, 64> items; + char16ptr_t text = aText; + + memset(aBreakBefore, false, aLength); + + items.AppendElements(64); + + do { + result = ScriptItemize(text, aLength, items.Length(), nullptr, nullptr, + items.Elements(), &outItems); + + if (result == E_OUTOFMEMORY) { + // XXX(Bug 1631371) Check if this should use a fallible operation as it + // pretended earlier. + items.AppendElements(items.Length()); + } + } while (result == E_OUTOFMEMORY); + + for (int iItem = 0; iItem < outItems; ++iItem) { + uint32_t endOffset = + (iItem + 1 == outItems ? aLength : items[iItem + 1].iCharPos); + uint32_t startOffset = items[iItem].iCharPos; + AutoTArray<SCRIPT_LOGATTR, 64> sla; + + // XXX(Bug 1631371) Check if this should use a fallible operation as it + // pretended earlier. + sla.AppendElements(endOffset - startOffset); + + if (ScriptBreak(text + startOffset, endOffset - startOffset, + &items[iItem].a, sla.Elements()) < 0) + return; + + // We don't want to set a potential break position at the start of text; + // that's the responsibility of a higher level. + for (uint32_t j = startOffset ? 0 : 1; j + startOffset < endOffset; ++j) { + aBreakBefore[j + startOffset] = sla[j].fSoftBreak; + } + } +} diff --git a/intl/lwbrk/rulebrk.c b/intl/lwbrk/rulebrk.c new file mode 100644 index 0000000000..d7574b929f --- /dev/null +++ b/intl/lwbrk/rulebrk.c @@ -0,0 +1,388 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#define TH_UNICODE + +#include <stdlib.h> +#include <stdint.h> +#include <assert.h> +#include "th_char.h" +#define th_isalpha(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) +#define th_isspace(c) ((c) == ' ' || (c) == '\t') + +/* +///////////////////////////////////////////////// +// Thai character type array +*/ + +typedef unsigned short twb_t; +extern const twb_t _TwbType[0x100 - 0xa0]; + +/* +// bit definition +*/ + +#define VRS 0x0001 +#define VRE 0x0002 +#define VRX 0x0004 + +#define VRA 0x0008 + +#define VLA 0x0010 +#define VLO 0x0020 +#define VLI 0x0040 + +#define VC 0x0080 + +#define CC 0x0100 +#define CS 0x0200 + +#define C2 0x0400 +#define CHB 0x0800 +#define CHE 0x1000 + +#define MT 0x2000 +/* +//_#define me 0x2000 +*/ +#define M 0x4000 + +#define T 0x8000 + +#define VL (VLA | VLO | VLI) +#define VR (VRS | VRE | VRX) +#define NE (VL | VRS) +#define NB (VR | M) +#define V (VL | VR) +#define CX (CC | CS) +#define C (CX | VC) +#define A (C | V | M) + +#define twbtype(c) (_TwbType[th_zcode(c)]) + +#ifndef TRUE +# define TRUE 1 +# define FALSE 0 +#endif +#define RETURN(b) return (b) + +/* +///////////////////////////////////////////////// +*/ + +int TrbWordBreakPos(const th_char* pstr, int left, const th_char* rstr, + int right) +/* const ThBreakIterator *it, const th_char **p)*/ +{ + /* + //int left, right; + //const th_char *s = *p; + */ + const th_char* lstr = pstr + left; + th_char _c[6]; + twb_t _t[6]; +#define c(i) (_c[(i) + 3]) +#define t(i) (_t[(i) + 3]) + int i, j; + + /* + //left = s - it->begin; + */ + if (left < 0) return -1; + /* + //right = (it->end == NULL) ? 4 : it->begin - s; + */ + if (right < 1) return -1; + + /* + // get c(0), t(0) + */ + c(0) = rstr[0]; /* may be '\0' */ + if (!th_isthai(c(0))) return -1; + t(0) = twbtype(c(0)); + if (!(t(0) & A)) return -1; + + /* + // get c(-1), t(-1) + */ + if (left >= 1) { + c(-1) = lstr[-1]; + if (!th_isthai(c(-1))) return 0; + t(-1) = twbtype(c(-1)); + if (!(t(-1) & A)) return 0; /* handle punctuation marks here */ + } else { + c(-1) = 0; + t(-1) = 0; + } + + /* + // get c(1..2), t(1..2) + */ + for (i = 1; i <= 2; i++) { + if (i >= right) { + c(i) = 0; + t(i) = 0; + } else { + c(i) = rstr[i]; /* may be '\0'; */ + if (!th_isthai(c(i))) + right = i--; + else { + t(i) = twbtype(c(i)); + if (!(t(i) & A)) right = i--; + } + } + } + /* + // get c(-2..-3), t(-2..-3) + */ + for (i = -2, j = -2; i >= -3; j--) { + if (j < -left) { + c(i) = 0; + t(i) = 0; + i--; + } else { + c(i) = lstr[j]; + if (!th_isthai(c(i))) + left = 0; + else { + t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0); + if (!(t(i) & A)) + left = 0; + else { + if ((t(i + 1) & MT) && ((t(i) & VR) || (t(i + 2) & VR))) { + c(i + 1) = c(i); + t(i + 1) = t(i); + } else + i--; + } + } + } + } + + /* + // prohibit the unlikely + */ + if ((t(-1) & C) && (t(0) & C)) { + if ((t(-1) & CHE) || (t(0) & CHB)) return -1; + } + /* + // special case : vlao, C/ sara_a|aa, !sara_a + */ + if ((t(-3) & (VLA | VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) && + (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) + return 0; + + /* + // prohibit break + */ + if (t(0) & NB) return -1; + if (t(-1) & NE) return -1; + + /* + // apply 100% rules + */ + if (t(-1) & VRE) { + if (c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0; + return -1; /* usually too short syllable, part of word */ + } + + if (t(-2) & VRE) return -1; + + if ((t(0) & C) && (t(1) & (VR | MT)) && + (c(2) != TH_THANTHAKHAT)) { /*?C, NB */ + if ((t(-1) & (VRS | VRX)) && c(1) == TH_SARA_I) return -1; /* exception */ + if (t(-1) & (V | M)) return 0; /* !C/ C, NB */ + if (t(-2) & VRS) return 0; /* VRS, C / C, NB */ + if (!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */ + if (t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */ + if (t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */ + } + } + if ((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */ + if ((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V | M))) + return 0; /* VRS, C/ !C */ + + if ((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) { + if ((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */ + if ((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */ + } + /* + // apply 90% rules + */ + if (t(0) & VL) return 0; + if (t(1) & VL) return -1; + if (c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) + return 0; + + /* + //return -1; + // apply 80% rules + */ + if (t(0) & CHE) { + if ((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */ + /*if(t(-1) & VRX) return 0; // VRX/ CHE */ + if (t(-1) & VC) return 0; /* VC/ CHE */ + } + if (t(-1) & CHB) { + if ((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */ + if (t(0) & VC) return 0; /* CHB/ VC */ + } + + if ((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */ + if (t(-2) & VLI) + return 0; /* VLI,C/C,VR .*/ + else { /* vlao, C ? C , VR */ + if (c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */ + if (t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */ + if (!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */ + } + } + /* C,MT,C */ + if ((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1; + + return -1; +} + +int TrbFollowing(const th_char* begin, int length, int offset) +/* +//(ThBreakIterator *this, int offset) +*/ +{ + const th_char* w = begin + offset; + const th_char* end = begin + length; + while (w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++; + + if (w < end && *w && !th_isthai(*w)) { + int english = FALSE; + while (w < end && *w && !th_isthai(*w) && !th_isspace(*w)) { + if (th_isalpha(*w)) english = TRUE; + w++; + } + if (english || w == end || (!th_isthai(*w) && th_isspace(*w))) + return w - begin; + } + if (w == end || *w == 0 || !th_isthai(*w)) return w - begin; + w++; + if (w < end && *w && th_isthai(*w)) { + int brk = TrbWordBreakPos(begin, w - begin, w, end - w); + while (brk < 0) { + w++; + if (w == end || *w == 0 || !th_isthai(*w)) break; + brk = TrbWordBreakPos(begin, w - begin, w, end - w); + } + if (brk > 0) w += brk; + } + if (w < end && *w && !th_isthai(*w)) { + while (w < end && *w && !th_isthai(*w) && !th_isalpha(*w) && + !th_isspace(*w)) + w++; + } + return w - begin; +} + +/* +///////////////////////////////////////////////// +*/ +const twb_t _TwbType[0x100 - 0xa0] = { +#if 0 +/* 80 */ T, +/* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +/* 90 */ T, +/* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +#endif + /* a0 */ 0, + /* a1 ¡ */ CS, + /* a2 ¢ */ CS | CHE, + /* a3 £ */ CC | CHE, + /* a4 € */ CS | CHE, + /* a5 ¥ */ CC | CHE, + /* a6 Š */ CS, + /* a7 § */ CS | CHB, + /* a8 š */ CS, + /* a9 © */ CC | CHE, + /* aa ª */ CS, + /* ab « */ CC | CHE, + /* ac ¬ */ CC | CHB | CHE, + /* ad */ CS | CHB, + /* ae ® */ CS | CHB, + /* af ¯ */ CS | CHB, + /* b0 ° */ CS, + /* b1 ± */ CS | CHB | CHE, + /* b2 ² */ CS | CHB | CHE, + /* b3 ³ */ CS | CHB, + /* b4 Ž */ CS, + /* b5 µ */ CS, + /* b6 ¶ */ CS, + /* b7 · */ CS, + /* b8 ž */ CS, + /* b9 ¹ */ CS, + /* ba º */ CS, + /* bb » */ CS, + /* bc Œ */ CC | CHE, + /* bd œ */ CC | CHE, + /* be Ÿ */ CS, + /* bf ¿ */ CS, + /* c0 À */ CS | CHE, + /* c1 Á */ CS, + /* c2  */ CS, + /* c3 à */ CS | C2 | CHE, /* ? add CHE */ + /* c4 Ä */ VC | CHE, + /* c5 Å */ CS | C2, + /* c6 Æ */ VC | CHE, + /* c7 Ç */ VC | C2, + /* c8 È */ CS, + /* c9 É */ CS | CHB, + /* ca Ê */ CS | CHE, + /* cb Ë */ CC | CHE, + /* CC Ì */ CS | CHB | CHE, + /* cd Í */ VC, + /* ce Î */ CC | CHE, + /* cf Ï */ T, + /* d0 Ð */ VRE | VRA, + /* d1 Ñ */ VRS, + /* d2 Ò */ VRX | VRA, + /* d3 Ó */ VRE, + /* d4 Ô */ VRX | VRA, + /* d5 Õ */ VRX | VRA, + /* d6 Ö */ VRS, + /* d7 × */ VRS | VRA, + /* d8 Ø */ VRX, + /* d9 Ù */ VRX, + /* da Ú */ T, + /* db Û */ 0, + /* dc Ü */ 0, + /* dd Ý */ 0, + /* de Þ */ 0, + /* df ß */ T, + /* e0 à */ VLA, + /* e1 á */ VLO, + /* e2 â */ VLO, + /* e3 ã */ VLI, + /* e4 ä */ VLI, + /* e5 å */ VRE, + /* e6 æ */ M, + /* e7 ç */ M, + /* e8 è */ M | MT, + /* e9 é */ M | MT, + /* ea ê */ M | MT, + /* eb ë */ M | MT, + /* ec ì */ M, + /* ed í */ T, + /* ee î */ T, + /* ef ï */ T, + /* f0 ð */ T, + /* f1 ñ */ T, + /* f2 ò */ T, + /* f3 ó */ T, + /* f4 ô */ T, + /* f5 õ */ T, + /* f6 ö */ T, + /* f7 ÷ */ T, + /* f8 ø */ T, + /* f9 ù */ T, + /* fa ú */ T, + /* fb û */ T, + /* fc ü */ 0, + /* fd ý */ 0, + /* fe þ */ 0, + /* ff */ 0}; diff --git a/intl/lwbrk/rulebrk.h b/intl/lwbrk/rulebrk.h new file mode 100644 index 0000000000..c1f2e0957b --- /dev/null +++ b/intl/lwbrk/rulebrk.h @@ -0,0 +1,26 @@ +/* +Copyright (c) 1999 Samphan Raruenrom <samphan@thai.com> +Permission to use, copy, modify, distribute and sell this software +and its documentation for any purpose is hereby granted without fee, +provided that the above copyright notice appear in all copies and +that both that copyright notice and this permission notice appear +in supporting documentation. Samphan Raruenrom makes no +representations about the suitability of this software for any +purpose. It is provided "as is" without express or implied warranty. +*/ +#ifndef __RULEBRK_H__ +#define __RULEBRK_H__ +#include "th_char.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int TrbWordBreakPos(const th_char* pstr, int left, const th_char* rstr, + int right); +int TrbFollowing(const th_char* begin, int length, int offset); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/intl/lwbrk/th_char.h b/intl/lwbrk/th_char.h new file mode 100644 index 0000000000..a088228fff --- /dev/null +++ b/intl/lwbrk/th_char.h @@ -0,0 +1,133 @@ +/* +Copyright (c) 1999 Samphan Raruenrom <samphan@thai.com> +Permission to use, copy, modify, distribute and sell this software +and its documentation for any purpose is hereby granted without fee, +provided that the above copyright notice appear in all copies and +that both that copyright notice and this permission notice appear +in supporting documentation. Samphan Raruenrom makes no +representations about the suitability of this software for any +purpose. It is provided "as is" without express or implied warranty. +*/ +#ifndef __TH_CHAR_H__ +#define __TH_CHAR_H__ + +typedef unsigned char tis_char; + +#ifdef TH_UNICODE +/* + * The char16_t type is only usable in C++ code, so we need this ugly hack to + * select a binary compatible C type for the expat C code to use. + */ +# ifdef __cplusplus +typedef char16_t th_char; +# else +typedef uint16_t th_char; +# endif +# define TH_THAIBEGIN_ 0x0e00 +# define th_isthai(c) (0x0e00 <= (c) && (c) <= 0x0e5f) +#else +typedef tis_char th_char; +# define TH_THAIBEGIN_ 0xa0 +# define th_isthai(c) ((c) >= 0xa0) +#endif +#define th_zcode(c) ((c)-TH_THAIBEGIN_) + +enum TH_CHARNAME { + TH_THAIBEGIN = TH_THAIBEGIN_, + TH_KOKAI, + TH_KHOKHAI, + TH_KHOKHUAT, + TH_KHOKHWAI, + TH_KHOKHON, + TH_KHORAKHANG, + TH_NGONGU, + TH_CHOCHAN, + TH_CHOCHING, + TH_CHOCHANG, + TH_SOSO, + TH_CHOCHOE, + TH_YOYING, + TH_DOCHADA, + TH_TOPATAK, + TH_THOTHAN, + TH_THONANGMONTHO, + TH_THOPHUTHAO, + TH_NONEN, + TH_DODEK, + TH_TOTAO, + TH_THOTHUNG, + TH_THOTHAHAN, + TH_THOTHONG, + TH_NONU, + TH_BOBAIMAI, + TH_POPLA, + TH_PHOPHUNG, + TH_FOFA, + TH_PHOPHAN, + TH_FOFAN, + TH_PHOSAMPHAO, + TH_MOMA, + TH_YOYAK, + TH_RORUA, + TH_RU, + TH_LOLING, + TH_LU, + TH_WOWAEN, + TH_SOSALA, + TH_SORUSI, + TH_SOSUA, + TH_HOHIP, + TH_LOCHULA, + TH_OANG, + TH_HONOKHUK, + TH_PAIYANNOI, + TH_SARA_A, + TH_MAIHANAKAT, + TH_SARA_AA, + TH_SARA_AM, + TH_SARA_I, + TH_SARA_II, + TH_SARA_UE, + TH_SARA_UEE, + TH_SARA_U, + TH_SARA_UU, + TH_PHINTHU, + TH_REM_CHERNG_, + TH_TAC_WBRK_, + TH_UNDEF_DD, + TH_UNDEF_DE, + TH_BAHT, + TH_SARA_E, + TH_SARA_AE, + TH_SARA_O, + TH_MAIMUAN, + TH_MAIMALAI, + TH_LAKKHANGYAO, + TH_MAIYAMOK, + TH_MAITAIKHU, + TH_MAIEK, + TH_MAITHO, + TH_MAITRI, + TH_MAICHATTAWA, + TH_THANTHAKHAT, + TH_NIKHAHIT, + TH_YAMAKKAN, + TH_FONGMAN, + TH_THAIZERO, + TH_THAIONE, + TH_THAITWO, + TH_THAITHREE, + TH_THAIFOUR, + TH_THAIFIVE, + TH_THAISIX, + TH_THAISEVEN, + TH_THAIEIGHT, + TH_THAININE, + TH_ANGKHANKHU, + TH_KHOMUT, + TH_UNDEF_FC, + TH_UNDEF_FD, + TH_UNDEF_FE, + TH_THAIEND +}; +#endif diff --git a/intl/lwbrk/tools/anzx4051.html b/intl/lwbrk/tools/anzx4051.html new file mode 100644 index 0000000000..295f8741e0 --- /dev/null +++ b/intl/lwbrk/tools/anzx4051.html @@ -0,0 +1,669 @@ +<!-- This Source Code Form is subject to the terms of the Mozilla Public + - License, v. 2.0. If a copy of the MPL was not distributed with this + - file, You can obtain one at http://mozilla.org/MPL/2.0/. --> + +<HTML> +<HEAD> +<TITLE> +Analysis of JIS X 4051 to Unicode General Category Mapping +</TITLE> +</HEAD> +<BODY> +<H1> +Analysis of JIS X 4051 to Unicode General Category Mapping +</H1> +<TABLE BORDER=3> +<TR BGCOLOR=blue><TH><TH> +<TD BGCOLOR=red>C</TD> +<TD BGCOLOR=red>L</TD> +<TD BGCOLOR=red>M</TD> +<TD BGCOLOR=red>N</TD> +<TD BGCOLOR=red>P</TD> +<TD BGCOLOR=red>S</TD> +<TD BGCOLOR=red>Z</TD> +<TD BGCOLOR=white>Total</TD> +<TD BGCOLOR=yellow>Cc</TD> +<TD BGCOLOR=yellow>Cf</TD> +<TD BGCOLOR=yellow>Co</TD> +<TD BGCOLOR=yellow>Cs</TD> +<TD BGCOLOR=yellow>Ll</TD> +<TD BGCOLOR=yellow>Lm</TD> +<TD BGCOLOR=yellow>Lo</TD> +<TD BGCOLOR=yellow>Lt</TD> +<TD BGCOLOR=yellow>Lu</TD> +<TD BGCOLOR=yellow>Mc</TD> +<TD BGCOLOR=yellow>Me</TD> +<TD BGCOLOR=yellow>Mn</TD> +<TD BGCOLOR=yellow>Nd</TD> +<TD BGCOLOR=yellow>Nl</TD> +<TD BGCOLOR=yellow>No</TD> +<TD BGCOLOR=yellow>Pc</TD> +<TD BGCOLOR=yellow>Pd</TD> +<TD BGCOLOR=yellow>Pe</TD> +<TD BGCOLOR=yellow>Pf</TD> +<TD BGCOLOR=yellow>Pi</TD> +<TD BGCOLOR=yellow>Po</TD> +<TD BGCOLOR=yellow>Ps</TD> +<TD BGCOLOR=yellow>Sc</TD> +<TD BGCOLOR=yellow>Sk</TD> +<TD BGCOLOR=yellow>Sm</TD> +<TD BGCOLOR=yellow>So</TD> +<TD BGCOLOR=yellow>Zl</TD> +<TD BGCOLOR=yellow>Zp</TD> +<TD BGCOLOR=yellow>Zs</TD> +</TR> +<TR><TH>00_1<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>14</TD> +<TD>1</TD> +<TD></TD> +<TD BGCOLOR=white>15</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD>2</TD> +<TD>11</TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>01_[a]<TH> +<TD></TD> +<TD>32</TD> +<TD>2</TD> +<TD></TD> +<TD>31</TD> +<TD>3</TD> +<TD></TD> +<TD BGCOLOR=white>68</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>8</TD> +<TD>24</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>2</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD>12</TD> +<TD>1</TD> +<TD></TD> +<TD>17</TD> +<TD></TD> +<TD></TD> +<TD>2</TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>02_7<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD BGCOLOR=white>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>03_8<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD BGCOLOR=white>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>04_9<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>5</TD> +<TD></TD> +<TD></TD> +<TD BGCOLOR=white>5</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>5</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>05_[b]<TH> +<TD>33</TD> +<TD>153</TD> +<TD></TD> +<TD>33</TD> +<TD>2</TD> +<TD>5</TD> +<TD>13</TD> +<TD BGCOLOR=white>239</TD> +<TD>32</TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>153</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>33</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>2</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>5</TD> +<TD></TD> +<TD></TD> +<TD>13</TD> +</TR> +<TR><TH>06_15<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>30</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD BGCOLOR=white>30</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>30</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>07_18<TH> +<TD>18</TD> +<TD>157</TD> +<TD></TD> +<TD>33</TD> +<TD>56</TD> +<TD>125</TD> +<TD>2</TD> +<TD BGCOLOR=white>391</TD> +<TD></TD> +<TD>18</TD> +<TD></TD> +<TD></TD> +<TD>64</TD> +<TD>7</TD> +<TD>5</TD> +<TD></TD> +<TD>81</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>3</TD> +<TD>30</TD> +<TD>4</TD> +<TD>5</TD> +<TD>2</TD> +<TD></TD> +<TD>5</TD> +<TD>36</TD> +<TD>4</TD> +<TD></TD> +<TD>3</TD> +<TD>24</TD> +<TD>98</TD> +<TD>1</TD> +<TD>1</TD> +<TD></TD> +</TR> +<TR><TH>08_COMPLEX<TH> +<TD></TD> +<TD>54</TD> +<TD>33</TD> +<TD>20</TD> +<TD>2</TD> +<TD>1</TD> +<TD></TD> +<TD BGCOLOR=white>110</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD>53</TD> +<TD></TD> +<TD></TD> +<TD>11</TD> +<TD></TD> +<TD>22</TD> +<TD>10</TD> +<TD></TD> +<TD>10</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>2</TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>09_[c]<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>3</TD> +<TD>4</TD> +<TD></TD> +<TD BGCOLOR=white>7</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>3</TD> +<TD>2</TD> +<TD></TD> +<TD>2</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>0A_[d]<TH> +<TD>1</TD> +<TD>2</TD> +<TD></TD> +<TD>6</TD> +<TD>25</TD> +<TD>14</TD> +<TD></TD> +<TD BGCOLOR=white>48</TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>6</TD> +<TD></TD> +<TD></TD> +<TD>3</TD> +<TD>3</TD> +<TD></TD> +<TD>19</TD> +<TD></TD> +<TD>2</TD> +<TD>3</TD> +<TD>7</TD> +<TD>2</TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>0B_[e]<TH> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD>1</TD> +<TD>3</TD> +<TD BGCOLOR=white>6</TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>3</TD> +</TR> +<TR><TH>X<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD BGCOLOR=white>0</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +</TABLE> +<TABLE BORDER=3> +<TR BGCOLOR=blue><TH><TH> +<TD BGCOLOR=red>00_1</TD> +<TD BGCOLOR=red>01_[a]</TD> +<TD BGCOLOR=red>02_7</TD> +<TD BGCOLOR=red>03_8</TD> +<TD BGCOLOR=red>04_9</TD> +<TD BGCOLOR=red>05_[b]</TD> +<TD BGCOLOR=red>06_15</TD> +<TD BGCOLOR=red>07_18</TD> +<TD BGCOLOR=red>08_COMPLEX</TD> +<TD BGCOLOR=red>09_[c]</TD> +<TD BGCOLOR=red>0A_[d]</TD> +<TD BGCOLOR=red>0B_[e]</TD> +<TD BGCOLOR=red>X</TD> +</TR> +<TR><TH>00<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>33</TD> +<TD>10</TD> +<TD>127</TD> +<TD></TD> +<TD>7</TD> +<TD>44</TD> +<TD>2</TD> +<TD></TD> +</TR> +<TR><TH>0E<TH> +<TD>1</TD> +<TD>6</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>20</TD> +<TD>1</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>17<TH> +<TD>2</TD> +<TD>4</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>110</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>20<TH> +<TD>2</TD> +<TD>11</TD> +<TD>1</TD> +<TD></TD> +<TD>5</TD> +<TD>13</TD> +<TD></TD> +<TD>100</TD> +<TD></TD> +<TD></TD> +<TD>4</TD> +<TD>4</TD> +<TD></TD> +</TR> +<TR><TH>21<TH> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>1</TD> +<TD></TD> +<TD>32</TD> +<TD></TD> +<TD>163</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +<TR><TH>30<TH> +<TD>10</TD> +<TD>47</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD>161</TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +<TD></TD> +</TR> +</TABLE> diff --git a/intl/lwbrk/tools/anzx4051.pl b/intl/lwbrk/tools/anzx4051.pl new file mode 100644 index 0000000000..e76eac6207 --- /dev/null +++ b/intl/lwbrk/tools/anzx4051.pl @@ -0,0 +1,356 @@ +#!/usr/bin/perl +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +###################################################################### +# +# Initial global variable +# +###################################################################### +%utot = (); +$ui=0; +$li=0; + +###################################################################### +# +# Open the unicode database file +# +###################################################################### +open ( UNICODATA , "< ../../unicharutil/tools/UnicodeData-Latest.txt") + || die "cannot find UnicodeData-Latest.txt"; + +###################################################################### +# +# Open the JIS X 4051 Class file +# +###################################################################### +open ( CLASS , "< jisx4051class.txt") + || die "cannot find jisx4051class.txt"; + +###################################################################### +# +# Open the JIS X 4051 Class simplified mapping +# +###################################################################### +open ( SIMP , "< jisx4051simp.txt") + || die "cannot find jisx4051simp.txt"; + +###################################################################### +# +# Open the output file +# +###################################################################### +open ( OUT , "> anzx4051.html") + || die "cannot open output anzx4051.html file"; + +###################################################################### +# +# Open the output file +# +###################################################################### +open ( HEADER , "> ../jisx4051class.h") + || die "cannot open output ../jisx4051class.h file"; + +###################################################################### +# +# Generate license and header +# +###################################################################### +$hthmlheader = <<END_OF_HTML; +<!-- This Source Code Form is subject to the terms of the Mozilla Public + - License, v. 2.0. If a copy of the MPL was not distributed with this + - file, You can obtain one at http://mozilla.org/MPL/2.0/. --> + +<HTML> +<HEAD> +<TITLE> +Analysis of JIS X 4051 to Unicode General Category Mapping +</TITLE> +</HEAD> +<BODY> +<H1> +Analysis of JIS X 4051 to Unicode General Category Mapping +</H1> +END_OF_HTML +print OUT $hthmlheader; + +###################################################################### +# +# Generate license and header +# +###################################################################### +$npl = <<END_OF_NPL; +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +/* + DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY + mozilla/intl/lwbrk/tools/anzx4051.pl + */ +END_OF_NPL +print HEADER $npl; + +%occ = (); +%gcat = (); +%dcat = (); +%simp = (); +%gcount = (); +%dcount = (); +%sccount = (); +%rangecount = (); + +###################################################################### +# +# Process the file line by line +# +###################################################################### +while(<UNICODATA>) { + chop; + ###################################################################### + # + # Get value from fields + # + ###################################################################### + @f = split(/;/ , $_); + $c = $f[0]; # The unicode value + $g = $f[2]; + $d = substr($g, 0, 1); + + $gcat{$c} = $g; + $dcat{$c} = $d; + $gcount{$g}++; + $dcount{$d}++; +} +close(UNIDATA); + +while(<SIMP>) { + chop; + ###################################################################### + # + # Get value from fields + # + ###################################################################### + @f = split(/;/ , $_); + + $simp{$f[0]} = $f[1]; + $sccount{$f[1]}++; +} +close(SIMP); + +sub GetClass{ + my ($u) = @_; + my $hex = DecToHex($u); + $g = $gcat{$hex}; + if($g ne "") { + return $g; + } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) { + return "Han"; + } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) { + return "Lo"; + } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) { + return "Cs"; + } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) { + return "Cs"; + } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) { + return "Cs"; + } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) { + return "Co"; + } else { + printf "WARNING !!!! Cannot find General Category for U+%s \n" , $hex; + } +} +sub GetDClass{ + my ($u) = @_; + my $hex = DecToHex($u); + $g = $dcat{$hex}; + if($g ne "") { + return $g; + } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) { + return "Han"; + } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) { + return "L"; + } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) { + return "C"; + } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) { + return "C"; + } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) { + return "C"; + } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) { + return "C"; + } else { + printf "WARNING !!!! Cannot find Detailed General Category for U+%s \n" , $hex; + } +} +sub DecToHex{ + my ($d) = @_; + return sprintf("%04X", $d); +} +%gtotal = (); +%dtotal = (); +while(<CLASS>) { + chop; + ###################################################################### + # + # Get value from fields + # + ###################################################################### + @f = split(/;/ , $_); + + if( substr($f[2], 0, 1) ne "a") + { + $sc = $simp{$f[2]}; + $l = hex($f[0]); + if($f[1] eq "") + { + $h = $l; + } else { + $h = hex($f[1]); + } + for($k = $l; $k <= $h ; $k++) + { + if( exists($occ{$k})) + { + # printf "WARNING !! Conflict defination!!! U+%s -> [%s] [%s | %s]\n", + # DecToHex($k), $occ{$k} , $f[2] , $sc; + } + else + { + $occ{$k} = $sc . " | " . $f[2]; + $gclass = GetClass($k); + $dclass = GetDClass($k); + $gtotal{$sc . $gclass}++; + $dtotal{$sc . $dclass}++; + $u = DecToHex($k); + $rk = " " . substr($u,0,2) . ":" . $sc; + $rangecount{$rk}++; + } + } + } +} + +#print %gtotal; +#print %dtotal; + +sub printreport +{ + print OUT "<TABLE BORDER=3>\n"; + print OUT "<TR BGCOLOR=blue><TH><TH>\n"; + + foreach $d (sort(keys %dcount)) { + print OUT "<TD BGCOLOR=red>$d</TD>\n"; + } + + print OUT "<TD BGCOLOR=white>Total</TD>\n"; + foreach $g (sort(keys %gcount)) { + print OUT "<TD BGCOLOR=yellow>$g</TD>\n"; + } + print OUT "</TR>\n"; + foreach $sc (sort(keys %sccount)) { + + print OUT "<TR><TH>$sc<TH>\n"; + + $total = 0; + foreach $d (sort (keys %dcount)) { + $count = $dtotal{$sc . $d}; + $total += $count; + print OUT "<TD>$count</TD>\n"; + } + + print OUT "<TD BGCOLOR=white>$total</TD>\n"; + + foreach $g (sort(keys %gcount)) { + $count = $gtotal{$sc . $g}; + print OUT "<TD>$count</TD>\n"; + } + + + print OUT "</TR>\n"; + } + print OUT "</TABLE>\n"; + + + print OUT "<TABLE BORDER=3>\n"; + print OUT "<TR BGCOLOR=blue><TH><TH>\n"; + + foreach $sc (sort(keys %sccount)) + { + print OUT "<TD BGCOLOR=red>$sc</TD>\n"; + } + + print OUT "</TR>\n"; + + + for($rr = 0; $rr < 0x4f; $rr++) + { + $empty = 0; + $r = sprintf("%02X" , $rr) ; + $tmp = "<TR><TH>" . $r . "<TH>\n"; + + foreach $sc (sort(keys %sccount)) { + $count = $rangecount{ " " .$r . ":" .$sc}; + $tmp .= sprintf("<TD>%s</TD>\n", $count); + $empty += $count; + } + + $tmp .= "</TR>\n"; + + if($empty ne 0) + { + print OUT $tmp; + } + } + print OUT "</TABLE>\n"; + +} +printreport(); + +sub printarray +{ + my($r, $def) = @_; +printf "[%s || %s]\n", $r, $def; + $k = hex($r) * 256; + printf HEADER "static const uint32_t gLBClass%s[32] = {\n", $r; + for($i = 0 ; $i < 256; $i+= 8) + { + for($j = 7 ; $j >= 0; $j-- ) + { + $v = $k + $i + $j; + if( exists($occ{$v})) + { + $p = substr($occ{$v}, 1,1); + } else { + $p = $def; + } + + if($j eq 7 ) + { + printf HEADER "0x%s" , $p; + } else { + printf HEADER "%s", $p ; + } + } + printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7); + } + print HEADER "};\n\n"; +} +printarray("00", "7"); +printarray("20", "7"); +printarray("21", "7"); +printarray("30", "5"); +printarray("0E", "8"); +printarray("17", "7"); + +#print %rangecount; + +###################################################################### +# +# Close files +# +###################################################################### +close(HEADER); +close(CLASS); +close(OUT); + diff --git a/intl/lwbrk/tools/jisx4051class.txt b/intl/lwbrk/tools/jisx4051class.txt new file mode 100644 index 0000000000..c435c1ae55 --- /dev/null +++ b/intl/lwbrk/tools/jisx4051class.txt @@ -0,0 +1,159 @@ +0000;001f;17 +0020;;17 +0024;;24 +0027;;18 +0028;;22 +002D;;18 +002F;;18 +0021;002F;23 +0030;0039;15 +003C;;22 +003A;003F;23 +0040;;18 +0041;005A;18 +005B;;22 +005E;;18 +005F;;18 +005B;005F;23 +0060;;18 +0061;007A;18 +007B;;22 +007B;007E;23 +00A0;;24 +00A3;;22 +00A5;;22 +00A9;;18 +00AA;;18 +00AB;;18 +00AC;;22 +00AE;;18 +00AF;;18 +00A1;00BF;23 +00B0;;18 +00F7;;23 +00C0;00FF;18 +0E3F;;1 +0E2F;;4 +0E46;;4 +0E5A;0E5B;4 +0E50;0E59;15 +0E4F;;18 +0EAF;;4 +0EC6;;4 +0ED0;0ED9;15 +1735;1736;1 +17D4;17D5;4 +17D8;;4 +17DA;;4 +1780;17DD;21 +17E0;17E9;21 +17F0;17F9;21 +2007;;24 +2000;200B;17 +200C;200F;18 +2010;;18 +2011;;24 +2012;2013;18 +2014;;7 +2015;;18 +2016;2017;18 +2019;;23 +201D;;23 +2018;201F;18 +2020;2023;18 +2024;2026;2 +2027;;23 +2028;202E;18 +202F;;24 +2030;2034;9 +2035;2038;18 +2039;;1 +203A;;2 +203B;;12 +203C;203D;3 +203E;;23 +203F;2043;18 +2044;;3 +2045;;1 +2046;;2 +2047;2049;3 +204A;205E;18 +205F;;17 +2060;;24 +2061;2063;18 +206A;206F;18 +2070;2071;18 +2074;208E;18 +2090;2094;18 +2116;;8 +2160;217F;12 +2190;21EA;a12 +2126;;18 +2100;2138;18 +2153;2182;18 +2190;21EA;18 +3008;;1 +300A;;1 +300C;;1 +300E;;1 +3010;;1 +3014;;1 +3016;;1 +3018;;1 +301A;;1 +301D;;1 +3001;;2 +3009;;2 +300B;;2 +300D;;2 +300F;;2 +3011;;2 +3015;;2 +3017;;2 +3019;;2 +301B;;2 +301E;;2 +301F;;2 +3005;;3 +301C;;3 +3041;;3 +3043;;3 +3045;;3 +3047;;3 +3049;;3 +3063;;3 +3083;;3 +3085;;3 +3087;;3 +308E;;3 +309D;;3 +309E;;3 +30A1;;3 +30A3;;3 +30A5;;3 +30A7;;3 +30A9;;3 +30C3;;3 +30E3;;3 +30E5;;3 +30E7;;3 +30EE;;3 +30F5;;3 +30F6;;3 +30FC;;3 +30FD;;3 +30FE;;3 +30FB;;5 +3002;;6 +3000;;10 +3042;3094;11 +3099;309E;3 +3003;;12 +3004;;12 +3006;;12 +3007;;12 +3012;;12 +3013;;12 +3020;;12 +3036;;12 +30A2;30FA;12 diff --git a/intl/lwbrk/tools/jisx4051simp.txt b/intl/lwbrk/tools/jisx4051simp.txt new file mode 100644 index 0000000000..e12a7fd805 --- /dev/null +++ b/intl/lwbrk/tools/jisx4051simp.txt @@ -0,0 +1,24 @@ +1;00_1 +2;01_[a] +3;01_[a] +4;01_[a] +5;01_[a] +6;01_[a] +7;02_7 +8;03_8 +9;04_9 +10;05_[b] +11;05_[b] +12;05_[b] +13;X +14;X +15;06_15 +16;X +17;05_[b] +18;07_18 +19;X +20;X +21;08_COMPLEX +22;09_[c] +23;0A_[d] +24;0B_[e] diff --git a/intl/lwbrk/tools/spec_table.html b/intl/lwbrk/tools/spec_table.html new file mode 100644 index 0000000000..519f98c534 --- /dev/null +++ b/intl/lwbrk/tools/spec_table.html @@ -0,0 +1,127 @@ +<!-- This Source Code Form is subject to the terms of the Mozilla Public + - License, v. 2.0. If a copy of the MPL was not distributed with this + - file, You can obtain one at http://mozilla.org/MPL/2.0/. --> + +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> +<html> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> +<title></title> +<style type="text/css"> +table { + border: solid 1px; + border-collapse: collapse; +} +tbody, tfoot { + border-top: solid 2px; +} +td, th { + border: solid 1px; +} +td { + text-align: center; +} +</style> +</head> +<body> +<p>This is a specification table for line breaking.</p> +<p>The values of IE7 and Opera9: 'A' means that the line is breakable After the character, and 'B' means Before. 'BA' means Before and After.</p> +<p>(C) which is the tail of the IE7 and the Opera9 means Character. (N) means Numeric. +This means that they are around the character at testing. E.g., "a$a" is a testcase for (C), "0$0" is a testcase for (N).</p> +<p>Gecko is not breaking the lines on most western language context. But for file paths, URLs and very long word which is connected hyphens, +some characters might be breakable. They are 'breakable' in the table. However, they are not always breakable, +they <em>depend on the context</em> in the word.</p> +<table border="1"> +<thead> +<tr><th colspan="2">character</th><th>Gecko</th><th>IE7(C)</th><th>IE7(N)</th><th>Opera9.2(C)</th><th>Opera9.2(N)</th></tr> +</thead> +<tfoot> +<tr><th colspan="2">character</th><th>Gecko</th><th>IE7(C)</th><th>IE7(N)</th><th>Opera9.2(C)</th><th>Opera9.2(N)</th></tr> +</tfoot> +<tbody> +<tr><th>0x21</th><th>!</th><td></td><td>A</td><td>A</td><td></td><td></td></tr> +<tr><th>0x22</th><th>"</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x23</th><th>#</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x24</th><th>$</th><td></td><td></td><td>B</td><td></td><td></td></tr> +<tr><th>0x25</th><th>%</th><td>breakable</td><td>A</td><td>A</td><td></td><td></td></tr> +<tr><th>0x26</th><th>&</th><td>breakable</td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x27</th><th>'</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x28</th><th>(</th><td></td><td>B</td><td>B</td><td></td><td></td></tr> +<tr><th>0x29</th><th>)</th><td></td><td>A</td><td>A</td><td></td><td></td></tr> +<tr><th>0x2A</th><th>*</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x2B</th><th>+</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x2C</th><th>,</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x2D</th><th>-</th><td>breakable</td><td>BA</td><td>BA</td><td>A</td><td>A</td></tr> +<tr><th>0x2E</th><th>.</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x2F</th><th>/</th><td>breakable</td><td></td><td></td><td>A</td><td>A</td></tr> +</tbody> +<tbody> +<tr><th>0x3A</th><th>:</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x3B</th><th>;</th><td>breakable</td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x3C</th><th><</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x3D</th><th>=</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x3E</th><th>></th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x3F</th><th>?</th><td></td><td>A</td><td>A</td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0x40</th><th>@</th><td></td><td></td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0x5B</th><th>[</th><td></td><td>B</td><td>B</td><td></td><td></td></tr> +<tr><th>0x5C</th><th>\</th><td>breakable</td><td></td><td>B</td><td></td><td></td></tr> +<tr><th>0x5D</th><th>]</th><td></td><td>A</td><td>A</td><td></td><td></td></tr> +<tr><th>0x5E</th><th>^</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0x5F</th><th>_</th><td></td><td></td><td></td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0x60</th><th>`</th><td></td><td></td><td></td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0x7B</th><th>{</th><td></td><td>B</td><td>B</td><td></td><td></td></tr> +<tr><th>0x7C</th><th>|</th><td></td><td></td><td></td><td>A</td><td>A</td></tr> +<tr><th>0x7D</th><th>}</th><td></td><td>A</td><td>A</td><td></td><td></td></tr> +<tr><th>0x7E</th><th>~</th><td></td><td></td><td></td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0xA1</th><th>¡</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xA2</th><th>¢</th><td></td><td>A</td><td>A</td><td></td><td></td></tr> +<tr><th>0xA3</th><th>£</th><td></td><td></td><td>B</td><td></td><td></td></tr> +<tr><th>0xA4</th><th>¤</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xA5</th><th>¥</th><td></td><td></td><td>B</td><td></td><td></td></tr> +<tr><th>0xA6</th><th>¦</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xA7</th><th>§</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xA8</th><th>¨</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xA9</th><th>©</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xAA</th><th>ª</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xAB</th><th>«</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xAC</th><th>¬</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xAE</th><th>®</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xAF</th><th>¯</th><td></td><td></td><td></td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0xB0</th><th>°</th><td></td><td>A</td><td>A</td><td></td><td></td></tr> +<tr><th>0xB1</th><th>±</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xB2</th><th>²</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xB3</th><th>³</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xB4</th><th>´</th><td></td><td></td><td></td><td>B</td><td>B</td></tr> +<tr><th>0xB5</th><th>µ</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xB6</th><th>¶</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xB7</th><th>·</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xB8</th><th>¸</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xB9</th><th>¹</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xBA</th><th>º</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xBB</th><th>»</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xBC</th><th>¼</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xBD</th><th>½</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xBE</th><th>¾</th><td></td><td></td><td></td><td></td><td></td></tr> +<tr><th>0xBF</th><th>¿</th><td></td><td></td><td></td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0xD7</th><th>×</th><td></td><td></td><td></td><td></td><td></td></tr> +</tbody> +<tbody> +<tr><th>0xF7</th><th>÷</th><td></td><td></td><td></td><td></td><td></td></tr> +</tbody> +</table> +</body> +</html> |