summaryrefslogtreecommitdiffstats
path: root/intl/lwbrk
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--intl/lwbrk/LineBreaker.cpp1153
-rw-r--r--intl/lwbrk/LineBreaker.h82
-rw-r--r--intl/lwbrk/Segmenter.cpp260
-rw-r--r--intl/lwbrk/Segmenter.h177
-rw-r--r--intl/lwbrk/WordBreaker.cpp185
-rw-r--r--intl/lwbrk/WordBreaker.h65
-rw-r--r--intl/lwbrk/crashtests/416721.html11
-rw-r--r--intl/lwbrk/crashtests/Lo_test_page_no_uniscribe_breaks.html12
-rw-r--r--intl/lwbrk/crashtests/UDHR_Thai_test_page_long_sequences.html184
-rw-r--r--intl/lwbrk/crashtests/crashtests.list1
-rw-r--r--intl/lwbrk/crashtests/crashtests_manual.list6
-rw-r--r--intl/lwbrk/gtest/TestBreak.cpp327
-rw-r--r--intl/lwbrk/gtest/TestSegmenter.cpp105
-rw-r--r--intl/lwbrk/gtest/moz.build12
-rw-r--r--intl/lwbrk/jisx4051class.h217
-rw-r--r--intl/lwbrk/jisx4051pairtable.txt286
-rw-r--r--intl/lwbrk/moz.build45
-rw-r--r--intl/lwbrk/nsCarbonBreaker.cpp43
-rw-r--r--intl/lwbrk/nsComplexBreaker.cpp173
-rw-r--r--intl/lwbrk/nsComplexBreaker.h36
-rw-r--r--intl/lwbrk/nsLWBrkCIID.h28
-rw-r--r--intl/lwbrk/nsPangoBreaker.cpp61
-rw-r--r--intl/lwbrk/nsRuleBreaker.cpp18
-rw-r--r--intl/lwbrk/nsUniscribeBreaker.cpp146
-rw-r--r--intl/lwbrk/rulebrk.c388
-rw-r--r--intl/lwbrk/rulebrk.h26
-rw-r--r--intl/lwbrk/th_char.h133
-rw-r--r--intl/lwbrk/tools/anzx4051.html709
-rw-r--r--intl/lwbrk/tools/anzx4051.pl356
-rw-r--r--intl/lwbrk/tools/jisx4051class.txt159
-rw-r--r--intl/lwbrk/tools/jisx4051simp.txt24
-rw-r--r--intl/lwbrk/tools/spec_table.html664
32 files changed, 6092 insertions, 0 deletions
diff --git a/intl/lwbrk/LineBreaker.cpp b/intl/lwbrk/LineBreaker.cpp
new file mode 100644
index 0000000000..2784b0d302
--- /dev/null
+++ b/intl/lwbrk/LineBreaker.cpp
@@ -0,0 +1,1153 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/intl/LineBreaker.h"
+
+#include "jisx4051class.h"
+#include "nsComplexBreaker.h"
+#include "nsTArray.h"
+#include "nsUnicodeProperties.h"
+#include "mozilla/ArrayUtils.h"
+#include "mozilla/intl/Segmenter.h"
+#include "mozilla/intl/UnicodeProperties.h"
+
+using namespace mozilla::unicode;
+using namespace mozilla::intl;
+
+/*
+
+ Simplification of Pair Table in JIS X 4051
+
+ 1. The Origion Table - in 4.1.3
+
+ In JIS x 4051. The pair table is defined as below
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20
+ * # * #
+ 1 X X X X X X X X X X X X X X X X X X X X X E
+ 2 X X X X X X
+ 3 X X X X X X
+ 4 X X X X X X
+ 5 X X X X X X
+ 6 X X X X X X
+ 7 X X X X X X X
+ 8 X X X X X X E
+ 9 X X X X X X
+ 10 X X X X X X
+ 11 X X X X X X
+ 12 X X X X X X
+ 13 X X X X X X X
+ 14 X X X X X X X
+ 15 X X X X X X X X X
+ 16 X X X X X X X X
+ 17 X X X X X E
+ 18 X X X X X X X X X
+ 19 X E E E E E X X X X X X X X X X X X E X E E
+ 20 X X X X X E
+
+ * Same Char
+ # Other Char
+
+ X Cannot Break
+
+ The classes mean:
+ 1: Open parenthesis
+ 2: Close parenthesis
+ 3: Prohibit a line break before
+ 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")
+ 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)
+ 6: Full stop
+ 7: Non-breakable between same characters
+ 8: Prefix (e.g., "$", "NO.")
+ 9: Postfix (e.g., "%")
+ 10: Ideographic space
+ 11: Hiragana
+ 12: Japanese characters (except class 11)
+ 13: Subscript
+ 14: Ruby
+ 15: Numeric
+ 16: Alphabet
+ 17: Space for Western language
+ 18: Western characters (except class 17)
+ 19: Split line note (Warichu) begin quote
+ 20: Split line note (Warichu) end quote
+
+ 2. Simplified by remove the class which we do not care
+
+ However, since we do not care about class 13(Subscript), 14(Ruby),
+ 16 (Aphabet), 19(split line note begin quote), and 20(split line note end
+ quote) we can simplify this par table into the following
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18
+
+ 1 X X X X X X X X X X X X X X X
+ 2 X X X X X
+ 3 X X X X X
+ 4 X X X X X
+ 5 X X X X X
+ 6 X X X X X
+ 7 X X X X X X
+ 8 X X X X X X
+ 9 X X X X X
+ 10 X X X X X
+ 11 X X X X X
+ 12 X X X X X
+ 15 X X X X X X X X
+ 17 X X X X X
+ 18 X X X X X X X
+
+ 3. Simplified by merged classes
+
+ After the 2 simplification, the pair table have some duplication
+ a. class 2, 3, 4, 5, 6, are the same- we can merged them
+ b. class 10, 11, 12, 17 are the same- we can merged them
+
+ We introduce an extra non-breaking pair at [b]/7 to better match
+ the expectations of CSS line-breaking as tested by WPT tests.
+ This added entry is marked as * in the tables below.
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18
+
+ 1 X X X X X X X X
+ [a] X
+ 7 X X
+ 8 X X
+ 9 X
+ [b] X *
+ 15 X X X X
+ 18 X X X
+
+
+ 4. We add COMPLEX characters and make it breakable w/ all ther class
+ except after class 1 and before class [a]
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18 COMPLEX
+
+ 1 X X X X X X X X X
+ [a] X
+ 7 X X
+ 8 X X
+ 9 X
+ [b] X *
+ 15 X X X X
+ 18 X X X
+ COMPLEX X T
+
+ T : need special handling
+
+
+ 5. However, we need two special class for some punctuations/parentheses,
+ theirs breaking rules like character class (18), see bug 389056.
+ And also we need character like punctuation that is same behavior with 18,
+ but the characters are not letters of all languages. (e.g., '_')
+ [c]. Based on open parenthesis class (1), but it is not breakable after
+ character class (18) or numeric class (15).
+ [d]. Based on close parenthesis (or punctuation) class (2), but it is not
+ breakable before character class (18) or numeric class (15).
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d]
+
+ 1 X X X X X X X X X X X
+ [a] X X X
+ 7 X X
+ 8 X X
+ 9 X
+ [b] X * X
+ 15 X X X X X X
+ 18 X X X X X
+ COMPLEX X T
+ [c] X X X X X X X X X X X
+ [d] X X X X
+
+
+ 6. And Unicode has "NON-BREAK" characters. The lines should be broken around
+ them. But in JIS X 4051, such class is not, therefore, we create [e].
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
+
+ 1 X X X X X X X X X X X X
+ [a] X X X
+ 7 X X X
+ 8 X X X
+ 9 X X
+ [b] X * X X
+ 15 X X X X X X X
+ 18 X X X X X X
+ COMPLEX X T X
+ [c] X X X X X X X X X X X X
+ [d] X X X X X
+ [e] X X X X X X X X X X X X
+
+
+ 7. Now we use one bit to encode whether it is breakable, and use 2 bytes
+ for one row, then the bit table will look like:
+
+ 18 <- 1
+
+ 1 0000 1111 1111 1111 = 0x0FFF
+ [a] 0000 1100 0000 0010 = 0x0C02
+ 7 0000 1000 0000 0110 = 0x0806
+ 8 0000 1000 0100 0010 = 0x0842
+ 9 0000 1000 0000 0010 = 0x0802
+ [b] 0000 1100 0000 0110 = 0x0C06
+ 15 0000 1110 1101 0010 = 0x0ED2
+ 18 0000 1110 1100 0010 = 0x0EC2
+ COMPLEX 0000 1001 0000 0010 = 0x0902
+ [c] 0000 1111 1111 1111 = 0x0FFF
+ [d] 0000 1100 1100 0010 = 0x0CC2
+ [e] 0000 1111 1111 1111 = 0x0FFF
+*/
+
+#define MAX_CLASSES 12
+
+static const uint16_t gPair[MAX_CLASSES] = {0x0FFF, 0x0C02, 0x0806, 0x0842,
+ 0x0802, 0x0C06, 0x0ED2, 0x0EC2,
+ 0x0902, 0x0FFF, 0x0CC2, 0x0FFF};
+
+/*
+
+ 8. And if the character is not enough far from word start, word end and
+ another break point, we should not break in non-CJK languages.
+ I.e., Don't break around 15, 18, [c] and [d], but don't change
+ that if they are related to [b].
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
+
+ 1 X X X X X X X X X X X X
+ [a] X X X X X X
+ 7 X X X X X X X
+ 8 X X X X X X
+ 9 X X X X X X
+ [b] X * X X
+ 15 X X X X X X X X X X X
+ 18 X X X X X X X X X X X
+ COMPLEX X X X T X X X
+ [c] X X X X X X X X X X X X
+ [d] X X X X X X X X X X X
+ [e] X X X X X X X X X X X X
+
+ 18 <- 1
+
+ 1 0000 1111 1111 1111 = 0x0FFF
+ [a] 0000 1110 1100 0010 = 0x0EC2
+ 7 0000 1110 1100 0110 = 0x0EC6
+ 8 0000 1110 1100 0010 = 0x0EC2
+ 9 0000 1110 1100 0010 = 0x0EC2
+ [b] 0000 1100 0000 0110 = 0x0C06
+ 15 0000 1111 1101 1111 = 0x0FDF
+ 18 0000 1111 1101 1111 = 0x0FDF
+ COMPLEX 0000 1111 1100 0010 = 0x0FC2
+ [c] 0000 1111 1111 1111 = 0x0FFF
+ [d] 0000 1111 1101 1111 = 0x0FDF
+ [e] 0000 1111 1111 1111 = 0x0FFF
+*/
+
+static const uint16_t gPairConservative[MAX_CLASSES] = {
+ 0x0FFF, 0x0EC2, 0x0EC6, 0x0EC2, 0x0EC2, 0x0C06,
+ 0x0FDF, 0x0FDF, 0x0FC2, 0x0FFF, 0x0FDF, 0x0FFF};
+
+/*
+
+ 9. Now we map the class to number
+
+ 0: 1
+ 1: [a]- 2, 3, 4, 5, 6
+ 2: 7
+ 3: 8
+ 4: 9
+ 5: [b]- 10, 11, 12, 17
+ 6: 15
+ 7: 18
+ 8: COMPLEX
+ 9: [c]
+ A: [d]
+ B: [e]
+
+ and they mean:
+ 0: Open parenthesis
+ 1: Punctuation that prohibits break before
+ 2: Non-breakable between same classes
+ 3: Prefix
+ 4: Postfix
+ 5: Breakable character (Spaces and Most Japanese characters)
+ 6: Numeric
+ 7: Characters
+ 8: Need special handling characters (E.g., Thai)
+ 9: Open parentheses like Character (See bug 389056)
+ A: Close parenthese (or punctuations) like Character (See bug 389056)
+ B: Non breakable (See bug 390920)
+
+*/
+
+#define CLASS_NONE INT8_MAX
+
+#define CLASS_OPEN 0x00
+#define CLASS_CLOSE 0x01
+#define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02
+#define CLASS_PREFIX 0x03
+#define CLASS_POSTFFIX 0x04
+#define CLASS_BREAKABLE 0x05
+#define CLASS_NUMERIC 0x06
+#define CLASS_CHARACTER 0x07
+#define CLASS_COMPLEX 0x08
+#define CLASS_OPEN_LIKE_CHARACTER 0x09
+#define CLASS_CLOSE_LIKE_CHARACTER 0x0A
+#define CLASS_NON_BREAKABLE 0x0B
+
+#define U_NULL char16_t(0x0000)
+#define U_SLASH char16_t('/')
+#define U_SPACE char16_t(' ')
+#define U_HYPHEN char16_t('-')
+#define U_EQUAL char16_t('=')
+#define U_PERCENT char16_t('%')
+#define U_AMPERSAND char16_t('&')
+#define U_SEMICOLON char16_t(';')
+#define U_BACKSLASH char16_t('\\')
+#define U_OPEN_SINGLE_QUOTE char16_t(0x2018)
+#define U_OPEN_DOUBLE_QUOTE char16_t(0x201C)
+#define U_OPEN_GUILLEMET char16_t(0x00AB)
+
+#define NEED_CONTEXTUAL_ANALYSIS(c) \
+ (IS_HYPHEN(c) || (c) == U_SLASH || (c) == U_PERCENT || (c) == U_AMPERSAND || \
+ (c) == U_SEMICOLON || (c) == U_BACKSLASH || (c) == U_OPEN_SINGLE_QUOTE || \
+ (c) == U_OPEN_DOUBLE_QUOTE || (c) == U_OPEN_GUILLEMET)
+
+#define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
+
+static inline int GETCLASSFROMTABLE(const uint32_t* t, uint16_t l) {
+ return ((((t)[(l >> 3)]) >> ((l & 0x0007) << 2)) & 0x000f);
+}
+
+static inline int IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u) {
+ return ((0xff66 <= (u)) && ((u) <= 0xff70));
+}
+
+static inline int IS_CJK_CHAR(char32_t u) {
+ return (
+ (0x1100 <= (u) && (u) <= 0x11ff) || (0x2e80 <= (u) && (u) <= 0xd7ff) ||
+ (0xf900 <= (u) && (u) <= 0xfaff) || (0xff00 <= (u) && (u) <= 0xffef) ||
+ (0x20000 <= (u) && (u) <= 0x2fffd));
+}
+
+static inline bool IS_NONBREAKABLE_SPACE(char16_t u) {
+ return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE
+}
+
+static inline bool IS_HYPHEN(char16_t u) {
+ return (u == U_HYPHEN || u == 0x2010 || // HYPHEN
+ u == 0x2012 || // FIGURE DASH
+ u == 0x2013 || // EN DASH
+#if ANDROID || XP_WIN
+ /* Bug 1647377: On Android and Windows, we don't have a "platform"
+ * backend that supports Tibetan (nsRuleBreaker.cpp only knows about
+ * Thai, and ScriptBreak doesn't handle Tibetan well either), so
+ * instead we just treat the TSHEG like a hyphen to provide basic
+ * line-breaking possibilities.
+ */
+ u == 0x0F0B || // TIBETAN MARK INTERSYLLABIC TSHEG
+#endif
+ u == 0x058A); // ARMENIAN HYPHEN
+}
+
+static int8_t GetClass(uint32_t u, LineBreakRule aLevel,
+ bool aIsChineseOrJapanese) {
+ // Mapping for Unicode LineBreak.txt classes to the (simplified) set of
+ // character classes used here.
+ // XXX The mappings here were derived by comparing the Unicode LineBreak
+ // values of BMP characters to the classes our existing GetClass returns
+ // for the same codepoints; in cases where characters with the same
+ // LineBreak class mapped to various classes here, I picked what seemed
+ // the most prevalent equivalence.
+ // Some of these are unclear to me, but currently they are ONLY used
+ // for characters not handled by the old code below, so all the JISx405
+ // special cases should already be accounted for.
+ static const int8_t sUnicodeLineBreakToClass[] = {
+ /* UNKNOWN = 0, [XX] */ CLASS_CHARACTER,
+ /* AMBIGUOUS = 1, [AI] */ CLASS_CHARACTER,
+ /* ALPHABETIC = 2, [AL] */ CLASS_CHARACTER,
+ /* BREAK_BOTH = 3, [B2] */ CLASS_CHARACTER,
+ /* BREAK_AFTER = 4, [BA] */ CLASS_BREAKABLE,
+ /* BREAK_BEFORE = 5, [BB] */ CLASS_OPEN_LIKE_CHARACTER,
+ /* MANDATORY_BREAK = 6, [BK] */ CLASS_CHARACTER,
+ /* CONTINGENT_BREAK = 7, [CB] */ CLASS_CHARACTER,
+ /* CLOSE_PUNCTUATION = 8, [CL] */ CLASS_CLOSE_LIKE_CHARACTER,
+ /* COMBINING_MARK = 9, [CM] */ CLASS_CHARACTER,
+ /* CARRIAGE_RETURN = 10, [CR] */ CLASS_BREAKABLE,
+ /* EXCLAMATION = 11, [EX] */ CLASS_CLOSE_LIKE_CHARACTER,
+ /* GLUE = 12, [GL] */ CLASS_NON_BREAKABLE,
+ /* HYPHEN = 13, [HY] */ CLASS_CHARACTER,
+ /* IDEOGRAPHIC = 14, [ID] */ CLASS_BREAKABLE,
+ /* INSEPARABLE = 15, [IN] */ CLASS_CLOSE_LIKE_CHARACTER,
+ /* INFIX_NUMERIC = 16, [IS] */ CLASS_CHARACTER,
+ /* LINE_FEED = 17, [LF] */ CLASS_BREAKABLE,
+ /* NONSTARTER = 18, [NS] */ CLASS_CLOSE_LIKE_CHARACTER,
+ /* NUMERIC = 19, [NU] */ CLASS_NUMERIC,
+ /* OPEN_PUNCTUATION = 20, [OP] */ CLASS_OPEN_LIKE_CHARACTER,
+ /* POSTFIX_NUMERIC = 21, [PO] */ CLASS_CLOSE_LIKE_CHARACTER,
+ /* PREFIX_NUMERIC = 22, [PR] */ CLASS_CHARACTER,
+ /* QUOTATION = 23, [QU] */ CLASS_CHARACTER,
+ /* COMPLEX_CONTEXT = 24, [SA] */ CLASS_CHARACTER,
+ /* SURROGATE = 25, [SG] */ CLASS_CHARACTER,
+ /* SPACE = 26, [SP] */ CLASS_BREAKABLE,
+ /* BREAK_SYMBOLS = 27, [SY] */ CLASS_CHARACTER,
+ /* ZWSPACE = 28, [ZW] */ CLASS_BREAKABLE,
+ /* NEXT_LINE = 29, [NL] */ CLASS_CHARACTER,
+ /* WORD_JOINER = 30, [WJ] */ CLASS_NON_BREAKABLE,
+ /* H2 = 31, [H2] */ CLASS_BREAKABLE,
+ /* H3 = 32, [H3] */ CLASS_BREAKABLE,
+ /* JL = 33, [JL] */ CLASS_CHARACTER,
+ /* JT = 34, [JT] */ CLASS_CHARACTER,
+ /* JV = 35, [JV] */ CLASS_CHARACTER,
+ /* CLOSE_PARENTHESIS = 36, [CP] */ CLASS_CLOSE_LIKE_CHARACTER,
+ /* CONDITIONAL_JAPANESE_STARTER = 37, [CJ] */ CLASS_CLOSE,
+ /* HEBREW_LETTER = 38, [HL] */ CLASS_CHARACTER,
+ /* REGIONAL_INDICATOR = 39, [RI] */ CLASS_CHARACTER,
+ /* E_BASE = 40, [EB] */ CLASS_BREAKABLE,
+ /* E_MODIFIER = 41, [EM] */ CLASS_CHARACTER,
+ /* ZWJ = 42, [ZWJ]*/ CLASS_CHARACTER};
+
+ static_assert(U_LB_COUNT == mozilla::ArrayLength(sUnicodeLineBreakToClass),
+ "Gecko vs ICU LineBreak class mismatch");
+
+ auto cls = GetLineBreakClass(u);
+ MOZ_ASSERT(cls < mozilla::ArrayLength(sUnicodeLineBreakToClass));
+
+ // Overrides based on rules for the different line-break values given in
+ // https://drafts.csswg.org/css-text-3/#line-break-property
+ switch (aLevel) {
+ case LineBreakRule::Auto:
+ // For now, just use legacy Gecko behavior.
+ // XXX Possible enhancement - vary strictness according to line width
+ // or other criteria.
+ break;
+ case LineBreakRule::Strict:
+ if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER ||
+ (u == 0x3095 || u == 0x3096 || u == 0x30f5 || u == 0x30f6)) {
+ return CLASS_CLOSE;
+ }
+ if (cls == U_LB_INSEPARABLE) {
+ return CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS;
+ }
+ if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E ||
+ u == 0x30FD || u == 0x30FE) {
+ return CLASS_CLOSE_LIKE_CHARACTER;
+ }
+ if (aIsChineseOrJapanese) {
+ if (cls == U_LB_POSTFIX_NUMERIC &&
+ UnicodeProperties::IsEastAsianWidthAFW(u)) {
+ return CLASS_CLOSE_LIKE_CHARACTER;
+ }
+ if (cls == U_LB_PREFIX_NUMERIC &&
+ UnicodeProperties::IsEastAsianWidthAFW(u)) {
+ return CLASS_OPEN_LIKE_CHARACTER;
+ }
+ if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) {
+ return CLASS_CLOSE_LIKE_CHARACTER;
+ }
+ }
+ break;
+ case LineBreakRule::Normal:
+ if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER) {
+ return CLASS_BREAKABLE;
+ }
+ if (cls == U_LB_INSEPARABLE) {
+ return CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS;
+ }
+ if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E ||
+ u == 0x30FD || u == 0x30FE) {
+ return CLASS_CLOSE_LIKE_CHARACTER;
+ }
+ if (aIsChineseOrJapanese) {
+ if (cls == U_LB_POSTFIX_NUMERIC &&
+ UnicodeProperties::IsEastAsianWidthAFW(u)) {
+ return CLASS_CLOSE_LIKE_CHARACTER;
+ }
+ if (cls == U_LB_PREFIX_NUMERIC &&
+ UnicodeProperties::IsEastAsianWidthAFW(u)) {
+ return CLASS_OPEN_LIKE_CHARACTER;
+ }
+ if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) {
+ return CLASS_BREAKABLE;
+ }
+ }
+ break;
+ case LineBreakRule::Loose:
+ if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER) {
+ return CLASS_BREAKABLE;
+ }
+ if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E ||
+ u == 0x30FD || u == 0x30FE) {
+ return CLASS_BREAKABLE;
+ }
+ if (cls == U_LB_INSEPARABLE) {
+ return CLASS_BREAKABLE;
+ }
+ if (aIsChineseOrJapanese) {
+ if (u == 0x30FB || u == 0xFF1A || u == 0xFF1B || u == 0xFF65 ||
+ u == 0x203C || u == 0x2047 || u == 0x2048 || u == 0x2049 ||
+ u == 0xFF01 || u == 0xFF1F) {
+ return CLASS_BREAKABLE;
+ }
+ if (cls == U_LB_POSTFIX_NUMERIC &&
+ UnicodeProperties::IsEastAsianWidthAFW(u)) {
+ return CLASS_BREAKABLE;
+ }
+ if (cls == U_LB_PREFIX_NUMERIC &&
+ UnicodeProperties::IsEastAsianWidthAFW(u)) {
+ return CLASS_BREAKABLE;
+ }
+ if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) {
+ return CLASS_BREAKABLE;
+ }
+ }
+ break;
+ case LineBreakRule::Anywhere:
+ MOZ_ASSERT_UNREACHABLE("should have been handled already");
+ break;
+ }
+
+ if (u < 0x10000) {
+ uint16_t h = u & 0xFF00;
+ uint16_t l = u & 0x00ff;
+
+ // Handle 3 range table first
+ if (0x0000 == h) {
+ return GETCLASSFROMTABLE(gLBClass00, l);
+ }
+ if (0x1700 == h) {
+ return GETCLASSFROMTABLE(gLBClass17, l);
+ }
+ if (NS_NeedsPlatformNativeHandling(u)) {
+ return CLASS_COMPLEX;
+ }
+ if (0x0E00 == h) {
+ return GETCLASSFROMTABLE(gLBClass0E, l);
+ }
+ if (0x2000 == h) {
+ return GETCLASSFROMTABLE(gLBClass20, l);
+ }
+ if (0x2100 == h) {
+ return GETCLASSFROMTABLE(gLBClass21, l);
+ }
+ if (0x3000 == h) {
+ return GETCLASSFROMTABLE(gLBClass30, l);
+ }
+ if (0xff00 == h) {
+ if (l <= 0x0060) { // Fullwidth ASCII variant
+ // Previously, we treated Fullwidth chars the same as their ASCII
+ // counterparts, but UAX#14 (LineBreak.txt) disagrees with this and
+ // treats many of them as ideograph-like.
+ return sUnicodeLineBreakToClass[cls];
+ }
+ if (l < 0x00a0) { // Halfwidth Katakana variants
+ switch (l) {
+ case 0x61:
+ return GetClass(0x3002, aLevel, aIsChineseOrJapanese);
+ case 0x62:
+ return GetClass(0x300c, aLevel, aIsChineseOrJapanese);
+ case 0x63:
+ return GetClass(0x300d, aLevel, aIsChineseOrJapanese);
+ case 0x64:
+ return GetClass(0x3001, aLevel, aIsChineseOrJapanese);
+ case 0x65:
+ return GetClass(0x30fb, aLevel, aIsChineseOrJapanese);
+ case 0x9e:
+ return GetClass(0x309b, aLevel, aIsChineseOrJapanese);
+ case 0x9f:
+ return GetClass(0x309c, aLevel, aIsChineseOrJapanese);
+ default:
+ if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) {
+ return CLASS_CLOSE; // jis x4051 class 3
+ }
+ return CLASS_BREAKABLE; // jis x4051 class 11
+ }
+ }
+ if (l < 0x00e0) {
+ return CLASS_CHARACTER; // Halfwidth Hangul variants
+ }
+ if (l < 0x00f0) {
+ static char16_t NarrowFFEx[16] = {
+ 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
+ 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000};
+ return GetClass(NarrowFFEx[l - 0x00e0], aLevel, aIsChineseOrJapanese);
+ }
+ } else if (0x3100 == h) {
+ if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun
+ // XXX: This is per UAX #14, but UAX #14 may change
+ // the line breaking rules about Kanbun and Bopomofo.
+ return CLASS_BREAKABLE;
+ }
+ if (l >= 0xf0) { // Katakana small letters for Ainu
+ return CLASS_CLOSE;
+ }
+ } else if (0x0300 == h) {
+ if (0x4F == l || (0x5C <= l && l <= 0x62)) {
+ return CLASS_NON_BREAKABLE;
+ }
+ } else if (0x0500 == h) {
+ // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)
+ if (l == 0x8A) {
+ return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN));
+ }
+ } else if (0x0F00 == h) {
+ // We treat Tibetan TSHEG as a hyphen (when not using platform breaker);
+ // other Tibetan chars with LineBreak class=BA will be handled by the
+ // default sUnicodeLineBreakToClass mapping below.
+ if (l == 0x0B) {
+ return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN));
+ }
+ } else if (0x1800 == h) {
+ if (0x0E == l) {
+ return CLASS_NON_BREAKABLE;
+ }
+ } else if (0x1600 == h) {
+ if (0x80 == l) { // U+1680 OGHAM SPACE MARK
+ return CLASS_BREAKABLE;
+ }
+ } else if (u == 0xfeff) {
+ return CLASS_NON_BREAKABLE;
+ }
+ }
+
+ return sUnicodeLineBreakToClass[cls];
+}
+
+static bool GetPair(int8_t c1, int8_t c2) {
+ NS_ASSERTION(c1 < MAX_CLASSES, "illegal classes 1");
+ NS_ASSERTION(c2 < MAX_CLASSES, "illegal classes 2");
+
+ return (0 == ((gPair[c1] >> c2) & 0x0001));
+}
+
+static bool GetPairConservative(int8_t c1, int8_t c2) {
+ NS_ASSERTION(c1 < MAX_CLASSES, "illegal classes 1");
+ NS_ASSERTION(c2 < MAX_CLASSES, "illegal classes 2");
+
+ return (0 == ((gPairConservative[c1] >> c2) & 0x0001));
+}
+
+class ContextState {
+ public:
+ ContextState(const char16_t* aText, uint32_t aLength)
+ : mUniText(aText), mText(nullptr), mLength(aLength) {
+ Init();
+ }
+
+ ContextState(const uint8_t* aText, uint32_t aLength)
+ : mUniText(nullptr), mText(aText), mLength(aLength) {
+ Init();
+ }
+
+ uint32_t Length() const { return mLength; }
+ uint32_t Index() const { return mIndex; }
+
+ // This gets a single code unit of the text, without checking for surrogates
+ // (in the case of a 16-bit text buffer). That's OK if we're only checking for
+ // specific characters that are known to be BMP values.
+ char16_t GetCodeUnitAt(uint32_t aIndex) const {
+ MOZ_ASSERT(aIndex < mLength, "Out of range!");
+ return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);
+ }
+
+ // This gets a 32-bit Unicode character (codepoint), handling surrogate pairs
+ // as necessary. It must ONLY be called for 16-bit text, not 8-bit.
+ char32_t GetUnicodeCharAt(uint32_t aIndex) const {
+ MOZ_ASSERT(mUniText, "Only for 16-bit text!");
+ MOZ_ASSERT(aIndex < mLength, "Out of range!");
+ char32_t c = mUniText[aIndex];
+ if (aIndex + 1 < mLength && NS_IS_SURROGATE_PAIR(c, mUniText[aIndex + 1])) {
+ c = SURROGATE_TO_UCS4(c, mUniText[aIndex + 1]);
+ }
+ return c;
+ }
+
+ void AdvanceIndex() { ++mIndex; }
+
+ void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
+
+ // A word of western language should not be broken. But even if the word has
+ // only ASCII characters, non-natural context words should be broken, e.g.,
+ // URL and file path. For protecting the natural words, we should use
+ // conservative breaking rules at following conditions:
+ // 1. at near the start of word
+ // 2. at near the end of word
+ // 3. at near the latest broken point
+ // CONSERVATIVE_RANGE_{LETTER,OTHER} define the 'near' in characters,
+ // which varies depending whether we are looking at a letter or a non-letter
+ // character: for non-letters, we use an extended "conservative" range.
+
+#define CONSERVATIVE_RANGE_LETTER 2
+#define CONSERVATIVE_RANGE_OTHER 6
+
+ bool UseConservativeBreaking(uint32_t aOffset = 0) const {
+ if (mHasCJKChar) return false;
+ uint32_t index = mIndex + aOffset;
+
+ // If the character at index is a letter (rather than various punctuation
+ // characters, etc) then we want a shorter "conservative" range
+ uint32_t conservativeRangeStart, conservativeRangeEnd;
+ if (index < mLength &&
+ nsUGenCategory::kLetter ==
+ (mText ? GetGenCategory(mText[index])
+ : GetGenCategory(GetUnicodeCharAt(index)))) {
+ // Primarily for hyphenated word prefixes/suffixes; we add 1 to Start
+ // to get more balanced behavior (if we break off a 2-letter prefix,
+ // that means the break will actually be three letters from start of
+ // word, to include the hyphen; whereas a 2-letter suffix will be
+ // broken only two letters from end of word).
+ conservativeRangeEnd = CONSERVATIVE_RANGE_LETTER;
+ conservativeRangeStart = CONSERVATIVE_RANGE_LETTER + 1;
+ } else {
+ conservativeRangeEnd = conservativeRangeStart = CONSERVATIVE_RANGE_OTHER;
+ }
+
+ bool result = (index < conservativeRangeStart ||
+ mLength - index < conservativeRangeEnd ||
+ index - mLastBreakIndex < conservativeRangeStart);
+ if (result || !mHasNonbreakableSpace) return result;
+
+ // This text has no-breakable space, we need to check whether the index
+ // is near it.
+
+ // Note that index is always larger than conservativeRange here.
+ for (uint32_t i = index; index - conservativeRangeStart < i; --i) {
+ if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i - 1))) return true;
+ }
+ // Note that index is always less than mLength - conservativeRange.
+ for (uint32_t i = index + 1; i < index + conservativeRangeEnd; ++i) {
+ if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i))) return true;
+ }
+ return false;
+ }
+
+ bool HasPreviousEqualsSign() const { return mHasPreviousEqualsSign; }
+ void NotifySeenEqualsSign() { mHasPreviousEqualsSign = true; }
+
+ bool HasPreviousSlash() const { return mHasPreviousSlash; }
+ void NotifySeenSlash() { mHasPreviousSlash = true; }
+
+ bool HasPreviousBackslash() const { return mHasPreviousBackslash; }
+ void NotifySeenBackslash() { mHasPreviousBackslash = true; }
+
+ uint32_t GetPreviousNonHyphenCharacter() const {
+ return mPreviousNonHyphenCharacter;
+ }
+ void NotifyNonHyphenCharacter(uint32_t ch) {
+ mPreviousNonHyphenCharacter = ch;
+ }
+
+ private:
+ void Init() {
+ mIndex = 0;
+ mLastBreakIndex = 0;
+ mPreviousNonHyphenCharacter = U_NULL;
+ mHasCJKChar = false;
+ mHasNonbreakableSpace = false;
+ mHasPreviousEqualsSign = false;
+ mHasPreviousSlash = false;
+ mHasPreviousBackslash = false;
+
+ if (mText) {
+ // 8-bit text: we only need to check for &nbsp;
+ for (uint32_t i = 0; i < mLength; ++i) {
+ if (IS_NONBREAKABLE_SPACE(mText[i])) {
+ mHasNonbreakableSpace = true;
+ break;
+ }
+ }
+ } else {
+ // 16-bit text: handle surrogates and check for CJK as well as &nbsp;
+ for (uint32_t i = 0; i < mLength; ++i) {
+ char32_t u = GetUnicodeCharAt(i);
+ if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) {
+ mHasNonbreakableSpace = true;
+ if (mHasCJKChar) {
+ break;
+ }
+ } else if (!mHasCJKChar && IS_CJK_CHAR(u)) {
+ mHasCJKChar = true;
+ if (mHasNonbreakableSpace) {
+ break;
+ }
+ }
+ if (u > 0xFFFFu) {
+ ++i; // step over trailing low surrogate
+ }
+ }
+ }
+ }
+
+ const char16_t* const mUniText;
+ const uint8_t* const mText;
+
+ uint32_t mIndex;
+ const uint32_t mLength; // length of text
+ uint32_t mLastBreakIndex;
+ char32_t mPreviousNonHyphenCharacter; // The last character we have seen
+ // which is not U_HYPHEN
+ bool mHasCJKChar; // if the text has CJK character, this is true.
+ bool mHasNonbreakableSpace; // if the text has no-breakable space,
+ // this is true.
+ bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
+ bool mHasPreviousSlash; // True if we have seen a U_SLASH
+ bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH
+};
+
+static int8_t ContextualAnalysis(char32_t prev, char32_t cur, char32_t next,
+ ContextState& aState, LineBreakRule aLevel,
+ bool aIsChineseOrJapanese) {
+ // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
+
+ if (IS_HYPHEN(cur)) {
+ // If next character is hyphen, we don't need to break between them.
+ if (IS_HYPHEN(next)) return CLASS_CHARACTER;
+ // If prev and next characters are numeric, it may be in Math context.
+ // So, we should not break here.
+ bool prevIsNum = IS_ASCII_DIGIT(prev);
+ bool nextIsNum = IS_ASCII_DIGIT(next);
+ if (prevIsNum && nextIsNum) return CLASS_NUMERIC;
+ // If one side is numeric and the other is a character, or if both sides are
+ // characters, the hyphen should be breakable.
+ if (!aState.UseConservativeBreaking(1)) {
+ char32_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
+ if (prevOfHyphen && next) {
+ int8_t prevClass = GetClass(prevOfHyphen, aLevel, aIsChineseOrJapanese);
+ int8_t nextClass = GetClass(next, aLevel, aIsChineseOrJapanese);
+ bool prevIsNumOrCharOrClose =
+ prevIsNum ||
+ (prevClass == CLASS_CHARACTER &&
+ !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) ||
+ prevClass == CLASS_CLOSE || prevClass == CLASS_CLOSE_LIKE_CHARACTER;
+ bool nextIsNumOrCharOrOpen =
+ nextIsNum ||
+ (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) ||
+ nextClass == CLASS_OPEN || nextClass == CLASS_OPEN_LIKE_CHARACTER ||
+ next == U_OPEN_SINGLE_QUOTE || next == U_OPEN_DOUBLE_QUOTE ||
+ next == U_OPEN_GUILLEMET;
+ if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) {
+ return CLASS_CLOSE;
+ }
+ }
+ }
+ } else {
+ aState.NotifyNonHyphenCharacter(cur);
+ if (cur == U_SLASH || cur == U_BACKSLASH) {
+ // If this is immediately after same char, we should not break here.
+ if (prev == cur) return CLASS_CHARACTER;
+ // If this text has two or more (BACK)SLASHs, this may be file path or
+ // URL. Make sure to compute shouldReturn before we notify on this slash.
+ bool shouldReturn = !aState.UseConservativeBreaking() &&
+ (cur == U_SLASH ? aState.HasPreviousSlash()
+ : aState.HasPreviousBackslash());
+
+ if (cur == U_SLASH) {
+ aState.NotifySeenSlash();
+ } else {
+ aState.NotifySeenBackslash();
+ }
+
+ if (shouldReturn) return CLASS_OPEN;
+ } else if (cur == U_PERCENT) {
+ // If this is a part of the param of URL, we should break before.
+ if (!aState.UseConservativeBreaking()) {
+ if (aState.Index() >= 3 &&
+ aState.GetCodeUnitAt(aState.Index() - 3) == U_PERCENT)
+ return CLASS_OPEN;
+ if (aState.Index() + 3 < aState.Length() &&
+ aState.GetCodeUnitAt(aState.Index() + 3) == U_PERCENT)
+ return CLASS_OPEN;
+ }
+ } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
+ // If this may be a separator of params of URL, we should break after.
+ if (!aState.UseConservativeBreaking(1) && aState.HasPreviousEqualsSign())
+ return CLASS_CLOSE;
+ } else if (cur == U_OPEN_SINGLE_QUOTE || cur == U_OPEN_DOUBLE_QUOTE ||
+ cur == U_OPEN_GUILLEMET) {
+ // for CJK usage, we treat these as openers to allow a break before them,
+ // but otherwise treat them as normal characters because quote mark usage
+ // in various Western languages varies too much; see bug #450088
+ // discussion.
+ if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
+ return CLASS_OPEN;
+ } else {
+ NS_ERROR("Forgot to handle the current character!");
+ }
+ }
+ return GetClass(cur, aLevel, aIsChineseOrJapanese);
+}
+
+int32_t LineBreaker::Next(const char16_t* aText, uint32_t aLen, uint32_t aPos) {
+ MOZ_ASSERT(aText);
+
+ if (aPos >= aLen) {
+ return NS_LINEBREAKER_NEED_MORE_TEXT;
+ }
+
+ bool textNeedsComplexLineBreak = false;
+ int32_t begin, end;
+
+ for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {
+ if (IS_CJK_CHAR(aText[begin]) ||
+ NS_NeedsPlatformNativeHandling(aText[begin])) {
+ textNeedsComplexLineBreak = true;
+ }
+ }
+ for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) {
+ if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {
+ textNeedsComplexLineBreak = true;
+ }
+ }
+
+ int32_t ret;
+ if (!textNeedsComplexLineBreak) {
+ // No complex text character, do not try to do complex line break.
+ // (This is required for serializers. See Bug #344816.)
+ ret = end;
+ } else {
+ AutoTArray<uint8_t, 2000> breakState;
+ // XXX(Bug 1631371) Check if this should use a fallible operation as it
+ // pretended earlier.
+ breakState.AppendElements(end - begin);
+ ComputeBreakPositions(aText + begin, end - begin, WordBreakRule::Normal,
+ LineBreakRule::Auto, false, breakState.Elements());
+
+ ret = aPos;
+ do {
+ ++ret;
+ } while (begin < ret && ret < end && !breakState[ret - begin]);
+ }
+
+ return ret;
+}
+
+static bool SuppressBreakForKeepAll(uint32_t aPrev, uint32_t aCh) {
+ auto affectedByKeepAll = [](uint8_t aLBClass) {
+ switch (aLBClass) {
+ // Per https://drafts.csswg.org/css-text-3/#valdef-word-break-keep-all:
+ // "implicit soft wrap opportunities between typographic letter units
+ // (or other typographic character units belonging to the NU, AL, AI,
+ // or ID Unicode line breaking classes [UAX14]) are suppressed..."
+ case U_LB_ALPHABETIC:
+ case U_LB_AMBIGUOUS:
+ case U_LB_NUMERIC:
+ case U_LB_IDEOGRAPHIC:
+ // Additional classes that should be treated similarly, but have been
+ // broken out as separate classes in newer Unicode versions:
+ case U_LB_H2:
+ case U_LB_H3:
+ case U_LB_JL:
+ case U_LB_JV:
+ case U_LB_JT:
+ case U_LB_CONDITIONAL_JAPANESE_STARTER:
+ return true;
+ default:
+ return false;
+ }
+ };
+ return affectedByKeepAll(GetLineBreakClass(aPrev)) &&
+ affectedByKeepAll(GetLineBreakClass(aCh));
+}
+
+void LineBreaker::ComputeBreakPositions(
+ const char16_t* aChars, uint32_t aLength, WordBreakRule aWordBreak,
+ LineBreakRule aLevel, bool aIsChineseOrJapanese, uint8_t* aBreakBefore) {
+ uint32_t cur;
+ int8_t lastClass = CLASS_NONE;
+ ContextState state(aChars, aLength);
+
+ for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
+ char32_t ch = state.GetUnicodeCharAt(cur);
+ uint32_t chLen = ch > 0xFFFFu ? 2 : 1;
+ int8_t cl;
+
+ auto prev = [=]() -> char32_t {
+ if (!cur) {
+ return 0;
+ }
+ char32_t c = aChars[cur - 1];
+ if (cur > 1 && NS_IS_SURROGATE_PAIR(aChars[cur - 2], c)) {
+ c = SURROGATE_TO_UCS4(aChars[cur - 2], c);
+ }
+ return c;
+ };
+
+ if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
+ char32_t next;
+ if (cur + chLen < aLength) {
+ next = state.GetUnicodeCharAt(cur + chLen);
+ } else {
+ next = 0;
+ }
+ cl = ContextualAnalysis(prev(), ch, next, state, aLevel,
+ aIsChineseOrJapanese);
+ } else {
+ if (ch == U_EQUAL) state.NotifySeenEqualsSign();
+ state.NotifyNonHyphenCharacter(ch);
+ cl = GetClass(ch, aLevel, aIsChineseOrJapanese);
+ }
+
+ // To implement word-break:break-all, we overwrite the line-break class of
+ // alphanumeric characters so they are treated the same as ideographic.
+ // The relevant characters will have been assigned CLASS_CHARACTER, _CLOSE,
+ // _CLOSE_LIKE_CHARACTER, or _NUMERIC by GetClass(), but those classes also
+ // include others that we don't want to touch here, so we re-check the
+ // Unicode line-break class to determine which ones to modify.
+ if (aWordBreak == WordBreakRule::BreakAll &&
+ (cl == CLASS_CHARACTER || cl == CLASS_CLOSE ||
+ cl == CLASS_CLOSE_LIKE_CHARACTER || cl == CLASS_NUMERIC)) {
+ auto cls = GetLineBreakClass(ch);
+ if (cls == U_LB_ALPHABETIC || cls == U_LB_NUMERIC ||
+ cls == U_LB_AMBIGUOUS || cls == U_LB_COMPLEX_CONTEXT ||
+ /* Additional Japanese and Korean LB classes; CSS Text spec doesn't
+ explicitly mention these, but this appears to give expected
+ behavior (spec issue?) */
+ cls == U_LB_CONDITIONAL_JAPANESE_STARTER ||
+ (cls >= U_LB_H2 && cls <= U_LB_JV)) {
+ cl = CLASS_BREAKABLE;
+ }
+ }
+
+ bool allowBreak = false;
+ if (cur > 0) {
+ NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
+ "Loop should have prevented adjacent complex chars here");
+ allowBreak =
+ (state.UseConservativeBreaking() ? GetPairConservative(lastClass, cl)
+ : GetPair(lastClass, cl));
+ // Special cases where a normally-allowed break is suppressed:
+ if (allowBreak) {
+ // word-break:keep-all suppresses breaks between certain line-break
+ // classes.
+ if (aWordBreak == WordBreakRule::KeepAll &&
+ SuppressBreakForKeepAll(prev(), ch)) {
+ allowBreak = false;
+ }
+ // We also don't allow a break within a run of U+3000 chars unless
+ // word-break:break-all is in effect.
+ if (ch == 0x3000 && prev() == 0x3000 &&
+ aWordBreak != WordBreakRule::BreakAll) {
+ allowBreak = false;
+ }
+ }
+ }
+ aBreakBefore[cur] = allowBreak;
+ if (allowBreak) state.NotifyBreakBefore();
+ lastClass = cl;
+ if (CLASS_COMPLEX == cl) {
+ uint32_t end = cur + chLen;
+
+ while (end < aLength) {
+ char32_t c = state.GetUnicodeCharAt(end);
+ if (CLASS_COMPLEX != GetClass(c, aLevel, false)) {
+ break;
+ }
+ ++end;
+ if (c > 0xFFFFU) { // it was a surrogate pair
+ ++end;
+ }
+ }
+
+ if (aWordBreak == WordBreakRule::BreakAll) {
+ // For break-all, we don't need to run a dictionary-based breaking
+ // algorithm, we just allow breaks between all grapheme clusters.
+ GraphemeClusterBreakIteratorUtf16 ci(
+ Span<const char16_t>(aChars + cur, end - cur));
+ while (Maybe<uint32_t> pos = ci.Next()) {
+ aBreakBefore[cur + *pos] = true;
+ }
+ } else {
+ ComplexBreaker::GetBreaks(aChars + cur, end - cur, aBreakBefore + cur);
+ // restore breakability at chunk begin, which was always set to false
+ // by the complex line breaker
+ aBreakBefore[cur] = allowBreak;
+ }
+
+ cur = end - 1;
+ }
+
+ if (chLen == 2) {
+ // Supplementary-plane character: mark that we cannot break before the
+ // trailing low surrogate, and advance past it.
+ ++cur;
+ aBreakBefore[cur] = false;
+ state.AdvanceIndex();
+ }
+ }
+}
+
+void LineBreaker::ComputeBreakPositions(const uint8_t* aChars, uint32_t aLength,
+ WordBreakRule aWordBreak,
+ LineBreakRule aLevel,
+ bool aIsChineseOrJapanese,
+ uint8_t* aBreakBefore) {
+ uint32_t cur;
+ int8_t lastClass = CLASS_NONE;
+ ContextState state(aChars, aLength);
+
+ for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
+ char32_t ch = aChars[cur];
+ int8_t cl;
+
+ if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
+ cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, ch,
+ cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
+ state, aLevel, aIsChineseOrJapanese);
+ } else {
+ if (ch == U_EQUAL) state.NotifySeenEqualsSign();
+ state.NotifyNonHyphenCharacter(ch);
+ cl = GetClass(ch, aLevel, aIsChineseOrJapanese);
+ }
+ if (aWordBreak == WordBreakRule::BreakAll &&
+ (cl == CLASS_CHARACTER || cl == CLASS_CLOSE ||
+ cl == CLASS_CLOSE_LIKE_CHARACTER || cl == CLASS_NUMERIC)) {
+ auto cls = GetLineBreakClass(ch);
+ // Don't need to check additional Japanese/Korean classes in 8-bit
+ if (cls == U_LB_ALPHABETIC || cls == U_LB_NUMERIC ||
+ cls == U_LB_COMPLEX_CONTEXT) {
+ cl = CLASS_BREAKABLE;
+ }
+ }
+
+ bool allowBreak = false;
+ if (cur > 0) {
+ allowBreak =
+ (state.UseConservativeBreaking() ? GetPairConservative(lastClass, cl)
+ : GetPair(lastClass, cl)) &&
+ (aWordBreak != WordBreakRule::KeepAll ||
+ !SuppressBreakForKeepAll(aChars[cur - 1], ch));
+ }
+ aBreakBefore[cur] = allowBreak;
+ if (allowBreak) state.NotifyBreakBefore();
+ lastClass = cl;
+ }
+}
diff --git a/intl/lwbrk/LineBreaker.h b/intl/lwbrk/LineBreaker.h
new file mode 100644
index 0000000000..a2d7377474
--- /dev/null
+++ b/intl/lwbrk/LineBreaker.h
@@ -0,0 +1,82 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef mozilla_intl_LineBreaker_h__
+#define mozilla_intl_LineBreaker_h__
+
+#include <cstdint>
+
+#define NS_LINEBREAKER_NEED_MORE_TEXT -1
+
+namespace mozilla {
+namespace intl {
+enum class LineBreakRule : uint8_t;
+enum class WordBreakRule : uint8_t;
+
+class LineBreaker final {
+ public:
+ // LineBreaker is a utility class with only static methods. No need to
+ // instantiate it.
+ LineBreaker() = delete;
+ ~LineBreaker() = delete;
+
+ // Find the next line break opportunity starting from aPos + 1. It can return
+ // aLen if there's no break opportunity between [aPos + 1, aLen - 1].
+ //
+ // If aPos is already at the end of aText or beyond, i.e. aPos >= aLen, return
+ // NS_LINEBREAKER_NEED_MORE_TEXT.
+ //
+ // DEPRECATED: Use LineBreakIteratorUtf16 instead.
+ static int32_t Next(const char16_t* aText, uint32_t aLen, uint32_t aPos);
+
+ // Call this on a word with whitespace at either end. We will apply JISx4051
+ // rules to find breaks inside the word. aBreakBefore is set to the break-
+ // before status of each character; aBreakBefore[0] will always be false
+ // because we never return a break before the first character.
+ // aLength is the length of the aText array and also the length of the
+ // aBreakBefore output array.
+ static void ComputeBreakPositions(const char16_t* aText, uint32_t aLength,
+ WordBreakRule aWordBreak,
+ LineBreakRule aLevel,
+ bool aIsChineseOrJapanese,
+ uint8_t* aBreakBefore);
+ static void ComputeBreakPositions(const uint8_t* aText, uint32_t aLength,
+ WordBreakRule aWordBreak,
+ LineBreakRule aLevel,
+ bool aIsChineseOrJapanese,
+ uint8_t* aBreakBefore);
+};
+
+static inline bool NS_IsSpace(char16_t u) {
+ return u == 0x0020 || // SPACE
+ u == 0x0009 || // CHARACTER TABULATION
+ u == 0x000D || // CARRIAGE RETURN
+ (0x2000 <= u && u <= 0x2006) || // EN QUAD, EM QUAD, EN SPACE,
+ // EM SPACE, THREE-PER-EM SPACE,
+ // FOUR-PER-SPACE, SIX-PER-EM SPACE,
+ (0x2008 <= u && u <= 0x200B) || // PUNCTUATION SPACE, THIN SPACE,
+ // HAIR SPACE, ZERO WIDTH SPACE
+ u == 0x1361 || // ETHIOPIC WORDSPACE
+ u == 0x1680 || // OGHAM SPACE MARK
+ u == 0x205F; // MEDIUM MATHEMATICAL SPACE
+}
+
+static inline bool NS_NeedsPlatformNativeHandling(char16_t aChar) {
+ return
+#if ANDROID || XP_WIN // Bug 1647377/1736393: no "platform native" support for
+ // Tibetan; better to just use our class-based breaker.
+ (0x0e01 <= aChar && aChar <= 0x0eff) || // Thai, Lao
+#else
+ // Routing Tibetan to the platform-native breaker currently results in
+ // WPT failures in a few css3-text-line-break-opclns-* testcases that mix
+ // a Tibetan character with other-script context.
+ (0x0e01 <= aChar && aChar <= 0x0fff) || // Thai, Lao, Tibetan
+#endif
+ (0x1780 <= aChar && aChar <= 0x17ff); // Khmer
+}
+
+} // namespace intl
+} // namespace mozilla
+
+#endif /* mozilla_intl_LineBreaker_h__ */
diff --git a/intl/lwbrk/Segmenter.cpp b/intl/lwbrk/Segmenter.cpp
new file mode 100644
index 0000000000..53d87336a3
--- /dev/null
+++ b/intl/lwbrk/Segmenter.cpp
@@ -0,0 +1,260 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* Classes to iterate over grapheme, word, sentence, or line. */
+
+#include "mozilla/intl/Segmenter.h"
+
+#include "mozilla/intl/LineBreaker.h"
+#include "mozilla/intl/WordBreaker.h"
+#include "mozilla/intl/UnicodeProperties.h"
+#include "nsUnicodeProperties.h"
+#include "nsCharTraits.h"
+
+using namespace mozilla::unicode;
+
+namespace mozilla::intl {
+
+SegmentIteratorUtf16::SegmentIteratorUtf16(Span<const char16_t> aText)
+ : mText(aText) {}
+
+Maybe<uint32_t> SegmentIteratorUtf16::Seek(uint32_t aPos) {
+ if (mPos < aPos) {
+ mPos = aPos;
+ }
+ return Next();
+}
+
+LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText,
+ const LineBreakOptions& aOptions)
+ : SegmentIteratorUtf16(aText), mOptions(aOptions) {}
+
+Maybe<uint32_t> LineBreakIteratorUtf16::Next() {
+ const int32_t nextPos =
+ LineBreaker::Next(mText.Elements(), mText.Length(), mPos);
+ if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) {
+ return Nothing();
+ }
+ mPos = nextPos;
+ return Some(mPos);
+}
+
+WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText)
+ : SegmentIteratorUtf16(aText) {}
+
+Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
+ const int32_t nextPos =
+ WordBreaker::Next(mText.Elements(), mText.Length(), mPos);
+ if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) {
+ return Nothing();
+ }
+ mPos = nextPos;
+ return Some(mPos);
+}
+
+GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16(
+ Span<const char16_t> aText)
+ : SegmentIteratorUtf16(aText) {}
+
+enum HSType {
+ HST_NONE = U_HST_NOT_APPLICABLE,
+ HST_L = U_HST_LEADING_JAMO,
+ HST_V = U_HST_VOWEL_JAMO,
+ HST_T = U_HST_TRAILING_JAMO,
+ HST_LV = U_HST_LV_SYLLABLE,
+ HST_LVT = U_HST_LVT_SYLLABLE
+};
+
+static HSType GetHangulSyllableType(uint32_t aCh) {
+ return HSType(UnicodeProperties::GetIntPropertyValue(
+ aCh, UnicodeProperties::IntProperty::HangulSyllableType));
+}
+
+Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() {
+ const auto len = mText.Length();
+ if (mPos >= len) {
+ // The iterator has already reached the end.
+ return Nothing();
+ }
+
+ uint32_t ch = mText[mPos++];
+
+ if (mPos < len && NS_IS_SURROGATE_PAIR(ch, mText[mPos])) {
+ ch = SURROGATE_TO_UCS4(ch, mText[mPos++]);
+ } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
+ (ch >= 0xac00 && ch <= 0xd7ff)) {
+ // Handle conjoining Jamo that make Hangul syllables
+ HSType hangulState = GetHangulSyllableType(ch);
+ while (mPos < len) {
+ ch = mText[mPos];
+ HSType hangulType = GetHangulSyllableType(ch);
+ switch (hangulType) {
+ case HST_L:
+ case HST_LV:
+ case HST_LVT:
+ if (hangulState == HST_L) {
+ hangulState = hangulType;
+ mPos++;
+ continue;
+ }
+ break;
+ case HST_V:
+ if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
+ (hangulState != HST_LVT)) {
+ hangulState = hangulType;
+ mPos++;
+ continue;
+ }
+ break;
+ case HST_T:
+ if (hangulState != HST_NONE && hangulState != HST_L) {
+ hangulState = hangulType;
+ mPos++;
+ continue;
+ }
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ }
+
+ const uint32_t kVS16 = 0xfe0f;
+ const uint32_t kZWJ = 0x200d;
+ // UTF-16 surrogate values for Fitzpatrick type modifiers
+ const uint32_t kFitzpatrickHigh = 0xD83C;
+ const uint32_t kFitzpatrickLowFirst = 0xDFFB;
+ const uint32_t kFitzpatrickLowLast = 0xDFFF;
+
+ // Checking the emoji-presentation property of the base character is a bit
+ // expensive, so we do it lazily.
+ enum class EmojiStatus : uint8_t {
+ No,
+ Yes,
+ Unknown,
+ } baseIsEmojiStatus = EmojiStatus::Unknown;
+
+ // Remember the base character and the position of the next, in case we need
+ // to evaluate its emoji status.
+ uint32_t baseCh = ch;
+ uint32_t afterBase = mPos;
+
+ auto isFitzpatrickModifierAt = [&](uint32_t aPos) -> bool {
+ return aPos + 1 < len && mText[aPos] == kFitzpatrickHigh &&
+ mText[aPos + 1] >= kFitzpatrickLowFirst &&
+ mText[aPos + 1] <= kFitzpatrickLowLast;
+ };
+
+ auto baseIsEmoji = [&]() -> bool {
+ if (baseIsEmojiStatus == EmojiStatus::Unknown) {
+ auto basePresentation = GetEmojiPresentation(baseCh);
+ baseIsEmojiStatus =
+ basePresentation == EmojiDefault ||
+ (basePresentation == TextDefault &&
+ ((afterBase < len && mText[afterBase] == kVS16) ||
+ isFitzpatrickModifierAt(afterBase)))
+ ? EmojiStatus::Yes
+ : EmojiStatus::No;
+ }
+ return baseIsEmojiStatus == EmojiStatus::Yes;
+ };
+
+ bool prevWasZwj = false;
+
+ while (mPos < len) {
+ ch = mText[mPos];
+ size_t chLen = 1;
+
+ // Check for surrogate pairs; note that isolated surrogates will just
+ // be treated as generic (non-cluster-extending) characters here,
+ // which is fine for cluster-iterating purposes
+ if (mPos < len - 1 && NS_IS_SURROGATE_PAIR(ch, mText[mPos + 1])) {
+ ch = SURROGATE_TO_UCS4(ch, mText[mPos + 1]);
+ chLen = 2;
+ }
+
+ bool extendCluster =
+ IsClusterExtender(ch) ||
+ (prevWasZwj && baseIsEmoji() &&
+ ((GetEmojiPresentation(ch) == EmojiDefault) ||
+ (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < len &&
+ mText[mPos + chLen] == kVS16)));
+ if (!extendCluster) {
+ break;
+ }
+
+ prevWasZwj = (ch == kZWJ);
+ mPos += chLen;
+ }
+
+ MOZ_ASSERT(mPos <= len, "Next() has overshot the string!");
+ return Some(mPos);
+}
+
+GraphemeClusterBreakReverseIteratorUtf16::
+ GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText)
+ : SegmentIteratorUtf16(aText) {
+ mPos = mText.Length();
+}
+
+Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Next() {
+ if (mPos == 0) {
+ return Nothing();
+ }
+
+ uint32_t ch;
+ do {
+ ch = mText[--mPos];
+
+ if (mPos > 0 && NS_IS_SURROGATE_PAIR(mText[mPos - 1], ch)) {
+ ch = SURROGATE_TO_UCS4(mText[--mPos], ch);
+ }
+
+ if (!IsClusterExtender(ch)) {
+ break;
+ }
+ } while (mPos > 0);
+
+ // XXX May need to handle conjoining Jamo
+
+ return Some(mPos);
+}
+
+Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) {
+ if (mPos > aPos) {
+ mPos = aPos;
+ }
+ return Next();
+}
+
+Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate(
+ Span<const char> aLocale, const SegmenterOptions& aOptions) {
+ if (aOptions.mGranularity == SegmenterGranularity::Sentence) {
+ // Grapheme and Sentence iterator are not yet implemented.
+ return Err(ICUError::InternalError);
+ }
+ return MakeUnique<Segmenter>(aLocale, aOptions);
+}
+
+UniquePtr<SegmentIteratorUtf16> Segmenter::Segment(
+ Span<const char16_t> aText) const {
+ switch (mOptions.mGranularity) {
+ case SegmenterGranularity::Grapheme:
+ return MakeUnique<GraphemeClusterBreakIteratorUtf16>(aText);
+ case SegmenterGranularity::Sentence:
+ MOZ_ASSERT_UNREACHABLE("Unimplemented yet!");
+ return nullptr;
+ case SegmenterGranularity::Word:
+ return MakeUnique<WordBreakIteratorUtf16>(aText);
+ case SegmenterGranularity::Line:
+ return MakeUnique<LineBreakIteratorUtf16>(aText);
+ }
+ MOZ_ASSERT_UNREACHABLE("All granularities must be handled!");
+ return nullptr;
+}
+
+} // namespace mozilla::intl
diff --git a/intl/lwbrk/Segmenter.h b/intl/lwbrk/Segmenter.h
new file mode 100644
index 0000000000..647adb6fab
--- /dev/null
+++ b/intl/lwbrk/Segmenter.h
@@ -0,0 +1,177 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* Classes to iterate over grapheme, word, sentence, or line. */
+
+#ifndef intl_components_Segmenter_h_
+#define intl_components_Segmenter_h_
+
+#include "mozilla/intl/ICUError.h"
+#include "mozilla/Maybe.h"
+#include "mozilla/Result.h"
+#include "mozilla/Span.h"
+#include "mozilla/UniquePtr.h"
+
+namespace mozilla::intl {
+
+enum class SegmenterGranularity : uint8_t {
+ Grapheme,
+ Word,
+ Sentence,
+ Line,
+};
+
+struct SegmenterOptions final {
+ SegmenterGranularity mGranularity = SegmenterGranularity::Grapheme;
+};
+
+/**
+ * Interface of segment iterators. Subclass this class to implement iterator for
+ * UTF-16 text.
+ */
+class SegmentIteratorUtf16 {
+ public:
+ virtual ~SegmentIteratorUtf16() = default;
+
+ // Disable copy or move semantics. Move semantic could be enabled in the
+ // future if needed.
+ SegmentIteratorUtf16(SegmentIteratorUtf16&&) = delete;
+ SegmentIteratorUtf16& operator=(SegmentIteratorUtf16&&) = delete;
+ SegmentIteratorUtf16(const SegmentIteratorUtf16&) = delete;
+ SegmentIteratorUtf16& operator=(const SegmentIteratorUtf16&) = delete;
+
+ /**
+ * Advance the iterator to the next break position.
+ *
+ * @return the break position. If there's no further break position, return
+ * Nothing().
+ */
+ virtual Maybe<uint32_t> Next() = 0;
+
+ /**
+ * Advance the iterator to the first break position following the specified
+ * position aPos.
+ *
+ * Note: if this iterator's current position is already >= aPos, this method
+ * behaves the same as Next().
+ */
+ virtual Maybe<uint32_t> Seek(uint32_t aPos);
+
+ protected:
+ explicit SegmentIteratorUtf16(Span<const char16_t> aText);
+
+ // The text to iterate over.
+ Span<const char16_t> mText;
+
+ // The current break position within mText.
+ uint32_t mPos = 0;
+};
+
+// Each enum value has the same meaning with respect to the `word-break`
+// property values in the CSS Text spec. See the details in
+// https://drafts.csswg.org/css-text-3/#word-break-property
+enum class WordBreakRule : uint8_t {
+ Normal = 0,
+ BreakAll,
+ KeepAll,
+};
+
+// Each enum value has the same meaning with respect to the `line-break`
+// property values in the CSS Text spec. See the details in
+// https://drafts.csswg.org/css-text-3/#line-break-property.
+enum class LineBreakRule : uint8_t {
+ Auto = 0,
+ Loose,
+ Normal,
+ Strict,
+ Anywhere,
+};
+
+// Extra options for line break iterator.
+struct LineBreakOptions final {
+ WordBreakRule mWordBreakRule = WordBreakRule::Normal;
+ LineBreakRule mLineBreakRule = LineBreakRule::Auto;
+ bool mScriptIsChineseOrJapanese = false;
+};
+
+/**
+ * Line break iterator for UTF-16 text.
+ */
+class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
+ public:
+ explicit LineBreakIteratorUtf16(Span<const char16_t> aText,
+ const LineBreakOptions& aOptions = {});
+
+ Maybe<uint32_t> Next() override;
+
+ private:
+ LineBreakOptions mOptions;
+};
+
+/**
+ * Word break iterator for UTF-16 text.
+ */
+class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
+ public:
+ explicit WordBreakIteratorUtf16(Span<const char16_t> aText);
+
+ Maybe<uint32_t> Next() override;
+};
+
+/**
+ * Grapheme cluster break iterator for UTF-16 text.
+ */
+class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
+ public:
+ explicit GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText);
+
+ Maybe<uint32_t> Next() override;
+};
+
+/**
+ * Grapheme cluster break reverse iterator for UTF-16 text.
+ *
+ * Note: The reverse iterator doesn't handle conjoining Jamo and emoji. Use it
+ * at your own risk.
+ */
+class GraphemeClusterBreakReverseIteratorUtf16 final
+ : public SegmentIteratorUtf16 {
+ public:
+ explicit GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText);
+
+ Maybe<uint32_t> Next() override;
+ Maybe<uint32_t> Seek(uint32_t aPos) override;
+};
+
+/**
+ * This component is a Mozilla-focused API for working with segmenters in
+ * internationalization code.
+ *
+ * This is a factor class. Calling Segment() to create an iterator over a text
+ * of given granularity.
+ */
+class Segmenter final {
+ public:
+ // NOTE: aLocale is a no-op currently.
+ static Result<UniquePtr<Segmenter>, ICUError> TryCreate(
+ Span<const char> aLocale, const SegmenterOptions& aOptions);
+
+ explicit Segmenter(Span<const char> aLocale, const SegmenterOptions& aOptions)
+ : mOptions(aOptions) {}
+
+ // Creates an iterator over aText of a given granularity in mOptions.
+ UniquePtr<SegmentIteratorUtf16> Segment(Span<const char16_t> aText) const;
+
+ // TODO: Implement an iterator for Latin1 text.
+ // UniquePtr<SegmentIteratorLatin1> Segment(Span<const uint8_t> aText) const;
+
+ private:
+ SegmenterOptions mOptions;
+};
+
+} // namespace mozilla::intl
+
+#endif
diff --git a/intl/lwbrk/WordBreaker.cpp b/intl/lwbrk/WordBreaker.cpp
new file mode 100644
index 0000000000..f7fc10a7a1
--- /dev/null
+++ b/intl/lwbrk/WordBreaker.cpp
@@ -0,0 +1,185 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/intl/UnicodeProperties.h"
+#include "mozilla/intl/WordBreaker.h"
+#include "mozilla/StaticPrefs_layout.h"
+#include "nsComplexBreaker.h"
+#include "nsTArray.h"
+#include "nsUnicodeProperties.h"
+
+using mozilla::intl::Script;
+using mozilla::intl::UnicodeProperties;
+using mozilla::intl::WordBreaker;
+using mozilla::intl::WordRange;
+using mozilla::unicode::GetGenCategory;
+
+#define IS_ASCII(c) (0 == (0xFF80 & (c)))
+#define ASCII_IS_ALPHA(c) \
+ ((('a' <= (c)) && ((c) <= 'z')) || (('A' <= (c)) && ((c) <= 'Z')))
+#define ASCII_IS_DIGIT(c) (('0' <= (c)) && ((c) <= '9'))
+#define ASCII_IS_SPACE(c) \
+ ((' ' == (c)) || ('\t' == (c)) || ('\r' == (c)) || ('\n' == (c)))
+#define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80)
+
+// we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect
+// Unicode 3.0
+#define IS_HAN(c) \
+ ((0x3400 <= (c)) && ((c) <= 0x9fff)) || ((0xf900 <= (c)) && ((c) <= 0xfaff))
+#define IS_KATAKANA(c) ((0x30A0 <= (c)) && ((c) <= 0x30FF))
+#define IS_HIRAGANA(c) ((0x3040 <= (c)) && ((c) <= 0x309F))
+#define IS_HALFWIDTHKATAKANA(c) ((0xFF60 <= (c)) && ((c) <= 0xFF9F))
+
+// Return true if aChar belongs to a SEAsian script that is written without
+// word spaces, so we need to use the "complex breaker" to find possible word
+// boundaries. (https://en.wikipedia.org/wiki/Scriptio_continua)
+// (How well this works depends on the level of platform support for finding
+// possible line breaks - or possible word boundaries - in the particular
+// script. Thai, at least, works pretty well on the major desktop OSes. If
+// the script is not supported by the platform, we just won't find any useful
+// boundaries.)
+static bool IsScriptioContinua(char16_t aChar) {
+ Script sc = UnicodeProperties::GetScriptCode(aChar);
+ return sc == Script::THAI || sc == Script::MYANMAR || sc == Script::KHMER ||
+ sc == Script::JAVANESE || sc == Script::BALINESE ||
+ sc == Script::SUNDANESE || sc == Script::LAO;
+}
+
+/* static */
+WordBreaker::WordBreakClass WordBreaker::GetClass(char16_t c) {
+ // begin of the hack
+
+ if (IS_ALPHABETICAL_SCRIPT(c)) {
+ if (IS_ASCII(c)) {
+ if (ASCII_IS_SPACE(c)) {
+ return kWbClassSpace;
+ }
+ if (ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c) ||
+ (c == '_' && !StaticPrefs::layout_word_select_stop_at_underscore())) {
+ return kWbClassAlphaLetter;
+ }
+ return kWbClassPunct;
+ }
+ if (c == 0x00A0 /*NBSP*/) {
+ return kWbClassSpace;
+ }
+ if (GetGenCategory(c) == nsUGenCategory::kPunctuation) {
+ return kWbClassPunct;
+ }
+ if (IsScriptioContinua(c)) {
+ return kWbClassScriptioContinua;
+ }
+ return kWbClassAlphaLetter;
+ }
+ if (IS_HAN(c)) {
+ return kWbClassHanLetter;
+ }
+ if (IS_KATAKANA(c)) {
+ return kWbClassKatakanaLetter;
+ }
+ if (IS_HIRAGANA(c)) {
+ return kWbClassHiraganaLetter;
+ }
+ if (IS_HALFWIDTHKATAKANA(c)) {
+ return kWbClassHWKatakanaLetter;
+ }
+ if (GetGenCategory(c) == nsUGenCategory::kPunctuation) {
+ return kWbClassPunct;
+ }
+ if (IsScriptioContinua(c)) {
+ return kWbClassScriptioContinua;
+ }
+ return kWbClassAlphaLetter;
+}
+
+WordRange WordBreaker::FindWord(const char16_t* aText, uint32_t aLen,
+ uint32_t aPos) {
+ MOZ_ASSERT(aText);
+
+ if (aPos >= aLen) {
+ return {aLen, aLen};
+ }
+
+ WordBreakClass c = GetClass(aText[aPos]);
+ WordRange range{0, aLen};
+
+ // Scan forward
+ for (uint32_t i = aPos + 1; i <= aLen; i++) {
+ if (c != GetClass(aText[i])) {
+ range.mEnd = i;
+ break;
+ }
+ }
+
+ // Scan backward
+ for (uint32_t i = aPos; i > 0; i--) {
+ if (c != GetClass(aText[i - 1])) {
+ range.mBegin = i;
+ break;
+ }
+ }
+
+ if (kWbClassScriptioContinua == c) {
+ // we pass the whole text segment to the complex word breaker to find a
+ // shorter answer
+ AutoTArray<uint8_t, 256> breakBefore;
+ breakBefore.SetLength(range.mEnd - range.mBegin);
+ ComplexBreaker::GetBreaks(aText + range.mBegin, range.mEnd - range.mBegin,
+ breakBefore.Elements());
+
+ // Scan forward
+ for (uint32_t i = aPos + 1; i < range.mEnd; i++) {
+ if (breakBefore[i - range.mBegin]) {
+ range.mEnd = i;
+ break;
+ }
+ }
+
+ // Scan backward
+ for (uint32_t i = aPos; i > range.mBegin; i--) {
+ if (breakBefore[i - range.mBegin]) {
+ range.mBegin = i;
+ break;
+ }
+ }
+ }
+ return range;
+}
+
+int32_t WordBreaker::Next(const char16_t* aText, uint32_t aLen, uint32_t aPos) {
+ MOZ_ASSERT(aText);
+
+ if (aPos >= aLen) {
+ return NS_WORDBREAKER_NEED_MORE_TEXT;
+ }
+
+ const WordBreakClass posClass = GetClass(aText[aPos]);
+ uint32_t nextBreakPos;
+ for (nextBreakPos = aPos + 1; nextBreakPos < aLen; ++nextBreakPos) {
+ if (posClass != GetClass(aText[nextBreakPos])) {
+ break;
+ }
+ }
+
+ if (kWbClassScriptioContinua == posClass) {
+ // We pass the whole text segment to the complex word breaker to find a
+ // shorter answer.
+ const char16_t* segStart = aText + aPos;
+ const uint32_t segLen = nextBreakPos - aPos + 1;
+ AutoTArray<uint8_t, 256> breakBefore;
+ breakBefore.SetLength(segLen);
+ ComplexBreaker::GetBreaks(segStart, segLen, breakBefore.Elements());
+
+ for (uint32_t i = aPos + 1; i < nextBreakPos; ++i) {
+ if (breakBefore[i - aPos]) {
+ nextBreakPos = i;
+ break;
+ }
+ }
+ }
+
+ MOZ_ASSERT(nextBreakPos != aPos);
+ return nextBreakPos;
+}
diff --git a/intl/lwbrk/WordBreaker.h b/intl/lwbrk/WordBreaker.h
new file mode 100644
index 0000000000..f508e41ba6
--- /dev/null
+++ b/intl/lwbrk/WordBreaker.h
@@ -0,0 +1,65 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef mozilla_intl_WordBreaker_h__
+#define mozilla_intl_WordBreaker_h__
+
+#include <cstdint>
+
+#define NS_WORDBREAKER_NEED_MORE_TEXT -1
+
+namespace mozilla {
+namespace intl {
+
+struct WordRange {
+ uint32_t mBegin;
+ uint32_t mEnd;
+};
+
+class WordBreaker final {
+ public:
+ // WordBreaker is a utility class with only static methods. No need to
+ // instantiate it.
+ WordBreaker() = delete;
+ ~WordBreaker() = delete;
+
+ // Find the word boundary by scanning forward and backward from aPos.
+ //
+ // @return WordRange where mBegin equals to the offset to first character in
+ // the word and mEnd equals to the offset to the last character plus 1. mEnd
+ // can be aLen if the desired word is at the end of aText.
+ //
+ // If aPos is already at the end of aText or beyond, both mBegin and mEnd
+ // equals to aLen.
+ static WordRange FindWord(const char16_t* aText, uint32_t aLen,
+ uint32_t aPos);
+
+ // Find the next word break opportunity starting from aPos + 1. It can return
+ // aLen if there's no break opportunity between [aPos + 1, aLen - 1].
+ //
+ // If aPos is already at the end of aText or beyond, i.e. aPos >= aLen, return
+ // NS_WORDBREAKER_NEED_MORE_TEXT.
+ //
+ // DEPRECATED: Use WordBreakIteratorUtf16 instead.
+ static int32_t Next(const char16_t* aText, uint32_t aLen, uint32_t aPos);
+
+ private:
+ enum WordBreakClass : uint8_t {
+ kWbClassSpace = 0,
+ kWbClassAlphaLetter,
+ kWbClassPunct,
+ kWbClassHanLetter,
+ kWbClassKatakanaLetter,
+ kWbClassHiraganaLetter,
+ kWbClassHWKatakanaLetter,
+ kWbClassScriptioContinua
+ };
+
+ static WordBreakClass GetClass(char16_t aChar);
+};
+
+} // namespace intl
+} // namespace mozilla
+
+#endif /* mozilla_intl_WordBreaker_h__ */
diff --git a/intl/lwbrk/crashtests/416721.html b/intl/lwbrk/crashtests/416721.html
new file mode 100644
index 0000000000..0a6625ba8a
--- /dev/null
+++ b/intl/lwbrk/crashtests/416721.html
@@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>Testcase for bug 416721</title>
+ <meta http-equiv="content-type" content="text/html; charset=utf-8">
+ </head>
+ <body>
+ <p>กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛</p>
+ </body>
+</html>
+
diff --git a/intl/lwbrk/crashtests/Lo_test_page_no_uniscribe_breaks.html b/intl/lwbrk/crashtests/Lo_test_page_no_uniscribe_breaks.html
new file mode 100644
index 0000000000..b8958e474e
--- /dev/null
+++ b/intl/lwbrk/crashtests/Lo_test_page_no_uniscribe_breaks.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<meta charset="utf-8">
+<title>Lao - test page no breaks</title>
+<style>
+div {
+ width: 6em;
+}
+</style>
+
+<p>The text below does not produce any breaks with the Uniscribe breaker and is longer than the test buffer length used for brokering.</p>
+
+<div lang="lo">ການຮັບຮູ້ກຽດຕິສັກອັນມີປະຈຳຢູ່ຕົວບຸກຄົນໃນວົງສະກຸນຂອງມະນຸດທຸກໆຄົນການຮັບຮູ້ກຽດຕິສັກອັນມີປະຈຳຢູ່ຕົວບຸກຄົນໃນວົງສະກຸນຂອງມະນຸດທຸກໆຄົນ</div>
diff --git a/intl/lwbrk/crashtests/UDHR_Thai_test_page_long_sequences.html b/intl/lwbrk/crashtests/UDHR_Thai_test_page_long_sequences.html
new file mode 100644
index 0000000000..bd483d93c5
--- /dev/null
+++ b/intl/lwbrk/crashtests/UDHR_Thai_test_page_long_sequences.html
@@ -0,0 +1,184 @@
+<!DOCTYPE html>
+<html>
+<head>
+<title>UDHR - Thai - test page for bug 1713973</title>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+</head>
+<body dir="ltr">
+<h3 style="text-align:center">Universal Declaration of Human Rights - Thai</h3>
+<p>© 1996 – 2009 The Office of the High Commissioner for Human Rights</p>
+<p>This HTML version prepared by the <i>UDHR in Unicode</i> project, <a href="https://www.unicode.org/udhr">http://www.unicode.org/udhr</a>.</p>
+<hr>
+<p style="color:red; background:yellow">NOTE: Spaces within Thai content were removed for testing purposes.</p>
+<div xml:lang="th" dir="ltr" lang="th">
+<h3>ปฏิญญาสากลว่าด้วยสิทธิมนุษยชน</h3>
+<p>ได้รับการรับรองและประกาศโดยข้อมติสมัชชาสหประชาชาติที่ 217 เอ (III) วันที่ 10 ธันวาคม พ.ศ. 2491</p>
+<h4>อารัมภบท</h4>
+<p>โดยที่การยอมรับศักดิ์ศรีแต่กำเนิดและสิทธิที่เท่าเทียมกันและที่ไม่อาจ
+เพิกถอนได้ของสมาชิกทั้งมวลแห่งครอบครัวมนุษยชาติเป็นพื้นฐานแห่งอิสรภาพ
+ความยุติธรรมและสันติภาพในโลก</p>
+<p>โดยที่การไม่นำพาและการหมิ่นในคุณค่าของสิทธิมนุษยชนยังผลให้มีการกระทำ
+อันป่าเถื่อนซึ่งเป็นการขัดอย่างร้ายแรงต่อมโนธรรมของมนุษยชาติและการมาถึง
+ของโลกที่ได้มีการประกาศให้ความมีอิสรภาพในการพูดและความเชื่อและอิสรภาพจาก
+ความหวาดกลัวและความต้องการของมนุษย์เป็นความปรารถนาสูงสุดของประชาชนทั่วไป</p>
+<p>โดยที่เป็นการจำเป็นที่สิทธิมนุษยชนควรได้รับความคุ้มครองโดยหลัก
+นิติธรรมถ้าจะไม่บังคับให้คนต้องหันเข้าหาการลุกขึ้นต่อต้านทรราชและการกด
+ขี่เป็นวิถีทางสุดท้าย</p>
+<p>โดยที่เป็นการจำเป็นที่จะส่งเสริมพัฒนาการแห่งความสัมพันธ์ฉันมิตรระหว่างชาติต่างๆ</p>
+<p>โดยที่ประชาชนแห่งสหประชาชาติได้ยืนยันอีกครั้งไว้ในกฎบัตรถึงศรัทธาใน
+สิทธิมนุษยชนขั้นพื้นฐานในศักดิ์ศรีและค่าของมนุษย์และในสิทธิที่เท่าเทียม
+กันของบรรดาชายและหญิงและได้มุ่งมั่นที่จะส่งเสริมความก้าวหน้าทางสังคมและ
+มาตรฐานแห่งชีวิตที่ดีขึ้นในอิสรภาพอันกว้างขวางยิ่งขึ้น</p>
+<p>โดยที่รัฐสมาชิกต่างปฏิญาณที่จะบรรลุถึงซึ่งการส่งเสริมการเคารพและการ
+ยึดถือสิทธิมนุษยชนและอิสรภาพขั้นพื้นฐานโดยสากลโดยความร่วมมือกับสหประชา
+ชาติ</p>
+<p>โดยที่ความเข้าใจร่วมกันในสิทธิและอิสรภาพเหล่านี้เป็นสิ่งสำคัญที่สุดเพื่อให้ปฏิญาณนี้สำเร็จผลเต็มบริบูรณ์</p>
+<p>ฉะนั้นบัดนี้สมัชชาจึงประกาศปฏิญญาสากลว่าด้วยสิทธิมนุษยชนนี้ให้เป็น
+มาตรฐานร่วมกันแห่งความสำเร็จสำหรับประชาชนทั้งมวลและประชาชาติทั้งหลาย
+เพื่อจุดมุ่งหมายที่ว่าปัจเจกบุคคลทุกคนและทุกส่วนของสังคมโดยการคำนึงถึง
+ปฏิญญานี้เป็นเนืองนิตย์จะมุ่งมั่นส่งเสริมการเคารพสิทธิและอิสรภาพเหล่านี้
+ด้วยการสอนและการศึกษาและให้มีการยอมรับและยึดถือโดยสากลอย่างมีประสิทธิผล
+ด้วยมาตรการแห่งชาติและระหว่างประเทศอันก้าวหน้าตามลำดับทั้งในบรรดาประชาชน
+ของรัฐสมาชิกด้วยกันเองและในบรรดาประชาชนของดินแดนที่อยู่ใต้เขตอำนาจแห่ง
+รัฐนั้น</p>
+<h4>ข้อ1</h4>
+<p>มนุษย์ทั้งปวงเกิดมามีอิสระและเสมอภาคกันในศักดิ์ศรีและสิทธิต่างในตนมีเหตุผลและมโนธรรมและควรปฏิบัติต่อกันด้วยจิตวิญญาณแห่งภราดรภาพ</p>
+<h4>ข้อ2</h4>
+<p>ทุกคนย่อมมีสิทธิและอิสรภาพทั้งปวงตามที่กำหนดไว้ในปฏิญญานี้โดยปราศจาก
+การแบ่งแยกไม่ว่าชนิดใดอาทิเชื้อชาติผิวเพศภาษาศาสนาความคิดเห็นทางการเมือง
+หรือทางอื่นพื้นเพทางชาติหรือสังคมทรัพย์สินการเกิดหรือสถานะอื่นนอกเหนือ
+จากนี้จะไม่มีการแบ่งแยกใดบนพื้นฐานของสถานะทางการเมืองทางกฎหมายหรือทางการ
+ระหว่างประเทศของประเทศหรือดินแดนที่บุคคลสังกัดไม่ว่าดินแดนนี้จะเป็น
+เอกราชอยู่ในความพิทักษ์มิได้ปกครองตนเองหรืออยู่ภายใต้การจำกัดอธิปไตยอื่น
+ใด</p>
+<h4>ข้อ3</h4>
+<p>ทุกคนมีสิทธิในการมีชีวิตเสรีภาพและความมั่นคงแห่งบุคคล</p>
+<h4>ข้อ4</h4>
+<p>บุคคลใดจะตกอยู่ในความเป็นทาสหรือสภาวะจำยอมไม่ได้ทั้งนี้ห้ามความเป็นทาสและการค้าทาสทุกรูปแบบ</p>
+<h4>ข้อ5</h4>
+<p>บุคคลใดจะถูกกระทำการทรมานหรือการปฏิบัติหรือการลงโทษที่โหดร้ายไร้มนุษยธรรมหรือย่ำยีศักดิ์ศรีไม่ได้</p>
+<h4>ข้อ6</h4>
+<p>ทุกคนมีสิทธิที่จะได้รับการยอมรับทุกแห่งหนว่าเป็นบุคคลตามกฎหมาย</p>
+<h4>ข้อ7</h4>
+<p>ทุกคนเสมอภาคกันตามกฎหมายและมีสิทธิที่จะได้รับความคุ้มครองของกฎหมาย
+เท่าเทียมกันโดยปราศจากการเลือกปฏิบัติใดทุกคนมีสิทธิที่จะได้รับความคุ้ม
+ครองเท่าเทียมกันจากการเลือกปฏิบัติใดอันเป็นการล่วงละเมิดปฏิญญานี้และจาก
+การยุยงให้มีการเลือกปฏิบัติดังกล่าว</p>
+<h4>ข้อ8</h4>
+<p>ทุกคนมีสิทธิที่จะได้รับการเยียวยาอันมีประสิทธิผลจากศาลที่มีอำนาจแห่ง
+รัฐต่อการกระทำอันล่วงละเมิดสิทธิขั้นพื้นฐานซึ่งตนได้รับตามรัฐธรรมนูญหรือ
+กฎหมาย</p>
+<h4>ข้อ9</h4>
+<p>บุคคลใดจะถูกจับกุมกักขังหรือเนรเทศตามอำเภอใจไม่ได้</p>
+<h4>ข้อ10</h4>
+<p>ทุกคนย่อมมีสิทธิในความเสมอภาคอย่างเต็มที่ในการได้รับการพิจารณาคดีที่
+เป็นธรรมและเปิดเผยจากศาลที่อิสระและไม่ลำเอียงในการพิจารณากำหนดสิทธิและ
+หน้าที่ของตนและข้อกล่าวหาอาญาใดต่อตน</p>
+<h4>ข้อ11</h4>
+<p>1.ทุกคนที่ถูกกล่าวหาว่ากระทำผิดทางอาญามีสิทธิที่จะได้รับการสันนิษฐาน
+ไว้ก่อนว่าบริสุทธิ์จนกว่าจะพิสูจน์ได้ว่ามีความผิดตามกฎหมายในการพิจารณา
+คดีที่เปิดเผยซึ่งตนได้รับหลักประกันที่จำเป็นทั้งปวงสำหรับการต่อสู้คดี</p>
+<p>2.บุคคลใดจะถูกตัดสินว่ามีความผิดทางอาญาใดอันเนื่องจากการกระทำหรือ
+ละเว้นใดอันมิได้ถือว่าเป็นความผิดทางอาญาตามกฎหมายแห่งชาติหรือกฎหมาย
+ระหว่างประเทศในขณะที่ได้กระทำการนั้นไม่ได้และจะกำหนดโทษที่หนักกว่าที่
+บังคับใช้ในขณะที่ได้กระทำความผิดทางอาญานั้นไม่ได้</p>
+<h4>ข้อ12</h4>
+<p>บุคคลใดจะถูกแทรกแซงตามอำเภอใจในความเป็นส่วนตัวครอบครัวที่อยู่อาศัย
+หรือการสื่อสารหรือจะถูกลบหลู่เกียรติยศและชื่อเสียงไม่ได้ทุกคนมีสิทธิที่
+จะได้รับความคุ้มครองของกฎหมายต่อการแทรกแซงสิทธิหรือการลบหลู่ดังกล่าวนั้น</p>
+<h4>ข13</h4>
+<p>1.ทุกคนมีสิทธิในอิสรภาพแห่งการเคลื่อนย้ายและการอยู่อาศัยภายในพรมแดนของแต่ละรัฐ</p>
+<p>2.ทุกคนมีสิทธิที่จะออกนอกประเทศใดรวมทั้งประเทศของตนเองและสิทธิที่จะกลับสู่ประเทศตน</p>
+<h4>ข้อ14</h4>
+<p>1.ทุกคนมีสิทธิที่จะแสวงหาและที่จะได้ที่ลี้ภัยในประเทศอื่นจากการประหัตประหาร</p>
+<p>2.สิทธินี้จะยกขึ้นกล่าวอ้างกับกรณีที่การดำเนินคดีที่เกิดขึ้นโดยแท้จาก
+ความผิดที่มิใช่ทางการเมืองหรือจากการกระทำอันขัดต่อวัตถุประสงค์และหลักการ
+ของสหประชาชาติไม่ได้</p>
+<h4>ข้อ15</h4>
+<p>1.ทุกคนมีสิทธิในสัญชาติหนึ่ง</p>
+<p>2.บุคคลใดจะถูกเพิกถอนสัญชาติของตนตามอำเภอใจหรือถูกปฏิเสธสิทธิที่จะเปลี่ยนสัญชาติของตนไม่ได้</p>
+<h4>ข้อ16</h4>
+<p>1.บรรดาชายและหญิงที่มีอายุครบบริบูรณ์แล้วมีสิทธิที่จะสมรสและก่อร่าง
+สร้างครอบครัวโดยปราศจากการจำกัดใดอันเนื่องจากเชื้อชาติสัญชาติหรือศาสนา
+ต่างย่อมมีสิทธิเท่าเทียมกันในการสมรสระหว่างการสมรสและในการขาดจากการสมรส</p>
+<p>2.การสมรสจะกระทำโดยความยินยอมอย่างอิสระและเต็มที่ของผู้ที่จะเป็นคู่สมรสเท่านั้น</p>
+<p>3.ครอบครัวเป็นหน่วยธรรมชาติและพื้นฐานของสังคมและย่อมมีสิทธิที่จะได้รับความคุ้มครองจากสังคมและรัฐ</p>
+<h4>ข้อ17</h4>
+<p>1.ทุกคนมีสิทธิที่จะเป็นเจ้าของทรัพย์สินโดยตนเองและโดยร่วมกับผู้อื่น</p>
+<p>2.บุคคลใดจะถูกเอาทรัพย์สินไปจากตนตามอำเภอใจไม่ได้</p>
+<h4>ข้อ18</h4>
+<p>ทุกคนมีสิทธิในอิสรภาพแห่งความคิดมโนธรรมและศาสนาทั้งนี้สิทธินี้รวมถึง
+อิสรภาพในการเปลี่ยนศาสนาหรือความเชื่อและอิสรภาพในการแสดงออกทางศาสนาหรือ
+ความเชื่อถือของตนในการสอนการปฏิบัติการสักการะบูชาและการประกอบพิธีกรรมไม่
+ว่าจะโดยลำพังหรือในชุมชนร่วมกับผู้อื่นและในที่สาธารณะหรือส่วนบุคคล</p>
+<h4>ข้อ19</h4>
+<p>ทุกคนมีสิทธิในอิสรภาพแห่งความเห็นและการแสดงออกทั้งนี้สิทธินี้รวมถึง
+อิสรภาพที่จะถือเอาความเห็นโดยปราศจากการแทรกแซงและที่จะแสวงหารับและส่ง
+ข้อมูลข่าวสารและข้อคิดผ่านสื่อใดและโดยไม่คำนึงถึงพรมแดน</p>
+<h4>ข้อ20</h4>
+<p>1.ทุกคนมีสิทธิในอิสรภาพแห่งการชุมนุมและการสมาคมโดยสันติ</p>
+<p>2.บุคคลใดไม่อาจถูกบังคับให้สังกัดสมาคมหนึ่งได้</p>
+<h4>ข้อ21</h4>
+<p>1.ทุกคนมีสิทธิที่จะมีส่วนร่วมในการปกครองประเทศตนโดยตรงหรือผ่านผู้แทนซึ่งได้รับเลือกตั้งโดยอิสระ</p>
+<p>2.ทุกคนมีสิทธิที่จะเข้าถึงบริการสาธารณะในประเทศตนโดยเสมอภาค</p>
+<p>3.เจตจำนงของประชาชนจะต้องเป็นพื้นฐานแห่งอำนาจการปกครองทั้งนี้เจตจำนง
+นี้จะต้องแสดงออกทางการเลือกตั้งตามกำหนดเวลาและอย่างแท้จริงซึ่งต้องเป็น
+การออกเสียงอย่างทั่วถึงและเสมอภาคและต้องเป็นการลงคะแนนลับหรือวิธีการลง
+คะแนนโดยอิสระในทำนองเดียวกัน</p>
+<h4>ข้อ22</h4>
+<p>ทุกคนในฐานะสมาชิกของสังคมมีสิทธิในหลักประกันทางสังคมและย่อมมีสิทธิใน
+การบรรลุสิทธิทางเศรษฐกิจสังคมและวัฒนธรรมอันจำเป็นยิ่งสำหรับศักดิ์ศรีของ
+ตนและการพัฒนาบุคลิกภาพของตนอย่างอิสระผ่านความพยายามของรัฐและความร่วมมือ
+ระหว่างประเทศและตามการจัดการและทรัพยากรของแต่ละรัฐ</p>
+<h4>ข้อ23</h4>
+<p>1.ทุกคนมีสิทธิในการทำงานในการเลือกงานโดยอิสระในเงื่อนไขที่ยุติธรรมและเอื้ออำนวยต่อการทำงานและในการคุ้มครองต่อการว่างงาน</p>
+<p>2.ทุกคนมีสิทธิที่จะได้รับค่าจ้างที่เท่าเทียมกันสำหรับงานที่เท่าเทียมกันโดยปราศจากการเลือกปฏิบัติใด</p>
+<p>3.ทุกคนที่ทำงานมีสิทธิที่จะได้รับค่าตอบแทนที่ยุติธรรมและเอื้ออำนวยต่อ
+การประกันความเป็นอยู่อันควรค่าแก่ศักดิ์ศรีของมนุษย์สำหรับตนเองและครอบ
+ครัวและหากจำเป็นก็จะได้รับการคุ้มครองทางสังคมในรูปแบบอื่นเพิ่มเติมด้วย</p>
+<p>4.ทุกคนมีสิทธิที่จะจัดตั้งและที่จะเข้าร่วมสหภาพแรงงานเพื่อความคุ้มครองผลประโยชน์ของตน</p>
+<h4>ข้อ24</h4>
+<p>ทุกคนมีสิทธิในการพักผ่อนและการผ่อนคลายยามว่างรวมทั้งจำกัดเวลาทำงานตามสมควรและวันหยุดเป็นครั้งคราวโดยได้รับค่าจ้าง</p>
+<h4>ข้อ25</h4>
+<p>1.ทุกคนมีสิทธิในมาตรฐานการครองชีพอันเพียงพอสำหรับสุขภาพและความอยู่ดี
+ของตนและของครอบครัวรวมทั้งอาหารเครื่องนุ่งห่มที่อยู่อาศัยและการดูแลรักษา
+ทางการแพทย์และบริการสังคมที่จำเป็นและมีสิทธิในหลักประกันยามว่างงานเจ็บ
+ป่วยพิการหม้ายวัยชราหรือปราศจากการดำรงชีพอื่นในสภาวะแวดล้อมนอกเหนือการ
+ควบคุมของตน</p>
+<p>2.มารดาและเด็กย่อมมีสิทธิที่จะรับการดูแลรักษาและการช่วยเหลือเป็นพิเศษ
+เด็กทั้งปวงไม่ว่าจะเกิดในหรือนอกสมรสจะต้องได้รับการคุ้มครองทางสังคมเช่น
+เดียวกัน</p>
+<h4>ข้อ26</h4>
+<p>1.ทุกคนมีสิทธิในการศึกษาการศึกษาจะต้องให้เปล่าอย่างน้อยในขั้นประถม
+ศึกษาและขั้นพื้นฐานการศึกษาระดับประถมจะต้องเป็นภาคบังคับการศึกษาด้าน
+วิชาการและวิชาชีพจะต้องเปิดเป็นการทั่วไปและการศึกษาระดับสูงขึ้นไปจะต้อง
+เข้าถึงได้อย่างเสมอภาคสำหรับทุกคนบนพื้นฐานของคุณสมบัติความเหมาะสม</p>
+<p>2.การศึกษาจะต้องมุ่งไปสู่การพัฒนาบุคลิกภาพของมนุษย์อย่างเต็มที่และการ
+เสริมสร้างความเคารพต่อสิทธิมนุษยชนและอิสรภาพขั้นพื้นฐานการศึกษาจะต้องส่ง
+เสริมความเข้าใจขันติธรรมและมิตรภาพระหว่างประชาชาติกลุ่มเชื้อชาติหรือ
+ศาสนาทั้งมวลและจะต้องส่งเสริมกิจกรรมของสหประชาชาติเพื่อการธำรงไว้ซึ่ง
+สันติภาพ</p>
+<p>3.ผู้ปกครองมีสิทธิเบื้องแรกที่จะเลือกประเภทการศึกษาที่จะให้แก่บุตรของตน</p>
+<h4>ข้อ27</h4>
+<p>1.ทุกคนมีสิทธิที่จะเข้าร่วมโดยอิสระในชีวิตทางวัฒนธรรมของชุมชนที่จะ
+เพลิดเพลินกับศิลปะและมีส่วนในความรุดหน้าและคุณประโยชน์ทางวิทยาศาสตร์</p>
+<p>2.ทุกคนมีสิทธิที่จะได้รับการคุ้มครองผลประโยชน์ทางจิตใจและทางวัตถุอัน
+เป็นผลจากประดิษฐกรรมใดทางวิทยาศาสตร์วรรณกรรมและศิลปกรรมซึ่งตนเป็นผู้
+สร้าง</p>
+<h4>ข้อ28</h4>
+<p>ทุกคนย่อมมีสิทธิในระเบียบทางสังคมและระหว่างประเทศซึ่งจะเป็นกรอบให้บรรลุสิทธิและอิสรภาพที่กำหนดไว้ในปฏิญญานี้อย่างเต็มที่</p>
+<h4>ข้อ29</h4>
+<p>1.ทุกคนมีหน้าที่ต่อชุมชนซึ่งการพัฒนาบุคลิกภาพของตนโดยอิสระและเต็มที่จะกระทำได้ก็แต่ในชุมชนเท่านั้น</p>
+<p>2.ในการใช้สิทธิและอิสรภาพของตนทุกคนจะต้องอยู่ภายใต้ข้อจำกัดเพียงเท่า
+ที่มีกำหนดไว้ตามกฎหมายเท่านั้นเพื่อวัตถุประสงค์ของการได้มาซึ่งการยอมรับ
+และการเคารพสิทธิและอิสรภาพอันควรของผู้อื่นและเพื่อให้สอดรับกับความต้อง
+การอันสมควรทางด้านศีลธรรมความสงบเรียบร้อยของประชาชนและสวัสดิการทั่วไปใน
+สังคมประชาธิปไตย</p>
+<p>3.สิทธิและอิสรภาพเหล่านี้ไม่อาจใช้ขัดต่อวัตถุประสงค์และหลักการของสหประชาชาติไม่ว่าในกรณีใด</p>
+<h4>ข้อ30</h4>
+<p>ไม่มีบทใดในปฏิญญานี้ที่อาจตีความได้ว่าเป็นการให้สิทธิใดแก่รัฐกลุ่มคน
+หรือบุคคลใดในการดำเนินกิจกรรมใดหรือกระทำการใดอันมุ่งต่อการทำลายสิทธิและ
+อิสรภาพใดที่กำหนดไว้ณที่นี้</p>
+</div><hr>
+</body>
+</html>
diff --git a/intl/lwbrk/crashtests/crashtests.list b/intl/lwbrk/crashtests/crashtests.list
new file mode 100644
index 0000000000..a7cb7a173b
--- /dev/null
+++ b/intl/lwbrk/crashtests/crashtests.list
@@ -0,0 +1 @@
+load 416721.html
diff --git a/intl/lwbrk/crashtests/crashtests_manual.list b/intl/lwbrk/crashtests/crashtests_manual.list
new file mode 100644
index 0000000000..c58041a076
--- /dev/null
+++ b/intl/lwbrk/crashtests/crashtests_manual.list
@@ -0,0 +1,6 @@
+# Tests need to be run with --setpref security.sandbox.content.win32k-disable=false
+# This is because the pref is not dynamic and is also the reason that these tests
+# can only be run manually. They are also DEBUG only.
+defaults pref(intl.compare_against_brokered_complex_line_breaks,true)
+load Lo_test_page_no_uniscribe_breaks.html
+load UDHR_Thai_test_page_long_sequences.html
diff --git a/intl/lwbrk/gtest/TestBreak.cpp b/intl/lwbrk/gtest/TestBreak.cpp
new file mode 100644
index 0000000000..cd0cd01065
--- /dev/null
+++ b/intl/lwbrk/gtest/TestBreak.cpp
@@ -0,0 +1,327 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+#include "mozilla/intl/LineBreaker.h"
+#include "mozilla/intl/WordBreaker.h"
+#include "mozilla/Span.h"
+#include "nsISupports.h"
+#include "nsServiceManagerUtils.h"
+#include "nsString.h"
+#include "nsTArray.h"
+#include "nsXPCOM.h"
+
+using mozilla::intl::LineBreaker;
+using mozilla::intl::WordBreaker;
+
+// Turn off clang-format to align the ruler comments to the test strings.
+
+// clang-format off
+static char teng0[] =
+ // 1 2 3 4 5 6 7
+ // 01234567890123456789012345678901234567890123456789012345678901234567890123456789
+ "hello world";
+// clang-format on
+
+static uint32_t lexp0[] = {5, 11};
+
+static uint32_t wexp0[] = {5, 6, 11};
+
+// clang-format off
+static char teng1[] =
+ // 1 2 3 4 5 6 7
+ // 01234567890123456789012345678901234567890123456789012345678901234567890123456789
+ "This is a test to test(reasonable) line break. This 0.01123 = 45 x 48.";
+// clang-format on
+
+static uint32_t lexp1[] = {4, 7, 9, 14, 17, 34, 39, 40, 41,
+ 42, 49, 54, 62, 64, 67, 69, 73};
+
+static uint32_t wexp1[] = {4, 5, 7, 8, 9, 10, 14, 15, 17, 18, 22, 23,
+ 33, 34, 35, 39, 43, 48, 49, 50, 54, 55, 56, 57,
+ 62, 63, 64, 65, 67, 68, 69, 70, 72, 73};
+
+// clang-format off
+static char teng2[] =
+ // 1 2 3 4 5 6 7
+ // 01234567890123456789012345678901234567890123456789012345678901234567890123456789
+ "()((reasonab(l)e) line break. .01123=45x48.";
+// clang-format on
+
+static uint32_t lexp2[] = {17, 22, 23, 30, 44};
+
+static uint32_t wexp2[] = {4, 12, 13, 14, 15, 16, 17, 18, 22,
+ 24, 29, 30, 31, 32, 37, 38, 43, 44};
+
+// clang-format off
+static char teng3[] =
+ // 1 2 3 4 5 6 7
+ // 01234567890123456789012345678901234567890123456789012345678901234567890123456789
+ "It's a test to test(ronae ) line break....";
+// clang-format on
+
+static uint32_t lexp3[] = {4, 6, 11, 14, 25, 27, 32, 42};
+
+static uint32_t wexp3[] = {2, 3, 4, 5, 6, 7, 11, 12, 14, 15,
+ 19, 20, 25, 26, 27, 28, 32, 33, 38, 42};
+
+static char ruler1[] =
+ " 1 2 3 4 5 6 7 ";
+static char ruler2[] =
+ "0123456789012345678901234567890123456789012345678901234567890123456789012";
+
+bool Check(const char* in, mozilla::Span<const uint32_t> out,
+ mozilla::Span<const uint32_t> res) {
+ const uint32_t outlen = out.Length();
+ const uint32_t i = res.Length();
+ bool ok = true;
+
+ if (i != outlen) {
+ ok = false;
+ printf("WARNING!!! return size wrong, expect %d but got %d \n", outlen, i);
+ }
+
+ for (uint32_t j = 0; j < i; j++) {
+ if (j < outlen) {
+ if (res[j] != out[j]) {
+ ok = false;
+ printf("[%d] expect %d but got %d\n", j, out[j], res[j]);
+ }
+ } else {
+ ok = false;
+ printf("[%d] additional %d\n", j, res[j]);
+ }
+ }
+
+ if (!ok) {
+ printf("string = \n%s\n", in);
+ printf("%s\n", ruler1);
+ printf("%s\n", ruler2);
+
+ printf("Expect = \n");
+ for (uint32_t j = 0; j < outlen; j++) {
+ printf("%d,", out[j]);
+ }
+
+ printf("\nResult = \n");
+ for (uint32_t j = 0; j < i; j++) {
+ printf("%d,", res[j]);
+ }
+ printf("\n");
+ }
+
+ return ok;
+}
+
+bool TestASCIILB(const char* in, mozilla::Span<const uint32_t> out) {
+ NS_ConvertASCIItoUTF16 input(in);
+ EXPECT_GT(input.Length(), 0u) << "Expect a non-empty input!";
+
+ nsTArray<uint32_t> result;
+ int32_t curr = 0;
+ while (true) {
+ curr = LineBreaker::Next(input.get(), input.Length(), curr);
+ if (curr == NS_LINEBREAKER_NEED_MORE_TEXT) {
+ break;
+ }
+ result.AppendElement(curr);
+ }
+
+ return Check(in, out, result);
+}
+
+bool TestASCIIWB(const char* in, mozilla::Span<const uint32_t> out) {
+ NS_ConvertASCIItoUTF16 input(in);
+ EXPECT_GT(input.Length(), 0u) << "Expect a non-empty input!";
+
+ nsTArray<uint32_t> result;
+ int32_t curr = 0;
+ while (true) {
+ curr = WordBreaker::Next(input.get(), input.Length(), curr);
+ if (curr == NS_WORDBREAKER_NEED_MORE_TEXT) {
+ break;
+ }
+ result.AppendElement(curr);
+ }
+
+ return Check(in, out, result);
+}
+
+TEST(LineBreak, LineBreaker)
+{
+ ASSERT_TRUE(TestASCIILB(teng0, lexp0));
+ ASSERT_TRUE(TestASCIILB(teng1, lexp1));
+ ASSERT_TRUE(TestASCIILB(teng2, lexp2));
+ ASSERT_TRUE(TestASCIILB(teng3, lexp3));
+}
+
+TEST(WordBreak, WordBreaker)
+{
+ ASSERT_TRUE(TestASCIIWB(teng0, wexp0));
+ ASSERT_TRUE(TestASCIIWB(teng1, wexp1));
+ ASSERT_TRUE(TestASCIIWB(teng2, wexp2));
+ ASSERT_TRUE(TestASCIIWB(teng3, wexp3));
+}
+
+// 012345678901234
+static const char wb0[] = "T";
+static const char wb1[] = "h";
+static const char wb2[] = "";
+static const char wb3[] = "is is a int";
+static const char wb4[] = "";
+static const char wb5[] = "";
+static const char wb6[] = "ernationali";
+static const char wb7[] = "zation work.";
+
+static const char* wb[] = {wb0, wb1, wb2, wb3, wb4, wb5, wb6, wb7};
+
+TEST(WordBreak, TestPrintWordWithBreak)
+{
+ uint32_t numOfFragment = sizeof(wb) / sizeof(char*);
+
+ // This test generate the result string by appending '^' at every word break
+ // opportunity except the one at end of the text.
+ nsAutoString result;
+
+ for (uint32_t i = 0; i < numOfFragment; i++) {
+ NS_ConvertASCIItoUTF16 fragText(wb[i]);
+
+ int32_t cur = 0;
+ cur = WordBreaker::Next(fragText.get(), fragText.Length(), cur);
+ uint32_t start = 0;
+ while (cur != NS_WORDBREAKER_NEED_MORE_TEXT) {
+ result.Append(Substring(fragText, start, cur - start));
+
+ // Append '^' only if cur is within the fragText. We'll check the word
+ // break opportunity between fragText and nextFragText using
+ // BreakInBetween() below.
+ if (cur < static_cast<int32_t>(fragText.Length())) {
+ result.Append('^');
+ }
+ start = (cur >= 0 ? cur : cur - start);
+ cur = WordBreaker::Next(fragText.get(), fragText.Length(), cur);
+ }
+
+ if (i != numOfFragment - 1) {
+ NS_ConvertASCIItoUTF16 nextFragText(wb[i + 1]);
+ if (nextFragText.IsEmpty()) {
+ // If nextFragText is empty, there's no new possible word break
+ // opportunity.
+ continue;
+ }
+
+ const auto origFragLen = static_cast<int32_t>(fragText.Length());
+ fragText.Append(nextFragText);
+
+ bool canBreak =
+ origFragLen ==
+ WordBreaker::Next(fragText.get(), fragText.Length(), origFragLen - 1);
+ if (canBreak) {
+ result.Append('^');
+ }
+ }
+ }
+ ASSERT_STREQ("This^ ^is^ ^a^ ^internationalization^ ^work^.",
+ NS_ConvertUTF16toUTF8(result).get());
+}
+
+// This function searches a complete word starting from |offset| in wb[fragN].
+// If it reaches the end of wb[fragN], and there is no word break opportunity
+// between wb[fragN] and wb[fragN+1], it will continue the search in wb[fragN+1]
+// until a word break.
+void TestFindWordBreakFromPosition(uint32_t fragN, uint32_t offset,
+ const char* expected) {
+ uint32_t numOfFragment = sizeof(wb) / sizeof(char*);
+
+ NS_ConvertASCIItoUTF16 fragText(wb[fragN]);
+
+ mozilla::intl::WordRange res =
+ WordBreaker::FindWord(fragText.get(), fragText.Length(), offset);
+
+ nsAutoString result(Substring(fragText, res.mBegin, res.mEnd - res.mBegin));
+
+ if ((uint32_t)fragText.Length() <= res.mEnd) {
+ // if we hit the end of the fragment
+ nsAutoString curFragText = fragText;
+ for (uint32_t p = fragN + 1; p < numOfFragment; p++) {
+ NS_ConvertASCIItoUTF16 nextFragText(wb[p]);
+ if (nextFragText.IsEmpty()) {
+ // If nextFragText is empty, there's no new possible word break
+ // opportunity between curFragText and nextFragText.
+ continue;
+ }
+
+ const auto origFragLen = static_cast<int32_t>(curFragText.Length());
+ curFragText.Append(nextFragText);
+ bool canBreak = origFragLen == WordBreaker::Next(curFragText.get(),
+ curFragText.Length(),
+ origFragLen - 1);
+ if (canBreak) {
+ break;
+ }
+ mozilla::intl::WordRange r =
+ WordBreaker::FindWord(nextFragText.get(), nextFragText.Length(), 0);
+
+ result.Append(Substring(nextFragText, r.mBegin, r.mEnd - r.mBegin));
+
+ if ((uint32_t)nextFragText.Length() != r.mEnd) {
+ break;
+ }
+ }
+ }
+
+ ASSERT_STREQ(expected, NS_ConvertUTF16toUTF8(result).get())
+ << "FindWordBreakFromPosition(" << fragN << ", " << offset << ")";
+}
+
+TEST(WordBreak, TestNextWordBreakWithComplexLanguage)
+{
+ nsString fragText(u"\u0e40\u0e1b\u0e47\u0e19\u0e19\u0e31\u0e01");
+
+ int32_t offset = 0;
+ while (offset != NS_WORDBREAKER_NEED_MORE_TEXT) {
+ int32_t newOffset =
+ WordBreaker::Next(fragText.get(), fragText.Length(), offset);
+ ASSERT_NE(offset, newOffset);
+ offset = newOffset;
+ }
+ ASSERT_TRUE(true);
+}
+
+TEST(WordBreak, TestFindWordWithEmptyString)
+{
+ char16_t empty[] = {};
+ mozilla::intl::WordRange expect{0, 0};
+ mozilla::intl::WordRange result = WordBreaker::FindWord(empty, 0, 0);
+ ASSERT_EQ(expect.mBegin, result.mBegin);
+ ASSERT_EQ(expect.mEnd, result.mEnd);
+}
+
+TEST(WordBreak, TestNextWordBreakWithEmptyString)
+{
+ char16_t empty[] = {};
+ ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, WordBreaker::Next(empty, 0, 0));
+ ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, WordBreaker::Next(empty, 0, 1));
+}
+
+TEST(WordBreak, TestFindWordBreakFromPosition)
+{
+ TestFindWordBreakFromPosition(0, 0, "This");
+ TestFindWordBreakFromPosition(1, 0, "his");
+ TestFindWordBreakFromPosition(2, 0, "is");
+ TestFindWordBreakFromPosition(3, 0, "is");
+ TestFindWordBreakFromPosition(3, 1, "is");
+ TestFindWordBreakFromPosition(3, 9, " ");
+ TestFindWordBreakFromPosition(3, 10, "internationalization");
+ TestFindWordBreakFromPosition(4, 0, "ernationalization");
+ TestFindWordBreakFromPosition(5, 0, "ernationalization");
+ TestFindWordBreakFromPosition(6, 4, "ernationalization");
+ TestFindWordBreakFromPosition(6, 8, "ernationalization");
+ TestFindWordBreakFromPosition(7, 6, " ");
+ TestFindWordBreakFromPosition(7, 7, "work");
+}
diff --git a/intl/lwbrk/gtest/TestSegmenter.cpp b/intl/lwbrk/gtest/TestSegmenter.cpp
new file mode 100644
index 0000000000..21c44a078f
--- /dev/null
+++ b/intl/lwbrk/gtest/TestSegmenter.cpp
@@ -0,0 +1,105 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "gtest/gtest.h"
+
+#include "mozilla/intl/Segmenter.h"
+
+namespace mozilla::intl {
+
+TEST(IntlSegmenter, TestLineBreakIteratorUtf16)
+{
+ const SegmenterOptions options{SegmenterGranularity::Line};
+ auto result = Segmenter::TryCreate("en", options);
+ ASSERT_TRUE(result.isOk());
+ auto lineSegmenter = result.unwrap();
+
+ const char16_t text[] = u"hello world";
+ UniquePtr<SegmentIteratorUtf16> segIter =
+ lineSegmenter->Segment(MakeStringSpan(text));
+
+ // Seek to space between "hello" and "world".
+ ASSERT_EQ(segIter->Seek(5u), Some(11u));
+
+ ASSERT_EQ(segIter->Next(), Nothing());
+
+ // Same as calling Next().
+ ASSERT_EQ(segIter->Seek(0u), Nothing());
+}
+
+TEST(IntlSegmenter, TestWordBreakIteratorUtf16)
+{
+ const SegmenterOptions options{SegmenterGranularity::Word};
+ auto result = Segmenter::TryCreate("en", options);
+ ASSERT_TRUE(result.isOk());
+ auto wordSegmenter = result.unwrap();
+
+ const char16_t text[] = u"hello world";
+ UniquePtr<SegmentIteratorUtf16> segIter =
+ wordSegmenter->Segment(MakeStringSpan(text));
+
+ // Seek to the space between "hello" and "world"
+ ASSERT_EQ(segIter->Seek(5u), Some(6u));
+
+ ASSERT_EQ(segIter->Next(), Some(11u));
+ ASSERT_EQ(segIter->Next(), Nothing());
+
+ // Same as calling Next().
+ ASSERT_EQ(segIter->Seek(0u), Nothing());
+}
+
+TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16)
+{
+ SegmenterOptions options{SegmenterGranularity::Grapheme};
+ auto result = Segmenter::TryCreate("en", options);
+ ASSERT_TRUE(result.isOk());
+ auto graphemeClusterSegmenter = result.unwrap();
+
+ const char16_t text[] = u"hello world";
+ UniquePtr<SegmentIteratorUtf16> segIter =
+ graphemeClusterSegmenter->Segment(MakeStringSpan(text));
+
+ // Seek to the space between "hello" and "world"
+ ASSERT_EQ(segIter->Seek(5u), Some(6u));
+
+ ASSERT_EQ(segIter->Next(), Some(7u));
+ ASSERT_EQ(segIter->Next(), Some(8u));
+ ASSERT_EQ(segIter->Next(), Some(9u));
+ ASSERT_EQ(segIter->Next(), Some(10u));
+ ASSERT_EQ(segIter->Next(), Some(11u));
+ ASSERT_EQ(segIter->Next(), Nothing());
+
+ // Same as calling Next().
+ ASSERT_EQ(segIter->Seek(0u), Nothing());
+}
+
+TEST(IntlSegmenter, TestGraphemeClusterBreakReverseIteratorUtf16)
+{
+ const char16_t text[] = u"hello world";
+ GraphemeClusterBreakReverseIteratorUtf16 segIter(MakeStringSpan(text));
+
+ // Seek to the space between "hello" and "world"
+ ASSERT_EQ(segIter.Seek(6u), Some(5u));
+
+ ASSERT_EQ(segIter.Next(), Some(4u));
+ ASSERT_EQ(segIter.Next(), Some(3u));
+ ASSERT_EQ(segIter.Next(), Some(2u));
+ ASSERT_EQ(segIter.Next(), Some(1u));
+ ASSERT_EQ(segIter.Next(), Some(0u));
+ ASSERT_EQ(segIter.Next(), Nothing());
+
+ // Same as calling Next().
+ ASSERT_EQ(segIter.Seek(0u), Nothing());
+}
+
+TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16)
+{
+ SegmenterOptions options{SegmenterGranularity::Sentence};
+ auto result = Segmenter::TryCreate("en", options);
+ ASSERT_TRUE(result.isErr());
+}
+
+} // namespace mozilla::intl
diff --git a/intl/lwbrk/gtest/moz.build b/intl/lwbrk/gtest/moz.build
new file mode 100644
index 0000000000..e869bfa30b
--- /dev/null
+++ b/intl/lwbrk/gtest/moz.build
@@ -0,0 +1,12 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+UNIFIED_SOURCES += [
+ "TestBreak.cpp",
+ "TestSegmenter.cpp",
+]
+
+FINAL_LIBRARY = "xul-gtest"
diff --git a/intl/lwbrk/jisx4051class.h b/intl/lwbrk/jisx4051class.h
new file mode 100644
index 0000000000..3140cf63a7
--- /dev/null
+++ b/intl/lwbrk/jisx4051class.h
@@ -0,0 +1,217 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+/*
+ DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY
+ mozilla/intl/lwbrk/tools/anzx4051.pl
+ */
+static const uint32_t gLBClass00[32] = {
+ 0x55555555, // U+0000 - U+0007
+ 0x55555555, // U+0008 - U+000F
+ 0x55555555, // U+0010 - U+0017
+ 0x55555555, // U+0018 - U+001F
+ 0x7AABAAA5, // U+0020 - U+0027
+ 0x7A7AAAA9, // U+0028 - U+002F
+ 0x66666666, // U+0030 - U+0037
+ 0xAAA9AA66, // U+0038 - U+003F
+ 0x77777777, // U+0040 - U+0047
+ 0x77777777, // U+0048 - U+004F
+ 0x77777777, // U+0050 - U+0057
+ 0x77AA9777, // U+0058 - U+005F
+ 0x77777777, // U+0060 - U+0067
+ 0x77777777, // U+0068 - U+006F
+ 0x77777777, // U+0070 - U+0077
+ 0x7AAA9777, // U+0078 - U+007F
+ 0x77777777, // U+0080 - U+0087
+ 0x77777777, // U+0088 - U+008F
+ 0x77777777, // U+0090 - U+0097
+ 0x77777777, // U+0098 - U+009F
+ 0xAA9A9AAB, // U+00A0 - U+00A7
+ 0x77A9777A, // U+00A8 - U+00AF
+ 0xAAAAAAAA, // U+00B0 - U+00B7
+ 0xAAAAAAAA, // U+00B8 - U+00BF
+ 0x77777777, // U+00C0 - U+00C7
+ 0x77777777, // U+00C8 - U+00CF
+ 0x77777777, // U+00D0 - U+00D7
+ 0x77777777, // U+00D8 - U+00DF
+ 0x77777777, // U+00E0 - U+00E7
+ 0x77777777, // U+00E8 - U+00EF
+ 0xA7777777, // U+00F0 - U+00F7
+ 0x77777777, // U+00F8 - U+00FF
+};
+
+static const uint32_t gLBClass20[32] = {
+ 0xB5555555, // U+2000 - U+2007
+ 0x77775555, // U+2008 - U+200F
+ 0x777277B7, // U+2010 - U+2017
+ 0x77A777A7, // U+2018 - U+201F
+ 0xA1117777, // U+2020 - U+2027
+ 0xB7777777, // U+2028 - U+202F
+ 0x77744444, // U+2030 - U+2037
+ 0x7A115107, // U+2038 - U+203F
+ 0x11017777, // U+2040 - U+2047
+ 0x77777711, // U+2048 - U+204F
+ 0x77777777, // U+2050 - U+2057
+ 0x57777777, // U+2058 - U+205F
+ 0x7777777B, // U+2060 - U+2067
+ 0x77777777, // U+2068 - U+206F
+ 0x77777777, // U+2070 - U+2077
+ 0x77777777, // U+2078 - U+207F
+ 0x77777777, // U+2080 - U+2087
+ 0x77777777, // U+2088 - U+208F
+ 0x77777777, // U+2090 - U+2097
+ 0x77777777, // U+2098 - U+209F
+ 0x77777777, // U+20A0 - U+20A7
+ 0x77777777, // U+20A8 - U+20AF
+ 0x77777777, // U+20B0 - U+20B7
+ 0x77777777, // U+20B8 - U+20BF
+ 0x77777777, // U+20C0 - U+20C7
+ 0x77777777, // U+20C8 - U+20CF
+ 0x77777777, // U+20D0 - U+20D7
+ 0x77777777, // U+20D8 - U+20DF
+ 0x77777777, // U+20E0 - U+20E7
+ 0x77777777, // U+20E8 - U+20EF
+ 0x77777777, // U+20F0 - U+20F7
+ 0x77777777, // U+20F8 - U+20FF
+};
+
+static const uint32_t gLBClass21[32] = {
+ 0x77777777, // U+2100 - U+2107
+ 0x77777777, // U+2108 - U+210F
+ 0x73777777, // U+2110 - U+2117
+ 0x77777777, // U+2118 - U+211F
+ 0x77777777, // U+2120 - U+2127
+ 0x77777777, // U+2128 - U+212F
+ 0x77777777, // U+2130 - U+2137
+ 0x77777777, // U+2138 - U+213F
+ 0x77777777, // U+2140 - U+2147
+ 0x77777777, // U+2148 - U+214F
+ 0x77777777, // U+2150 - U+2157
+ 0x77777777, // U+2158 - U+215F
+ 0x55555555, // U+2160 - U+2167
+ 0x55555555, // U+2168 - U+216F
+ 0x55555555, // U+2170 - U+2177
+ 0x55555555, // U+2178 - U+217F
+ 0x77777777, // U+2180 - U+2187
+ 0x77777777, // U+2188 - U+218F
+ 0x77777777, // U+2190 - U+2197
+ 0x77777777, // U+2198 - U+219F
+ 0x77777777, // U+21A0 - U+21A7
+ 0x77777777, // U+21A8 - U+21AF
+ 0x77777777, // U+21B0 - U+21B7
+ 0x77777777, // U+21B8 - U+21BF
+ 0x77777777, // U+21C0 - U+21C7
+ 0x77777777, // U+21C8 - U+21CF
+ 0x77777777, // U+21D0 - U+21D7
+ 0x77777777, // U+21D8 - U+21DF
+ 0x77777777, // U+21E0 - U+21E7
+ 0x77777777, // U+21E8 - U+21EF
+ 0x77777777, // U+21F0 - U+21F7
+ 0x77777777, // U+21F8 - U+21FF
+};
+
+static const uint32_t gLBClass30[32] = {
+ 0x55155115, // U+3000 - U+3007
+ 0x10101010, // U+3008 - U+300F
+ 0x10105510, // U+3010 - U+3017
+ 0x11011010, // U+3018 - U+301F
+ 0x55555555, // U+3020 - U+3027
+ 0x55555555, // U+3028 - U+302F
+ 0x55555555, // U+3030 - U+3037
+ 0x55555555, // U+3038 - U+303F
+ 0x15151515, // U+3040 - U+3047
+ 0x55555515, // U+3048 - U+304F
+ 0x55555555, // U+3050 - U+3057
+ 0x55555555, // U+3058 - U+305F
+ 0x55551555, // U+3060 - U+3067
+ 0x55555555, // U+3068 - U+306F
+ 0x55555555, // U+3070 - U+3077
+ 0x55555555, // U+3078 - U+307F
+ 0x15151555, // U+3080 - U+3087
+ 0x51555555, // U+3088 - U+308F
+ 0x55555555, // U+3090 - U+3097
+ 0x51111115, // U+3098 - U+309F
+ 0x15151515, // U+30A0 - U+30A7
+ 0x55555515, // U+30A8 - U+30AF
+ 0x55555555, // U+30B0 - U+30B7
+ 0x55555555, // U+30B8 - U+30BF
+ 0x55551555, // U+30C0 - U+30C7
+ 0x55555555, // U+30C8 - U+30CF
+ 0x55555555, // U+30D0 - U+30D7
+ 0x55555555, // U+30D8 - U+30DF
+ 0x15151555, // U+30E0 - U+30E7
+ 0x51555555, // U+30E8 - U+30EF
+ 0x51155555, // U+30F0 - U+30F7
+ 0x51111555, // U+30F8 - U+30FF
+};
+
+static const uint32_t gLBClass0E[32] = {
+ 0x88888888, // U+0E00 - U+0E07
+ 0x88888888, // U+0E08 - U+0E0F
+ 0x88888888, // U+0E10 - U+0E17
+ 0x88888888, // U+0E18 - U+0E1F
+ 0x88888888, // U+0E20 - U+0E27
+ 0x18888888, // U+0E28 - U+0E2F
+ 0x88888888, // U+0E30 - U+0E37
+ 0x08888888, // U+0E38 - U+0E3F
+ 0x81888888, // U+0E40 - U+0E47
+ 0x78888888, // U+0E48 - U+0E4F
+ 0x66666666, // U+0E50 - U+0E57
+ 0x88881166, // U+0E58 - U+0E5F
+ 0x88888888, // U+0E60 - U+0E67
+ 0x88888888, // U+0E68 - U+0E6F
+ 0x88888888, // U+0E70 - U+0E77
+ 0x88888888, // U+0E78 - U+0E7F
+ 0x88888888, // U+0E80 - U+0E87
+ 0x88888888, // U+0E88 - U+0E8F
+ 0x88888888, // U+0E90 - U+0E97
+ 0x88888888, // U+0E98 - U+0E9F
+ 0x88888888, // U+0EA0 - U+0EA7
+ 0x18888888, // U+0EA8 - U+0EAF
+ 0x88888888, // U+0EB0 - U+0EB7
+ 0x88888888, // U+0EB8 - U+0EBF
+ 0x81888888, // U+0EC0 - U+0EC7
+ 0x88888888, // U+0EC8 - U+0ECF
+ 0x66666666, // U+0ED0 - U+0ED7
+ 0x88888866, // U+0ED8 - U+0EDF
+ 0x88888888, // U+0EE0 - U+0EE7
+ 0x88888888, // U+0EE8 - U+0EEF
+ 0x88888888, // U+0EF0 - U+0EF7
+ 0x88888888, // U+0EF8 - U+0EFF
+};
+
+static const uint32_t gLBClass17[32] = {
+ 0x77777777, // U+1700 - U+1707
+ 0x77777777, // U+1708 - U+170F
+ 0x77777777, // U+1710 - U+1717
+ 0x77777777, // U+1718 - U+171F
+ 0x77777777, // U+1720 - U+1727
+ 0x77777777, // U+1728 - U+172F
+ 0x70077777, // U+1730 - U+1737
+ 0x77777777, // U+1738 - U+173F
+ 0x77777777, // U+1740 - U+1747
+ 0x77777777, // U+1748 - U+174F
+ 0x77777777, // U+1750 - U+1757
+ 0x77777777, // U+1758 - U+175F
+ 0x77777777, // U+1760 - U+1767
+ 0x77777777, // U+1768 - U+176F
+ 0x77777777, // U+1770 - U+1777
+ 0x77777777, // U+1778 - U+177F
+ 0x88888888, // U+1780 - U+1787
+ 0x88888888, // U+1788 - U+178F
+ 0x88888888, // U+1790 - U+1797
+ 0x88888888, // U+1798 - U+179F
+ 0x88888888, // U+17A0 - U+17A7
+ 0x88888888, // U+17A8 - U+17AF
+ 0x88888888, // U+17B0 - U+17B7
+ 0x88888888, // U+17B8 - U+17BF
+ 0x88888888, // U+17C0 - U+17C7
+ 0x88888888, // U+17C8 - U+17CF
+ 0x88118888, // U+17D0 - U+17D7
+ 0x77888181, // U+17D8 - U+17DF
+ 0x88888888, // U+17E0 - U+17E7
+ 0x77777788, // U+17E8 - U+17EF
+ 0x88888888, // U+17F0 - U+17F7
+ 0x77777788, // U+17F8 - U+17FF
+};
diff --git a/intl/lwbrk/jisx4051pairtable.txt b/intl/lwbrk/jisx4051pairtable.txt
new file mode 100644
index 0000000000..2bae1b18fe
--- /dev/null
+++ b/intl/lwbrk/jisx4051pairtable.txt
@@ -0,0 +1,286 @@
+
+
+
+/*
+
+ Simplification of Pair Table in JIS X 4051
+
+ 1. The Origion Table - in 4.1.3
+
+ In JIS x 4051. The pair table is defined as below
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20
+ * # * #
+ 1 X X X X X X X X X X X X X X X X X X X X X E
+ 2 X X X X X X
+ 3 X X X X X X
+ 4 X X X X X X
+ 5 X X X X X X
+ 6 X X X X X X
+ 7 X X X X X X X
+ 8 X X X X X X E
+ 9 X X X X X X
+ 10 X X X X X X
+ 11 X X X X X X
+ 12 X X X X X X
+ 13 X X X X X X X
+ 14 X X X X X X X
+ 15 X X X X X X X X X
+ 16 X X X X X X X X
+ 17 X X X X X E
+ 18 X X X X X X X X X
+ 19 X E E E E E X X X X X X X X X X X X E X E E
+ 20 X X X X X E
+
+ * Same Char
+ # Other Char
+
+ 2. Simplified by remove the class which we do not care
+
+ However, since we do not care about class 13(Subscript), 14(Ruby),
+ 19(split line note begin quote), and 20(split line note end quote)
+ we can simplify this par table into the following
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 2 3 4 5 6 7 8 9 10 11 12 15 16 17 18
+
+ 1 X X X X X X X X X X X X X X X X
+ 2 X X X X X
+ 3 X X X X X
+ 4 X X X X X
+ 5 X X X X X
+ 6 X X X X X
+ 7 X X X X X X
+ 8 X X X X X X
+ 9 X X X X X
+ 10 X X X X X
+ 11 X X X X X
+ 12 X X X X X
+ 15 X X X X X X X X
+ 16 X X X X X X X
+ 17 X X X X X
+ 18 X X X X X X X X
+
+ 3. Simplified by merged classes
+
+ After the 2 simplification, the pair table have some duplication
+ a. class 2, 3, 4, 5, 6, are the same- we can merged them
+ b. class 10, 11, 12, 17 are the same- we can merged them
+
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 16 18
+
+ 1 X X X X X X X X X
+ [a] X
+ 7 X X
+ 8 X X
+ 9 X
+ [b] X
+ 15 X X X X
+ 16 X X X
+ 18 X X X X
+
+
+ 4. Now we use one bit to encode weather it is breakable, and use 2 bytes
+ for one row, then the bit table will look like:
+
+ 18 <- 1
+
+ 1 0000 0001 1111 1111 = 0x01FF
+ [a] 0000 0000 0000 0010 = 0x0002
+ 7 0000 0000 0000 0110 = 0x0006
+ 8 0000 0000 0100 0010 = 0x0042
+ 9 0000 0000 0000 0010 = 0x0002
+ [b] 0000 0000 0000 0010 = 0x0042
+ 15 0000 0001 0101 0010 = 0x0152
+ 16 0000 0001 1000 0010 = 0x0182
+ 17 0000 0001 1100 0010 = 0x01C2
+
+*/
+
+static uint16_t gJISx4051SimplifiedPair[9] = {
+ 0x01FF, 0x0002, 0x0006, 0x0042, 0x0002, 0x0042, 0x0152, 0x0182, 0x01C2
+};
+
+PRBool XXXX::ClassesToPair(nsJISx4051Cls aCls1, nsJISx4051Cls aCls1)
+{
+ NS_ASSERTION( (aCls1 < 9) "invalid class");
+ NS_ASSERTION( (aCls2 < 9) "invalid class");
+ return ( 0 != (gJISx4051SimplifiedPair[aCls1] & (1L << aCls2) ));
+}
+
+
+#define X4051_IS_DIGIT(u) ((0x0030 >= (u)) && ((u) >= 0x0039))
+
+nsJISx4051Cls XXXX::GetClass(
+ PRUnichar aChar, PRUnichar aBefore = 0, PRUnichar aAfter = 0)
+{
+ // take care the special case in cls 15
+ if( ((0x2C == aChar) || (0x2E == aChar)) &&
+ (X4051_IS_DIGIT(aBefore)) && X4051_IS_DIGIT(aAfter)))
+ {
+ return kJISx4051Cls_15;
+ }
+
+ nsJISx4051Cls cls;
+ if(gSingle->Lookup(aChar, &cls))
+ return cls;
+
+ if(gRange->Lookup(aChar, &cls))
+ return cls;
+
+ return kJISx4051Cls_15;
+}
+
+
+typedef enum {
+ kJISx4051Cls_1 = 0,
+ kJISx4051Cls_2 = 1,
+ kJISx4051Cls_3 = 1,
+ kJISx4051Cls_4 = 1,
+ kJISx4051Cls_5 = 1,
+ kJISx4051Cls_6 = 1,
+ kJISx4051Cls_7 = 2,
+ kJISx4051Cls_8 = 3,
+ kJISx4051Cls_9 = 4,
+ kJISx4051Cls_10 = 5,
+ kJISx4051Cls_11 = 5,
+ kJISx4051Cls_12 = 5,
+ // kJISx4051Cls_13 = 0,
+ // kJISx4051Cls_14 = 0,
+ kJISx4051Cls_15 = 6,
+ kJISx4051Cls_16 = 7,
+ kJISx4051Cls_17 = 5,
+ kJISx4051Cls_18 = 8,
+ // kJISx4051Cls_19 = 0,
+ // kJISx4051Cls_20 = 0
+} nsJISx4051Cls;
+
+
+ // Table 2
+ YYYY(kJISx4051Cls_1 , 0x0028),
+ YYYY(kJISx4051Cls_1 , 0x005B),
+ YYYY(kJISx4051Cls_1 , 0x007B),
+ YYYY(kJISx4051Cls_1 , 0x2018),
+ YYYY(kJISx4051Cls_1 , 0x201B),
+ YYYY(kJISx4051Cls_1 , 0x201C),
+ YYYY(kJISx4051Cls_1 , 0x201F),
+ YYYY(kJISx4051Cls_1 , 0x3008),
+ YYYY(kJISx4051Cls_1 , 0x300A),
+ YYYY(kJISx4051Cls_1 , 0x300C),
+ YYYY(kJISx4051Cls_1 , 0x300E),
+ YYYY(kJISx4051Cls_1 , 0x3010),
+ YYYY(kJISx4051Cls_1 , 0x3014),
+ YYYY(kJISx4051Cls_1 , 0x3016),
+ YYYY(kJISx4051Cls_1 , 0x3018),
+ YYYY(kJISx4051Cls_1 , 0x301A),
+ YYYY(kJISx4051Cls_1 , 0x301D),
+
+ // Table 3
+ YYYY(kJISx4051Cls_2 , 0x0029),
+ YYYY(kJISx4051Cls_2 , 0x002C),
+ YYYY(kJISx4051Cls_2 , 0x005D),
+ YYYY(kJISx4051Cls_2 , 0x007D),
+ YYYY(kJISx4051Cls_2 , 0x2019),
+ YYYY(kJISx4051Cls_2 , 0x201A),
+ YYYY(kJISx4051Cls_2 , 0x201D),
+ YYYY(kJISx4051Cls_2 , 0x201E),
+ YYYY(kJISx4051Cls_2 , 0x3001),
+ YYYY(kJISx4051Cls_2 , 0x3009),
+ YYYY(kJISx4051Cls_2 , 0x300B),
+ YYYY(kJISx4051Cls_2 , 0x300D),
+ YYYY(kJISx4051Cls_2 , 0x300F),
+ YYYY(kJISx4051Cls_2 , 0x3011),
+ YYYY(kJISx4051Cls_2 , 0x3015),
+ YYYY(kJISx4051Cls_2 , 0x3017),
+ YYYY(kJISx4051Cls_2 , 0x3019),
+ YYYY(kJISx4051Cls_2 , 0x301B),
+ YYYY(kJISx4051Cls_2 , 0x301E),
+ YYYY(kJISx4051Cls_2 , 0x301F),
+
+ // Table 4
+ YYYY(kJISx4051Cls_3 , 0x203C),
+ YYYY(kJISx4051Cls_3 , 0x2044),
+ YYYY(kJISx4051Cls_3 , 0x301C),
+ YYYY(kJISx4051Cls_3 , 0x3041),
+ YYYY(kJISx4051Cls_3 , 0x3043),
+ YYYY(kJISx4051Cls_3 , 0x3045),
+ YYYY(kJISx4051Cls_3 , 0x3047),
+ YYYY(kJISx4051Cls_3 , 0x3049),
+ YYYY(kJISx4051Cls_3 , 0x3063),
+ YYYY(kJISx4051Cls_3 , 0x3083),
+ YYYY(kJISx4051Cls_3 , 0x3085),
+ YYYY(kJISx4051Cls_3 , 0x3087),
+ YYYY(kJISx4051Cls_3 , 0x308E),
+ YYYY(kJISx4051Cls_3 , 0x309D),
+ YYYY(kJISx4051Cls_3 , 0x309E),
+ YYYY(kJISx4051Cls_3 , 0x30A1),
+ YYYY(kJISx4051Cls_3 , 0x30A3),
+ YYYY(kJISx4051Cls_3 , 0x30A5),
+ YYYY(kJISx4051Cls_3 , 0x30A7),
+ YYYY(kJISx4051Cls_3 , 0x30A9),
+ YYYY(kJISx4051Cls_3 , 0x30C3),
+ YYYY(kJISx4051Cls_3 , 0x30E3),
+ YYYY(kJISx4051Cls_3 , 0x30E5),
+ YYYY(kJISx4051Cls_3 , 0x30E7),
+ YYYY(kJISx4051Cls_3 , 0x30EE),
+ YYYY(kJISx4051Cls_3 , 0x30F5),
+ YYYY(kJISx4051Cls_3 , 0x30F6),
+ YYYY(kJISx4051Cls_3 , 0x30FC),
+ YYYY(kJISx4051Cls_3 , 0x30FD),
+ YYYY(kJISx4051Cls_3 , 0x30FE),
+
+ // Table 5
+ YYYY(kJISx4051Cls_4 , 0x0021),
+ YYYY(kJISx4051Cls_4 , 0x003F),
+
+ // Table 6
+ YYYY(kJISx4051Cls_5 , 0x003A),
+ YYYY(kJISx4051Cls_5 , 0x003B),
+ YYYY(kJISx4051Cls_5 , 0x30FB),
+
+ // Table 7
+ YYYY(kJISx4051Cls_6 , 0x002E),
+ YYYY(kJISx4051Cls_6 , 0x3002),
+
+ // Table 8
+ YYYY(kJISx4051Cls_7 , 0x2014),
+ YYYY(kJISx4051Cls_7 , 0x2024),
+ YYYY(kJISx4051Cls_7 , 0x2025),
+ YYYY(kJISx4051Cls_7 , 0x2026),
+
+ // Table 9
+ YYYY(kJISx4051Cls_8 , 0x0024),
+ YYYY(kJISx4051Cls_8 , 0x00A3),
+ YYYY(kJISx4051Cls_8 , 0x00A5),
+ YYYY(kJISx4051Cls_8 , 0x2116),
+
+ // Table 10
+ YYYY(kJISx4051Cls_9 , 0x0025),
+ YYYY(kJISx4051Cls_9 , 0x00A2),
+ YYYY(kJISx4051Cls_9 , 0x00B0),
+ YYYY(kJISx4051Cls_9 , 0x2030),
+ YYYY(kJISx4051Cls_9 , 0x2031),
+ YYYY(kJISx4051Cls_9 , 0x2032),
+ YYYY(kJISx4051Cls_9 , 0x2033),
+
+ // Table 1
+ YYYY(kJISx4051Cls_10, 0x3000),
+
+ // Table 1
+ ZZZZ(kJISx4051Cls_11, 0x3000),
+
+
+
+
diff --git a/intl/lwbrk/moz.build b/intl/lwbrk/moz.build
new file mode 100644
index 0000000000..0699ff63a5
--- /dev/null
+++ b/intl/lwbrk/moz.build
@@ -0,0 +1,45 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+TEST_DIRS += ["gtest"]
+
+EXPORTS.mozilla.intl += [
+ "LineBreaker.h",
+ "nsComplexBreaker.h",
+ "Segmenter.h",
+ "WordBreaker.h",
+]
+
+UNIFIED_SOURCES += [
+ "LineBreaker.cpp",
+ "Segmenter.cpp",
+ "WordBreaker.cpp",
+]
+
+SOURCES += [
+ "nsComplexBreaker.cpp",
+]
+
+if CONFIG["MOZ_WIDGET_TOOLKIT"] == "gtk":
+ SOURCES += [
+ "nsPangoBreaker.cpp",
+ ]
+ CXXFLAGS += CONFIG["MOZ_PANGO_CFLAGS"]
+elif CONFIG["MOZ_WIDGET_TOOLKIT"] == "windows":
+ SOURCES += [
+ "nsUniscribeBreaker.cpp",
+ ]
+elif CONFIG["MOZ_WIDGET_TOOLKIT"] == "cocoa":
+ UNIFIED_SOURCES += [
+ "nsCarbonBreaker.cpp",
+ ]
+else:
+ SOURCES += [
+ "nsRuleBreaker.cpp",
+ "rulebrk.c",
+ ]
+
+FINAL_LIBRARY = "xul"
diff --git a/intl/lwbrk/nsCarbonBreaker.cpp b/intl/lwbrk/nsCarbonBreaker.cpp
new file mode 100644
index 0000000000..d1d81b2578
--- /dev/null
+++ b/intl/lwbrk/nsCarbonBreaker.cpp
@@ -0,0 +1,43 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <CoreFoundation/CoreFoundation.h>
+#include <stdint.h>
+#include "nsDebug.h"
+#include "nscore.h"
+
+void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore) {
+ NS_ASSERTION(aText, "aText shouldn't be null");
+
+ memset(aBreakBefore, 0, aLength * sizeof(uint8_t));
+
+ CFStringRef str = ::CFStringCreateWithCharactersNoCopy(
+ kCFAllocatorDefault, reinterpret_cast<const UniChar*>(aText), aLength,
+ kCFAllocatorNull);
+ if (!str) {
+ return;
+ }
+
+ CFStringTokenizerRef st = ::CFStringTokenizerCreate(
+ kCFAllocatorDefault, str, ::CFRangeMake(0, aLength),
+ kCFStringTokenizerUnitLineBreak, nullptr);
+ if (!st) {
+ ::CFRelease(str);
+ return;
+ }
+
+ CFStringTokenizerTokenType tt = ::CFStringTokenizerAdvanceToNextToken(st);
+ while (tt != kCFStringTokenizerTokenNone) {
+ CFRange r = ::CFStringTokenizerGetCurrentTokenRange(st);
+ if (r.location != 0) { // Ignore leading edge
+ aBreakBefore[r.location] = true;
+ }
+ tt = CFStringTokenizerAdvanceToNextToken(st);
+ }
+
+ ::CFRelease(st);
+ ::CFRelease(str);
+}
diff --git a/intl/lwbrk/nsComplexBreaker.cpp b/intl/lwbrk/nsComplexBreaker.cpp
new file mode 100644
index 0000000000..0c0a5e45b6
--- /dev/null
+++ b/intl/lwbrk/nsComplexBreaker.cpp
@@ -0,0 +1,173 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsComplexBreaker.h"
+
+#include <algorithm>
+
+#include "MainThreadUtils.h"
+#include "mozilla/Assertions.h"
+#include "mozilla/Services.h"
+#include "mozilla/UniquePtr.h"
+#include "nsTHashMap.h"
+#include "nsIObserver.h"
+#include "nsIObserverService.h"
+#include "nsString.h"
+#include "nsTArray.h"
+#include "nsThreadUtils.h"
+
+using namespace mozilla;
+
+using CacheMap = nsTHashMap<nsString, nsTArray<uint8_t>>;
+
+static UniquePtr<CacheMap> sBreakCache;
+
+// The underlying hash table extends capacity, when it hits .75 full and uses
+// powers of 2 for sizing. This cache limit will hopefully mean most pages fit
+// within the cache, while keeping it to a reasonable size. Also by holding the
+// previous cache even if pages are bigger than the cache the most commonly used
+// should still remain fast.
+static const int kCacheLimit = 3072;
+
+static UniquePtr<CacheMap> sOldBreakCache;
+
+// Simple runnable to delete caches off the main thread.
+class CacheDeleter final : public Runnable {
+ public:
+ explicit CacheDeleter(UniquePtr<CacheMap> aCacheToDelete)
+ : Runnable("ComplexBreaker CacheDeleter"),
+ mCacheToDelete(std::move(aCacheToDelete)) {}
+
+ NS_IMETHOD Run() override {
+ MOZ_ASSERT(!NS_IsMainThread());
+ mCacheToDelete = nullptr;
+ return NS_OK;
+ }
+
+ private:
+ UniquePtr<CacheMap> mCacheToDelete;
+};
+
+class ComplexBreakObserver final : public nsIObserver {
+ ~ComplexBreakObserver() = default;
+
+ public:
+ NS_DECL_ISUPPORTS
+ NS_DECL_NSIOBSERVER
+};
+
+NS_IMPL_ISUPPORTS(ComplexBreakObserver, nsIObserver)
+
+NS_IMETHODIMP ComplexBreakObserver::Observe(nsISupports* aSubject,
+ const char* aTopic,
+ const char16_t* aData) {
+ MOZ_ASSERT(NS_IsMainThread());
+
+ if (strcmp(aTopic, "memory-pressure") == 0) {
+ if (sOldBreakCache) {
+ // We have an old cache, so delete that one first.
+ NS_DispatchBackgroundTask(
+ MakeAndAddRef<CacheDeleter>(std::move(sOldBreakCache)));
+ } else if (sBreakCache) {
+ NS_DispatchBackgroundTask(
+ MakeAndAddRef<CacheDeleter>(std::move(sBreakCache)));
+ }
+ }
+
+ return NS_OK;
+}
+
+void ComplexBreaker::Initialize() {
+ MOZ_ASSERT(NS_IsMainThread());
+
+ nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
+ if (obs) {
+ obs->AddObserver(new ComplexBreakObserver(), "memory-pressure", false);
+ }
+}
+
+void ComplexBreaker::Shutdown() {
+ MOZ_ASSERT(NS_IsMainThread());
+
+ sBreakCache = nullptr;
+ sOldBreakCache = nullptr;
+}
+
+static void AddToCache(const char16_t* aText, uint32_t aLength,
+ nsTArray<uint8_t> aBreakBefore) {
+ if (NS_WARN_IF(!sBreakCache->InsertOrUpdate(
+ nsString(aText, aLength), std::move(aBreakBefore), fallible))) {
+ return;
+ }
+
+ if (sBreakCache->Count() <= kCacheLimit) {
+ return;
+ }
+
+ if (sOldBreakCache) {
+ NS_DispatchBackgroundTask(
+ MakeAndAddRef<CacheDeleter>(std::move(sOldBreakCache)));
+ }
+
+ sOldBreakCache = std::move(sBreakCache);
+}
+
+static void CopyAndFill(const nsTArray<uint8_t>& aCachedBreakBefore,
+ uint8_t* aBreakBefore, uint8_t* aEndBreakBefore) {
+ auto* startFill = std::copy(aCachedBreakBefore.begin(),
+ aCachedBreakBefore.end(), aBreakBefore);
+ std::fill(startFill, aEndBreakBefore, false);
+}
+
+void ComplexBreaker::GetBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore) {
+ // It is believed that this is only called on the main thread, so we don't
+ // need to lock the caching structures. A diagnostic assert is used in case
+ // our tests don't exercise all code paths.
+ MOZ_DIAGNOSTIC_ASSERT(NS_IsMainThread());
+
+ MOZ_ASSERT(aText, "aText shouldn't be null");
+ MOZ_ASSERT(aLength, "aLength shouldn't be zero");
+ MOZ_ASSERT(aBreakBefore, "aBreakBefore shouldn't be null");
+
+ // If we have a cache then check it, if not then create it.
+ if (sBreakCache) {
+ if (auto entry =
+ sBreakCache->Lookup(nsDependentSubstring(aText, aLength))) {
+ auto& breakBefore = entry.Data();
+ CopyAndFill(breakBefore, aBreakBefore, aBreakBefore + aLength);
+ return;
+ }
+ } else {
+ sBreakCache = MakeUnique<CacheMap>();
+ }
+
+ // We keep the previous cache when we hit our limit, so that the most recently
+ // used fragments remain fast.
+ if (sOldBreakCache) {
+ auto breakBefore =
+ sOldBreakCache->Extract(nsDependentSubstring(aText, aLength));
+ if (breakBefore) {
+ CopyAndFill(*breakBefore, aBreakBefore, aBreakBefore + aLength);
+ // Move the entry to the latest cache.
+ AddToCache(aText, aLength, std::move(*breakBefore));
+ return;
+ }
+ }
+
+ NS_GetComplexLineBreaks(aText, aLength, aBreakBefore);
+
+ // As a very simple memory saving measure we trim off trailing elements that
+ // are false before caching.
+ auto* afterLastTrue = aBreakBefore + aLength;
+ while (!*(afterLastTrue - 1)) {
+ if (--afterLastTrue == aBreakBefore) {
+ break;
+ }
+ }
+
+ AddToCache(aText, aLength,
+ nsTArray<uint8_t>(aBreakBefore, afterLastTrue - aBreakBefore));
+}
diff --git a/intl/lwbrk/nsComplexBreaker.h b/intl/lwbrk/nsComplexBreaker.h
new file mode 100644
index 0000000000..4120217a6e
--- /dev/null
+++ b/intl/lwbrk/nsComplexBreaker.h
@@ -0,0 +1,36 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsComplexBreaker_h__
+#define nsComplexBreaker_h__
+
+#include <stdint.h>
+/**
+ * Find line break opportunities in aText[] of aLength characters,
+ * filling boolean values indicating line break opportunities for
+ * corresponding charactersin aBreakBefore[] on return.
+ */
+void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore);
+
+class ComplexBreaker {
+ public:
+ static void Initialize();
+
+ static void Shutdown();
+
+ /**
+ * A wrapper around the platform specific NS_GetComplexLineBreaks, which adds
+ * caching of the results to mitigate sometimes expensive implementation.
+ * @param aText - pointer to the text to process for possible line breaks
+ * @param aLength - the length to process
+ * @param aBreakBefore - result array correlated to aText, where element is
+ * set to true if line can be broken before
+ * corresponding character in aText and false otherwise
+ */
+ static void GetBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore);
+};
+
+#endif /* nsComplexBreaker_h__ */
diff --git a/intl/lwbrk/nsLWBrkCIID.h b/intl/lwbrk/nsLWBrkCIID.h
new file mode 100644
index 0000000000..b612155ef0
--- /dev/null
+++ b/intl/lwbrk/nsLWBrkCIID.h
@@ -0,0 +1,28 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsLWBrkCIID_h__
+#define nsLWBrkCIID_h__
+
+// {2BF64764-997F-450D-AF96-3028D1A902B0}
+#define NS_LBRK_CID \
+ { \
+ 0x2bf64764, 0x997f, 0x450d, { \
+ 0xaf, 0x96, 0x30, 0x28, 0xd1, 0xa9, 0x2, 0xb0 \
+ } \
+ }
+
+#define NS_LBRK_CONTRACTID "@mozilla.org/intl/lbrk;1"
+
+// {2BF64765-997F-450D-AF96-3028D1A902B0}
+#define NS_WBRK_CID \
+ { \
+ 0x2bf64765, 0x997f, 0x450d, { \
+ 0xaf, 0x96, 0x30, 0x28, 0xd1, 0xa9, 0x2, 0xb0 \
+ } \
+ }
+
+#define NS_WBRK_CONTRACTID "@mozilla.org/intl/wbrk;1"
+
+#endif
diff --git a/intl/lwbrk/nsPangoBreaker.cpp b/intl/lwbrk/nsPangoBreaker.cpp
new file mode 100644
index 0000000000..e098a11e58
--- /dev/null
+++ b/intl/lwbrk/nsPangoBreaker.cpp
@@ -0,0 +1,61 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsComplexBreaker.h"
+
+#include <pango/pango-break.h>
+#include "nsUTF8Utils.h"
+#include "nsString.h"
+#include "nsTArray.h"
+
+void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore) {
+ NS_ASSERTION(aText, "aText shouldn't be null");
+
+ memset(aBreakBefore, uint8_t(false), aLength * sizeof(uint8_t));
+
+ AutoTArray<PangoLogAttr, 2000> attrBuffer;
+ // XXX(Bug 1631371) Check if this should use a fallible operation as it
+ // pretended earlier.
+ attrBuffer.AppendElements(aLength + 1);
+ // `PangoLogAttr` doesn't have a default constructor (it is a C struct), so
+ // we need to manually initialize the new elements. See bug 1808182.
+ memset(attrBuffer.Elements(), 0, attrBuffer.Length() * sizeof(PangoLogAttr));
+
+ NS_ConvertUTF16toUTF8 aUTF8(aText, aLength);
+
+ const gchar* p = aUTF8.Data();
+ const gchar* end = p + aUTF8.Length();
+ uint32_t u16Offset = 0;
+
+ static PangoLanguage* language = pango_language_from_string("en");
+
+ while (p < end) {
+ PangoLogAttr* attr = attrBuffer.Elements();
+ pango_get_log_attrs(p, end - p, -1, language, attr, attrBuffer.Length());
+
+ while (p < end) {
+ aBreakBefore[u16Offset] = attr->is_line_break;
+ if (NS_IS_LOW_SURROGATE(aText[u16Offset]))
+ aBreakBefore[++u16Offset] = false; // Skip high surrogate
+ ++u16Offset;
+
+ // We're iterating over text obtained from NS_ConvertUTF16toUTF8,
+ // so we know we have valid UTF-8 and don't need to check for
+ // errors.
+ uint32_t ch = UTF8CharEnumerator::NextChar(&p, end);
+ ++attr;
+
+ if (!ch) {
+ // pango_break (pango 1.16.2) only analyses text before the
+ // first NUL (but sets one extra attr). Workaround loop to call
+ // pango_break again to analyse after the NUL is done somewhere else
+ // (gfx/thebes/gfxFontconfigFonts.cpp: SetupClusterBoundaries()).
+ // So, we do the same here for pango_get_log_attrs.
+ break;
+ }
+ }
+ }
+}
diff --git a/intl/lwbrk/nsRuleBreaker.cpp b/intl/lwbrk/nsRuleBreaker.cpp
new file mode 100644
index 0000000000..641f094360
--- /dev/null
+++ b/intl/lwbrk/nsRuleBreaker.cpp
@@ -0,0 +1,18 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsComplexBreaker.h"
+#include "nsDebug.h"
+
+#define TH_UNICODE
+#include "rulebrk.h"
+
+void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore) {
+ NS_ASSERTION(aText, "aText shouldn't be null");
+
+ for (uint32_t i = 0; i < aLength; i++)
+ aBreakBefore[i] = (0 == TrbWordBreakPos(aText, i, aText + i, aLength - i));
+}
diff --git a/intl/lwbrk/nsUniscribeBreaker.cpp b/intl/lwbrk/nsUniscribeBreaker.cpp
new file mode 100644
index 0000000000..fcd1e282e1
--- /dev/null
+++ b/intl/lwbrk/nsUniscribeBreaker.cpp
@@ -0,0 +1,146 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsComplexBreaker.h"
+
+#include <windows.h>
+
+#include <usp10.h>
+
+#include "nsUTF8Utils.h"
+#include "nsString.h"
+#include "nsTArray.h"
+
+#if defined(MOZ_SANDBOX)
+# include "mozilla/WindowsProcessMitigations.h"
+# include "mozilla/SandboxSettings.h"
+# include "mozilla/sandboxTarget.h"
+# include "nsXULAppAPI.h"
+
+# if defined(MOZ_DEBUG)
+# include "mozilla/StaticPrefs_intl.h"
+# endif
+#endif
+
+using namespace mozilla;
+
+#if defined(MOZ_SANDBOX)
+static bool UseBrokeredLineBreaking() {
+ // If win32k lockdown is enabled we can't use Uniscribe in this process. Also
+ // if the sandbox is above a certain level we can't load the required DLLs
+ // without other intervention. Given that it looks like we are likely to have
+ // win32k lockdown enabled first, using the brokered call for people testing
+ // this case also makes most sense.
+ static bool sUseBrokeredLineBreaking =
+ IsWin32kLockedDown() ||
+ (XRE_IsContentProcess() && GetEffectiveContentSandboxLevel() >= 20);
+
+ return sUseBrokeredLineBreaking;
+}
+#endif
+
+void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore) {
+ NS_ASSERTION(aText, "aText shouldn't be null");
+
+#if defined(MOZ_SANDBOX)
+ if (UseBrokeredLineBreaking()) {
+ // We can't use Uniscribe, so use a brokered call. Use of Uniscribe will be
+ // replaced in bug 1684927.
+ char16ptr_t text = aText;
+ if (!SandboxTarget::Instance()->GetComplexLineBreaks(text, aLength,
+ aBreakBefore)) {
+ NS_WARNING("Brokered line break failed, breaks might be incorrect.");
+ }
+
+ return;
+ }
+#endif
+
+ int outItems = 0;
+ HRESULT result;
+ AutoTArray<SCRIPT_ITEM, 64> items;
+ char16ptr_t text = aText;
+
+ memset(aBreakBefore, false, aLength);
+
+ items.AppendElements(64);
+
+ do {
+ result = ScriptItemize(text, aLength, items.Length(), nullptr, nullptr,
+ items.Elements(), &outItems);
+
+ if (result == E_OUTOFMEMORY) {
+ // XXX(Bug 1631371) Check if this should use a fallible operation as it
+ // pretended earlier.
+ items.AppendElements(items.Length());
+ }
+ } while (result == E_OUTOFMEMORY);
+
+ for (int iItem = 0; iItem < outItems; ++iItem) {
+ uint32_t endOffset =
+ (iItem + 1 == outItems ? aLength : items[iItem + 1].iCharPos);
+ uint32_t startOffset = items[iItem].iCharPos;
+ AutoTArray<SCRIPT_LOGATTR, 64> sla;
+
+ // XXX(Bug 1631371) Check if this should use a fallible operation as it
+ // pretended earlier.
+ sla.AppendElements(endOffset - startOffset);
+
+ if (ScriptBreak(text + startOffset, endOffset - startOffset,
+ &items[iItem].a, sla.Elements()) < 0)
+ return;
+
+ // We don't want to set a potential break position at the start of text;
+ // that's the responsibility of a higher level.
+ for (uint32_t j = startOffset ? 0 : 1; j + startOffset < endOffset; ++j) {
+ aBreakBefore[j + startOffset] = sla[j].fSoftBreak;
+ }
+ }
+
+#if defined(MOZ_DEBUG) && defined(MOZ_SANDBOX)
+ // When tests are enabled and pref is set, we compare the line breaks returned
+ // from the Uniscribe breaker in the content process, with the ones returned
+ // from the brokered call to the parent. If they differ we crash so we can
+ // test using a crashtest.
+ if (!StaticPrefs::intl_compare_against_brokered_complex_line_breaks() ||
+ !XRE_IsContentProcess()) {
+ return;
+ }
+
+ nsTArray<uint8_t> brokeredBreaks(aLength);
+ brokeredBreaks.AppendElements(aLength);
+ if (!SandboxTarget::Instance()->GetComplexLineBreaks(
+ text, aLength, brokeredBreaks.Elements())) {
+ MOZ_CRASH("Brokered GetComplexLineBreaks failed.");
+ }
+
+ bool mismatch = false;
+ for (uint32_t i = 0; i < aLength; ++i) {
+ if (aBreakBefore[i] != brokeredBreaks[i]) {
+ mismatch = true;
+ break;
+ }
+ }
+
+ if (mismatch) {
+ // The logging here doesn't handle surrogates, but we only have tests using
+ // Thai currently, which is BMP-only.
+ printf_stderr("uniscribe: ");
+ for (uint32_t i = 0; i < aLength; ++i) {
+ if (aBreakBefore[i]) printf_stderr("#");
+ printf_stderr("%s", NS_ConvertUTF16toUTF8(aText + i, 1).get());
+ }
+ printf_stderr("\n");
+ printf_stderr("brokered : ");
+ for (uint32_t i = 0; i < aLength; ++i) {
+ if (brokeredBreaks[i]) printf_stderr("#");
+ printf_stderr("%s", NS_ConvertUTF16toUTF8(aText + i, 1).get());
+ }
+ printf_stderr("\n");
+ MOZ_CRASH("Brokered breaks did not match.");
+ }
+#endif
+}
diff --git a/intl/lwbrk/rulebrk.c b/intl/lwbrk/rulebrk.c
new file mode 100644
index 0000000000..d7574b929f
--- /dev/null
+++ b/intl/lwbrk/rulebrk.c
@@ -0,0 +1,388 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#define TH_UNICODE
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include "th_char.h"
+#define th_isalpha(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
+#define th_isspace(c) ((c) == ' ' || (c) == '\t')
+
+/*
+/////////////////////////////////////////////////
+// Thai character type array
+*/
+
+typedef unsigned short twb_t;
+extern const twb_t _TwbType[0x100 - 0xa0];
+
+/*
+// bit definition
+*/
+
+#define VRS 0x0001
+#define VRE 0x0002
+#define VRX 0x0004
+
+#define VRA 0x0008
+
+#define VLA 0x0010
+#define VLO 0x0020
+#define VLI 0x0040
+
+#define VC 0x0080
+
+#define CC 0x0100
+#define CS 0x0200
+
+#define C2 0x0400
+#define CHB 0x0800
+#define CHE 0x1000
+
+#define MT 0x2000
+/*
+//_#define me 0x2000
+*/
+#define M 0x4000
+
+#define T 0x8000
+
+#define VL (VLA | VLO | VLI)
+#define VR (VRS | VRE | VRX)
+#define NE (VL | VRS)
+#define NB (VR | M)
+#define V (VL | VR)
+#define CX (CC | CS)
+#define C (CX | VC)
+#define A (C | V | M)
+
+#define twbtype(c) (_TwbType[th_zcode(c)])
+
+#ifndef TRUE
+# define TRUE 1
+# define FALSE 0
+#endif
+#define RETURN(b) return (b)
+
+/*
+/////////////////////////////////////////////////
+*/
+
+int TrbWordBreakPos(const th_char* pstr, int left, const th_char* rstr,
+ int right)
+/* const ThBreakIterator *it, const th_char **p)*/
+{
+ /*
+ //int left, right;
+ //const th_char *s = *p;
+ */
+ const th_char* lstr = pstr + left;
+ th_char _c[6];
+ twb_t _t[6];
+#define c(i) (_c[(i) + 3])
+#define t(i) (_t[(i) + 3])
+ int i, j;
+
+ /*
+ //left = s - it->begin;
+ */
+ if (left < 0) return -1;
+ /*
+ //right = (it->end == NULL) ? 4 : it->begin - s;
+ */
+ if (right < 1) return -1;
+
+ /*
+ // get c(0), t(0)
+ */
+ c(0) = rstr[0]; /* may be '\0' */
+ if (!th_isthai(c(0))) return -1;
+ t(0) = twbtype(c(0));
+ if (!(t(0) & A)) return -1;
+
+ /*
+ // get c(-1), t(-1)
+ */
+ if (left >= 1) {
+ c(-1) = lstr[-1];
+ if (!th_isthai(c(-1))) return 0;
+ t(-1) = twbtype(c(-1));
+ if (!(t(-1) & A)) return 0; /* handle punctuation marks here */
+ } else {
+ c(-1) = 0;
+ t(-1) = 0;
+ }
+
+ /*
+ // get c(1..2), t(1..2)
+ */
+ for (i = 1; i <= 2; i++) {
+ if (i >= right) {
+ c(i) = 0;
+ t(i) = 0;
+ } else {
+ c(i) = rstr[i]; /* may be '\0'; */
+ if (!th_isthai(c(i)))
+ right = i--;
+ else {
+ t(i) = twbtype(c(i));
+ if (!(t(i) & A)) right = i--;
+ }
+ }
+ }
+ /*
+ // get c(-2..-3), t(-2..-3)
+ */
+ for (i = -2, j = -2; i >= -3; j--) {
+ if (j < -left) {
+ c(i) = 0;
+ t(i) = 0;
+ i--;
+ } else {
+ c(i) = lstr[j];
+ if (!th_isthai(c(i)))
+ left = 0;
+ else {
+ t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0);
+ if (!(t(i) & A))
+ left = 0;
+ else {
+ if ((t(i + 1) & MT) && ((t(i) & VR) || (t(i + 2) & VR))) {
+ c(i + 1) = c(i);
+ t(i + 1) = t(i);
+ } else
+ i--;
+ }
+ }
+ }
+ }
+
+ /*
+ // prohibit the unlikely
+ */
+ if ((t(-1) & C) && (t(0) & C)) {
+ if ((t(-1) & CHE) || (t(0) & CHB)) return -1;
+ }
+ /*
+ // special case : vlao, C/ sara_a|aa, !sara_a
+ */
+ if ((t(-3) & (VLA | VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) &&
+ (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA))
+ return 0;
+
+ /*
+ // prohibit break
+ */
+ if (t(0) & NB) return -1;
+ if (t(-1) & NE) return -1;
+
+ /*
+ // apply 100% rules
+ */
+ if (t(-1) & VRE) {
+ if (c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0;
+ return -1; /* usually too short syllable, part of word */
+ }
+
+ if (t(-2) & VRE) return -1;
+
+ if ((t(0) & C) && (t(1) & (VR | MT)) &&
+ (c(2) != TH_THANTHAKHAT)) { /*?C, NB */
+ if ((t(-1) & (VRS | VRX)) && c(1) == TH_SARA_I) return -1; /* exception */
+ if (t(-1) & (V | M)) return 0; /* !C/ C, NB */
+ if (t(-2) & VRS) return 0; /* VRS, C / C, NB */
+ if (!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */
+ if (t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */
+ if (t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */
+ }
+ }
+ if ((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */
+ if ((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V | M)))
+ return 0; /* VRS, C/ !C */
+
+ if ((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) {
+ if ((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */
+ if ((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */
+ }
+ /*
+ // apply 90% rules
+ */
+ if (t(0) & VL) return 0;
+ if (t(1) & VL) return -1;
+ if (c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING)
+ return 0;
+
+ /*
+ //return -1;
+ // apply 80% rules
+ */
+ if (t(0) & CHE) {
+ if ((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */
+ /*if(t(-1) & VRX) return 0; // VRX/ CHE */
+ if (t(-1) & VC) return 0; /* VC/ CHE */
+ }
+ if (t(-1) & CHB) {
+ if ((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */
+ if (t(0) & VC) return 0; /* CHB/ VC */
+ }
+
+ if ((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */
+ if (t(-2) & VLI)
+ return 0; /* VLI,C/C,VR .*/
+ else { /* vlao, C ? C , VR */
+ if (c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */
+ if (t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */
+ if (!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */
+ }
+ }
+ /* C,MT,C */
+ if ((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1;
+
+ return -1;
+}
+
+int TrbFollowing(const th_char* begin, int length, int offset)
+/*
+//(ThBreakIterator *this, int offset)
+*/
+{
+ const th_char* w = begin + offset;
+ const th_char* end = begin + length;
+ while (w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++;
+
+ if (w < end && *w && !th_isthai(*w)) {
+ int english = FALSE;
+ while (w < end && *w && !th_isthai(*w) && !th_isspace(*w)) {
+ if (th_isalpha(*w)) english = TRUE;
+ w++;
+ }
+ if (english || w == end || (!th_isthai(*w) && th_isspace(*w)))
+ return w - begin;
+ }
+ if (w == end || *w == 0 || !th_isthai(*w)) return w - begin;
+ w++;
+ if (w < end && *w && th_isthai(*w)) {
+ int brk = TrbWordBreakPos(begin, w - begin, w, end - w);
+ while (brk < 0) {
+ w++;
+ if (w == end || *w == 0 || !th_isthai(*w)) break;
+ brk = TrbWordBreakPos(begin, w - begin, w, end - w);
+ }
+ if (brk > 0) w += brk;
+ }
+ if (w < end && *w && !th_isthai(*w)) {
+ while (w < end && *w && !th_isthai(*w) && !th_isalpha(*w) &&
+ !th_isspace(*w))
+ w++;
+ }
+ return w - begin;
+}
+
+/*
+/////////////////////////////////////////////////
+*/
+const twb_t _TwbType[0x100 - 0xa0] = {
+#if 0
+/* 80 € */ T,
+/* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+/* 90  */ T,
+/* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+#endif
+ /* a0   */ 0,
+ /* a1 ¡ */ CS,
+ /* a2 ¢ */ CS | CHE,
+ /* a3 £ */ CC | CHE,
+ /* a4 € */ CS | CHE,
+ /* a5 ¥ */ CC | CHE,
+ /* a6 Š */ CS,
+ /* a7 § */ CS | CHB,
+ /* a8 š */ CS,
+ /* a9 © */ CC | CHE,
+ /* aa ª */ CS,
+ /* ab « */ CC | CHE,
+ /* ac ¬ */ CC | CHB | CHE,
+ /* ad ­ */ CS | CHB,
+ /* ae ® */ CS | CHB,
+ /* af ¯ */ CS | CHB,
+ /* b0 ° */ CS,
+ /* b1 ± */ CS | CHB | CHE,
+ /* b2 ² */ CS | CHB | CHE,
+ /* b3 ³ */ CS | CHB,
+ /* b4 Ž */ CS,
+ /* b5 µ */ CS,
+ /* b6 ¶ */ CS,
+ /* b7 · */ CS,
+ /* b8 ž */ CS,
+ /* b9 ¹ */ CS,
+ /* ba º */ CS,
+ /* bb » */ CS,
+ /* bc Œ */ CC | CHE,
+ /* bd œ */ CC | CHE,
+ /* be Ÿ */ CS,
+ /* bf ¿ */ CS,
+ /* c0 À */ CS | CHE,
+ /* c1 Á */ CS,
+ /* c2 Â */ CS,
+ /* c3 Ã */ CS | C2 | CHE, /* ? add CHE */
+ /* c4 Ä */ VC | CHE,
+ /* c5 Å */ CS | C2,
+ /* c6 Æ */ VC | CHE,
+ /* c7 Ç */ VC | C2,
+ /* c8 È */ CS,
+ /* c9 É */ CS | CHB,
+ /* ca Ê */ CS | CHE,
+ /* cb Ë */ CC | CHE,
+ /* CC Ì */ CS | CHB | CHE,
+ /* cd Í */ VC,
+ /* ce Î */ CC | CHE,
+ /* cf Ï */ T,
+ /* d0 Ð */ VRE | VRA,
+ /* d1 Ñ */ VRS,
+ /* d2 Ò */ VRX | VRA,
+ /* d3 Ó */ VRE,
+ /* d4 Ô */ VRX | VRA,
+ /* d5 Õ */ VRX | VRA,
+ /* d6 Ö */ VRS,
+ /* d7 × */ VRS | VRA,
+ /* d8 Ø */ VRX,
+ /* d9 Ù */ VRX,
+ /* da Ú */ T,
+ /* db Û */ 0,
+ /* dc Ü */ 0,
+ /* dd Ý */ 0,
+ /* de Þ */ 0,
+ /* df ß */ T,
+ /* e0 à */ VLA,
+ /* e1 á */ VLO,
+ /* e2 â */ VLO,
+ /* e3 ã */ VLI,
+ /* e4 ä */ VLI,
+ /* e5 å */ VRE,
+ /* e6 æ */ M,
+ /* e7 ç */ M,
+ /* e8 è */ M | MT,
+ /* e9 é */ M | MT,
+ /* ea ê */ M | MT,
+ /* eb ë */ M | MT,
+ /* ec ì */ M,
+ /* ed í */ T,
+ /* ee î */ T,
+ /* ef ï */ T,
+ /* f0 ð */ T,
+ /* f1 ñ */ T,
+ /* f2 ò */ T,
+ /* f3 ó */ T,
+ /* f4 ô */ T,
+ /* f5 õ */ T,
+ /* f6 ö */ T,
+ /* f7 ÷ */ T,
+ /* f8 ø */ T,
+ /* f9 ù */ T,
+ /* fa ú */ T,
+ /* fb û */ T,
+ /* fc ü */ 0,
+ /* fd ý */ 0,
+ /* fe þ */ 0,
+ /* ff ’ */ 0};
diff --git a/intl/lwbrk/rulebrk.h b/intl/lwbrk/rulebrk.h
new file mode 100644
index 0000000000..c1f2e0957b
--- /dev/null
+++ b/intl/lwbrk/rulebrk.h
@@ -0,0 +1,26 @@
+/*
+Copyright (c) 1999 Samphan Raruenrom <samphan@thai.com>
+Permission to use, copy, modify, distribute and sell this software
+and its documentation for any purpose is hereby granted without fee,
+provided that the above copyright notice appear in all copies and
+that both that copyright notice and this permission notice appear
+in supporting documentation. Samphan Raruenrom makes no
+representations about the suitability of this software for any
+purpose. It is provided "as is" without express or implied warranty.
+*/
+#ifndef __RULEBRK_H__
+#define __RULEBRK_H__
+#include "th_char.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int TrbWordBreakPos(const th_char* pstr, int left, const th_char* rstr,
+ int right);
+int TrbFollowing(const th_char* begin, int length, int offset);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/intl/lwbrk/th_char.h b/intl/lwbrk/th_char.h
new file mode 100644
index 0000000000..a088228fff
--- /dev/null
+++ b/intl/lwbrk/th_char.h
@@ -0,0 +1,133 @@
+/*
+Copyright (c) 1999 Samphan Raruenrom <samphan@thai.com>
+Permission to use, copy, modify, distribute and sell this software
+and its documentation for any purpose is hereby granted without fee,
+provided that the above copyright notice appear in all copies and
+that both that copyright notice and this permission notice appear
+in supporting documentation. Samphan Raruenrom makes no
+representations about the suitability of this software for any
+purpose. It is provided "as is" without express or implied warranty.
+*/
+#ifndef __TH_CHAR_H__
+#define __TH_CHAR_H__
+
+typedef unsigned char tis_char;
+
+#ifdef TH_UNICODE
+/*
+ * The char16_t type is only usable in C++ code, so we need this ugly hack to
+ * select a binary compatible C type for the expat C code to use.
+ */
+# ifdef __cplusplus
+typedef char16_t th_char;
+# else
+typedef uint16_t th_char;
+# endif
+# define TH_THAIBEGIN_ 0x0e00
+# define th_isthai(c) (0x0e00 <= (c) && (c) <= 0x0e5f)
+#else
+typedef tis_char th_char;
+# define TH_THAIBEGIN_ 0xa0
+# define th_isthai(c) ((c) >= 0xa0)
+#endif
+#define th_zcode(c) ((c)-TH_THAIBEGIN_)
+
+enum TH_CHARNAME {
+ TH_THAIBEGIN = TH_THAIBEGIN_,
+ TH_KOKAI,
+ TH_KHOKHAI,
+ TH_KHOKHUAT,
+ TH_KHOKHWAI,
+ TH_KHOKHON,
+ TH_KHORAKHANG,
+ TH_NGONGU,
+ TH_CHOCHAN,
+ TH_CHOCHING,
+ TH_CHOCHANG,
+ TH_SOSO,
+ TH_CHOCHOE,
+ TH_YOYING,
+ TH_DOCHADA,
+ TH_TOPATAK,
+ TH_THOTHAN,
+ TH_THONANGMONTHO,
+ TH_THOPHUTHAO,
+ TH_NONEN,
+ TH_DODEK,
+ TH_TOTAO,
+ TH_THOTHUNG,
+ TH_THOTHAHAN,
+ TH_THOTHONG,
+ TH_NONU,
+ TH_BOBAIMAI,
+ TH_POPLA,
+ TH_PHOPHUNG,
+ TH_FOFA,
+ TH_PHOPHAN,
+ TH_FOFAN,
+ TH_PHOSAMPHAO,
+ TH_MOMA,
+ TH_YOYAK,
+ TH_RORUA,
+ TH_RU,
+ TH_LOLING,
+ TH_LU,
+ TH_WOWAEN,
+ TH_SOSALA,
+ TH_SORUSI,
+ TH_SOSUA,
+ TH_HOHIP,
+ TH_LOCHULA,
+ TH_OANG,
+ TH_HONOKHUK,
+ TH_PAIYANNOI,
+ TH_SARA_A,
+ TH_MAIHANAKAT,
+ TH_SARA_AA,
+ TH_SARA_AM,
+ TH_SARA_I,
+ TH_SARA_II,
+ TH_SARA_UE,
+ TH_SARA_UEE,
+ TH_SARA_U,
+ TH_SARA_UU,
+ TH_PHINTHU,
+ TH_REM_CHERNG_,
+ TH_TAC_WBRK_,
+ TH_UNDEF_DD,
+ TH_UNDEF_DE,
+ TH_BAHT,
+ TH_SARA_E,
+ TH_SARA_AE,
+ TH_SARA_O,
+ TH_MAIMUAN,
+ TH_MAIMALAI,
+ TH_LAKKHANGYAO,
+ TH_MAIYAMOK,
+ TH_MAITAIKHU,
+ TH_MAIEK,
+ TH_MAITHO,
+ TH_MAITRI,
+ TH_MAICHATTAWA,
+ TH_THANTHAKHAT,
+ TH_NIKHAHIT,
+ TH_YAMAKKAN,
+ TH_FONGMAN,
+ TH_THAIZERO,
+ TH_THAIONE,
+ TH_THAITWO,
+ TH_THAITHREE,
+ TH_THAIFOUR,
+ TH_THAIFIVE,
+ TH_THAISIX,
+ TH_THAISEVEN,
+ TH_THAIEIGHT,
+ TH_THAININE,
+ TH_ANGKHANKHU,
+ TH_KHOMUT,
+ TH_UNDEF_FC,
+ TH_UNDEF_FD,
+ TH_UNDEF_FE,
+ TH_THAIEND
+};
+#endif
diff --git a/intl/lwbrk/tools/anzx4051.html b/intl/lwbrk/tools/anzx4051.html
new file mode 100644
index 0000000000..9f3461a285
--- /dev/null
+++ b/intl/lwbrk/tools/anzx4051.html
@@ -0,0 +1,709 @@
+<!-- This Source Code Form is subject to the terms of the Mozilla Public
+ - License, v. 2.0. If a copy of the MPL was not distributed with this
+ - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
+
+<html>
+ <head>
+ <title>Analysis of JIS X 4051 to Unicode General Category Mapping</title>
+ </head>
+ <body>
+ <h1>Analysis of JIS X 4051 to Unicode General Category Mapping</h1>
+ <table border="3">
+ <tr bgcolor="blue">
+ <th></th>
+ <th></th>
+ <td bgcolor="red">C</td>
+ <td bgcolor="red">L</td>
+ <td bgcolor="red">M</td>
+ <td bgcolor="red">N</td>
+ <td bgcolor="red">P</td>
+ <td bgcolor="red">S</td>
+ <td bgcolor="red">Z</td>
+ <td bgcolor="white">Total</td>
+ <td bgcolor="yellow">Cc</td>
+ <td bgcolor="yellow">Cf</td>
+ <td bgcolor="yellow">Co</td>
+ <td bgcolor="yellow">Cs</td>
+ <td bgcolor="yellow">Ll</td>
+ <td bgcolor="yellow">Lm</td>
+ <td bgcolor="yellow">Lo</td>
+ <td bgcolor="yellow">Lt</td>
+ <td bgcolor="yellow">Lu</td>
+ <td bgcolor="yellow">Mc</td>
+ <td bgcolor="yellow">Me</td>
+ <td bgcolor="yellow">Mn</td>
+ <td bgcolor="yellow">Nd</td>
+ <td bgcolor="yellow">Nl</td>
+ <td bgcolor="yellow">No</td>
+ <td bgcolor="yellow">Pc</td>
+ <td bgcolor="yellow">Pd</td>
+ <td bgcolor="yellow">Pe</td>
+ <td bgcolor="yellow">Pf</td>
+ <td bgcolor="yellow">Pi</td>
+ <td bgcolor="yellow">Po</td>
+ <td bgcolor="yellow">Ps</td>
+ <td bgcolor="yellow">Sc</td>
+ <td bgcolor="yellow">Sk</td>
+ <td bgcolor="yellow">Sm</td>
+ <td bgcolor="yellow">So</td>
+ <td bgcolor="yellow">Zl</td>
+ <td bgcolor="yellow">Zp</td>
+ <td bgcolor="yellow">Zs</td>
+ </tr>
+ <tr>
+ <th>00_1</th>
+ <th></th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>14</td>
+ <td>1</td>
+ <td></td>
+ <td bgcolor="white">15</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>1</td>
+ <td>2</td>
+ <td>11</td>
+ <td>1</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>01_[a]</th>
+ <th></th>
+ <td></td>
+ <td>32</td>
+ <td>2</td>
+ <td></td>
+ <td>31</td>
+ <td>3</td>
+ <td></td>
+ <td bgcolor="white">68</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>8</td>
+ <td>24</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>2</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>1</td>
+ <td>12</td>
+ <td>1</td>
+ <td></td>
+ <td>17</td>
+ <td></td>
+ <td></td>
+ <td>2</td>
+ <td>1</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>02_7</th>
+ <th></th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>1</td>
+ <td></td>
+ <td></td>
+ <td bgcolor="white">1</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>1</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>03_8</th>
+ <th></th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>1</td>
+ <td></td>
+ <td bgcolor="white">1</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>1</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>04_9</th>
+ <th></th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>5</td>
+ <td></td>
+ <td></td>
+ <td bgcolor="white">5</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>5</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>05_[b]</th>
+ <th></th>
+ <td>33</td>
+ <td>153</td>
+ <td></td>
+ <td>33</td>
+ <td>2</td>
+ <td>5</td>
+ <td>13</td>
+ <td bgcolor="white">239</td>
+ <td>32</td>
+ <td>1</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>153</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>33</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>2</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>5</td>
+ <td></td>
+ <td></td>
+ <td>13</td>
+ </tr>
+ <tr>
+ <th>06_15</th>
+ <th></th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>30</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td bgcolor="white">30</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>30</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>07_18</th>
+ <th></th>
+ <td>18</td>
+ <td>157</td>
+ <td></td>
+ <td>33</td>
+ <td>56</td>
+ <td>125</td>
+ <td>2</td>
+ <td bgcolor="white">391</td>
+ <td></td>
+ <td>18</td>
+ <td></td>
+ <td></td>
+ <td>64</td>
+ <td>7</td>
+ <td>5</td>
+ <td></td>
+ <td>81</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>3</td>
+ <td>30</td>
+ <td>4</td>
+ <td>5</td>
+ <td>2</td>
+ <td></td>
+ <td>5</td>
+ <td>36</td>
+ <td>4</td>
+ <td></td>
+ <td>3</td>
+ <td>24</td>
+ <td>98</td>
+ <td>1</td>
+ <td>1</td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>08_COMPLEX</th>
+ <th></th>
+ <td></td>
+ <td>54</td>
+ <td>33</td>
+ <td>20</td>
+ <td>2</td>
+ <td>1</td>
+ <td></td>
+ <td bgcolor="white">110</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>1</td>
+ <td>53</td>
+ <td></td>
+ <td></td>
+ <td>11</td>
+ <td></td>
+ <td>22</td>
+ <td>10</td>
+ <td></td>
+ <td>10</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>2</td>
+ <td></td>
+ <td>1</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>09_[c]</th>
+ <th></th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>3</td>
+ <td>4</td>
+ <td></td>
+ <td bgcolor="white">7</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>3</td>
+ <td>2</td>
+ <td></td>
+ <td>2</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0A_[d]</th>
+ <th></th>
+ <td>1</td>
+ <td>2</td>
+ <td></td>
+ <td>6</td>
+ <td>25</td>
+ <td>14</td>
+ <td></td>
+ <td bgcolor="white">48</td>
+ <td></td>
+ <td>1</td>
+ <td></td>
+ <td></td>
+ <td>1</td>
+ <td></td>
+ <td>1</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>6</td>
+ <td></td>
+ <td></td>
+ <td>3</td>
+ <td>3</td>
+ <td></td>
+ <td>19</td>
+ <td></td>
+ <td>2</td>
+ <td>3</td>
+ <td>7</td>
+ <td>2</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0B_[e]</th>
+ <th></th>
+ <td>1</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>1</td>
+ <td>1</td>
+ <td>3</td>
+ <td bgcolor="white">6</td>
+ <td></td>
+ <td>1</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>1</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>1</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>3</td>
+ </tr>
+ <tr>
+ <th>X</th>
+ <th></th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td bgcolor="white">0</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ </table>
+ <table border="3">
+ <tr bgcolor="blue">
+ <th></th>
+ <th></th>
+ <td bgcolor="red">00_1</td>
+ <td bgcolor="red">01_[a]</td>
+ <td bgcolor="red">02_7</td>
+ <td bgcolor="red">03_8</td>
+ <td bgcolor="red">04_9</td>
+ <td bgcolor="red">05_[b]</td>
+ <td bgcolor="red">06_15</td>
+ <td bgcolor="red">07_18</td>
+ <td bgcolor="red">08_COMPLEX</td>
+ <td bgcolor="red">09_[c]</td>
+ <td bgcolor="red">0A_[d]</td>
+ <td bgcolor="red">0B_[e]</td>
+ <td bgcolor="red">X</td>
+ </tr>
+ <tr>
+ <th>00</th>
+ <th></th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>33</td>
+ <td>10</td>
+ <td>127</td>
+ <td></td>
+ <td>7</td>
+ <td>44</td>
+ <td>2</td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0E</th>
+ <th></th>
+ <td>1</td>
+ <td>6</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>20</td>
+ <td>1</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>17</th>
+ <th></th>
+ <td>2</td>
+ <td>4</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>110</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>20</th>
+ <th></th>
+ <td>2</td>
+ <td>11</td>
+ <td>1</td>
+ <td></td>
+ <td>5</td>
+ <td>13</td>
+ <td></td>
+ <td>100</td>
+ <td></td>
+ <td></td>
+ <td>4</td>
+ <td>4</td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>21</th>
+ <th></th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>1</td>
+ <td></td>
+ <td>32</td>
+ <td></td>
+ <td>163</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>30</th>
+ <th></th>
+ <td>10</td>
+ <td>47</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>161</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ </table>
+ </body>
+</html>
diff --git a/intl/lwbrk/tools/anzx4051.pl b/intl/lwbrk/tools/anzx4051.pl
new file mode 100644
index 0000000000..e76eac6207
--- /dev/null
+++ b/intl/lwbrk/tools/anzx4051.pl
@@ -0,0 +1,356 @@
+#!/usr/bin/perl
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+######################################################################
+#
+# Initial global variable
+#
+######################################################################
+%utot = ();
+$ui=0;
+$li=0;
+
+######################################################################
+#
+# Open the unicode database file
+#
+######################################################################
+open ( UNICODATA , "< ../../unicharutil/tools/UnicodeData-Latest.txt")
+ || die "cannot find UnicodeData-Latest.txt";
+
+######################################################################
+#
+# Open the JIS X 4051 Class file
+#
+######################################################################
+open ( CLASS , "< jisx4051class.txt")
+ || die "cannot find jisx4051class.txt";
+
+######################################################################
+#
+# Open the JIS X 4051 Class simplified mapping
+#
+######################################################################
+open ( SIMP , "< jisx4051simp.txt")
+ || die "cannot find jisx4051simp.txt";
+
+######################################################################
+#
+# Open the output file
+#
+######################################################################
+open ( OUT , "> anzx4051.html")
+ || die "cannot open output anzx4051.html file";
+
+######################################################################
+#
+# Open the output file
+#
+######################################################################
+open ( HEADER , "> ../jisx4051class.h")
+ || die "cannot open output ../jisx4051class.h file";
+
+######################################################################
+#
+# Generate license and header
+#
+######################################################################
+$hthmlheader = <<END_OF_HTML;
+<!-- This Source Code Form is subject to the terms of the Mozilla Public
+ - License, v. 2.0. If a copy of the MPL was not distributed with this
+ - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
+
+<HTML>
+<HEAD>
+<TITLE>
+Analysis of JIS X 4051 to Unicode General Category Mapping
+</TITLE>
+</HEAD>
+<BODY>
+<H1>
+Analysis of JIS X 4051 to Unicode General Category Mapping
+</H1>
+END_OF_HTML
+print OUT $hthmlheader;
+
+######################################################################
+#
+# Generate license and header
+#
+######################################################################
+$npl = <<END_OF_NPL;
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+/*
+ DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY
+ mozilla/intl/lwbrk/tools/anzx4051.pl
+ */
+END_OF_NPL
+print HEADER $npl;
+
+%occ = ();
+%gcat = ();
+%dcat = ();
+%simp = ();
+%gcount = ();
+%dcount = ();
+%sccount = ();
+%rangecount = ();
+
+######################################################################
+#
+# Process the file line by line
+#
+######################################################################
+while(<UNICODATA>) {
+ chop;
+ ######################################################################
+ #
+ # Get value from fields
+ #
+ ######################################################################
+ @f = split(/;/ , $_);
+ $c = $f[0]; # The unicode value
+ $g = $f[2];
+ $d = substr($g, 0, 1);
+
+ $gcat{$c} = $g;
+ $dcat{$c} = $d;
+ $gcount{$g}++;
+ $dcount{$d}++;
+}
+close(UNIDATA);
+
+while(<SIMP>) {
+ chop;
+ ######################################################################
+ #
+ # Get value from fields
+ #
+ ######################################################################
+ @f = split(/;/ , $_);
+
+ $simp{$f[0]} = $f[1];
+ $sccount{$f[1]}++;
+}
+close(SIMP);
+
+sub GetClass{
+ my ($u) = @_;
+ my $hex = DecToHex($u);
+ $g = $gcat{$hex};
+ if($g ne "") {
+ return $g;
+ } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) {
+ return "Han";
+ } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) {
+ return "Lo";
+ } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) {
+ return "Cs";
+ } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) {
+ return "Cs";
+ } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) {
+ return "Cs";
+ } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) {
+ return "Co";
+ } else {
+ printf "WARNING !!!! Cannot find General Category for U+%s \n" , $hex;
+ }
+}
+sub GetDClass{
+ my ($u) = @_;
+ my $hex = DecToHex($u);
+ $g = $dcat{$hex};
+ if($g ne "") {
+ return $g;
+ } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) {
+ return "Han";
+ } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) {
+ return "L";
+ } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) {
+ return "C";
+ } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) {
+ return "C";
+ } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) {
+ return "C";
+ } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) {
+ return "C";
+ } else {
+ printf "WARNING !!!! Cannot find Detailed General Category for U+%s \n" , $hex;
+ }
+}
+sub DecToHex{
+ my ($d) = @_;
+ return sprintf("%04X", $d);
+}
+%gtotal = ();
+%dtotal = ();
+while(<CLASS>) {
+ chop;
+ ######################################################################
+ #
+ # Get value from fields
+ #
+ ######################################################################
+ @f = split(/;/ , $_);
+
+ if( substr($f[2], 0, 1) ne "a")
+ {
+ $sc = $simp{$f[2]};
+ $l = hex($f[0]);
+ if($f[1] eq "")
+ {
+ $h = $l;
+ } else {
+ $h = hex($f[1]);
+ }
+ for($k = $l; $k <= $h ; $k++)
+ {
+ if( exists($occ{$k}))
+ {
+ # printf "WARNING !! Conflict defination!!! U+%s -> [%s] [%s | %s]\n",
+ # DecToHex($k), $occ{$k} , $f[2] , $sc;
+ }
+ else
+ {
+ $occ{$k} = $sc . " | " . $f[2];
+ $gclass = GetClass($k);
+ $dclass = GetDClass($k);
+ $gtotal{$sc . $gclass}++;
+ $dtotal{$sc . $dclass}++;
+ $u = DecToHex($k);
+ $rk = " " . substr($u,0,2) . ":" . $sc;
+ $rangecount{$rk}++;
+ }
+ }
+ }
+}
+
+#print %gtotal;
+#print %dtotal;
+
+sub printreport
+{
+ print OUT "<TABLE BORDER=3>\n";
+ print OUT "<TR BGCOLOR=blue><TH><TH>\n";
+
+ foreach $d (sort(keys %dcount)) {
+ print OUT "<TD BGCOLOR=red>$d</TD>\n";
+ }
+
+ print OUT "<TD BGCOLOR=white>Total</TD>\n";
+ foreach $g (sort(keys %gcount)) {
+ print OUT "<TD BGCOLOR=yellow>$g</TD>\n";
+ }
+ print OUT "</TR>\n";
+ foreach $sc (sort(keys %sccount)) {
+
+ print OUT "<TR><TH>$sc<TH>\n";
+
+ $total = 0;
+ foreach $d (sort (keys %dcount)) {
+ $count = $dtotal{$sc . $d};
+ $total += $count;
+ print OUT "<TD>$count</TD>\n";
+ }
+
+ print OUT "<TD BGCOLOR=white>$total</TD>\n";
+
+ foreach $g (sort(keys %gcount)) {
+ $count = $gtotal{$sc . $g};
+ print OUT "<TD>$count</TD>\n";
+ }
+
+
+ print OUT "</TR>\n";
+ }
+ print OUT "</TABLE>\n";
+
+
+ print OUT "<TABLE BORDER=3>\n";
+ print OUT "<TR BGCOLOR=blue><TH><TH>\n";
+
+ foreach $sc (sort(keys %sccount))
+ {
+ print OUT "<TD BGCOLOR=red>$sc</TD>\n";
+ }
+
+ print OUT "</TR>\n";
+
+
+ for($rr = 0; $rr < 0x4f; $rr++)
+ {
+ $empty = 0;
+ $r = sprintf("%02X" , $rr) ;
+ $tmp = "<TR><TH>" . $r . "<TH>\n";
+
+ foreach $sc (sort(keys %sccount)) {
+ $count = $rangecount{ " " .$r . ":" .$sc};
+ $tmp .= sprintf("<TD>%s</TD>\n", $count);
+ $empty += $count;
+ }
+
+ $tmp .= "</TR>\n";
+
+ if($empty ne 0)
+ {
+ print OUT $tmp;
+ }
+ }
+ print OUT "</TABLE>\n";
+
+}
+printreport();
+
+sub printarray
+{
+ my($r, $def) = @_;
+printf "[%s || %s]\n", $r, $def;
+ $k = hex($r) * 256;
+ printf HEADER "static const uint32_t gLBClass%s[32] = {\n", $r;
+ for($i = 0 ; $i < 256; $i+= 8)
+ {
+ for($j = 7 ; $j >= 0; $j-- )
+ {
+ $v = $k + $i + $j;
+ if( exists($occ{$v}))
+ {
+ $p = substr($occ{$v}, 1,1);
+ } else {
+ $p = $def;
+ }
+
+ if($j eq 7 )
+ {
+ printf HEADER "0x%s" , $p;
+ } else {
+ printf HEADER "%s", $p ;
+ }
+ }
+ printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7);
+ }
+ print HEADER "};\n\n";
+}
+printarray("00", "7");
+printarray("20", "7");
+printarray("21", "7");
+printarray("30", "5");
+printarray("0E", "8");
+printarray("17", "7");
+
+#print %rangecount;
+
+######################################################################
+#
+# Close files
+#
+######################################################################
+close(HEADER);
+close(CLASS);
+close(OUT);
+
diff --git a/intl/lwbrk/tools/jisx4051class.txt b/intl/lwbrk/tools/jisx4051class.txt
new file mode 100644
index 0000000000..c435c1ae55
--- /dev/null
+++ b/intl/lwbrk/tools/jisx4051class.txt
@@ -0,0 +1,159 @@
+0000;001f;17
+0020;;17
+0024;;24
+0027;;18
+0028;;22
+002D;;18
+002F;;18
+0021;002F;23
+0030;0039;15
+003C;;22
+003A;003F;23
+0040;;18
+0041;005A;18
+005B;;22
+005E;;18
+005F;;18
+005B;005F;23
+0060;;18
+0061;007A;18
+007B;;22
+007B;007E;23
+00A0;;24
+00A3;;22
+00A5;;22
+00A9;;18
+00AA;;18
+00AB;;18
+00AC;;22
+00AE;;18
+00AF;;18
+00A1;00BF;23
+00B0;;18
+00F7;;23
+00C0;00FF;18
+0E3F;;1
+0E2F;;4
+0E46;;4
+0E5A;0E5B;4
+0E50;0E59;15
+0E4F;;18
+0EAF;;4
+0EC6;;4
+0ED0;0ED9;15
+1735;1736;1
+17D4;17D5;4
+17D8;;4
+17DA;;4
+1780;17DD;21
+17E0;17E9;21
+17F0;17F9;21
+2007;;24
+2000;200B;17
+200C;200F;18
+2010;;18
+2011;;24
+2012;2013;18
+2014;;7
+2015;;18
+2016;2017;18
+2019;;23
+201D;;23
+2018;201F;18
+2020;2023;18
+2024;2026;2
+2027;;23
+2028;202E;18
+202F;;24
+2030;2034;9
+2035;2038;18
+2039;;1
+203A;;2
+203B;;12
+203C;203D;3
+203E;;23
+203F;2043;18
+2044;;3
+2045;;1
+2046;;2
+2047;2049;3
+204A;205E;18
+205F;;17
+2060;;24
+2061;2063;18
+206A;206F;18
+2070;2071;18
+2074;208E;18
+2090;2094;18
+2116;;8
+2160;217F;12
+2190;21EA;a12
+2126;;18
+2100;2138;18
+2153;2182;18
+2190;21EA;18
+3008;;1
+300A;;1
+300C;;1
+300E;;1
+3010;;1
+3014;;1
+3016;;1
+3018;;1
+301A;;1
+301D;;1
+3001;;2
+3009;;2
+300B;;2
+300D;;2
+300F;;2
+3011;;2
+3015;;2
+3017;;2
+3019;;2
+301B;;2
+301E;;2
+301F;;2
+3005;;3
+301C;;3
+3041;;3
+3043;;3
+3045;;3
+3047;;3
+3049;;3
+3063;;3
+3083;;3
+3085;;3
+3087;;3
+308E;;3
+309D;;3
+309E;;3
+30A1;;3
+30A3;;3
+30A5;;3
+30A7;;3
+30A9;;3
+30C3;;3
+30E3;;3
+30E5;;3
+30E7;;3
+30EE;;3
+30F5;;3
+30F6;;3
+30FC;;3
+30FD;;3
+30FE;;3
+30FB;;5
+3002;;6
+3000;;10
+3042;3094;11
+3099;309E;3
+3003;;12
+3004;;12
+3006;;12
+3007;;12
+3012;;12
+3013;;12
+3020;;12
+3036;;12
+30A2;30FA;12
diff --git a/intl/lwbrk/tools/jisx4051simp.txt b/intl/lwbrk/tools/jisx4051simp.txt
new file mode 100644
index 0000000000..e12a7fd805
--- /dev/null
+++ b/intl/lwbrk/tools/jisx4051simp.txt
@@ -0,0 +1,24 @@
+1;00_1
+2;01_[a]
+3;01_[a]
+4;01_[a]
+5;01_[a]
+6;01_[a]
+7;02_7
+8;03_8
+9;04_9
+10;05_[b]
+11;05_[b]
+12;05_[b]
+13;X
+14;X
+15;06_15
+16;X
+17;05_[b]
+18;07_18
+19;X
+20;X
+21;08_COMPLEX
+22;09_[c]
+23;0A_[d]
+24;0B_[e]
diff --git a/intl/lwbrk/tools/spec_table.html b/intl/lwbrk/tools/spec_table.html
new file mode 100644
index 0000000000..b7a642a332
--- /dev/null
+++ b/intl/lwbrk/tools/spec_table.html
@@ -0,0 +1,664 @@
+<!-- This Source Code Form is subject to the terms of the Mozilla Public
+ - License, v. 2.0. If a copy of the MPL was not distributed with this
+ - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
+
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+ <head>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <title></title>
+ <style type="text/css">
+ table {
+ border: solid 1px;
+ border-collapse: collapse;
+ }
+ tbody,
+ tfoot {
+ border-top: solid 2px;
+ }
+ td,
+ th {
+ border: solid 1px;
+ }
+ td {
+ text-align: center;
+ }
+ </style>
+ </head>
+ <body>
+ <p>This is a specification table for line breaking.</p>
+ <p>
+ The values of IE7 and Opera9: 'A' means that the line is breakable After
+ the character, and 'B' means Before. 'BA' means Before and After.
+ </p>
+ <p>
+ (C) which is the tail of the IE7 and the Opera9 means Character. (N) means
+ Numeric. This means that they are around the character at testing. E.g.,
+ "a$a" is a testcase for (C), "0$0" is a testcase for (N).
+ </p>
+ <p>
+ Gecko is not breaking the lines on most western language context. But for
+ file paths, URLs and very long word which is connected hyphens, some
+ characters might be breakable. They are 'breakable' in the table. However,
+ they are not always breakable, they <em>depend on the context</em> in the
+ word.
+ </p>
+ <table border="1">
+ <thead>
+ <tr>
+ <th colspan="2">character</th>
+ <th>Gecko</th>
+ <th>IE7(C)</th>
+ <th>IE7(N)</th>
+ <th>Opera9.2(C)</th>
+ <th>Opera9.2(N)</th>
+ </tr>
+ </thead>
+ <tfoot>
+ <tr>
+ <th colspan="2">character</th>
+ <th>Gecko</th>
+ <th>IE7(C)</th>
+ <th>IE7(N)</th>
+ <th>Opera9.2(C)</th>
+ <th>Opera9.2(N)</th>
+ </tr>
+ </tfoot>
+ <tbody>
+ <tr>
+ <th>0x21</th>
+ <th>&#x21;</th>
+ <td></td>
+ <td>A</td>
+ <td>A</td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x22</th>
+ <th>&#x22;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x23</th>
+ <th>&#x23;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x24</th>
+ <th>&#x24;</th>
+ <td></td>
+ <td></td>
+ <td>B</td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x25</th>
+ <th>&#x25;</th>
+ <td>breakable</td>
+ <td>A</td>
+ <td>A</td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x26</th>
+ <th>&#x26;</th>
+ <td>breakable</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x27</th>
+ <th>&#x27;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x28</th>
+ <th>&#x28;</th>
+ <td></td>
+ <td>B</td>
+ <td>B</td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x29</th>
+ <th>&#x29;</th>
+ <td></td>
+ <td>A</td>
+ <td>A</td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x2A</th>
+ <th>&#x2A;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x2B</th>
+ <th>&#x2B;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x2C</th>
+ <th>&#x2C;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x2D</th>
+ <th>&#x2D;</th>
+ <td>breakable</td>
+ <td>BA</td>
+ <td>BA</td>
+ <td>A</td>
+ <td>A</td>
+ </tr>
+ <tr>
+ <th>0x2E</th>
+ <th>&#x2E;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x2F</th>
+ <th>&#x2F;</th>
+ <td>breakable</td>
+ <td></td>
+ <td></td>
+ <td>A</td>
+ <td>A</td>
+ </tr>
+ </tbody>
+ <tbody>
+ <tr>
+ <th>0x3A</th>
+ <th>&#x3A;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x3B</th>
+ <th>&#x3B;</th>
+ <td>breakable</td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x3C</th>
+ <th>&#x3C;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x3D</th>
+ <th>&#x3D;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x3E</th>
+ <th>&#x3E;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x3F</th>
+ <th>&#x3F;</th>
+ <td></td>
+ <td>A</td>
+ <td>A</td>
+ <td></td>
+ <td></td>
+ </tr>
+ </tbody>
+ <tbody>
+ <tr>
+ <th>0x40</th>
+ <th>&#x40;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ </tbody>
+ <tbody>
+ <tr>
+ <th>0x5B</th>
+ <th>&#x5B;</th>
+ <td></td>
+ <td>B</td>
+ <td>B</td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x5C</th>
+ <th>&#x5C;</th>
+ <td>breakable</td>
+ <td></td>
+ <td>B</td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x5D</th>
+ <th>&#x5D;</th>
+ <td></td>
+ <td>A</td>
+ <td>A</td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x5E</th>
+ <th>&#x5E;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x5F</th>
+ <th>&#x5F;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ </tbody>
+ <tbody>
+ <tr>
+ <th>0x60</th>
+ <th>&#x60;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ </tbody>
+ <tbody>
+ <tr>
+ <th>0x7B</th>
+ <th>&#x7B;</th>
+ <td></td>
+ <td>B</td>
+ <td>B</td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x7C</th>
+ <th>&#x7C;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>A</td>
+ <td>A</td>
+ </tr>
+ <tr>
+ <th>0x7D</th>
+ <th>&#x7D;</th>
+ <td></td>
+ <td>A</td>
+ <td>A</td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0x7E</th>
+ <th>&#x7E;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ </tbody>
+ <tbody>
+ <tr>
+ <th>0xA1</th>
+ <th>&#xA1;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xA2</th>
+ <th>&#xA2;</th>
+ <td></td>
+ <td>A</td>
+ <td>A</td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xA3</th>
+ <th>&#xA3;</th>
+ <td></td>
+ <td></td>
+ <td>B</td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xA4</th>
+ <th>&#xA4;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xA5</th>
+ <th>&#xA5;</th>
+ <td></td>
+ <td></td>
+ <td>B</td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xA6</th>
+ <th>&#xA6;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xA7</th>
+ <th>&#xA7;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xA8</th>
+ <th>&#xA8;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xA9</th>
+ <th>&#xA9;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xAA</th>
+ <th>&#xAA;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xAB</th>
+ <th>&#xAB;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xAC</th>
+ <th>&#xAC;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xAE</th>
+ <th>&#xAE;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xAF</th>
+ <th>&#xAF;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ </tbody>
+ <tbody>
+ <tr>
+ <th>0xB0</th>
+ <th>&#xB0;</th>
+ <td></td>
+ <td>A</td>
+ <td>A</td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xB1</th>
+ <th>&#xB1;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xB2</th>
+ <th>&#xB2;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xB3</th>
+ <th>&#xB3;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xB4</th>
+ <th>&#xB4;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td>B</td>
+ <td>B</td>
+ </tr>
+ <tr>
+ <th>0xB5</th>
+ <th>&#xB5;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xB6</th>
+ <th>&#xB6;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xB7</th>
+ <th>&#xB7;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xB8</th>
+ <th>&#xB8;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xB9</th>
+ <th>&#xB9;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xBA</th>
+ <th>&#xBA;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xBB</th>
+ <th>&#xBB;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xBC</th>
+ <th>&#xBC;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xBD</th>
+ <th>&#xBD;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xBE</th>
+ <th>&#xBE;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ <tr>
+ <th>0xBF</th>
+ <th>&#xBF;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ </tbody>
+ <tbody>
+ <tr>
+ <th>0xD7</th>
+ <th>&#xD7;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ </tbody>
+ <tbody>
+ <tr>
+ <th>0xF7</th>
+ <th>&#xF7;</th>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ </tr>
+ </tbody>
+ </table>
+ </body>
+</html>