summaryrefslogtreecommitdiffstats
path: root/intl/lwbrk
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--intl/lwbrk/LineBreaker.cpp1169
-rw-r--r--intl/lwbrk/LineBreaker.h88
-rw-r--r--intl/lwbrk/WordBreaker.cpp218
-rw-r--r--intl/lwbrk/WordBreaker.h53
-rw-r--r--intl/lwbrk/crashtests/416721.html11
-rw-r--r--intl/lwbrk/crashtests/crashtests.list1
-rw-r--r--intl/lwbrk/gtest/TestLineBreak.cpp283
-rw-r--r--intl/lwbrk/gtest/moz.build11
-rw-r--r--intl/lwbrk/jisx4051class.h217
-rw-r--r--intl/lwbrk/jisx4051pairtable.txt286
-rw-r--r--intl/lwbrk/moz.build40
-rw-r--r--intl/lwbrk/nsCarbonBreaker.cpp43
-rw-r--r--intl/lwbrk/nsComplexBreaker.h18
-rw-r--r--intl/lwbrk/nsLWBrkCIID.h28
-rw-r--r--intl/lwbrk/nsPangoBreaker.cpp58
-rw-r--r--intl/lwbrk/nsRuleBreaker.cpp17
-rw-r--r--intl/lwbrk/nsUniscribeBreaker.cpp60
-rw-r--r--intl/lwbrk/rulebrk.c388
-rw-r--r--intl/lwbrk/rulebrk.h26
-rw-r--r--intl/lwbrk/th_char.h133
-rw-r--r--intl/lwbrk/tools/anzx4051.html669
-rw-r--r--intl/lwbrk/tools/anzx4051.pl356
-rw-r--r--intl/lwbrk/tools/jisx4051class.txt159
-rw-r--r--intl/lwbrk/tools/jisx4051simp.txt24
-rw-r--r--intl/lwbrk/tools/spec_table.html127
25 files changed, 4483 insertions, 0 deletions
diff --git a/intl/lwbrk/LineBreaker.cpp b/intl/lwbrk/LineBreaker.cpp
new file mode 100644
index 0000000000..d4c78c789e
--- /dev/null
+++ b/intl/lwbrk/LineBreaker.cpp
@@ -0,0 +1,1169 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/intl/LineBreaker.h"
+
+#include "jisx4051class.h"
+#include "nsComplexBreaker.h"
+#include "nsTArray.h"
+#include "nsUnicodeProperties.h"
+#include "mozilla/ArrayUtils.h"
+
+using namespace mozilla::unicode;
+using namespace mozilla::intl;
+
+/*static*/
+already_AddRefed<LineBreaker> LineBreaker::Create() {
+ return RefPtr<LineBreaker>(new LineBreaker()).forget();
+}
+
+/*
+
+ Simplification of Pair Table in JIS X 4051
+
+ 1. The Origion Table - in 4.1.3
+
+ In JIS x 4051. The pair table is defined as below
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20
+ * # * #
+ 1 X X X X X X X X X X X X X X X X X X X X X E
+ 2 X X X X X X
+ 3 X X X X X X
+ 4 X X X X X X
+ 5 X X X X X X
+ 6 X X X X X X
+ 7 X X X X X X X
+ 8 X X X X X X E
+ 9 X X X X X X
+ 10 X X X X X X
+ 11 X X X X X X
+ 12 X X X X X X
+ 13 X X X X X X X
+ 14 X X X X X X X
+ 15 X X X X X X X X X
+ 16 X X X X X X X X
+ 17 X X X X X E
+ 18 X X X X X X X X X
+ 19 X E E E E E X X X X X X X X X X X X E X E E
+ 20 X X X X X E
+
+ * Same Char
+ # Other Char
+
+ X Cannot Break
+
+ The classes mean:
+ 1: Open parenthesis
+ 2: Close parenthesis
+ 3: Prohibit a line break before
+ 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")
+ 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)
+ 6: Full stop
+ 7: Non-breakable between same characters
+ 8: Prefix (e.g., "$", "NO.")
+ 9: Postfix (e.g., "%")
+ 10: Ideographic space
+ 11: Hiragana
+ 12: Japanese characters (except class 11)
+ 13: Subscript
+ 14: Ruby
+ 15: Numeric
+ 16: Alphabet
+ 17: Space for Western language
+ 18: Western characters (except class 17)
+ 19: Split line note (Warichu) begin quote
+ 20: Split line note (Warichu) end quote
+
+ 2. Simplified by remove the class which we do not care
+
+ However, since we do not care about class 13(Subscript), 14(Ruby),
+ 16 (Aphabet), 19(split line note begin quote), and 20(split line note end
+ quote) we can simplify this par table into the following
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18
+
+ 1 X X X X X X X X X X X X X X X
+ 2 X X X X X
+ 3 X X X X X
+ 4 X X X X X
+ 5 X X X X X
+ 6 X X X X X
+ 7 X X X X X X
+ 8 X X X X X X
+ 9 X X X X X
+ 10 X X X X X
+ 11 X X X X X
+ 12 X X X X X
+ 15 X X X X X X X X
+ 17 X X X X X
+ 18 X X X X X X X
+
+ 3. Simplified by merged classes
+
+ After the 2 simplification, the pair table have some duplication
+ a. class 2, 3, 4, 5, 6, are the same- we can merged them
+ b. class 10, 11, 12, 17 are the same- we can merged them
+
+ We introduce an extra non-breaking pair at [b]/7 to better match
+ the expectations of CSS line-breaking as tested by WPT tests.
+ This added entry is marked as * in the tables below.
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18
+
+ 1 X X X X X X X X
+ [a] X
+ 7 X X
+ 8 X X
+ 9 X
+ [b] X *
+ 15 X X X X
+ 18 X X X
+
+
+ 4. We add COMPLEX characters and make it breakable w/ all ther class
+ except after class 1 and before class [a]
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18 COMPLEX
+
+ 1 X X X X X X X X X
+ [a] X
+ 7 X X
+ 8 X X
+ 9 X
+ [b] X *
+ 15 X X X X
+ 18 X X X
+ COMPLEX X T
+
+ T : need special handling
+
+
+ 5. However, we need two special class for some punctuations/parentheses,
+ theirs breaking rules like character class (18), see bug 389056.
+ And also we need character like punctuation that is same behavior with 18,
+ but the characters are not letters of all languages. (e.g., '_')
+ [c]. Based on open parenthesis class (1), but it is not breakable after
+ character class (18) or numeric class (15).
+ [d]. Based on close parenthesis (or punctuation) class (2), but it is not
+ breakable before character class (18) or numeric class (15).
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d]
+
+ 1 X X X X X X X X X X X
+ [a] X X X
+ 7 X X
+ 8 X X
+ 9 X
+ [b] X * X
+ 15 X X X X X X
+ 18 X X X X X
+ COMPLEX X T
+ [c] X X X X X X X X X X X
+ [d] X X X X
+
+
+ 6. And Unicode has "NON-BREAK" characters. The lines should be broken around
+ them. But in JIS X 4051, such class is not, therefore, we create [e].
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
+
+ 1 X X X X X X X X X X X X
+ [a] X X X
+ 7 X X X
+ 8 X X X
+ 9 X X
+ [b] X * X X
+ 15 X X X X X X X
+ 18 X X X X X X
+ COMPLEX X T X
+ [c] X X X X X X X X X X X X
+ [d] X X X X X
+ [e] X X X X X X X X X X X X
+
+
+ 7. Now we use one bit to encode whether it is breakable, and use 2 bytes
+ for one row, then the bit table will look like:
+
+ 18 <- 1
+
+ 1 0000 1111 1111 1111 = 0x0FFF
+ [a] 0000 1100 0000 0010 = 0x0C02
+ 7 0000 1000 0000 0110 = 0x0806
+ 8 0000 1000 0100 0010 = 0x0842
+ 9 0000 1000 0000 0010 = 0x0802
+ [b] 0000 1100 0000 0110 = 0x0C06
+ 15 0000 1110 1101 0010 = 0x0ED2
+ 18 0000 1110 1100 0010 = 0x0EC2
+ COMPLEX 0000 1001 0000 0010 = 0x0902
+ [c] 0000 1111 1111 1111 = 0x0FFF
+ [d] 0000 1100 1100 0010 = 0x0CC2
+ [e] 0000 1111 1111 1111 = 0x0FFF
+*/
+
+#define MAX_CLASSES 12
+
+static const uint16_t gPair[MAX_CLASSES] = {0x0FFF, 0x0C02, 0x0806, 0x0842,
+ 0x0802, 0x0C06, 0x0ED2, 0x0EC2,
+ 0x0902, 0x0FFF, 0x0CC2, 0x0FFF};
+
+/*
+
+ 8. And if the character is not enough far from word start, word end and
+ another break point, we should not break in non-CJK languages.
+ I.e., Don't break around 15, 18, [c] and [d], but don't change
+ that if they are related to [b].
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
+
+ 1 X X X X X X X X X X X X
+ [a] X X X X X X
+ 7 X X X X X X X
+ 8 X X X X X X
+ 9 X X X X X X
+ [b] X * X X
+ 15 X X X X X X X X X X X
+ 18 X X X X X X X X X X X
+ COMPLEX X X X T X X X
+ [c] X X X X X X X X X X X X
+ [d] X X X X X X X X X X X
+ [e] X X X X X X X X X X X X
+
+ 18 <- 1
+
+ 1 0000 1111 1111 1111 = 0x0FFF
+ [a] 0000 1110 1100 0010 = 0x0EC2
+ 7 0000 1110 1100 0110 = 0x0EC6
+ 8 0000 1110 1100 0010 = 0x0EC2
+ 9 0000 1110 1100 0010 = 0x0EC2
+ [b] 0000 1100 0000 0110 = 0x0C06
+ 15 0000 1111 1101 1111 = 0x0FDF
+ 18 0000 1111 1101 1111 = 0x0FDF
+ COMPLEX 0000 1111 1100 0010 = 0x0FC2
+ [c] 0000 1111 1111 1111 = 0x0FFF
+ [d] 0000 1111 1101 1111 = 0x0FDF
+ [e] 0000 1111 1111 1111 = 0x0FFF
+*/
+
+static const uint16_t gPairConservative[MAX_CLASSES] = {
+ 0x0FFF, 0x0EC2, 0x0EC6, 0x0EC2, 0x0EC2, 0x0C06,
+ 0x0FDF, 0x0FDF, 0x0FC2, 0x0FFF, 0x0FDF, 0x0FFF};
+
+/*
+
+ 9. Now we map the class to number
+
+ 0: 1
+ 1: [a]- 2, 3, 4, 5, 6
+ 2: 7
+ 3: 8
+ 4: 9
+ 5: [b]- 10, 11, 12, 17
+ 6: 15
+ 7: 18
+ 8: COMPLEX
+ 9: [c]
+ A: [d]
+ B: [e]
+
+ and they mean:
+ 0: Open parenthesis
+ 1: Punctuation that prohibits break before
+ 2: Non-breakable between same classes
+ 3: Prefix
+ 4: Postfix
+ 5: Breakable character (Spaces and Most Japanese characters)
+ 6: Numeric
+ 7: Characters
+ 8: Need special handling characters (E.g., Thai)
+ 9: Open parentheses like Character (See bug 389056)
+ A: Close parenthese (or punctuations) like Character (See bug 389056)
+ B: Non breakable (See bug 390920)
+
+*/
+
+#define CLASS_NONE INT8_MAX
+
+#define CLASS_OPEN 0x00
+#define CLASS_CLOSE 0x01
+#define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02
+#define CLASS_PREFIX 0x03
+#define CLASS_POSTFFIX 0x04
+#define CLASS_BREAKABLE 0x05
+#define CLASS_NUMERIC 0x06
+#define CLASS_CHARACTER 0x07
+#define CLASS_COMPLEX 0x08
+#define CLASS_OPEN_LIKE_CHARACTER 0x09
+#define CLASS_CLOSE_LIKE_CHARACTER 0x0A
+#define CLASS_NON_BREAKABLE 0x0B
+
+#define U_NULL char16_t(0x0000)
+#define U_SLASH char16_t('/')
+#define U_SPACE char16_t(' ')
+#define U_HYPHEN char16_t('-')
+#define U_EQUAL char16_t('=')
+#define U_PERCENT char16_t('%')
+#define U_AMPERSAND char16_t('&')
+#define U_SEMICOLON char16_t(';')
+#define U_BACKSLASH char16_t('\\')
+#define U_OPEN_SINGLE_QUOTE char16_t(0x2018)
+#define U_OPEN_DOUBLE_QUOTE char16_t(0x201C)
+#define U_OPEN_GUILLEMET char16_t(0x00AB)
+
+#define NEED_CONTEXTUAL_ANALYSIS(c) \
+ (IS_HYPHEN(c) || (c) == U_SLASH || (c) == U_PERCENT || (c) == U_AMPERSAND || \
+ (c) == U_SEMICOLON || (c) == U_BACKSLASH || (c) == U_OPEN_SINGLE_QUOTE || \
+ (c) == U_OPEN_DOUBLE_QUOTE || (c) == U_OPEN_GUILLEMET)
+
+#define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
+
+static inline int GETCLASSFROMTABLE(const uint32_t* t, uint16_t l) {
+ return ((((t)[(l >> 3)]) >> ((l & 0x0007) << 2)) & 0x000f);
+}
+
+static inline int IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u) {
+ return ((0xff66 <= (u)) && ((u) <= 0xff70));
+}
+
+static inline int IS_CJK_CHAR(char32_t u) {
+ return (
+ (0x1100 <= (u) && (u) <= 0x11ff) || (0x2e80 <= (u) && (u) <= 0xd7ff) ||
+ (0xf900 <= (u) && (u) <= 0xfaff) || (0xff00 <= (u) && (u) <= 0xffef) ||
+ (0x20000 <= (u) && (u) <= 0x2fffd));
+}
+
+static inline bool IS_NONBREAKABLE_SPACE(char16_t u) {
+ return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE
+}
+
+static inline bool IS_HYPHEN(char16_t u) {
+ return (u == U_HYPHEN || u == 0x2010 || // HYPHEN
+ u == 0x2012 || // FIGURE DASH
+ u == 0x2013 || // EN DASH
+#if ANDROID
+ /* Bug 1647377: On Android, we don't have a "platform" backend
+ * that supports Tibetan (nsRuleBreaker.cpp only knows about
+ * Thai), so instead we just treat the TSHEG like a hyphen to
+ * provide basic line-breaking possibilities.
+ */
+ u == 0x0F0B || // TIBETAN MARK INTERSYLLABIC TSHEG
+#endif
+ u == 0x058A); // ARMENIAN HYPHEN
+}
+
+static int8_t GetClass(uint32_t u, LineBreaker::Strictness aLevel,
+ bool aIsChineseOrJapanese) {
+ // Mapping for Unicode LineBreak.txt classes to the (simplified) set of
+ // character classes used here.
+ // XXX The mappings here were derived by comparing the Unicode LineBreak
+ // values of BMP characters to the classes our existing GetClass returns
+ // for the same codepoints; in cases where characters with the same
+ // LineBreak class mapped to various classes here, I picked what seemed
+ // the most prevalent equivalence.
+ // Some of these are unclear to me, but currently they are ONLY used
+ // for characters not handled by the old code below, so all the JISx405
+ // special cases should already be accounted for.
+ static const int8_t sUnicodeLineBreakToClass[] = {
+ /* UNKNOWN = 0, [XX] */ CLASS_CHARACTER,
+ /* AMBIGUOUS = 1, [AI] */ CLASS_CHARACTER,
+ /* ALPHABETIC = 2, [AL] */ CLASS_CHARACTER,
+ /* BREAK_BOTH = 3, [B2] */ CLASS_CHARACTER,
+ /* BREAK_AFTER = 4, [BA] */ CLASS_CHARACTER,
+ /* BREAK_BEFORE = 5, [BB] */ CLASS_OPEN_LIKE_CHARACTER,
+ /* MANDATORY_BREAK = 6, [BK] */ CLASS_CHARACTER,
+ /* CONTINGENT_BREAK = 7, [CB] */ CLASS_CHARACTER,
+ /* CLOSE_PUNCTUATION = 8, [CL] */ CLASS_CHARACTER,
+ /* COMBINING_MARK = 9, [CM] */ CLASS_CHARACTER,
+ /* CARRIAGE_RETURN = 10, [CR] */ CLASS_BREAKABLE,
+ /* EXCLAMATION = 11, [EX] */ CLASS_CHARACTER,
+ /* GLUE = 12, [GL] */ CLASS_NON_BREAKABLE,
+ /* HYPHEN = 13, [HY] */ CLASS_CHARACTER,
+ /* IDEOGRAPHIC = 14, [ID] */ CLASS_BREAKABLE,
+ /* INSEPARABLE = 15, [IN] */ CLASS_CLOSE_LIKE_CHARACTER,
+ /* INFIX_NUMERIC = 16, [IS] */ CLASS_CHARACTER,
+ /* LINE_FEED = 17, [LF] */ CLASS_BREAKABLE,
+ /* NONSTARTER = 18, [NS] */ CLASS_CLOSE_LIKE_CHARACTER,
+ /* NUMERIC = 19, [NU] */ CLASS_NUMERIC,
+ /* OPEN_PUNCTUATION = 20, [OP] */ CLASS_CHARACTER,
+ /* POSTFIX_NUMERIC = 21, [PO] */ CLASS_CHARACTER,
+ /* PREFIX_NUMERIC = 22, [PR] */ CLASS_CHARACTER,
+ /* QUOTATION = 23, [QU] */ CLASS_CHARACTER,
+ /* COMPLEX_CONTEXT = 24, [SA] */ CLASS_CHARACTER,
+ /* SURROGATE = 25, [SG] */ CLASS_CHARACTER,
+ /* SPACE = 26, [SP] */ CLASS_BREAKABLE,
+ /* BREAK_SYMBOLS = 27, [SY] */ CLASS_CHARACTER,
+ /* ZWSPACE = 28, [ZW] */ CLASS_BREAKABLE,
+ /* NEXT_LINE = 29, [NL] */ CLASS_CHARACTER,
+ /* WORD_JOINER = 30, [WJ] */ CLASS_NON_BREAKABLE,
+ /* H2 = 31, [H2] */ CLASS_BREAKABLE,
+ /* H3 = 32, [H3] */ CLASS_BREAKABLE,
+ /* JL = 33, [JL] */ CLASS_CHARACTER,
+ /* JT = 34, [JT] */ CLASS_CHARACTER,
+ /* JV = 35, [JV] */ CLASS_CHARACTER,
+ /* CLOSE_PARENTHESIS = 36, [CP] */ CLASS_CLOSE_LIKE_CHARACTER,
+ /* CONDITIONAL_JAPANESE_STARTER = 37, [CJ] */ CLASS_CLOSE,
+ /* HEBREW_LETTER = 38, [HL] */ CLASS_CHARACTER,
+ /* REGIONAL_INDICATOR = 39, [RI] */ CLASS_CHARACTER,
+ /* E_BASE = 40, [EB] */ CLASS_BREAKABLE,
+ /* E_MODIFIER = 41, [EM] */ CLASS_CHARACTER,
+ /* ZWJ = 42, [ZWJ]*/ CLASS_CHARACTER};
+
+ static_assert(U_LB_COUNT == mozilla::ArrayLength(sUnicodeLineBreakToClass),
+ "Gecko vs ICU LineBreak class mismatch");
+
+ auto cls = GetLineBreakClass(u);
+ MOZ_ASSERT(cls < mozilla::ArrayLength(sUnicodeLineBreakToClass));
+
+ // Overrides based on rules for the different line-break values given in
+ // https://drafts.csswg.org/css-text-3/#line-break-property
+ switch (aLevel) {
+ case LineBreaker::Strictness::Auto:
+ // For now, just use legacy Gecko behavior.
+ // XXX Possible enhancement - vary strictness according to line width
+ // or other criteria.
+ break;
+ case LineBreaker::Strictness::Strict:
+ if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER ||
+ (u == 0x3095 || u == 0x3096 || u == 0x30f5 || u == 0x30f6)) {
+ return CLASS_CLOSE;
+ }
+ if (cls == U_LB_INSEPARABLE) {
+ return CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS;
+ }
+ if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E ||
+ u == 0x30FD || u == 0x30FE) {
+ return CLASS_CLOSE_LIKE_CHARACTER;
+ }
+ if (aIsChineseOrJapanese) {
+ if (cls == U_LB_POSTFIX_NUMERIC && IsEastAsianWidthAFW(u)) {
+ return CLASS_CLOSE_LIKE_CHARACTER;
+ }
+ if (cls == U_LB_PREFIX_NUMERIC && IsEastAsianWidthAFW(u)) {
+ return CLASS_OPEN_LIKE_CHARACTER;
+ }
+ if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) {
+ return CLASS_CLOSE_LIKE_CHARACTER;
+ }
+ }
+ break;
+ case LineBreaker::Strictness::Normal:
+ if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER) {
+ return CLASS_BREAKABLE;
+ }
+ if (cls == U_LB_INSEPARABLE) {
+ return CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS;
+ }
+ if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E ||
+ u == 0x30FD || u == 0x30FE) {
+ return CLASS_CLOSE_LIKE_CHARACTER;
+ }
+ if (aIsChineseOrJapanese) {
+ if (cls == U_LB_POSTFIX_NUMERIC && IsEastAsianWidthAFW(u)) {
+ return CLASS_CLOSE_LIKE_CHARACTER;
+ }
+ if (cls == U_LB_PREFIX_NUMERIC && IsEastAsianWidthAFW(u)) {
+ return CLASS_OPEN_LIKE_CHARACTER;
+ }
+ if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) {
+ return CLASS_BREAKABLE;
+ }
+ }
+ break;
+ case LineBreaker::Strictness::Loose:
+ if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER) {
+ return CLASS_BREAKABLE;
+ }
+ if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E ||
+ u == 0x30FD || u == 0x30FE) {
+ return CLASS_BREAKABLE;
+ }
+ if (cls == U_LB_INSEPARABLE) {
+ return CLASS_BREAKABLE;
+ }
+ if (aIsChineseOrJapanese) {
+ if (u == 0x30FB || u == 0xFF1A || u == 0xFF1B || u == 0xFF65 ||
+ u == 0x203C || u == 0x2047 || u == 0x2048 || u == 0x2049 ||
+ u == 0xFF01 || u == 0xFF1F) {
+ return CLASS_BREAKABLE;
+ }
+ if (cls == U_LB_POSTFIX_NUMERIC && IsEastAsianWidthAFW(u)) {
+ return CLASS_BREAKABLE;
+ }
+ if (cls == U_LB_PREFIX_NUMERIC && IsEastAsianWidthAFW(u)) {
+ return CLASS_BREAKABLE;
+ }
+ if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) {
+ return CLASS_BREAKABLE;
+ }
+ }
+ break;
+ case LineBreaker::Strictness::Anywhere:
+ MOZ_ASSERT_UNREACHABLE("should have been handled already");
+ break;
+ }
+
+ if (u < 0x10000) {
+ uint16_t h = u & 0xFF00;
+ uint16_t l = u & 0x00ff;
+
+ // Handle 3 range table first
+ if (0x0000 == h) {
+ return GETCLASSFROMTABLE(gLBClass00, l);
+ }
+ if (0x1700 == h) {
+ return GETCLASSFROMTABLE(gLBClass17, l);
+ }
+ if (NS_NeedsPlatformNativeHandling(u)) {
+ return CLASS_COMPLEX;
+ }
+ if (0x0E00 == h) {
+ return GETCLASSFROMTABLE(gLBClass0E, l);
+ }
+ if (0x2000 == h) {
+ return GETCLASSFROMTABLE(gLBClass20, l);
+ }
+ if (0x2100 == h) {
+ return GETCLASSFROMTABLE(gLBClass21, l);
+ }
+ if (0x3000 == h) {
+ return GETCLASSFROMTABLE(gLBClass30, l);
+ }
+ if (0xff00 == h) {
+ if (l <= 0x0060) { // Fullwidth ASCII variant
+ // Fullwidth comma and period are exceptions to our map-to-ASCII
+ // behavior: https://bugzilla.mozilla.org/show_bug.cgi?id=1595428
+ if (l + 0x20 == ',' || l + 0x20 == '.') {
+ return CLASS_CLOSE;
+ }
+ // Also special-case fullwidth left/right white parenthesis,
+ // which do not fit the pattern of mapping to the ASCII block
+ if (l == 0x005f) {
+ return CLASS_OPEN;
+ }
+ if (l == 0x0060) {
+ return CLASS_CLOSE;
+ }
+ return GETCLASSFROMTABLE(gLBClass00, (l + 0x20));
+ }
+ if (l < 0x00a0) { // Halfwidth Katakana variants
+ switch (l) {
+ case 0x61:
+ return GetClass(0x3002, aLevel, aIsChineseOrJapanese);
+ case 0x62:
+ return GetClass(0x300c, aLevel, aIsChineseOrJapanese);
+ case 0x63:
+ return GetClass(0x300d, aLevel, aIsChineseOrJapanese);
+ case 0x64:
+ return GetClass(0x3001, aLevel, aIsChineseOrJapanese);
+ case 0x65:
+ return GetClass(0x30fb, aLevel, aIsChineseOrJapanese);
+ case 0x9e:
+ return GetClass(0x309b, aLevel, aIsChineseOrJapanese);
+ case 0x9f:
+ return GetClass(0x309c, aLevel, aIsChineseOrJapanese);
+ default:
+ if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) {
+ return CLASS_CLOSE; // jis x4051 class 3
+ }
+ return CLASS_BREAKABLE; // jis x4051 class 11
+ }
+ }
+ if (l < 0x00e0) {
+ return CLASS_CHARACTER; // Halfwidth Hangul variants
+ }
+ if (l < 0x00f0) {
+ static char16_t NarrowFFEx[16] = {
+ 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
+ 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000};
+ return GetClass(NarrowFFEx[l - 0x00e0], aLevel, aIsChineseOrJapanese);
+ }
+ } else if (0x3100 == h) {
+ if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun
+ // XXX: This is per UAX #14, but UAX #14 may change
+ // the line breaking rules about Kanbun and Bopomofo.
+ return CLASS_BREAKABLE;
+ }
+ if (l >= 0xf0) { // Katakana small letters for Ainu
+ return CLASS_CLOSE;
+ }
+ } else if (0x0300 == h) {
+ if (0x4F == l || (0x5C <= l && l <= 0x62)) {
+ return CLASS_NON_BREAKABLE;
+ }
+ } else if (0x0500 == h) {
+ // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)
+ if (l == 0x8A) {
+ return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN));
+ }
+ } else if (0x0F00 == h) {
+ // Tibetan chars with class = BA
+ if (0x34 == l || 0x7f == l || 0x85 == l || 0xbe == l || 0xbf == l ||
+ 0xd2 == l) {
+ return CLASS_BREAKABLE;
+ }
+ } else if (0x1800 == h) {
+ if (0x0E == l) {
+ return CLASS_NON_BREAKABLE;
+ }
+ } else if (0x1600 == h) {
+ if (0x80 == l) { // U+1680 OGHAM SPACE MARK
+ return CLASS_BREAKABLE;
+ }
+ } else if (u == 0xfeff) {
+ return CLASS_NON_BREAKABLE;
+ }
+ }
+
+ return sUnicodeLineBreakToClass[cls];
+}
+
+static bool GetPair(int8_t c1, int8_t c2) {
+ NS_ASSERTION(c1 < MAX_CLASSES, "illegal classes 1");
+ NS_ASSERTION(c2 < MAX_CLASSES, "illegal classes 2");
+
+ return (0 == ((gPair[c1] >> c2) & 0x0001));
+}
+
+static bool GetPairConservative(int8_t c1, int8_t c2) {
+ NS_ASSERTION(c1 < MAX_CLASSES, "illegal classes 1");
+ NS_ASSERTION(c2 < MAX_CLASSES, "illegal classes 2");
+
+ return (0 == ((gPairConservative[c1] >> c2) & 0x0001));
+}
+
+class ContextState {
+ public:
+ ContextState(const char16_t* aText, uint32_t aLength)
+ : mUniText(aText), mText(nullptr), mLength(aLength) {
+ Init();
+ }
+
+ ContextState(const uint8_t* aText, uint32_t aLength)
+ : mUniText(nullptr), mText(aText), mLength(aLength) {
+ Init();
+ }
+
+ uint32_t Length() const { return mLength; }
+ uint32_t Index() const { return mIndex; }
+
+ // This gets a single code unit of the text, without checking for surrogates
+ // (in the case of a 16-bit text buffer). That's OK if we're only checking for
+ // specific characters that are known to be BMP values.
+ char16_t GetCodeUnitAt(uint32_t aIndex) const {
+ MOZ_ASSERT(aIndex < mLength, "Out of range!");
+ return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);
+ }
+
+ // This gets a 32-bit Unicode character (codepoint), handling surrogate pairs
+ // as necessary. It must ONLY be called for 16-bit text, not 8-bit.
+ char32_t GetUnicodeCharAt(uint32_t aIndex) const {
+ MOZ_ASSERT(mUniText, "Only for 16-bit text!");
+ MOZ_ASSERT(aIndex < mLength, "Out of range!");
+ char32_t c = mUniText[aIndex];
+ if (aIndex + 1 < mLength && NS_IS_SURROGATE_PAIR(c, mUniText[aIndex + 1])) {
+ c = SURROGATE_TO_UCS4(c, mUniText[aIndex + 1]);
+ }
+ return c;
+ }
+
+ void AdvanceIndex() { ++mIndex; }
+
+ void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
+
+ // A word of western language should not be broken. But even if the word has
+ // only ASCII characters, non-natural context words should be broken, e.g.,
+ // URL and file path. For protecting the natural words, we should use
+ // conservative breaking rules at following conditions:
+ // 1. at near the start of word
+ // 2. at near the end of word
+ // 3. at near the latest broken point
+ // CONSERVATIVE_RANGE_{LETTER,OTHER} define the 'near' in characters,
+ // which varies depending whether we are looking at a letter or a non-letter
+ // character: for non-letters, we use an extended "conservative" range.
+
+#define CONSERVATIVE_RANGE_LETTER 2
+#define CONSERVATIVE_RANGE_OTHER 6
+
+ bool UseConservativeBreaking(uint32_t aOffset = 0) const {
+ if (mHasCJKChar) return false;
+ uint32_t index = mIndex + aOffset;
+
+ // If the character at index is a letter (rather than various punctuation
+ // characters, etc) then we want a shorter "conservative" range
+ uint32_t conservativeRangeStart, conservativeRangeEnd;
+ if (index < mLength &&
+ nsUGenCategory::kLetter ==
+ (mText ? GetGenCategory(mText[index])
+ : GetGenCategory(GetUnicodeCharAt(index)))) {
+ // Primarily for hyphenated word prefixes/suffixes; we add 1 to Start
+ // to get more balanced behavior (if we break off a 2-letter prefix,
+ // that means the break will actually be three letters from start of
+ // word, to include the hyphen; whereas a 2-letter suffix will be
+ // broken only two letters from end of word).
+ conservativeRangeEnd = CONSERVATIVE_RANGE_LETTER;
+ conservativeRangeStart = CONSERVATIVE_RANGE_LETTER + 1;
+ } else {
+ conservativeRangeEnd = conservativeRangeStart = CONSERVATIVE_RANGE_OTHER;
+ }
+
+ bool result = (index < conservativeRangeStart ||
+ mLength - index < conservativeRangeEnd ||
+ index - mLastBreakIndex < conservativeRangeStart);
+ if (result || !mHasNonbreakableSpace) return result;
+
+ // This text has no-breakable space, we need to check whether the index
+ // is near it.
+
+ // Note that index is always larger than conservativeRange here.
+ for (uint32_t i = index; index - conservativeRangeStart < i; --i) {
+ if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i - 1))) return true;
+ }
+ // Note that index is always less than mLength - conservativeRange.
+ for (uint32_t i = index + 1; i < index + conservativeRangeEnd; ++i) {
+ if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i))) return true;
+ }
+ return false;
+ }
+
+ bool HasPreviousEqualsSign() const { return mHasPreviousEqualsSign; }
+ void NotifySeenEqualsSign() { mHasPreviousEqualsSign = true; }
+
+ bool HasPreviousSlash() const { return mHasPreviousSlash; }
+ void NotifySeenSlash() { mHasPreviousSlash = true; }
+
+ bool HasPreviousBackslash() const { return mHasPreviousBackslash; }
+ void NotifySeenBackslash() { mHasPreviousBackslash = true; }
+
+ uint32_t GetPreviousNonHyphenCharacter() const {
+ return mPreviousNonHyphenCharacter;
+ }
+ void NotifyNonHyphenCharacter(uint32_t ch) {
+ mPreviousNonHyphenCharacter = ch;
+ }
+
+ private:
+ void Init() {
+ mIndex = 0;
+ mLastBreakIndex = 0;
+ mPreviousNonHyphenCharacter = U_NULL;
+ mHasCJKChar = false;
+ mHasNonbreakableSpace = false;
+ mHasPreviousEqualsSign = false;
+ mHasPreviousSlash = false;
+ mHasPreviousBackslash = false;
+
+ if (mText) {
+ // 8-bit text: we only need to check for &nbsp;
+ for (uint32_t i = 0; i < mLength; ++i) {
+ if (IS_NONBREAKABLE_SPACE(mText[i])) {
+ mHasNonbreakableSpace = true;
+ break;
+ }
+ }
+ } else {
+ // 16-bit text: handle surrogates and check for CJK as well as &nbsp;
+ for (uint32_t i = 0; i < mLength; ++i) {
+ char32_t u = GetUnicodeCharAt(i);
+ if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) {
+ mHasNonbreakableSpace = true;
+ if (mHasCJKChar) {
+ break;
+ }
+ } else if (!mHasCJKChar && IS_CJK_CHAR(u)) {
+ mHasCJKChar = true;
+ if (mHasNonbreakableSpace) {
+ break;
+ }
+ }
+ if (u > 0xFFFFu) {
+ ++i; // step over trailing low surrogate
+ }
+ }
+ }
+ }
+
+ const char16_t* const mUniText;
+ const uint8_t* const mText;
+
+ uint32_t mIndex;
+ const uint32_t mLength; // length of text
+ uint32_t mLastBreakIndex;
+ char32_t mPreviousNonHyphenCharacter; // The last character we have seen
+ // which is not U_HYPHEN
+ bool mHasCJKChar; // if the text has CJK character, this is true.
+ bool mHasNonbreakableSpace; // if the text has no-breakable space,
+ // this is true.
+ bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
+ bool mHasPreviousSlash; // True if we have seen a U_SLASH
+ bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH
+};
+
+static int8_t ContextualAnalysis(char32_t prev, char32_t cur, char32_t next,
+ ContextState& aState,
+ LineBreaker::Strictness aLevel,
+ bool aIsChineseOrJapanese) {
+ // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
+
+ if (IS_HYPHEN(cur)) {
+ // If next character is hyphen, we don't need to break between them.
+ if (IS_HYPHEN(next)) return CLASS_CHARACTER;
+ // If prev and next characters are numeric, it may be in Math context.
+ // So, we should not break here.
+ bool prevIsNum = IS_ASCII_DIGIT(prev);
+ bool nextIsNum = IS_ASCII_DIGIT(next);
+ if (prevIsNum && nextIsNum) return CLASS_NUMERIC;
+ // If one side is numeric and the other is a character, or if both sides are
+ // characters, the hyphen should be breakable.
+ if (!aState.UseConservativeBreaking(1)) {
+ char32_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
+ if (prevOfHyphen && next) {
+ int8_t prevClass = GetClass(prevOfHyphen, aLevel, aIsChineseOrJapanese);
+ int8_t nextClass = GetClass(next, aLevel, aIsChineseOrJapanese);
+ bool prevIsNumOrCharOrClose =
+ prevIsNum ||
+ (prevClass == CLASS_CHARACTER &&
+ !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) ||
+ prevClass == CLASS_CLOSE || prevClass == CLASS_CLOSE_LIKE_CHARACTER;
+ bool nextIsNumOrCharOrOpen =
+ nextIsNum ||
+ (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) ||
+ nextClass == CLASS_OPEN || nextClass == CLASS_OPEN_LIKE_CHARACTER ||
+ next == U_OPEN_SINGLE_QUOTE || next == U_OPEN_DOUBLE_QUOTE ||
+ next == U_OPEN_GUILLEMET;
+ if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) {
+ return CLASS_CLOSE;
+ }
+ }
+ }
+ } else {
+ aState.NotifyNonHyphenCharacter(cur);
+ if (cur == U_SLASH || cur == U_BACKSLASH) {
+ // If this is immediately after same char, we should not break here.
+ if (prev == cur) return CLASS_CHARACTER;
+ // If this text has two or more (BACK)SLASHs, this may be file path or
+ // URL. Make sure to compute shouldReturn before we notify on this slash.
+ bool shouldReturn = !aState.UseConservativeBreaking() &&
+ (cur == U_SLASH ? aState.HasPreviousSlash()
+ : aState.HasPreviousBackslash());
+
+ if (cur == U_SLASH) {
+ aState.NotifySeenSlash();
+ } else {
+ aState.NotifySeenBackslash();
+ }
+
+ if (shouldReturn) return CLASS_OPEN;
+ } else if (cur == U_PERCENT) {
+ // If this is a part of the param of URL, we should break before.
+ if (!aState.UseConservativeBreaking()) {
+ if (aState.Index() >= 3 &&
+ aState.GetCodeUnitAt(aState.Index() - 3) == U_PERCENT)
+ return CLASS_OPEN;
+ if (aState.Index() + 3 < aState.Length() &&
+ aState.GetCodeUnitAt(aState.Index() + 3) == U_PERCENT)
+ return CLASS_OPEN;
+ }
+ } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
+ // If this may be a separator of params of URL, we should break after.
+ if (!aState.UseConservativeBreaking(1) && aState.HasPreviousEqualsSign())
+ return CLASS_CLOSE;
+ } else if (cur == U_OPEN_SINGLE_QUOTE || cur == U_OPEN_DOUBLE_QUOTE ||
+ cur == U_OPEN_GUILLEMET) {
+ // for CJK usage, we treat these as openers to allow a break before them,
+ // but otherwise treat them as normal characters because quote mark usage
+ // in various Western languages varies too much; see bug #450088
+ // discussion.
+ if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
+ return CLASS_OPEN;
+ } else {
+ NS_ERROR("Forgot to handle the current character!");
+ }
+ }
+ return GetClass(cur, aLevel, aIsChineseOrJapanese);
+}
+
+int32_t LineBreaker::WordMove(const char16_t* aText, uint32_t aLen,
+ uint32_t aPos, int8_t aDirection) {
+ bool textNeedsJISx4051 = false;
+ int32_t begin, end;
+
+ for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {
+ if (IS_CJK_CHAR(aText[begin]) ||
+ NS_NeedsPlatformNativeHandling(aText[begin])) {
+ textNeedsJISx4051 = true;
+ }
+ }
+ for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) {
+ if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {
+ textNeedsJISx4051 = true;
+ }
+ }
+
+ int32_t ret;
+ AutoTArray<uint8_t, 2000> breakState;
+ if (!textNeedsJISx4051) {
+ // No complex text character, do not try to do complex line break.
+ // (This is required for serializers. See Bug #344816.)
+ if (aDirection < 0) {
+ ret = (begin == int32_t(aPos)) ? begin - 1 : begin;
+ } else {
+ ret = end;
+ }
+ } else {
+ // XXX(Bug 1631371) Check if this should use a fallible operation as it
+ // pretended earlier.
+ breakState.AppendElements(end - begin);
+ GetJISx4051Breaks(aText + begin, end - begin, WordBreak::Normal,
+ Strictness::Auto, false, breakState.Elements());
+
+ ret = aPos;
+ do {
+ ret += aDirection;
+ } while (begin < ret && ret < end && !breakState[ret - begin]);
+ }
+
+ return ret;
+}
+
+int32_t LineBreaker::Next(const char16_t* aText, uint32_t aLen, uint32_t aPos) {
+ NS_ASSERTION(aText, "aText shouldn't be null");
+ NS_ASSERTION(aLen > aPos,
+ "Bad position passed to nsJISx4051LineBreaker::Next");
+
+ int32_t nextPos = WordMove(aText, aLen, aPos, 1);
+ return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;
+}
+
+int32_t LineBreaker::Prev(const char16_t* aText, uint32_t aLen, uint32_t aPos) {
+ NS_ASSERTION(aText, "aText shouldn't be null");
+ NS_ASSERTION(aLen >= aPos && aPos > 0,
+ "Bad position passed to nsJISx4051LineBreaker::Prev");
+
+ int32_t prevPos = WordMove(aText, aLen, aPos, -1);
+ return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;
+}
+
+static bool SuppressBreakForKeepAll(uint32_t aPrev, uint32_t aCh) {
+ auto affectedByKeepAll = [](uint8_t aLBClass) {
+ switch (aLBClass) {
+ // Per https://drafts.csswg.org/css-text-3/#valdef-word-break-keep-all:
+ // "implicit soft wrap opportunities between typographic letter units
+ // (or other typographic character units belonging to the NU, AL, AI,
+ // or ID Unicode line breaking classes [UAX14]) are suppressed..."
+ case U_LB_ALPHABETIC:
+ case U_LB_AMBIGUOUS:
+ case U_LB_NUMERIC:
+ case U_LB_IDEOGRAPHIC:
+ // Additional classes that should be treated similarly, but have been
+ // broken out as separate classes in newer Unicode versions:
+ case U_LB_H2:
+ case U_LB_H3:
+ case U_LB_JL:
+ case U_LB_JV:
+ case U_LB_JT:
+ case U_LB_CONDITIONAL_JAPANESE_STARTER:
+ return true;
+ default:
+ return false;
+ }
+ };
+ return affectedByKeepAll(GetLineBreakClass(aPrev)) &&
+ affectedByKeepAll(GetLineBreakClass(aCh));
+}
+
+void LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength,
+ WordBreak aWordBreak, Strictness aLevel,
+ bool aIsChineseOrJapanese,
+ uint8_t* aBreakBefore) {
+ uint32_t cur;
+ int8_t lastClass = CLASS_NONE;
+ ContextState state(aChars, aLength);
+
+ for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
+ char32_t ch = state.GetUnicodeCharAt(cur);
+ uint32_t chLen = ch > 0xFFFFu ? 2 : 1;
+ int8_t cl;
+
+ if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
+ char32_t prev, next;
+ if (cur > 0) {
+ // not using state.GetUnicodeCharAt() here because we're looking back
+ // rather than forward for possible surrogates
+ prev = aChars[cur - 1];
+ if (cur > 1 && NS_IS_SURROGATE_PAIR(aChars[cur - 2], prev)) {
+ prev = SURROGATE_TO_UCS4(aChars[cur - 2], prev);
+ }
+ } else {
+ prev = 0;
+ }
+ if (cur + chLen < aLength) {
+ next = state.GetUnicodeCharAt(cur + chLen);
+ } else {
+ next = 0;
+ }
+ cl = ContextualAnalysis(prev, ch, next, state, aLevel,
+ aIsChineseOrJapanese);
+ } else {
+ if (ch == U_EQUAL) state.NotifySeenEqualsSign();
+ state.NotifyNonHyphenCharacter(ch);
+ cl = GetClass(ch, aLevel, aIsChineseOrJapanese);
+ }
+
+ // To implement word-break:break-all, we overwrite the line-break class of
+ // alphanumeric characters so they are treated the same as ideographic.
+ // The relevant characters will have been assigned CLASS_CHARACTER, _CLOSE,
+ // _CLOSE_LIKE_CHARACTER, or _NUMERIC by GetClass(), but those classes also
+ // include others that we don't want to touch here, so we re-check the
+ // Unicode line-break class to determine which ones to modify.
+ if (aWordBreak == WordBreak::BreakAll &&
+ (cl == CLASS_CHARACTER || cl == CLASS_CLOSE ||
+ cl == CLASS_CLOSE_LIKE_CHARACTER || cl == CLASS_NUMERIC)) {
+ auto cls = GetLineBreakClass(ch);
+ if (cls == U_LB_ALPHABETIC || cls == U_LB_NUMERIC ||
+ cls == U_LB_AMBIGUOUS || cls == U_LB_COMPLEX_CONTEXT ||
+ /* Additional Japanese and Korean LB classes; CSS Text spec doesn't
+ explicitly mention these, but this appears to give expected
+ behavior (spec issue?) */
+ cls == U_LB_CONDITIONAL_JAPANESE_STARTER ||
+ (cls >= U_LB_H2 && cls <= U_LB_JV)) {
+ cl = CLASS_BREAKABLE;
+ }
+ }
+
+ bool allowBreak = false;
+ if (cur > 0) {
+ NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
+ "Loop should have prevented adjacent complex chars here");
+ auto prev = [=]() {
+ char32_t c = aChars[cur - 1];
+ if (cur > 1 && NS_IS_SURROGATE_PAIR(aChars[cur - 2], c)) {
+ c = SURROGATE_TO_UCS4(aChars[cur - 2], c);
+ }
+ return c;
+ };
+ allowBreak =
+ (state.UseConservativeBreaking() ? GetPairConservative(lastClass, cl)
+ : GetPair(lastClass, cl)) &&
+ (aWordBreak != WordBreak::KeepAll ||
+ !SuppressBreakForKeepAll(prev(), ch));
+ }
+ aBreakBefore[cur] = allowBreak;
+ if (allowBreak) state.NotifyBreakBefore();
+ lastClass = cl;
+ if (CLASS_COMPLEX == cl) {
+ uint32_t end = cur + chLen;
+
+ while (end < aLength) {
+ char32_t c = state.GetUnicodeCharAt(end);
+ if (CLASS_COMPLEX != GetClass(c, aLevel, false)) {
+ break;
+ }
+ ++end;
+ if (c > 0xFFFFU) { // it was a surrogate pair
+ ++end;
+ }
+ }
+
+ if (aWordBreak == WordBreak::BreakAll) {
+ // For break-all, we don't need to run a dictionary-based breaking
+ // algorithm, we just allow breaks between all grapheme clusters.
+ ClusterIterator ci(aChars + cur, end - cur);
+ while (!ci.AtEnd()) {
+ ci.Next();
+ aBreakBefore[ci - aChars] = true;
+ }
+ } else {
+ NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
+ // restore breakability at chunk begin, which was always set to false
+ // by the complex line breaker
+ aBreakBefore[cur] = allowBreak;
+ }
+
+ cur = end - 1;
+ }
+
+ if (chLen == 2) {
+ // Supplementary-plane character: mark that we cannot break before the
+ // trailing low surrogate, and advance past it.
+ ++cur;
+ aBreakBefore[cur] = false;
+ state.AdvanceIndex();
+ }
+ }
+}
+
+void LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength,
+ WordBreak aWordBreak, Strictness aLevel,
+ bool aIsChineseOrJapanese,
+ uint8_t* aBreakBefore) {
+ uint32_t cur;
+ int8_t lastClass = CLASS_NONE;
+ ContextState state(aChars, aLength);
+
+ for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
+ char32_t ch = aChars[cur];
+ int8_t cl;
+
+ if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
+ cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, ch,
+ cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
+ state, aLevel, aIsChineseOrJapanese);
+ } else {
+ if (ch == U_EQUAL) state.NotifySeenEqualsSign();
+ state.NotifyNonHyphenCharacter(ch);
+ cl = GetClass(ch, aLevel, aIsChineseOrJapanese);
+ }
+ if (aWordBreak == WordBreak::BreakAll &&
+ (cl == CLASS_CHARACTER || cl == CLASS_CLOSE ||
+ cl == CLASS_CLOSE_LIKE_CHARACTER || cl == CLASS_NUMERIC)) {
+ auto cls = GetLineBreakClass(ch);
+ // Don't need to check additional Japanese/Korean classes in 8-bit
+ if (cls == U_LB_ALPHABETIC || cls == U_LB_NUMERIC ||
+ cls == U_LB_COMPLEX_CONTEXT) {
+ cl = CLASS_BREAKABLE;
+ }
+ }
+
+ bool allowBreak = false;
+ if (cur > 0) {
+ allowBreak =
+ (state.UseConservativeBreaking() ? GetPairConservative(lastClass, cl)
+ : GetPair(lastClass, cl)) &&
+ (aWordBreak != WordBreak::KeepAll ||
+ !SuppressBreakForKeepAll(aChars[cur - 1], ch));
+ }
+ aBreakBefore[cur] = allowBreak;
+ if (allowBreak) state.NotifyBreakBefore();
+ lastClass = cl;
+ }
+}
diff --git a/intl/lwbrk/LineBreaker.h b/intl/lwbrk/LineBreaker.h
new file mode 100644
index 0000000000..eaea8e36cc
--- /dev/null
+++ b/intl/lwbrk/LineBreaker.h
@@ -0,0 +1,88 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef mozilla_intl_LineBreaker_h__
+#define mozilla_intl_LineBreaker_h__
+
+#include "nscore.h"
+#include "nsISupports.h"
+
+#define NS_LINEBREAKER_NEED_MORE_TEXT -1
+
+namespace mozilla {
+namespace intl {
+
+class LineBreaker {
+ public:
+ NS_INLINE_DECL_REFCOUNTING(LineBreaker)
+
+ enum class WordBreak : uint8_t {
+ Normal = 0, // default
+ BreakAll = 1, // break all
+ KeepAll = 2 // always keep
+ };
+
+ enum class Strictness : uint8_t {
+ Auto = 0,
+ Loose = 1,
+ Normal = 2,
+ Strict = 3,
+ Anywhere = 4
+ };
+
+ static already_AddRefed<LineBreaker> Create();
+
+ int32_t Next(const char16_t* aText, uint32_t aLen, uint32_t aPos);
+
+ int32_t Prev(const char16_t* aText, uint32_t aLen, uint32_t aPos);
+
+ // Call this on a word with whitespace at either end. We will apply JISx4051
+ // rules to find breaks inside the word. aBreakBefore is set to the break-
+ // before status of each character; aBreakBefore[0] will always be false
+ // because we never return a break before the first character.
+ // aLength is the length of the aText array and also the length of the
+ // aBreakBefore output array.
+ void GetJISx4051Breaks(const char16_t* aText, uint32_t aLength,
+ WordBreak aWordBreak, Strictness aLevel,
+ bool aIsChineseOrJapanese, uint8_t* aBreakBefore);
+ void GetJISx4051Breaks(const uint8_t* aText, uint32_t aLength,
+ WordBreak aWordBreak, Strictness aLevel,
+ bool aIsChineseOrJapanese, uint8_t* aBreakBefore);
+
+ private:
+ ~LineBreaker() = default;
+
+ int32_t WordMove(const char16_t* aText, uint32_t aLen, uint32_t aPos,
+ int8_t aDirection);
+};
+
+static inline bool NS_IsSpace(char16_t u) {
+ return u == 0x0020 || // SPACE
+ u == 0x0009 || // CHARACTER TABULATION
+ u == 0x000D || // CARRIAGE RETURN
+ (0x2000 <= u && u <= 0x2006) || // EN QUAD, EM QUAD, EN SPACE,
+ // EM SPACE, THREE-PER-EM SPACE,
+ // FOUR-PER-SPACE, SIX-PER-EM SPACE,
+ (0x2008 <= u && u <= 0x200B) || // PUNCTUATION SPACE, THIN SPACE,
+ // HAIR SPACE, ZERO WIDTH SPACE
+ u == 0x1361 || // ETHIOPIC WORDSPACE
+ u == 0x1680 || // OGHAM SPACE MARK
+ u == 0x205F; // MEDIUM MATHEMATICAL SPACE
+}
+
+static inline bool NS_NeedsPlatformNativeHandling(char16_t aChar) {
+ return
+#if ANDROID // Bug 1647377: no "platform native" support for Tibetan;
+ // better to just use our class-based breaker.
+ (0x0e01 <= aChar && aChar <= 0x0eff) || // Thai, Lao
+#else
+ (0x0e01 <= aChar && aChar <= 0x0fff) || // Thai, Lao, Tibetan
+#endif
+ (0x1780 <= aChar && aChar <= 0x17ff); // Khmer
+}
+
+} // namespace intl
+} // namespace mozilla
+
+#endif /* mozilla_intl_LineBreaker_h__ */
diff --git a/intl/lwbrk/WordBreaker.cpp b/intl/lwbrk/WordBreaker.cpp
new file mode 100644
index 0000000000..269d084d93
--- /dev/null
+++ b/intl/lwbrk/WordBreaker.cpp
@@ -0,0 +1,218 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/intl/WordBreaker.h"
+#include "mozilla/StaticPrefs_layout.h"
+#include "nsComplexBreaker.h"
+#include "nsUnicodeProperties.h"
+
+using mozilla::intl::WordBreakClass;
+using mozilla::intl::WordBreaker;
+using mozilla::intl::WordRange;
+using mozilla::unicode::GetScriptCode;
+
+/*static*/
+already_AddRefed<WordBreaker> WordBreaker::Create() {
+ return RefPtr<WordBreaker>(new WordBreaker()).forget();
+}
+
+bool WordBreaker::BreakInBetween(const char16_t* aText1, uint32_t aTextLen1,
+ const char16_t* aText2, uint32_t aTextLen2) {
+ MOZ_ASSERT(nullptr != aText1, "null ptr");
+ MOZ_ASSERT(nullptr != aText2, "null ptr");
+
+ if (!aText1 || !aText2 || (0 == aTextLen1) || (0 == aTextLen2)) return false;
+
+ uint8_t c1 = GetClass(aText1[aTextLen1 - 1]);
+ uint8_t c2 = GetClass(aText2[0]);
+
+ if (c1 == c2 && kWbClassScriptioContinua == c1) {
+ nsAutoString text(aText1, aTextLen1);
+ text.Append(aText2, aTextLen2);
+ AutoTArray<uint8_t, 256> breakBefore;
+ breakBefore.SetLength(aTextLen1 + aTextLen2);
+ NS_GetComplexLineBreaks(text.get(), text.Length(), breakBefore.Elements());
+ bool ret = breakBefore[aTextLen1];
+ return ret;
+ }
+
+ return (c1 != c2);
+}
+
+#define IS_ASCII(c) (0 == (0xFF80 & (c)))
+#define ASCII_IS_ALPHA(c) \
+ ((('a' <= (c)) && ((c) <= 'z')) || (('A' <= (c)) && ((c) <= 'Z')))
+#define ASCII_IS_DIGIT(c) (('0' <= (c)) && ((c) <= '9'))
+#define ASCII_IS_SPACE(c) \
+ ((' ' == (c)) || ('\t' == (c)) || ('\r' == (c)) || ('\n' == (c)))
+#define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80)
+
+// we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect
+// Unicode 3.0
+#define IS_HAN(c) \
+ ((0x3400 <= (c)) && ((c) <= 0x9fff)) || ((0xf900 <= (c)) && ((c) <= 0xfaff))
+#define IS_KATAKANA(c) ((0x30A0 <= (c)) && ((c) <= 0x30FF))
+#define IS_HIRAGANA(c) ((0x3040 <= (c)) && ((c) <= 0x309F))
+#define IS_HALFWIDTHKATAKANA(c) ((0xFF60 <= (c)) && ((c) <= 0xFF9F))
+
+// Return true if aChar belongs to a SEAsian script that is written without
+// word spaces, so we need to use the "complex breaker" to find possible word
+// boundaries. (https://en.wikipedia.org/wiki/Scriptio_continua)
+// (How well this works depends on the level of platform support for finding
+// possible line breaks - or possible word boundaries - in the particular
+// script. Thai, at least, works pretty well on the major desktop OSes. If
+// the script is not supported by the platform, we just won't find any useful
+// boundaries.)
+static bool IsScriptioContinua(char16_t aChar) {
+ Script sc = GetScriptCode(aChar);
+ return sc == Script::THAI || sc == Script::MYANMAR || sc == Script::KHMER ||
+ sc == Script::JAVANESE || sc == Script::BALINESE ||
+ sc == Script::SUNDANESE || sc == Script::LAO;
+}
+
+/* static */
+WordBreakClass WordBreaker::GetClass(char16_t c) {
+ // begin of the hack
+
+ if (IS_ALPHABETICAL_SCRIPT(c)) {
+ if (IS_ASCII(c)) {
+ if (ASCII_IS_SPACE(c)) {
+ return kWbClassSpace;
+ }
+ if (ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c) ||
+ (c == '_' && !StaticPrefs::layout_word_select_stop_at_underscore())) {
+ return kWbClassAlphaLetter;
+ }
+ return kWbClassPunct;
+ }
+ if (c == 0x00A0 /*NBSP*/) {
+ return kWbClassSpace;
+ }
+ if (GetGenCategory(c) == nsUGenCategory::kPunctuation) {
+ return kWbClassPunct;
+ }
+ if (IsScriptioContinua(c)) {
+ return kWbClassScriptioContinua;
+ }
+ return kWbClassAlphaLetter;
+ }
+ if (IS_HAN(c)) {
+ return kWbClassHanLetter;
+ }
+ if (IS_KATAKANA(c)) {
+ return kWbClassKatakanaLetter;
+ }
+ if (IS_HIRAGANA(c)) {
+ return kWbClassHiraganaLetter;
+ }
+ if (IS_HALFWIDTHKATAKANA(c)) {
+ return kWbClassHWKatakanaLetter;
+ }
+ if (GetGenCategory(c) == nsUGenCategory::kPunctuation) {
+ return kWbClassPunct;
+ }
+ if (IsScriptioContinua(c)) {
+ return kWbClassScriptioContinua;
+ }
+ return kWbClassAlphaLetter;
+}
+
+WordRange WordBreaker::FindWord(const char16_t* aText, uint32_t aTextLen,
+ uint32_t aOffset) {
+ WordRange range;
+ MOZ_ASSERT(nullptr != aText, "null ptr");
+ MOZ_ASSERT(0 != aTextLen, "len = 0");
+ MOZ_ASSERT(aOffset <= aTextLen, "aOffset > aTextLen");
+
+ range.mBegin = aTextLen + 1;
+ range.mEnd = aTextLen + 1;
+
+ if (!aText || aOffset > aTextLen) return range;
+
+ WordBreakClass c = GetClass(aText[aOffset]);
+ uint32_t i;
+ // Scan forward
+ range.mEnd--;
+ for (i = aOffset + 1; i <= aTextLen; i++) {
+ if (c != GetClass(aText[i])) {
+ range.mEnd = i;
+ break;
+ }
+ }
+
+ // Scan backward
+ range.mBegin = 0;
+ for (i = aOffset; i > 0; i--) {
+ if (c != GetClass(aText[i - 1])) {
+ range.mBegin = i;
+ break;
+ }
+ }
+
+ if (kWbClassScriptioContinua == c) {
+ // we pass the whole text segment to the complex word breaker to find a
+ // shorter answer
+ AutoTArray<uint8_t, 256> breakBefore;
+ breakBefore.SetLength(range.mEnd - range.mBegin);
+ NS_GetComplexLineBreaks(aText + range.mBegin, range.mEnd - range.mBegin,
+ breakBefore.Elements());
+
+ // Scan forward
+ for (i = aOffset + 1; i < range.mEnd; i++) {
+ if (breakBefore[i - range.mBegin]) {
+ range.mEnd = i;
+ break;
+ }
+ }
+
+ // Scan backward
+ for (i = aOffset; i > range.mBegin; i--) {
+ if (breakBefore[i - range.mBegin]) {
+ range.mBegin = i;
+ break;
+ }
+ }
+ }
+ return range;
+}
+
+int32_t WordBreaker::NextWord(const char16_t* aText, uint32_t aLen,
+ uint32_t aPos) {
+ WordBreakClass c1, c2;
+ uint32_t cur = aPos;
+ if (cur == aLen) {
+ return NS_WORDBREAKER_NEED_MORE_TEXT;
+ }
+ c1 = GetClass(aText[cur]);
+
+ for (cur++; cur < aLen; cur++) {
+ c2 = GetClass(aText[cur]);
+ if (c2 != c1) {
+ break;
+ }
+ }
+
+ if (kWbClassScriptioContinua == c1) {
+ // we pass the whole text segment to the complex word breaker to find a
+ // shorter answer
+ AutoTArray<uint8_t, 256> breakBefore;
+ breakBefore.SetLength(aLen - aPos);
+ NS_GetComplexLineBreaks(aText + aPos, aLen - aPos, breakBefore.Elements());
+ uint32_t i = 1;
+ while (i < cur - aPos && !breakBefore[i]) {
+ i++;
+ }
+ if (i < cur - aPos) {
+ return aPos + i;
+ }
+ }
+
+ if (cur == aLen) {
+ return NS_WORDBREAKER_NEED_MORE_TEXT;
+ }
+
+ MOZ_ASSERT(cur != aPos);
+ return cur;
+}
diff --git a/intl/lwbrk/WordBreaker.h b/intl/lwbrk/WordBreaker.h
new file mode 100644
index 0000000000..57cb4b18b7
--- /dev/null
+++ b/intl/lwbrk/WordBreaker.h
@@ -0,0 +1,53 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef mozilla_intl_WordBreaker_h__
+#define mozilla_intl_WordBreaker_h__
+
+#include "nscore.h"
+#include "nsISupports.h"
+
+#define NS_WORDBREAKER_NEED_MORE_TEXT -1
+
+namespace mozilla {
+namespace intl {
+
+typedef struct {
+ uint32_t mBegin;
+ uint32_t mEnd;
+} WordRange;
+
+enum WordBreakClass : uint8_t {
+ kWbClassSpace = 0,
+ kWbClassAlphaLetter,
+ kWbClassPunct,
+ kWbClassHanLetter,
+ kWbClassKatakanaLetter,
+ kWbClassHiraganaLetter,
+ kWbClassHWKatakanaLetter,
+ kWbClassScriptioContinua
+};
+
+class WordBreaker {
+ public:
+ NS_INLINE_DECL_REFCOUNTING(WordBreaker)
+
+ static already_AddRefed<WordBreaker> Create();
+
+ bool BreakInBetween(const char16_t* aText1, uint32_t aTextLen1,
+ const char16_t* aText2, uint32_t aTextLen2);
+ WordRange FindWord(const char16_t* aText1, uint32_t aTextLen1,
+ uint32_t aOffset);
+ int32_t NextWord(const char16_t* aText, uint32_t aLen, uint32_t aPos);
+
+ static WordBreakClass GetClass(char16_t aChar);
+
+ private:
+ ~WordBreaker() = default;
+};
+
+} // namespace intl
+} // namespace mozilla
+
+#endif /* mozilla_intl_WordBreaker_h__ */
diff --git a/intl/lwbrk/crashtests/416721.html b/intl/lwbrk/crashtests/416721.html
new file mode 100644
index 0000000000..0a6625ba8a
--- /dev/null
+++ b/intl/lwbrk/crashtests/416721.html
@@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>Testcase for bug 416721</title>
+ <meta http-equiv="content-type" content="text/html; charset=utf-8">
+ </head>
+ <body>
+ <p>กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛</p>
+ </body>
+</html>
+
diff --git a/intl/lwbrk/crashtests/crashtests.list b/intl/lwbrk/crashtests/crashtests.list
new file mode 100644
index 0000000000..a7cb7a173b
--- /dev/null
+++ b/intl/lwbrk/crashtests/crashtests.list
@@ -0,0 +1 @@
+load 416721.html
diff --git a/intl/lwbrk/gtest/TestLineBreak.cpp b/intl/lwbrk/gtest/TestLineBreak.cpp
new file mode 100644
index 0000000000..5c3215c228
--- /dev/null
+++ b/intl/lwbrk/gtest/TestLineBreak.cpp
@@ -0,0 +1,283 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <stdio.h>
+#include "nsXPCOM.h"
+#include "nsISupports.h"
+#include "nsServiceManagerUtils.h"
+#include "nsString.h"
+#include "gtest/gtest.h"
+
+#include "mozilla/intl/LineBreaker.h"
+#include "mozilla/intl/WordBreaker.h"
+
+static char teng1[] =
+ // 1 2 3 4 5 6 7
+ // 01234567890123456789012345678901234567890123456789012345678901234567890123456789
+ "This is a test to test(reasonable) line break. This 0.01123 = 45 x 48.";
+
+static uint32_t lexp1[] = {4, 7, 9, 14, 17, 34, 39, 40, 41,
+ 42, 49, 54, 62, 64, 67, 69, 73};
+
+static uint32_t wexp1[] = {4, 5, 7, 8, 9, 10, 14, 15, 17, 18, 22,
+ 23, 33, 34, 35, 39, 43, 48, 49, 50, 54, 55,
+ 56, 57, 62, 63, 64, 65, 67, 68, 69, 70, 72};
+
+static char teng2[] =
+ // 1 2 3 4 5 6 7
+ // 01234567890123456789012345678901234567890123456789012345678901234567890123456789
+ "()((reasonab(l)e) line break. .01123=45x48.";
+
+static uint32_t lexp2[] = {17, 22, 23, 30, 44};
+
+static uint32_t wexp2[] = {4, 12, 13, 14, 15, 16, 17, 18, 22,
+ 24, 29, 30, 31, 32, 37, 38, 43};
+
+static char teng3[] =
+ // 1 2 3 4 5 6 7
+ // 01234567890123456789012345678901234567890123456789012345678901234567890123456789
+ "It's a test to test(ronae ) line break....";
+
+static uint32_t lexp3[] = {4, 6, 11, 14, 25, 27, 32, 42};
+
+static uint32_t wexp3[] = {2, 3, 4, 5, 6, 7, 11, 12, 14, 15,
+ 19, 20, 25, 26, 27, 28, 32, 33, 38};
+
+static char ruler1[] =
+ " 1 2 3 4 5 6 7 ";
+static char ruler2[] =
+ "0123456789012345678901234567890123456789012345678901234567890123456789012";
+
+bool Check(const char* in, const uint32_t* out, uint32_t outlen, uint32_t i,
+ uint32_t res[256]) {
+ bool ok = true;
+
+ if (i != outlen) {
+ ok = false;
+ printf("WARNING!!! return size wrong, expect %d but got %d \n", outlen, i);
+ }
+
+ for (uint32_t j = 0; j < i; j++) {
+ if (j < outlen) {
+ if (res[j] != out[j]) {
+ ok = false;
+ printf("[%d] expect %d but got %d\n", j, out[j], res[j]);
+ }
+ } else {
+ ok = false;
+ printf("[%d] additional %d\n", j, res[j]);
+ }
+ }
+
+ if (!ok) {
+ printf("string = \n%s\n", in);
+ printf("%s\n", ruler1);
+ printf("%s\n", ruler2);
+
+ printf("Expect = \n");
+ for (uint32_t j = 0; j < outlen; j++) {
+ printf("%d,", out[j]);
+ }
+
+ printf("\nResult = \n");
+ for (uint32_t j = 0; j < i; j++) {
+ printf("%d,", res[j]);
+ }
+ printf("\n");
+ }
+
+ return ok;
+}
+
+bool TestASCIILB(mozilla::intl::LineBreaker* lb, const char* in,
+ const uint32_t* out, uint32_t outlen) {
+ NS_ConvertASCIItoUTF16 eng1(in);
+ uint32_t i;
+ uint32_t res[256];
+ int32_t curr;
+
+ for (i = 0, curr = 0; curr != NS_LINEBREAKER_NEED_MORE_TEXT && i < 256; i++) {
+ curr = lb->Next(eng1.get(), eng1.Length(), curr);
+ res[i] = curr != NS_LINEBREAKER_NEED_MORE_TEXT ? curr : eng1.Length();
+ }
+
+ return Check(in, out, outlen, i, res);
+}
+
+bool TestASCIIWB(mozilla::intl::WordBreaker* lb, const char* in,
+ const uint32_t* out, uint32_t outlen) {
+ NS_ConvertASCIItoUTF16 eng1(in);
+
+ uint32_t i;
+ uint32_t res[256];
+ int32_t curr = 0;
+
+ for (i = 0, curr = lb->NextWord(eng1.get(), eng1.Length(), curr);
+ curr != NS_WORDBREAKER_NEED_MORE_TEXT && i < 256;
+ curr = lb->NextWord(eng1.get(), eng1.Length(), curr), i++) {
+ res[i] = curr != NS_WORDBREAKER_NEED_MORE_TEXT ? curr : eng1.Length();
+ }
+
+ return Check(in, out, outlen, i, res);
+}
+
+TEST(LineBreak, LineBreaker)
+{
+ RefPtr<mozilla::intl::LineBreaker> t = mozilla::intl::LineBreaker::Create();
+
+ ASSERT_TRUE(t);
+
+ ASSERT_TRUE(TestASCIILB(t, teng1, lexp1, sizeof(lexp1) / sizeof(uint32_t)));
+ ASSERT_TRUE(TestASCIILB(t, teng2, lexp2, sizeof(lexp2) / sizeof(uint32_t)));
+ ASSERT_TRUE(TestASCIILB(t, teng3, lexp3, sizeof(lexp3) / sizeof(uint32_t)));
+}
+
+TEST(LineBreak, WordBreaker)
+{
+ RefPtr<mozilla::intl::WordBreaker> t = mozilla::intl::WordBreaker::Create();
+ ASSERT_TRUE(t);
+
+ ASSERT_TRUE(TestASCIIWB(t, teng1, wexp1, sizeof(wexp1) / sizeof(uint32_t)));
+ ASSERT_TRUE(TestASCIIWB(t, teng2, wexp2, sizeof(wexp2) / sizeof(uint32_t)));
+ ASSERT_TRUE(TestASCIIWB(t, teng3, wexp3, sizeof(wexp3) / sizeof(uint32_t)));
+}
+
+// 012345678901234
+static const char wb0[] = "T";
+static const char wb1[] = "h";
+static const char wb2[] = "is is a int";
+static const char wb3[] = "ernationali";
+static const char wb4[] = "zation work.";
+
+static const char* wb[] = {wb0, wb1, wb2, wb3, wb4};
+
+void TestPrintWordWithBreak() {
+ uint32_t numOfFragment = sizeof(wb) / sizeof(char*);
+ RefPtr<mozilla::intl::WordBreaker> wbk = mozilla::intl::WordBreaker::Create();
+
+ nsAutoString result;
+
+ for (uint32_t i = 0; i < numOfFragment; i++) {
+ NS_ConvertASCIItoUTF16 fragText(wb[i]);
+
+ int32_t cur = 0;
+ cur = wbk->NextWord(fragText.get(), fragText.Length(), cur);
+ uint32_t start = 0;
+ for (uint32_t j = 0; cur != NS_WORDBREAKER_NEED_MORE_TEXT; j++) {
+ result.Append(Substring(fragText, start, cur - start));
+ result.Append('^');
+ start = (cur >= 0 ? cur : cur - start);
+ cur = wbk->NextWord(fragText.get(), fragText.Length(), cur);
+ }
+
+ result.Append(Substring(fragText, fragText.Length() - start));
+
+ if (i != numOfFragment - 1) {
+ NS_ConvertASCIItoUTF16 nextFragText(wb[i + 1]);
+
+ bool canBreak = true;
+ canBreak = wbk->BreakInBetween(fragText.get(), fragText.Length(),
+ nextFragText.get(), nextFragText.Length());
+ if (canBreak) {
+ result.Append('^');
+ }
+ fragText.Assign(nextFragText);
+ }
+ }
+ ASSERT_STREQ("is^ ^is^ ^a^ ^ is a intzation^ ^work^ation work.",
+ NS_ConvertUTF16toUTF8(result).get());
+}
+
+void TestFindWordBreakFromPosition(uint32_t fragN, uint32_t offset,
+ const char* expected) {
+ uint32_t numOfFragment = sizeof(wb) / sizeof(char*);
+ RefPtr<mozilla::intl::WordBreaker> wbk = mozilla::intl::WordBreaker::Create();
+
+ NS_ConvertASCIItoUTF16 fragText(wb[fragN]);
+
+ mozilla::intl::WordRange res =
+ wbk->FindWord(fragText.get(), fragText.Length(), offset);
+
+ bool canBreak;
+ nsAutoString result(Substring(fragText, res.mBegin, res.mEnd - res.mBegin));
+
+ if ((uint32_t)fragText.Length() == res.mEnd) {
+ // if we hit the end of the fragment
+ nsAutoString curFragText = fragText;
+ for (uint32_t p = fragN + 1; p < numOfFragment; p++) {
+ NS_ConvertASCIItoUTF16 nextFragText(wb[p]);
+ canBreak = wbk->BreakInBetween(curFragText.get(), curFragText.Length(),
+ nextFragText.get(), nextFragText.Length());
+ if (canBreak) {
+ break;
+ }
+ mozilla::intl::WordRange r =
+ wbk->FindWord(nextFragText.get(), nextFragText.Length(), 0);
+
+ result.Append(Substring(nextFragText, r.mBegin, r.mEnd - r.mBegin));
+
+ if ((uint32_t)nextFragText.Length() != r.mEnd) {
+ break;
+ }
+ nextFragText.Assign(curFragText);
+ }
+ }
+
+ if (0 == res.mBegin) {
+ // if we hit the beginning of the fragment
+ nsAutoString curFragText = fragText;
+ for (uint32_t p = fragN; p > 0; p--) {
+ NS_ConvertASCIItoUTF16 prevFragText(wb[p - 1]);
+ canBreak = wbk->BreakInBetween(prevFragText.get(), prevFragText.Length(),
+ curFragText.get(), curFragText.Length());
+ if (canBreak) {
+ break;
+ }
+ mozilla::intl::WordRange r = wbk->FindWord(
+ prevFragText.get(), prevFragText.Length(), prevFragText.Length());
+
+ result.Insert(Substring(prevFragText, r.mBegin, r.mEnd - r.mBegin), 0);
+
+ if (0 != r.mBegin) {
+ break;
+ }
+ prevFragText.Assign(curFragText);
+ }
+ }
+
+ ASSERT_STREQ(expected, NS_ConvertUTF16toUTF8(result).get())
+ << "FindWordBreakFromPosition(" << fragN << ", " << offset << ")";
+}
+
+void TestNextWordBreakWithComplexLanguage() {
+ RefPtr<mozilla::intl::WordBreaker> wbk = mozilla::intl::WordBreaker::Create();
+ nsString fragText(u"\u0e40\u0e1b\u0e47\u0e19\u0e19\u0e31\u0e01");
+
+ int32_t offset = 0;
+ while (offset != NS_WORDBREAKER_NEED_MORE_TEXT) {
+ int32_t newOffset =
+ wbk->NextWord(fragText.get(), fragText.Length(), offset);
+ ASSERT_NE(offset, newOffset);
+ offset = newOffset;
+ }
+ ASSERT_TRUE(true);
+}
+
+TEST(LineBreak, WordBreakUsage)
+{
+ TestPrintWordWithBreak();
+ TestFindWordBreakFromPosition(0, 0, "This");
+ TestFindWordBreakFromPosition(1, 0, "his");
+ TestFindWordBreakFromPosition(2, 0, "is");
+ TestFindWordBreakFromPosition(2, 1, "is");
+ TestFindWordBreakFromPosition(2, 9, " ");
+ TestFindWordBreakFromPosition(2, 10, "internationalization");
+ TestFindWordBreakFromPosition(3, 4, "ernationalization");
+ TestFindWordBreakFromPosition(3, 8, "ernationalization");
+ TestFindWordBreakFromPosition(4, 6, " ");
+ TestFindWordBreakFromPosition(4, 7, "work");
+ TestNextWordBreakWithComplexLanguage();
+}
diff --git a/intl/lwbrk/gtest/moz.build b/intl/lwbrk/gtest/moz.build
new file mode 100644
index 0000000000..c9fbab8e76
--- /dev/null
+++ b/intl/lwbrk/gtest/moz.build
@@ -0,0 +1,11 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+UNIFIED_SOURCES += [
+ "TestLineBreak.cpp",
+]
+
+FINAL_LIBRARY = "xul-gtest"
diff --git a/intl/lwbrk/jisx4051class.h b/intl/lwbrk/jisx4051class.h
new file mode 100644
index 0000000000..3140cf63a7
--- /dev/null
+++ b/intl/lwbrk/jisx4051class.h
@@ -0,0 +1,217 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+/*
+ DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY
+ mozilla/intl/lwbrk/tools/anzx4051.pl
+ */
+static const uint32_t gLBClass00[32] = {
+ 0x55555555, // U+0000 - U+0007
+ 0x55555555, // U+0008 - U+000F
+ 0x55555555, // U+0010 - U+0017
+ 0x55555555, // U+0018 - U+001F
+ 0x7AABAAA5, // U+0020 - U+0027
+ 0x7A7AAAA9, // U+0028 - U+002F
+ 0x66666666, // U+0030 - U+0037
+ 0xAAA9AA66, // U+0038 - U+003F
+ 0x77777777, // U+0040 - U+0047
+ 0x77777777, // U+0048 - U+004F
+ 0x77777777, // U+0050 - U+0057
+ 0x77AA9777, // U+0058 - U+005F
+ 0x77777777, // U+0060 - U+0067
+ 0x77777777, // U+0068 - U+006F
+ 0x77777777, // U+0070 - U+0077
+ 0x7AAA9777, // U+0078 - U+007F
+ 0x77777777, // U+0080 - U+0087
+ 0x77777777, // U+0088 - U+008F
+ 0x77777777, // U+0090 - U+0097
+ 0x77777777, // U+0098 - U+009F
+ 0xAA9A9AAB, // U+00A0 - U+00A7
+ 0x77A9777A, // U+00A8 - U+00AF
+ 0xAAAAAAAA, // U+00B0 - U+00B7
+ 0xAAAAAAAA, // U+00B8 - U+00BF
+ 0x77777777, // U+00C0 - U+00C7
+ 0x77777777, // U+00C8 - U+00CF
+ 0x77777777, // U+00D0 - U+00D7
+ 0x77777777, // U+00D8 - U+00DF
+ 0x77777777, // U+00E0 - U+00E7
+ 0x77777777, // U+00E8 - U+00EF
+ 0xA7777777, // U+00F0 - U+00F7
+ 0x77777777, // U+00F8 - U+00FF
+};
+
+static const uint32_t gLBClass20[32] = {
+ 0xB5555555, // U+2000 - U+2007
+ 0x77775555, // U+2008 - U+200F
+ 0x777277B7, // U+2010 - U+2017
+ 0x77A777A7, // U+2018 - U+201F
+ 0xA1117777, // U+2020 - U+2027
+ 0xB7777777, // U+2028 - U+202F
+ 0x77744444, // U+2030 - U+2037
+ 0x7A115107, // U+2038 - U+203F
+ 0x11017777, // U+2040 - U+2047
+ 0x77777711, // U+2048 - U+204F
+ 0x77777777, // U+2050 - U+2057
+ 0x57777777, // U+2058 - U+205F
+ 0x7777777B, // U+2060 - U+2067
+ 0x77777777, // U+2068 - U+206F
+ 0x77777777, // U+2070 - U+2077
+ 0x77777777, // U+2078 - U+207F
+ 0x77777777, // U+2080 - U+2087
+ 0x77777777, // U+2088 - U+208F
+ 0x77777777, // U+2090 - U+2097
+ 0x77777777, // U+2098 - U+209F
+ 0x77777777, // U+20A0 - U+20A7
+ 0x77777777, // U+20A8 - U+20AF
+ 0x77777777, // U+20B0 - U+20B7
+ 0x77777777, // U+20B8 - U+20BF
+ 0x77777777, // U+20C0 - U+20C7
+ 0x77777777, // U+20C8 - U+20CF
+ 0x77777777, // U+20D0 - U+20D7
+ 0x77777777, // U+20D8 - U+20DF
+ 0x77777777, // U+20E0 - U+20E7
+ 0x77777777, // U+20E8 - U+20EF
+ 0x77777777, // U+20F0 - U+20F7
+ 0x77777777, // U+20F8 - U+20FF
+};
+
+static const uint32_t gLBClass21[32] = {
+ 0x77777777, // U+2100 - U+2107
+ 0x77777777, // U+2108 - U+210F
+ 0x73777777, // U+2110 - U+2117
+ 0x77777777, // U+2118 - U+211F
+ 0x77777777, // U+2120 - U+2127
+ 0x77777777, // U+2128 - U+212F
+ 0x77777777, // U+2130 - U+2137
+ 0x77777777, // U+2138 - U+213F
+ 0x77777777, // U+2140 - U+2147
+ 0x77777777, // U+2148 - U+214F
+ 0x77777777, // U+2150 - U+2157
+ 0x77777777, // U+2158 - U+215F
+ 0x55555555, // U+2160 - U+2167
+ 0x55555555, // U+2168 - U+216F
+ 0x55555555, // U+2170 - U+2177
+ 0x55555555, // U+2178 - U+217F
+ 0x77777777, // U+2180 - U+2187
+ 0x77777777, // U+2188 - U+218F
+ 0x77777777, // U+2190 - U+2197
+ 0x77777777, // U+2198 - U+219F
+ 0x77777777, // U+21A0 - U+21A7
+ 0x77777777, // U+21A8 - U+21AF
+ 0x77777777, // U+21B0 - U+21B7
+ 0x77777777, // U+21B8 - U+21BF
+ 0x77777777, // U+21C0 - U+21C7
+ 0x77777777, // U+21C8 - U+21CF
+ 0x77777777, // U+21D0 - U+21D7
+ 0x77777777, // U+21D8 - U+21DF
+ 0x77777777, // U+21E0 - U+21E7
+ 0x77777777, // U+21E8 - U+21EF
+ 0x77777777, // U+21F0 - U+21F7
+ 0x77777777, // U+21F8 - U+21FF
+};
+
+static const uint32_t gLBClass30[32] = {
+ 0x55155115, // U+3000 - U+3007
+ 0x10101010, // U+3008 - U+300F
+ 0x10105510, // U+3010 - U+3017
+ 0x11011010, // U+3018 - U+301F
+ 0x55555555, // U+3020 - U+3027
+ 0x55555555, // U+3028 - U+302F
+ 0x55555555, // U+3030 - U+3037
+ 0x55555555, // U+3038 - U+303F
+ 0x15151515, // U+3040 - U+3047
+ 0x55555515, // U+3048 - U+304F
+ 0x55555555, // U+3050 - U+3057
+ 0x55555555, // U+3058 - U+305F
+ 0x55551555, // U+3060 - U+3067
+ 0x55555555, // U+3068 - U+306F
+ 0x55555555, // U+3070 - U+3077
+ 0x55555555, // U+3078 - U+307F
+ 0x15151555, // U+3080 - U+3087
+ 0x51555555, // U+3088 - U+308F
+ 0x55555555, // U+3090 - U+3097
+ 0x51111115, // U+3098 - U+309F
+ 0x15151515, // U+30A0 - U+30A7
+ 0x55555515, // U+30A8 - U+30AF
+ 0x55555555, // U+30B0 - U+30B7
+ 0x55555555, // U+30B8 - U+30BF
+ 0x55551555, // U+30C0 - U+30C7
+ 0x55555555, // U+30C8 - U+30CF
+ 0x55555555, // U+30D0 - U+30D7
+ 0x55555555, // U+30D8 - U+30DF
+ 0x15151555, // U+30E0 - U+30E7
+ 0x51555555, // U+30E8 - U+30EF
+ 0x51155555, // U+30F0 - U+30F7
+ 0x51111555, // U+30F8 - U+30FF
+};
+
+static const uint32_t gLBClass0E[32] = {
+ 0x88888888, // U+0E00 - U+0E07
+ 0x88888888, // U+0E08 - U+0E0F
+ 0x88888888, // U+0E10 - U+0E17
+ 0x88888888, // U+0E18 - U+0E1F
+ 0x88888888, // U+0E20 - U+0E27
+ 0x18888888, // U+0E28 - U+0E2F
+ 0x88888888, // U+0E30 - U+0E37
+ 0x08888888, // U+0E38 - U+0E3F
+ 0x81888888, // U+0E40 - U+0E47
+ 0x78888888, // U+0E48 - U+0E4F
+ 0x66666666, // U+0E50 - U+0E57
+ 0x88881166, // U+0E58 - U+0E5F
+ 0x88888888, // U+0E60 - U+0E67
+ 0x88888888, // U+0E68 - U+0E6F
+ 0x88888888, // U+0E70 - U+0E77
+ 0x88888888, // U+0E78 - U+0E7F
+ 0x88888888, // U+0E80 - U+0E87
+ 0x88888888, // U+0E88 - U+0E8F
+ 0x88888888, // U+0E90 - U+0E97
+ 0x88888888, // U+0E98 - U+0E9F
+ 0x88888888, // U+0EA0 - U+0EA7
+ 0x18888888, // U+0EA8 - U+0EAF
+ 0x88888888, // U+0EB0 - U+0EB7
+ 0x88888888, // U+0EB8 - U+0EBF
+ 0x81888888, // U+0EC0 - U+0EC7
+ 0x88888888, // U+0EC8 - U+0ECF
+ 0x66666666, // U+0ED0 - U+0ED7
+ 0x88888866, // U+0ED8 - U+0EDF
+ 0x88888888, // U+0EE0 - U+0EE7
+ 0x88888888, // U+0EE8 - U+0EEF
+ 0x88888888, // U+0EF0 - U+0EF7
+ 0x88888888, // U+0EF8 - U+0EFF
+};
+
+static const uint32_t gLBClass17[32] = {
+ 0x77777777, // U+1700 - U+1707
+ 0x77777777, // U+1708 - U+170F
+ 0x77777777, // U+1710 - U+1717
+ 0x77777777, // U+1718 - U+171F
+ 0x77777777, // U+1720 - U+1727
+ 0x77777777, // U+1728 - U+172F
+ 0x70077777, // U+1730 - U+1737
+ 0x77777777, // U+1738 - U+173F
+ 0x77777777, // U+1740 - U+1747
+ 0x77777777, // U+1748 - U+174F
+ 0x77777777, // U+1750 - U+1757
+ 0x77777777, // U+1758 - U+175F
+ 0x77777777, // U+1760 - U+1767
+ 0x77777777, // U+1768 - U+176F
+ 0x77777777, // U+1770 - U+1777
+ 0x77777777, // U+1778 - U+177F
+ 0x88888888, // U+1780 - U+1787
+ 0x88888888, // U+1788 - U+178F
+ 0x88888888, // U+1790 - U+1797
+ 0x88888888, // U+1798 - U+179F
+ 0x88888888, // U+17A0 - U+17A7
+ 0x88888888, // U+17A8 - U+17AF
+ 0x88888888, // U+17B0 - U+17B7
+ 0x88888888, // U+17B8 - U+17BF
+ 0x88888888, // U+17C0 - U+17C7
+ 0x88888888, // U+17C8 - U+17CF
+ 0x88118888, // U+17D0 - U+17D7
+ 0x77888181, // U+17D8 - U+17DF
+ 0x88888888, // U+17E0 - U+17E7
+ 0x77777788, // U+17E8 - U+17EF
+ 0x88888888, // U+17F0 - U+17F7
+ 0x77777788, // U+17F8 - U+17FF
+};
diff --git a/intl/lwbrk/jisx4051pairtable.txt b/intl/lwbrk/jisx4051pairtable.txt
new file mode 100644
index 0000000000..2bae1b18fe
--- /dev/null
+++ b/intl/lwbrk/jisx4051pairtable.txt
@@ -0,0 +1,286 @@
+
+
+
+/*
+
+ Simplification of Pair Table in JIS X 4051
+
+ 1. The Origion Table - in 4.1.3
+
+ In JIS x 4051. The pair table is defined as below
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20
+ * # * #
+ 1 X X X X X X X X X X X X X X X X X X X X X E
+ 2 X X X X X X
+ 3 X X X X X X
+ 4 X X X X X X
+ 5 X X X X X X
+ 6 X X X X X X
+ 7 X X X X X X X
+ 8 X X X X X X E
+ 9 X X X X X X
+ 10 X X X X X X
+ 11 X X X X X X
+ 12 X X X X X X
+ 13 X X X X X X X
+ 14 X X X X X X X
+ 15 X X X X X X X X X
+ 16 X X X X X X X X
+ 17 X X X X X E
+ 18 X X X X X X X X X
+ 19 X E E E E E X X X X X X X X X X X X E X E E
+ 20 X X X X X E
+
+ * Same Char
+ # Other Char
+
+ 2. Simplified by remove the class which we do not care
+
+ However, since we do not care about class 13(Subscript), 14(Ruby),
+ 19(split line note begin quote), and 20(split line note end quote)
+ we can simplify this par table into the following
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 2 3 4 5 6 7 8 9 10 11 12 15 16 17 18
+
+ 1 X X X X X X X X X X X X X X X X
+ 2 X X X X X
+ 3 X X X X X
+ 4 X X X X X
+ 5 X X X X X
+ 6 X X X X X
+ 7 X X X X X X
+ 8 X X X X X X
+ 9 X X X X X
+ 10 X X X X X
+ 11 X X X X X
+ 12 X X X X X
+ 15 X X X X X X X X
+ 16 X X X X X X X
+ 17 X X X X X
+ 18 X X X X X X X X
+
+ 3. Simplified by merged classes
+
+ After the 2 simplification, the pair table have some duplication
+ a. class 2, 3, 4, 5, 6, are the same- we can merged them
+ b. class 10, 11, 12, 17 are the same- we can merged them
+
+
+ Class of
+ Leading Class of Trailing Char Class
+ Char
+
+ 1 [a] 7 8 9 [b]15 16 18
+
+ 1 X X X X X X X X X
+ [a] X
+ 7 X X
+ 8 X X
+ 9 X
+ [b] X
+ 15 X X X X
+ 16 X X X
+ 18 X X X X
+
+
+ 4. Now we use one bit to encode weather it is breakable, and use 2 bytes
+ for one row, then the bit table will look like:
+
+ 18 <- 1
+
+ 1 0000 0001 1111 1111 = 0x01FF
+ [a] 0000 0000 0000 0010 = 0x0002
+ 7 0000 0000 0000 0110 = 0x0006
+ 8 0000 0000 0100 0010 = 0x0042
+ 9 0000 0000 0000 0010 = 0x0002
+ [b] 0000 0000 0000 0010 = 0x0042
+ 15 0000 0001 0101 0010 = 0x0152
+ 16 0000 0001 1000 0010 = 0x0182
+ 17 0000 0001 1100 0010 = 0x01C2
+
+*/
+
+static uint16_t gJISx4051SimplifiedPair[9] = {
+ 0x01FF, 0x0002, 0x0006, 0x0042, 0x0002, 0x0042, 0x0152, 0x0182, 0x01C2
+};
+
+PRBool XXXX::ClassesToPair(nsJISx4051Cls aCls1, nsJISx4051Cls aCls1)
+{
+ NS_ASSERTION( (aCls1 < 9) "invalid class");
+ NS_ASSERTION( (aCls2 < 9) "invalid class");
+ return ( 0 != (gJISx4051SimplifiedPair[aCls1] & (1L << aCls2) ));
+}
+
+
+#define X4051_IS_DIGIT(u) ((0x0030 >= (u)) && ((u) >= 0x0039))
+
+nsJISx4051Cls XXXX::GetClass(
+ PRUnichar aChar, PRUnichar aBefore = 0, PRUnichar aAfter = 0)
+{
+ // take care the special case in cls 15
+ if( ((0x2C == aChar) || (0x2E == aChar)) &&
+ (X4051_IS_DIGIT(aBefore)) && X4051_IS_DIGIT(aAfter)))
+ {
+ return kJISx4051Cls_15;
+ }
+
+ nsJISx4051Cls cls;
+ if(gSingle->Lookup(aChar, &cls))
+ return cls;
+
+ if(gRange->Lookup(aChar, &cls))
+ return cls;
+
+ return kJISx4051Cls_15;
+}
+
+
+typedef enum {
+ kJISx4051Cls_1 = 0,
+ kJISx4051Cls_2 = 1,
+ kJISx4051Cls_3 = 1,
+ kJISx4051Cls_4 = 1,
+ kJISx4051Cls_5 = 1,
+ kJISx4051Cls_6 = 1,
+ kJISx4051Cls_7 = 2,
+ kJISx4051Cls_8 = 3,
+ kJISx4051Cls_9 = 4,
+ kJISx4051Cls_10 = 5,
+ kJISx4051Cls_11 = 5,
+ kJISx4051Cls_12 = 5,
+ // kJISx4051Cls_13 = 0,
+ // kJISx4051Cls_14 = 0,
+ kJISx4051Cls_15 = 6,
+ kJISx4051Cls_16 = 7,
+ kJISx4051Cls_17 = 5,
+ kJISx4051Cls_18 = 8,
+ // kJISx4051Cls_19 = 0,
+ // kJISx4051Cls_20 = 0
+} nsJISx4051Cls;
+
+
+ // Table 2
+ YYYY(kJISx4051Cls_1 , 0x0028),
+ YYYY(kJISx4051Cls_1 , 0x005B),
+ YYYY(kJISx4051Cls_1 , 0x007B),
+ YYYY(kJISx4051Cls_1 , 0x2018),
+ YYYY(kJISx4051Cls_1 , 0x201B),
+ YYYY(kJISx4051Cls_1 , 0x201C),
+ YYYY(kJISx4051Cls_1 , 0x201F),
+ YYYY(kJISx4051Cls_1 , 0x3008),
+ YYYY(kJISx4051Cls_1 , 0x300A),
+ YYYY(kJISx4051Cls_1 , 0x300C),
+ YYYY(kJISx4051Cls_1 , 0x300E),
+ YYYY(kJISx4051Cls_1 , 0x3010),
+ YYYY(kJISx4051Cls_1 , 0x3014),
+ YYYY(kJISx4051Cls_1 , 0x3016),
+ YYYY(kJISx4051Cls_1 , 0x3018),
+ YYYY(kJISx4051Cls_1 , 0x301A),
+ YYYY(kJISx4051Cls_1 , 0x301D),
+
+ // Table 3
+ YYYY(kJISx4051Cls_2 , 0x0029),
+ YYYY(kJISx4051Cls_2 , 0x002C),
+ YYYY(kJISx4051Cls_2 , 0x005D),
+ YYYY(kJISx4051Cls_2 , 0x007D),
+ YYYY(kJISx4051Cls_2 , 0x2019),
+ YYYY(kJISx4051Cls_2 , 0x201A),
+ YYYY(kJISx4051Cls_2 , 0x201D),
+ YYYY(kJISx4051Cls_2 , 0x201E),
+ YYYY(kJISx4051Cls_2 , 0x3001),
+ YYYY(kJISx4051Cls_2 , 0x3009),
+ YYYY(kJISx4051Cls_2 , 0x300B),
+ YYYY(kJISx4051Cls_2 , 0x300D),
+ YYYY(kJISx4051Cls_2 , 0x300F),
+ YYYY(kJISx4051Cls_2 , 0x3011),
+ YYYY(kJISx4051Cls_2 , 0x3015),
+ YYYY(kJISx4051Cls_2 , 0x3017),
+ YYYY(kJISx4051Cls_2 , 0x3019),
+ YYYY(kJISx4051Cls_2 , 0x301B),
+ YYYY(kJISx4051Cls_2 , 0x301E),
+ YYYY(kJISx4051Cls_2 , 0x301F),
+
+ // Table 4
+ YYYY(kJISx4051Cls_3 , 0x203C),
+ YYYY(kJISx4051Cls_3 , 0x2044),
+ YYYY(kJISx4051Cls_3 , 0x301C),
+ YYYY(kJISx4051Cls_3 , 0x3041),
+ YYYY(kJISx4051Cls_3 , 0x3043),
+ YYYY(kJISx4051Cls_3 , 0x3045),
+ YYYY(kJISx4051Cls_3 , 0x3047),
+ YYYY(kJISx4051Cls_3 , 0x3049),
+ YYYY(kJISx4051Cls_3 , 0x3063),
+ YYYY(kJISx4051Cls_3 , 0x3083),
+ YYYY(kJISx4051Cls_3 , 0x3085),
+ YYYY(kJISx4051Cls_3 , 0x3087),
+ YYYY(kJISx4051Cls_3 , 0x308E),
+ YYYY(kJISx4051Cls_3 , 0x309D),
+ YYYY(kJISx4051Cls_3 , 0x309E),
+ YYYY(kJISx4051Cls_3 , 0x30A1),
+ YYYY(kJISx4051Cls_3 , 0x30A3),
+ YYYY(kJISx4051Cls_3 , 0x30A5),
+ YYYY(kJISx4051Cls_3 , 0x30A7),
+ YYYY(kJISx4051Cls_3 , 0x30A9),
+ YYYY(kJISx4051Cls_3 , 0x30C3),
+ YYYY(kJISx4051Cls_3 , 0x30E3),
+ YYYY(kJISx4051Cls_3 , 0x30E5),
+ YYYY(kJISx4051Cls_3 , 0x30E7),
+ YYYY(kJISx4051Cls_3 , 0x30EE),
+ YYYY(kJISx4051Cls_3 , 0x30F5),
+ YYYY(kJISx4051Cls_3 , 0x30F6),
+ YYYY(kJISx4051Cls_3 , 0x30FC),
+ YYYY(kJISx4051Cls_3 , 0x30FD),
+ YYYY(kJISx4051Cls_3 , 0x30FE),
+
+ // Table 5
+ YYYY(kJISx4051Cls_4 , 0x0021),
+ YYYY(kJISx4051Cls_4 , 0x003F),
+
+ // Table 6
+ YYYY(kJISx4051Cls_5 , 0x003A),
+ YYYY(kJISx4051Cls_5 , 0x003B),
+ YYYY(kJISx4051Cls_5 , 0x30FB),
+
+ // Table 7
+ YYYY(kJISx4051Cls_6 , 0x002E),
+ YYYY(kJISx4051Cls_6 , 0x3002),
+
+ // Table 8
+ YYYY(kJISx4051Cls_7 , 0x2014),
+ YYYY(kJISx4051Cls_7 , 0x2024),
+ YYYY(kJISx4051Cls_7 , 0x2025),
+ YYYY(kJISx4051Cls_7 , 0x2026),
+
+ // Table 9
+ YYYY(kJISx4051Cls_8 , 0x0024),
+ YYYY(kJISx4051Cls_8 , 0x00A3),
+ YYYY(kJISx4051Cls_8 , 0x00A5),
+ YYYY(kJISx4051Cls_8 , 0x2116),
+
+ // Table 10
+ YYYY(kJISx4051Cls_9 , 0x0025),
+ YYYY(kJISx4051Cls_9 , 0x00A2),
+ YYYY(kJISx4051Cls_9 , 0x00B0),
+ YYYY(kJISx4051Cls_9 , 0x2030),
+ YYYY(kJISx4051Cls_9 , 0x2031),
+ YYYY(kJISx4051Cls_9 , 0x2032),
+ YYYY(kJISx4051Cls_9 , 0x2033),
+
+ // Table 1
+ YYYY(kJISx4051Cls_10, 0x3000),
+
+ // Table 1
+ ZZZZ(kJISx4051Cls_11, 0x3000),
+
+
+
+
diff --git a/intl/lwbrk/moz.build b/intl/lwbrk/moz.build
new file mode 100644
index 0000000000..b47a49e279
--- /dev/null
+++ b/intl/lwbrk/moz.build
@@ -0,0 +1,40 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+TEST_DIRS += ["gtest"]
+
+EXPORTS.mozilla.intl += [
+ "LineBreaker.h",
+ "WordBreaker.h",
+]
+
+UNIFIED_SOURCES += [
+ "LineBreaker.cpp",
+ "WordBreaker.cpp",
+]
+
+if CONFIG["MOZ_WIDGET_TOOLKIT"] == "gtk":
+ SOURCES += [
+ "nsPangoBreaker.cpp",
+ ]
+ CXXFLAGS += CONFIG["MOZ_PANGO_CFLAGS"]
+elif CONFIG["MOZ_WIDGET_TOOLKIT"] == "windows":
+ SOURCES += [
+ "nsUniscribeBreaker.cpp",
+ ]
+elif CONFIG["MOZ_WIDGET_TOOLKIT"] == "cocoa":
+ UNIFIED_SOURCES += [
+ "nsCarbonBreaker.cpp",
+ ]
+else:
+ SOURCES += [
+ "nsRuleBreaker.cpp",
+ ]
+ SOURCES += [
+ "rulebrk.c",
+ ]
+
+FINAL_LIBRARY = "xul"
diff --git a/intl/lwbrk/nsCarbonBreaker.cpp b/intl/lwbrk/nsCarbonBreaker.cpp
new file mode 100644
index 0000000000..d1d81b2578
--- /dev/null
+++ b/intl/lwbrk/nsCarbonBreaker.cpp
@@ -0,0 +1,43 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <CoreFoundation/CoreFoundation.h>
+#include <stdint.h>
+#include "nsDebug.h"
+#include "nscore.h"
+
+void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore) {
+ NS_ASSERTION(aText, "aText shouldn't be null");
+
+ memset(aBreakBefore, 0, aLength * sizeof(uint8_t));
+
+ CFStringRef str = ::CFStringCreateWithCharactersNoCopy(
+ kCFAllocatorDefault, reinterpret_cast<const UniChar*>(aText), aLength,
+ kCFAllocatorNull);
+ if (!str) {
+ return;
+ }
+
+ CFStringTokenizerRef st = ::CFStringTokenizerCreate(
+ kCFAllocatorDefault, str, ::CFRangeMake(0, aLength),
+ kCFStringTokenizerUnitLineBreak, nullptr);
+ if (!st) {
+ ::CFRelease(str);
+ return;
+ }
+
+ CFStringTokenizerTokenType tt = ::CFStringTokenizerAdvanceToNextToken(st);
+ while (tt != kCFStringTokenizerTokenNone) {
+ CFRange r = ::CFStringTokenizerGetCurrentTokenRange(st);
+ if (r.location != 0) { // Ignore leading edge
+ aBreakBefore[r.location] = true;
+ }
+ tt = CFStringTokenizerAdvanceToNextToken(st);
+ }
+
+ ::CFRelease(st);
+ ::CFRelease(str);
+}
diff --git a/intl/lwbrk/nsComplexBreaker.h b/intl/lwbrk/nsComplexBreaker.h
new file mode 100644
index 0000000000..0b508a4645
--- /dev/null
+++ b/intl/lwbrk/nsComplexBreaker.h
@@ -0,0 +1,18 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsComplexBreaker_h__
+#define nsComplexBreaker_h__
+
+#include "nsString.h"
+
+/**
+ * Find line break opportunities in aText[] of aLength characters,
+ * filling boolean values indicating line break opportunities for
+ * corresponding charactersin aBreakBefore[] on return.
+ */
+void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore);
+
+#endif /* nsComplexBreaker_h__ */
diff --git a/intl/lwbrk/nsLWBrkCIID.h b/intl/lwbrk/nsLWBrkCIID.h
new file mode 100644
index 0000000000..b612155ef0
--- /dev/null
+++ b/intl/lwbrk/nsLWBrkCIID.h
@@ -0,0 +1,28 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef nsLWBrkCIID_h__
+#define nsLWBrkCIID_h__
+
+// {2BF64764-997F-450D-AF96-3028D1A902B0}
+#define NS_LBRK_CID \
+ { \
+ 0x2bf64764, 0x997f, 0x450d, { \
+ 0xaf, 0x96, 0x30, 0x28, 0xd1, 0xa9, 0x2, 0xb0 \
+ } \
+ }
+
+#define NS_LBRK_CONTRACTID "@mozilla.org/intl/lbrk;1"
+
+// {2BF64765-997F-450D-AF96-3028D1A902B0}
+#define NS_WBRK_CID \
+ { \
+ 0x2bf64765, 0x997f, 0x450d, { \
+ 0xaf, 0x96, 0x30, 0x28, 0xd1, 0xa9, 0x2, 0xb0 \
+ } \
+ }
+
+#define NS_WBRK_CONTRACTID "@mozilla.org/intl/wbrk;1"
+
+#endif
diff --git a/intl/lwbrk/nsPangoBreaker.cpp b/intl/lwbrk/nsPangoBreaker.cpp
new file mode 100644
index 0000000000..ca3d3d54c9
--- /dev/null
+++ b/intl/lwbrk/nsPangoBreaker.cpp
@@ -0,0 +1,58 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsComplexBreaker.h"
+
+#include <pango/pango-break.h>
+#include "nsUTF8Utils.h"
+#include "nsString.h"
+#include "nsTArray.h"
+
+void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore) {
+ NS_ASSERTION(aText, "aText shouldn't be null");
+
+ memset(aBreakBefore, false, aLength * sizeof(uint8_t));
+
+ AutoTArray<PangoLogAttr, 2000> attrBuffer;
+ // XXX(Bug 1631371) Check if this should use a fallible operation as it
+ // pretended earlier.
+ attrBuffer.AppendElements(aLength + 1);
+
+ NS_ConvertUTF16toUTF8 aUTF8(aText, aLength);
+
+ const gchar* p = aUTF8.Data();
+ const gchar* end = p + aUTF8.Length();
+ uint32_t u16Offset = 0;
+
+ static PangoLanguage* language = pango_language_from_string("en");
+
+ while (p < end) {
+ PangoLogAttr* attr = attrBuffer.Elements();
+ pango_get_log_attrs(p, end - p, -1, language, attr, attrBuffer.Length());
+
+ while (p < end) {
+ aBreakBefore[u16Offset] = attr->is_line_break;
+ if (NS_IS_LOW_SURROGATE(aText[u16Offset]))
+ aBreakBefore[++u16Offset] = false; // Skip high surrogate
+ ++u16Offset;
+
+ // We're iterating over text obtained from NS_ConvertUTF16toUTF8,
+ // so we know we have valid UTF-8 and don't need to check for
+ // errors.
+ uint32_t ch = UTF8CharEnumerator::NextChar(&p, end);
+ ++attr;
+
+ if (!ch) {
+ // pango_break (pango 1.16.2) only analyses text before the
+ // first NUL (but sets one extra attr). Workaround loop to call
+ // pango_break again to analyse after the NUL is done somewhere else
+ // (gfx/thebes/gfxFontconfigFonts.cpp: SetupClusterBoundaries()).
+ // So, we do the same here for pango_get_log_attrs.
+ break;
+ }
+ }
+ }
+}
diff --git a/intl/lwbrk/nsRuleBreaker.cpp b/intl/lwbrk/nsRuleBreaker.cpp
new file mode 100644
index 0000000000..4c1c9aff90
--- /dev/null
+++ b/intl/lwbrk/nsRuleBreaker.cpp
@@ -0,0 +1,17 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsComplexBreaker.h"
+
+#define TH_UNICODE
+#include "rulebrk.h"
+
+void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore) {
+ NS_ASSERTION(aText, "aText shouldn't be null");
+
+ for (uint32_t i = 0; i < aLength; i++)
+ aBreakBefore[i] = (0 == TrbWordBreakPos(aText, i, aText + i, aLength - i));
+}
diff --git a/intl/lwbrk/nsUniscribeBreaker.cpp b/intl/lwbrk/nsUniscribeBreaker.cpp
new file mode 100644
index 0000000000..503b756b61
--- /dev/null
+++ b/intl/lwbrk/nsUniscribeBreaker.cpp
@@ -0,0 +1,60 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsComplexBreaker.h"
+
+#include <windows.h>
+
+#include <usp10.h>
+
+#include "nsUTF8Utils.h"
+#include "nsString.h"
+#include "nsTArray.h"
+
+void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
+ uint8_t* aBreakBefore) {
+ NS_ASSERTION(aText, "aText shouldn't be null");
+
+ int outItems = 0;
+ HRESULT result;
+ AutoTArray<SCRIPT_ITEM, 64> items;
+ char16ptr_t text = aText;
+
+ memset(aBreakBefore, false, aLength);
+
+ items.AppendElements(64);
+
+ do {
+ result = ScriptItemize(text, aLength, items.Length(), nullptr, nullptr,
+ items.Elements(), &outItems);
+
+ if (result == E_OUTOFMEMORY) {
+ // XXX(Bug 1631371) Check if this should use a fallible operation as it
+ // pretended earlier.
+ items.AppendElements(items.Length());
+ }
+ } while (result == E_OUTOFMEMORY);
+
+ for (int iItem = 0; iItem < outItems; ++iItem) {
+ uint32_t endOffset =
+ (iItem + 1 == outItems ? aLength : items[iItem + 1].iCharPos);
+ uint32_t startOffset = items[iItem].iCharPos;
+ AutoTArray<SCRIPT_LOGATTR, 64> sla;
+
+ // XXX(Bug 1631371) Check if this should use a fallible operation as it
+ // pretended earlier.
+ sla.AppendElements(endOffset - startOffset);
+
+ if (ScriptBreak(text + startOffset, endOffset - startOffset,
+ &items[iItem].a, sla.Elements()) < 0)
+ return;
+
+ // We don't want to set a potential break position at the start of text;
+ // that's the responsibility of a higher level.
+ for (uint32_t j = startOffset ? 0 : 1; j + startOffset < endOffset; ++j) {
+ aBreakBefore[j + startOffset] = sla[j].fSoftBreak;
+ }
+ }
+}
diff --git a/intl/lwbrk/rulebrk.c b/intl/lwbrk/rulebrk.c
new file mode 100644
index 0000000000..d7574b929f
--- /dev/null
+++ b/intl/lwbrk/rulebrk.c
@@ -0,0 +1,388 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#define TH_UNICODE
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include "th_char.h"
+#define th_isalpha(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
+#define th_isspace(c) ((c) == ' ' || (c) == '\t')
+
+/*
+/////////////////////////////////////////////////
+// Thai character type array
+*/
+
+typedef unsigned short twb_t;
+extern const twb_t _TwbType[0x100 - 0xa0];
+
+/*
+// bit definition
+*/
+
+#define VRS 0x0001
+#define VRE 0x0002
+#define VRX 0x0004
+
+#define VRA 0x0008
+
+#define VLA 0x0010
+#define VLO 0x0020
+#define VLI 0x0040
+
+#define VC 0x0080
+
+#define CC 0x0100
+#define CS 0x0200
+
+#define C2 0x0400
+#define CHB 0x0800
+#define CHE 0x1000
+
+#define MT 0x2000
+/*
+//_#define me 0x2000
+*/
+#define M 0x4000
+
+#define T 0x8000
+
+#define VL (VLA | VLO | VLI)
+#define VR (VRS | VRE | VRX)
+#define NE (VL | VRS)
+#define NB (VR | M)
+#define V (VL | VR)
+#define CX (CC | CS)
+#define C (CX | VC)
+#define A (C | V | M)
+
+#define twbtype(c) (_TwbType[th_zcode(c)])
+
+#ifndef TRUE
+# define TRUE 1
+# define FALSE 0
+#endif
+#define RETURN(b) return (b)
+
+/*
+/////////////////////////////////////////////////
+*/
+
+int TrbWordBreakPos(const th_char* pstr, int left, const th_char* rstr,
+ int right)
+/* const ThBreakIterator *it, const th_char **p)*/
+{
+ /*
+ //int left, right;
+ //const th_char *s = *p;
+ */
+ const th_char* lstr = pstr + left;
+ th_char _c[6];
+ twb_t _t[6];
+#define c(i) (_c[(i) + 3])
+#define t(i) (_t[(i) + 3])
+ int i, j;
+
+ /*
+ //left = s - it->begin;
+ */
+ if (left < 0) return -1;
+ /*
+ //right = (it->end == NULL) ? 4 : it->begin - s;
+ */
+ if (right < 1) return -1;
+
+ /*
+ // get c(0), t(0)
+ */
+ c(0) = rstr[0]; /* may be '\0' */
+ if (!th_isthai(c(0))) return -1;
+ t(0) = twbtype(c(0));
+ if (!(t(0) & A)) return -1;
+
+ /*
+ // get c(-1), t(-1)
+ */
+ if (left >= 1) {
+ c(-1) = lstr[-1];
+ if (!th_isthai(c(-1))) return 0;
+ t(-1) = twbtype(c(-1));
+ if (!(t(-1) & A)) return 0; /* handle punctuation marks here */
+ } else {
+ c(-1) = 0;
+ t(-1) = 0;
+ }
+
+ /*
+ // get c(1..2), t(1..2)
+ */
+ for (i = 1; i <= 2; i++) {
+ if (i >= right) {
+ c(i) = 0;
+ t(i) = 0;
+ } else {
+ c(i) = rstr[i]; /* may be '\0'; */
+ if (!th_isthai(c(i)))
+ right = i--;
+ else {
+ t(i) = twbtype(c(i));
+ if (!(t(i) & A)) right = i--;
+ }
+ }
+ }
+ /*
+ // get c(-2..-3), t(-2..-3)
+ */
+ for (i = -2, j = -2; i >= -3; j--) {
+ if (j < -left) {
+ c(i) = 0;
+ t(i) = 0;
+ i--;
+ } else {
+ c(i) = lstr[j];
+ if (!th_isthai(c(i)))
+ left = 0;
+ else {
+ t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0);
+ if (!(t(i) & A))
+ left = 0;
+ else {
+ if ((t(i + 1) & MT) && ((t(i) & VR) || (t(i + 2) & VR))) {
+ c(i + 1) = c(i);
+ t(i + 1) = t(i);
+ } else
+ i--;
+ }
+ }
+ }
+ }
+
+ /*
+ // prohibit the unlikely
+ */
+ if ((t(-1) & C) && (t(0) & C)) {
+ if ((t(-1) & CHE) || (t(0) & CHB)) return -1;
+ }
+ /*
+ // special case : vlao, C/ sara_a|aa, !sara_a
+ */
+ if ((t(-3) & (VLA | VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) &&
+ (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA))
+ return 0;
+
+ /*
+ // prohibit break
+ */
+ if (t(0) & NB) return -1;
+ if (t(-1) & NE) return -1;
+
+ /*
+ // apply 100% rules
+ */
+ if (t(-1) & VRE) {
+ if (c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0;
+ return -1; /* usually too short syllable, part of word */
+ }
+
+ if (t(-2) & VRE) return -1;
+
+ if ((t(0) & C) && (t(1) & (VR | MT)) &&
+ (c(2) != TH_THANTHAKHAT)) { /*?C, NB */
+ if ((t(-1) & (VRS | VRX)) && c(1) == TH_SARA_I) return -1; /* exception */
+ if (t(-1) & (V | M)) return 0; /* !C/ C, NB */
+ if (t(-2) & VRS) return 0; /* VRS, C / C, NB */
+ if (!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */
+ if (t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */
+ if (t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */
+ }
+ }
+ if ((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */
+ if ((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V | M)))
+ return 0; /* VRS, C/ !C */
+
+ if ((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) {
+ if ((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */
+ if ((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */
+ }
+ /*
+ // apply 90% rules
+ */
+ if (t(0) & VL) return 0;
+ if (t(1) & VL) return -1;
+ if (c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING)
+ return 0;
+
+ /*
+ //return -1;
+ // apply 80% rules
+ */
+ if (t(0) & CHE) {
+ if ((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */
+ /*if(t(-1) & VRX) return 0; // VRX/ CHE */
+ if (t(-1) & VC) return 0; /* VC/ CHE */
+ }
+ if (t(-1) & CHB) {
+ if ((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */
+ if (t(0) & VC) return 0; /* CHB/ VC */
+ }
+
+ if ((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */
+ if (t(-2) & VLI)
+ return 0; /* VLI,C/C,VR .*/
+ else { /* vlao, C ? C , VR */
+ if (c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */
+ if (t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */
+ if (!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */
+ }
+ }
+ /* C,MT,C */
+ if ((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1;
+
+ return -1;
+}
+
+int TrbFollowing(const th_char* begin, int length, int offset)
+/*
+//(ThBreakIterator *this, int offset)
+*/
+{
+ const th_char* w = begin + offset;
+ const th_char* end = begin + length;
+ while (w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++;
+
+ if (w < end && *w && !th_isthai(*w)) {
+ int english = FALSE;
+ while (w < end && *w && !th_isthai(*w) && !th_isspace(*w)) {
+ if (th_isalpha(*w)) english = TRUE;
+ w++;
+ }
+ if (english || w == end || (!th_isthai(*w) && th_isspace(*w)))
+ return w - begin;
+ }
+ if (w == end || *w == 0 || !th_isthai(*w)) return w - begin;
+ w++;
+ if (w < end && *w && th_isthai(*w)) {
+ int brk = TrbWordBreakPos(begin, w - begin, w, end - w);
+ while (brk < 0) {
+ w++;
+ if (w == end || *w == 0 || !th_isthai(*w)) break;
+ brk = TrbWordBreakPos(begin, w - begin, w, end - w);
+ }
+ if (brk > 0) w += brk;
+ }
+ if (w < end && *w && !th_isthai(*w)) {
+ while (w < end && *w && !th_isthai(*w) && !th_isalpha(*w) &&
+ !th_isspace(*w))
+ w++;
+ }
+ return w - begin;
+}
+
+/*
+/////////////////////////////////////////////////
+*/
+const twb_t _TwbType[0x100 - 0xa0] = {
+#if 0
+/* 80 € */ T,
+/* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+/* 90  */ T,
+/* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+#endif
+ /* a0   */ 0,
+ /* a1 ¡ */ CS,
+ /* a2 ¢ */ CS | CHE,
+ /* a3 £ */ CC | CHE,
+ /* a4 € */ CS | CHE,
+ /* a5 ¥ */ CC | CHE,
+ /* a6 Š */ CS,
+ /* a7 § */ CS | CHB,
+ /* a8 š */ CS,
+ /* a9 © */ CC | CHE,
+ /* aa ª */ CS,
+ /* ab « */ CC | CHE,
+ /* ac ¬ */ CC | CHB | CHE,
+ /* ad ­ */ CS | CHB,
+ /* ae ® */ CS | CHB,
+ /* af ¯ */ CS | CHB,
+ /* b0 ° */ CS,
+ /* b1 ± */ CS | CHB | CHE,
+ /* b2 ² */ CS | CHB | CHE,
+ /* b3 ³ */ CS | CHB,
+ /* b4 Ž */ CS,
+ /* b5 µ */ CS,
+ /* b6 ¶ */ CS,
+ /* b7 · */ CS,
+ /* b8 ž */ CS,
+ /* b9 ¹ */ CS,
+ /* ba º */ CS,
+ /* bb » */ CS,
+ /* bc Œ */ CC | CHE,
+ /* bd œ */ CC | CHE,
+ /* be Ÿ */ CS,
+ /* bf ¿ */ CS,
+ /* c0 À */ CS | CHE,
+ /* c1 Á */ CS,
+ /* c2 Â */ CS,
+ /* c3 Ã */ CS | C2 | CHE, /* ? add CHE */
+ /* c4 Ä */ VC | CHE,
+ /* c5 Å */ CS | C2,
+ /* c6 Æ */ VC | CHE,
+ /* c7 Ç */ VC | C2,
+ /* c8 È */ CS,
+ /* c9 É */ CS | CHB,
+ /* ca Ê */ CS | CHE,
+ /* cb Ë */ CC | CHE,
+ /* CC Ì */ CS | CHB | CHE,
+ /* cd Í */ VC,
+ /* ce Î */ CC | CHE,
+ /* cf Ï */ T,
+ /* d0 Ð */ VRE | VRA,
+ /* d1 Ñ */ VRS,
+ /* d2 Ò */ VRX | VRA,
+ /* d3 Ó */ VRE,
+ /* d4 Ô */ VRX | VRA,
+ /* d5 Õ */ VRX | VRA,
+ /* d6 Ö */ VRS,
+ /* d7 × */ VRS | VRA,
+ /* d8 Ø */ VRX,
+ /* d9 Ù */ VRX,
+ /* da Ú */ T,
+ /* db Û */ 0,
+ /* dc Ü */ 0,
+ /* dd Ý */ 0,
+ /* de Þ */ 0,
+ /* df ß */ T,
+ /* e0 à */ VLA,
+ /* e1 á */ VLO,
+ /* e2 â */ VLO,
+ /* e3 ã */ VLI,
+ /* e4 ä */ VLI,
+ /* e5 å */ VRE,
+ /* e6 æ */ M,
+ /* e7 ç */ M,
+ /* e8 è */ M | MT,
+ /* e9 é */ M | MT,
+ /* ea ê */ M | MT,
+ /* eb ë */ M | MT,
+ /* ec ì */ M,
+ /* ed í */ T,
+ /* ee î */ T,
+ /* ef ï */ T,
+ /* f0 ð */ T,
+ /* f1 ñ */ T,
+ /* f2 ò */ T,
+ /* f3 ó */ T,
+ /* f4 ô */ T,
+ /* f5 õ */ T,
+ /* f6 ö */ T,
+ /* f7 ÷ */ T,
+ /* f8 ø */ T,
+ /* f9 ù */ T,
+ /* fa ú */ T,
+ /* fb û */ T,
+ /* fc ü */ 0,
+ /* fd ý */ 0,
+ /* fe þ */ 0,
+ /* ff ’ */ 0};
diff --git a/intl/lwbrk/rulebrk.h b/intl/lwbrk/rulebrk.h
new file mode 100644
index 0000000000..c1f2e0957b
--- /dev/null
+++ b/intl/lwbrk/rulebrk.h
@@ -0,0 +1,26 @@
+/*
+Copyright (c) 1999 Samphan Raruenrom <samphan@thai.com>
+Permission to use, copy, modify, distribute and sell this software
+and its documentation for any purpose is hereby granted without fee,
+provided that the above copyright notice appear in all copies and
+that both that copyright notice and this permission notice appear
+in supporting documentation. Samphan Raruenrom makes no
+representations about the suitability of this software for any
+purpose. It is provided "as is" without express or implied warranty.
+*/
+#ifndef __RULEBRK_H__
+#define __RULEBRK_H__
+#include "th_char.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int TrbWordBreakPos(const th_char* pstr, int left, const th_char* rstr,
+ int right);
+int TrbFollowing(const th_char* begin, int length, int offset);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/intl/lwbrk/th_char.h b/intl/lwbrk/th_char.h
new file mode 100644
index 0000000000..a088228fff
--- /dev/null
+++ b/intl/lwbrk/th_char.h
@@ -0,0 +1,133 @@
+/*
+Copyright (c) 1999 Samphan Raruenrom <samphan@thai.com>
+Permission to use, copy, modify, distribute and sell this software
+and its documentation for any purpose is hereby granted without fee,
+provided that the above copyright notice appear in all copies and
+that both that copyright notice and this permission notice appear
+in supporting documentation. Samphan Raruenrom makes no
+representations about the suitability of this software for any
+purpose. It is provided "as is" without express or implied warranty.
+*/
+#ifndef __TH_CHAR_H__
+#define __TH_CHAR_H__
+
+typedef unsigned char tis_char;
+
+#ifdef TH_UNICODE
+/*
+ * The char16_t type is only usable in C++ code, so we need this ugly hack to
+ * select a binary compatible C type for the expat C code to use.
+ */
+# ifdef __cplusplus
+typedef char16_t th_char;
+# else
+typedef uint16_t th_char;
+# endif
+# define TH_THAIBEGIN_ 0x0e00
+# define th_isthai(c) (0x0e00 <= (c) && (c) <= 0x0e5f)
+#else
+typedef tis_char th_char;
+# define TH_THAIBEGIN_ 0xa0
+# define th_isthai(c) ((c) >= 0xa0)
+#endif
+#define th_zcode(c) ((c)-TH_THAIBEGIN_)
+
+enum TH_CHARNAME {
+ TH_THAIBEGIN = TH_THAIBEGIN_,
+ TH_KOKAI,
+ TH_KHOKHAI,
+ TH_KHOKHUAT,
+ TH_KHOKHWAI,
+ TH_KHOKHON,
+ TH_KHORAKHANG,
+ TH_NGONGU,
+ TH_CHOCHAN,
+ TH_CHOCHING,
+ TH_CHOCHANG,
+ TH_SOSO,
+ TH_CHOCHOE,
+ TH_YOYING,
+ TH_DOCHADA,
+ TH_TOPATAK,
+ TH_THOTHAN,
+ TH_THONANGMONTHO,
+ TH_THOPHUTHAO,
+ TH_NONEN,
+ TH_DODEK,
+ TH_TOTAO,
+ TH_THOTHUNG,
+ TH_THOTHAHAN,
+ TH_THOTHONG,
+ TH_NONU,
+ TH_BOBAIMAI,
+ TH_POPLA,
+ TH_PHOPHUNG,
+ TH_FOFA,
+ TH_PHOPHAN,
+ TH_FOFAN,
+ TH_PHOSAMPHAO,
+ TH_MOMA,
+ TH_YOYAK,
+ TH_RORUA,
+ TH_RU,
+ TH_LOLING,
+ TH_LU,
+ TH_WOWAEN,
+ TH_SOSALA,
+ TH_SORUSI,
+ TH_SOSUA,
+ TH_HOHIP,
+ TH_LOCHULA,
+ TH_OANG,
+ TH_HONOKHUK,
+ TH_PAIYANNOI,
+ TH_SARA_A,
+ TH_MAIHANAKAT,
+ TH_SARA_AA,
+ TH_SARA_AM,
+ TH_SARA_I,
+ TH_SARA_II,
+ TH_SARA_UE,
+ TH_SARA_UEE,
+ TH_SARA_U,
+ TH_SARA_UU,
+ TH_PHINTHU,
+ TH_REM_CHERNG_,
+ TH_TAC_WBRK_,
+ TH_UNDEF_DD,
+ TH_UNDEF_DE,
+ TH_BAHT,
+ TH_SARA_E,
+ TH_SARA_AE,
+ TH_SARA_O,
+ TH_MAIMUAN,
+ TH_MAIMALAI,
+ TH_LAKKHANGYAO,
+ TH_MAIYAMOK,
+ TH_MAITAIKHU,
+ TH_MAIEK,
+ TH_MAITHO,
+ TH_MAITRI,
+ TH_MAICHATTAWA,
+ TH_THANTHAKHAT,
+ TH_NIKHAHIT,
+ TH_YAMAKKAN,
+ TH_FONGMAN,
+ TH_THAIZERO,
+ TH_THAIONE,
+ TH_THAITWO,
+ TH_THAITHREE,
+ TH_THAIFOUR,
+ TH_THAIFIVE,
+ TH_THAISIX,
+ TH_THAISEVEN,
+ TH_THAIEIGHT,
+ TH_THAININE,
+ TH_ANGKHANKHU,
+ TH_KHOMUT,
+ TH_UNDEF_FC,
+ TH_UNDEF_FD,
+ TH_UNDEF_FE,
+ TH_THAIEND
+};
+#endif
diff --git a/intl/lwbrk/tools/anzx4051.html b/intl/lwbrk/tools/anzx4051.html
new file mode 100644
index 0000000000..295f8741e0
--- /dev/null
+++ b/intl/lwbrk/tools/anzx4051.html
@@ -0,0 +1,669 @@
+<!-- This Source Code Form is subject to the terms of the Mozilla Public
+ - License, v. 2.0. If a copy of the MPL was not distributed with this
+ - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
+
+<HTML>
+<HEAD>
+<TITLE>
+Analysis of JIS X 4051 to Unicode General Category Mapping
+</TITLE>
+</HEAD>
+<BODY>
+<H1>
+Analysis of JIS X 4051 to Unicode General Category Mapping
+</H1>
+<TABLE BORDER=3>
+<TR BGCOLOR=blue><TH><TH>
+<TD BGCOLOR=red>C</TD>
+<TD BGCOLOR=red>L</TD>
+<TD BGCOLOR=red>M</TD>
+<TD BGCOLOR=red>N</TD>
+<TD BGCOLOR=red>P</TD>
+<TD BGCOLOR=red>S</TD>
+<TD BGCOLOR=red>Z</TD>
+<TD BGCOLOR=white>Total</TD>
+<TD BGCOLOR=yellow>Cc</TD>
+<TD BGCOLOR=yellow>Cf</TD>
+<TD BGCOLOR=yellow>Co</TD>
+<TD BGCOLOR=yellow>Cs</TD>
+<TD BGCOLOR=yellow>Ll</TD>
+<TD BGCOLOR=yellow>Lm</TD>
+<TD BGCOLOR=yellow>Lo</TD>
+<TD BGCOLOR=yellow>Lt</TD>
+<TD BGCOLOR=yellow>Lu</TD>
+<TD BGCOLOR=yellow>Mc</TD>
+<TD BGCOLOR=yellow>Me</TD>
+<TD BGCOLOR=yellow>Mn</TD>
+<TD BGCOLOR=yellow>Nd</TD>
+<TD BGCOLOR=yellow>Nl</TD>
+<TD BGCOLOR=yellow>No</TD>
+<TD BGCOLOR=yellow>Pc</TD>
+<TD BGCOLOR=yellow>Pd</TD>
+<TD BGCOLOR=yellow>Pe</TD>
+<TD BGCOLOR=yellow>Pf</TD>
+<TD BGCOLOR=yellow>Pi</TD>
+<TD BGCOLOR=yellow>Po</TD>
+<TD BGCOLOR=yellow>Ps</TD>
+<TD BGCOLOR=yellow>Sc</TD>
+<TD BGCOLOR=yellow>Sk</TD>
+<TD BGCOLOR=yellow>Sm</TD>
+<TD BGCOLOR=yellow>So</TD>
+<TD BGCOLOR=yellow>Zl</TD>
+<TD BGCOLOR=yellow>Zp</TD>
+<TD BGCOLOR=yellow>Zs</TD>
+</TR>
+<TR><TH>00_1<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>14</TD>
+<TD>1</TD>
+<TD></TD>
+<TD BGCOLOR=white>15</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD>2</TD>
+<TD>11</TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>01_[a]<TH>
+<TD></TD>
+<TD>32</TD>
+<TD>2</TD>
+<TD></TD>
+<TD>31</TD>
+<TD>3</TD>
+<TD></TD>
+<TD BGCOLOR=white>68</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>8</TD>
+<TD>24</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>2</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD>12</TD>
+<TD>1</TD>
+<TD></TD>
+<TD>17</TD>
+<TD></TD>
+<TD></TD>
+<TD>2</TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>02_7<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD BGCOLOR=white>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>03_8<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD BGCOLOR=white>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>04_9<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>5</TD>
+<TD></TD>
+<TD></TD>
+<TD BGCOLOR=white>5</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>5</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>05_[b]<TH>
+<TD>33</TD>
+<TD>153</TD>
+<TD></TD>
+<TD>33</TD>
+<TD>2</TD>
+<TD>5</TD>
+<TD>13</TD>
+<TD BGCOLOR=white>239</TD>
+<TD>32</TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>153</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>33</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>2</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>5</TD>
+<TD></TD>
+<TD></TD>
+<TD>13</TD>
+</TR>
+<TR><TH>06_15<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>30</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD BGCOLOR=white>30</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>30</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>07_18<TH>
+<TD>18</TD>
+<TD>157</TD>
+<TD></TD>
+<TD>33</TD>
+<TD>56</TD>
+<TD>125</TD>
+<TD>2</TD>
+<TD BGCOLOR=white>391</TD>
+<TD></TD>
+<TD>18</TD>
+<TD></TD>
+<TD></TD>
+<TD>64</TD>
+<TD>7</TD>
+<TD>5</TD>
+<TD></TD>
+<TD>81</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>3</TD>
+<TD>30</TD>
+<TD>4</TD>
+<TD>5</TD>
+<TD>2</TD>
+<TD></TD>
+<TD>5</TD>
+<TD>36</TD>
+<TD>4</TD>
+<TD></TD>
+<TD>3</TD>
+<TD>24</TD>
+<TD>98</TD>
+<TD>1</TD>
+<TD>1</TD>
+<TD></TD>
+</TR>
+<TR><TH>08_COMPLEX<TH>
+<TD></TD>
+<TD>54</TD>
+<TD>33</TD>
+<TD>20</TD>
+<TD>2</TD>
+<TD>1</TD>
+<TD></TD>
+<TD BGCOLOR=white>110</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD>53</TD>
+<TD></TD>
+<TD></TD>
+<TD>11</TD>
+<TD></TD>
+<TD>22</TD>
+<TD>10</TD>
+<TD></TD>
+<TD>10</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>2</TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>09_[c]<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>3</TD>
+<TD>4</TD>
+<TD></TD>
+<TD BGCOLOR=white>7</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>3</TD>
+<TD>2</TD>
+<TD></TD>
+<TD>2</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>0A_[d]<TH>
+<TD>1</TD>
+<TD>2</TD>
+<TD></TD>
+<TD>6</TD>
+<TD>25</TD>
+<TD>14</TD>
+<TD></TD>
+<TD BGCOLOR=white>48</TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>6</TD>
+<TD></TD>
+<TD></TD>
+<TD>3</TD>
+<TD>3</TD>
+<TD></TD>
+<TD>19</TD>
+<TD></TD>
+<TD>2</TD>
+<TD>3</TD>
+<TD>7</TD>
+<TD>2</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>0B_[e]<TH>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD>1</TD>
+<TD>3</TD>
+<TD BGCOLOR=white>6</TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>3</TD>
+</TR>
+<TR><TH>X<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD BGCOLOR=white>0</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+</TABLE>
+<TABLE BORDER=3>
+<TR BGCOLOR=blue><TH><TH>
+<TD BGCOLOR=red>00_1</TD>
+<TD BGCOLOR=red>01_[a]</TD>
+<TD BGCOLOR=red>02_7</TD>
+<TD BGCOLOR=red>03_8</TD>
+<TD BGCOLOR=red>04_9</TD>
+<TD BGCOLOR=red>05_[b]</TD>
+<TD BGCOLOR=red>06_15</TD>
+<TD BGCOLOR=red>07_18</TD>
+<TD BGCOLOR=red>08_COMPLEX</TD>
+<TD BGCOLOR=red>09_[c]</TD>
+<TD BGCOLOR=red>0A_[d]</TD>
+<TD BGCOLOR=red>0B_[e]</TD>
+<TD BGCOLOR=red>X</TD>
+</TR>
+<TR><TH>00<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>33</TD>
+<TD>10</TD>
+<TD>127</TD>
+<TD></TD>
+<TD>7</TD>
+<TD>44</TD>
+<TD>2</TD>
+<TD></TD>
+</TR>
+<TR><TH>0E<TH>
+<TD>1</TD>
+<TD>6</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>20</TD>
+<TD>1</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>17<TH>
+<TD>2</TD>
+<TD>4</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>110</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>20<TH>
+<TD>2</TD>
+<TD>11</TD>
+<TD>1</TD>
+<TD></TD>
+<TD>5</TD>
+<TD>13</TD>
+<TD></TD>
+<TD>100</TD>
+<TD></TD>
+<TD></TD>
+<TD>4</TD>
+<TD>4</TD>
+<TD></TD>
+</TR>
+<TR><TH>21<TH>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>1</TD>
+<TD></TD>
+<TD>32</TD>
+<TD></TD>
+<TD>163</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+<TR><TH>30<TH>
+<TD>10</TD>
+<TD>47</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD>161</TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+<TD></TD>
+</TR>
+</TABLE>
diff --git a/intl/lwbrk/tools/anzx4051.pl b/intl/lwbrk/tools/anzx4051.pl
new file mode 100644
index 0000000000..e76eac6207
--- /dev/null
+++ b/intl/lwbrk/tools/anzx4051.pl
@@ -0,0 +1,356 @@
+#!/usr/bin/perl
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+######################################################################
+#
+# Initial global variable
+#
+######################################################################
+%utot = ();
+$ui=0;
+$li=0;
+
+######################################################################
+#
+# Open the unicode database file
+#
+######################################################################
+open ( UNICODATA , "< ../../unicharutil/tools/UnicodeData-Latest.txt")
+ || die "cannot find UnicodeData-Latest.txt";
+
+######################################################################
+#
+# Open the JIS X 4051 Class file
+#
+######################################################################
+open ( CLASS , "< jisx4051class.txt")
+ || die "cannot find jisx4051class.txt";
+
+######################################################################
+#
+# Open the JIS X 4051 Class simplified mapping
+#
+######################################################################
+open ( SIMP , "< jisx4051simp.txt")
+ || die "cannot find jisx4051simp.txt";
+
+######################################################################
+#
+# Open the output file
+#
+######################################################################
+open ( OUT , "> anzx4051.html")
+ || die "cannot open output anzx4051.html file";
+
+######################################################################
+#
+# Open the output file
+#
+######################################################################
+open ( HEADER , "> ../jisx4051class.h")
+ || die "cannot open output ../jisx4051class.h file";
+
+######################################################################
+#
+# Generate license and header
+#
+######################################################################
+$hthmlheader = <<END_OF_HTML;
+<!-- This Source Code Form is subject to the terms of the Mozilla Public
+ - License, v. 2.0. If a copy of the MPL was not distributed with this
+ - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
+
+<HTML>
+<HEAD>
+<TITLE>
+Analysis of JIS X 4051 to Unicode General Category Mapping
+</TITLE>
+</HEAD>
+<BODY>
+<H1>
+Analysis of JIS X 4051 to Unicode General Category Mapping
+</H1>
+END_OF_HTML
+print OUT $hthmlheader;
+
+######################################################################
+#
+# Generate license and header
+#
+######################################################################
+$npl = <<END_OF_NPL;
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+/*
+ DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY
+ mozilla/intl/lwbrk/tools/anzx4051.pl
+ */
+END_OF_NPL
+print HEADER $npl;
+
+%occ = ();
+%gcat = ();
+%dcat = ();
+%simp = ();
+%gcount = ();
+%dcount = ();
+%sccount = ();
+%rangecount = ();
+
+######################################################################
+#
+# Process the file line by line
+#
+######################################################################
+while(<UNICODATA>) {
+ chop;
+ ######################################################################
+ #
+ # Get value from fields
+ #
+ ######################################################################
+ @f = split(/;/ , $_);
+ $c = $f[0]; # The unicode value
+ $g = $f[2];
+ $d = substr($g, 0, 1);
+
+ $gcat{$c} = $g;
+ $dcat{$c} = $d;
+ $gcount{$g}++;
+ $dcount{$d}++;
+}
+close(UNIDATA);
+
+while(<SIMP>) {
+ chop;
+ ######################################################################
+ #
+ # Get value from fields
+ #
+ ######################################################################
+ @f = split(/;/ , $_);
+
+ $simp{$f[0]} = $f[1];
+ $sccount{$f[1]}++;
+}
+close(SIMP);
+
+sub GetClass{
+ my ($u) = @_;
+ my $hex = DecToHex($u);
+ $g = $gcat{$hex};
+ if($g ne "") {
+ return $g;
+ } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) {
+ return "Han";
+ } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) {
+ return "Lo";
+ } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) {
+ return "Cs";
+ } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) {
+ return "Cs";
+ } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) {
+ return "Cs";
+ } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) {
+ return "Co";
+ } else {
+ printf "WARNING !!!! Cannot find General Category for U+%s \n" , $hex;
+ }
+}
+sub GetDClass{
+ my ($u) = @_;
+ my $hex = DecToHex($u);
+ $g = $dcat{$hex};
+ if($g ne "") {
+ return $g;
+ } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) {
+ return "Han";
+ } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) {
+ return "L";
+ } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) {
+ return "C";
+ } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) {
+ return "C";
+ } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) {
+ return "C";
+ } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) {
+ return "C";
+ } else {
+ printf "WARNING !!!! Cannot find Detailed General Category for U+%s \n" , $hex;
+ }
+}
+sub DecToHex{
+ my ($d) = @_;
+ return sprintf("%04X", $d);
+}
+%gtotal = ();
+%dtotal = ();
+while(<CLASS>) {
+ chop;
+ ######################################################################
+ #
+ # Get value from fields
+ #
+ ######################################################################
+ @f = split(/;/ , $_);
+
+ if( substr($f[2], 0, 1) ne "a")
+ {
+ $sc = $simp{$f[2]};
+ $l = hex($f[0]);
+ if($f[1] eq "")
+ {
+ $h = $l;
+ } else {
+ $h = hex($f[1]);
+ }
+ for($k = $l; $k <= $h ; $k++)
+ {
+ if( exists($occ{$k}))
+ {
+ # printf "WARNING !! Conflict defination!!! U+%s -> [%s] [%s | %s]\n",
+ # DecToHex($k), $occ{$k} , $f[2] , $sc;
+ }
+ else
+ {
+ $occ{$k} = $sc . " | " . $f[2];
+ $gclass = GetClass($k);
+ $dclass = GetDClass($k);
+ $gtotal{$sc . $gclass}++;
+ $dtotal{$sc . $dclass}++;
+ $u = DecToHex($k);
+ $rk = " " . substr($u,0,2) . ":" . $sc;
+ $rangecount{$rk}++;
+ }
+ }
+ }
+}
+
+#print %gtotal;
+#print %dtotal;
+
+sub printreport
+{
+ print OUT "<TABLE BORDER=3>\n";
+ print OUT "<TR BGCOLOR=blue><TH><TH>\n";
+
+ foreach $d (sort(keys %dcount)) {
+ print OUT "<TD BGCOLOR=red>$d</TD>\n";
+ }
+
+ print OUT "<TD BGCOLOR=white>Total</TD>\n";
+ foreach $g (sort(keys %gcount)) {
+ print OUT "<TD BGCOLOR=yellow>$g</TD>\n";
+ }
+ print OUT "</TR>\n";
+ foreach $sc (sort(keys %sccount)) {
+
+ print OUT "<TR><TH>$sc<TH>\n";
+
+ $total = 0;
+ foreach $d (sort (keys %dcount)) {
+ $count = $dtotal{$sc . $d};
+ $total += $count;
+ print OUT "<TD>$count</TD>\n";
+ }
+
+ print OUT "<TD BGCOLOR=white>$total</TD>\n";
+
+ foreach $g (sort(keys %gcount)) {
+ $count = $gtotal{$sc . $g};
+ print OUT "<TD>$count</TD>\n";
+ }
+
+
+ print OUT "</TR>\n";
+ }
+ print OUT "</TABLE>\n";
+
+
+ print OUT "<TABLE BORDER=3>\n";
+ print OUT "<TR BGCOLOR=blue><TH><TH>\n";
+
+ foreach $sc (sort(keys %sccount))
+ {
+ print OUT "<TD BGCOLOR=red>$sc</TD>\n";
+ }
+
+ print OUT "</TR>\n";
+
+
+ for($rr = 0; $rr < 0x4f; $rr++)
+ {
+ $empty = 0;
+ $r = sprintf("%02X" , $rr) ;
+ $tmp = "<TR><TH>" . $r . "<TH>\n";
+
+ foreach $sc (sort(keys %sccount)) {
+ $count = $rangecount{ " " .$r . ":" .$sc};
+ $tmp .= sprintf("<TD>%s</TD>\n", $count);
+ $empty += $count;
+ }
+
+ $tmp .= "</TR>\n";
+
+ if($empty ne 0)
+ {
+ print OUT $tmp;
+ }
+ }
+ print OUT "</TABLE>\n";
+
+}
+printreport();
+
+sub printarray
+{
+ my($r, $def) = @_;
+printf "[%s || %s]\n", $r, $def;
+ $k = hex($r) * 256;
+ printf HEADER "static const uint32_t gLBClass%s[32] = {\n", $r;
+ for($i = 0 ; $i < 256; $i+= 8)
+ {
+ for($j = 7 ; $j >= 0; $j-- )
+ {
+ $v = $k + $i + $j;
+ if( exists($occ{$v}))
+ {
+ $p = substr($occ{$v}, 1,1);
+ } else {
+ $p = $def;
+ }
+
+ if($j eq 7 )
+ {
+ printf HEADER "0x%s" , $p;
+ } else {
+ printf HEADER "%s", $p ;
+ }
+ }
+ printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7);
+ }
+ print HEADER "};\n\n";
+}
+printarray("00", "7");
+printarray("20", "7");
+printarray("21", "7");
+printarray("30", "5");
+printarray("0E", "8");
+printarray("17", "7");
+
+#print %rangecount;
+
+######################################################################
+#
+# Close files
+#
+######################################################################
+close(HEADER);
+close(CLASS);
+close(OUT);
+
diff --git a/intl/lwbrk/tools/jisx4051class.txt b/intl/lwbrk/tools/jisx4051class.txt
new file mode 100644
index 0000000000..c435c1ae55
--- /dev/null
+++ b/intl/lwbrk/tools/jisx4051class.txt
@@ -0,0 +1,159 @@
+0000;001f;17
+0020;;17
+0024;;24
+0027;;18
+0028;;22
+002D;;18
+002F;;18
+0021;002F;23
+0030;0039;15
+003C;;22
+003A;003F;23
+0040;;18
+0041;005A;18
+005B;;22
+005E;;18
+005F;;18
+005B;005F;23
+0060;;18
+0061;007A;18
+007B;;22
+007B;007E;23
+00A0;;24
+00A3;;22
+00A5;;22
+00A9;;18
+00AA;;18
+00AB;;18
+00AC;;22
+00AE;;18
+00AF;;18
+00A1;00BF;23
+00B0;;18
+00F7;;23
+00C0;00FF;18
+0E3F;;1
+0E2F;;4
+0E46;;4
+0E5A;0E5B;4
+0E50;0E59;15
+0E4F;;18
+0EAF;;4
+0EC6;;4
+0ED0;0ED9;15
+1735;1736;1
+17D4;17D5;4
+17D8;;4
+17DA;;4
+1780;17DD;21
+17E0;17E9;21
+17F0;17F9;21
+2007;;24
+2000;200B;17
+200C;200F;18
+2010;;18
+2011;;24
+2012;2013;18
+2014;;7
+2015;;18
+2016;2017;18
+2019;;23
+201D;;23
+2018;201F;18
+2020;2023;18
+2024;2026;2
+2027;;23
+2028;202E;18
+202F;;24
+2030;2034;9
+2035;2038;18
+2039;;1
+203A;;2
+203B;;12
+203C;203D;3
+203E;;23
+203F;2043;18
+2044;;3
+2045;;1
+2046;;2
+2047;2049;3
+204A;205E;18
+205F;;17
+2060;;24
+2061;2063;18
+206A;206F;18
+2070;2071;18
+2074;208E;18
+2090;2094;18
+2116;;8
+2160;217F;12
+2190;21EA;a12
+2126;;18
+2100;2138;18
+2153;2182;18
+2190;21EA;18
+3008;;1
+300A;;1
+300C;;1
+300E;;1
+3010;;1
+3014;;1
+3016;;1
+3018;;1
+301A;;1
+301D;;1
+3001;;2
+3009;;2
+300B;;2
+300D;;2
+300F;;2
+3011;;2
+3015;;2
+3017;;2
+3019;;2
+301B;;2
+301E;;2
+301F;;2
+3005;;3
+301C;;3
+3041;;3
+3043;;3
+3045;;3
+3047;;3
+3049;;3
+3063;;3
+3083;;3
+3085;;3
+3087;;3
+308E;;3
+309D;;3
+309E;;3
+30A1;;3
+30A3;;3
+30A5;;3
+30A7;;3
+30A9;;3
+30C3;;3
+30E3;;3
+30E5;;3
+30E7;;3
+30EE;;3
+30F5;;3
+30F6;;3
+30FC;;3
+30FD;;3
+30FE;;3
+30FB;;5
+3002;;6
+3000;;10
+3042;3094;11
+3099;309E;3
+3003;;12
+3004;;12
+3006;;12
+3007;;12
+3012;;12
+3013;;12
+3020;;12
+3036;;12
+30A2;30FA;12
diff --git a/intl/lwbrk/tools/jisx4051simp.txt b/intl/lwbrk/tools/jisx4051simp.txt
new file mode 100644
index 0000000000..e12a7fd805
--- /dev/null
+++ b/intl/lwbrk/tools/jisx4051simp.txt
@@ -0,0 +1,24 @@
+1;00_1
+2;01_[a]
+3;01_[a]
+4;01_[a]
+5;01_[a]
+6;01_[a]
+7;02_7
+8;03_8
+9;04_9
+10;05_[b]
+11;05_[b]
+12;05_[b]
+13;X
+14;X
+15;06_15
+16;X
+17;05_[b]
+18;07_18
+19;X
+20;X
+21;08_COMPLEX
+22;09_[c]
+23;0A_[d]
+24;0B_[e]
diff --git a/intl/lwbrk/tools/spec_table.html b/intl/lwbrk/tools/spec_table.html
new file mode 100644
index 0000000000..519f98c534
--- /dev/null
+++ b/intl/lwbrk/tools/spec_table.html
@@ -0,0 +1,127 @@
+<!-- This Source Code Form is subject to the terms of the Mozilla Public
+ - License, v. 2.0. If a copy of the MPL was not distributed with this
+ - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<title></title>
+<style type="text/css">
+table {
+ border: solid 1px;
+ border-collapse: collapse;
+}
+tbody, tfoot {
+ border-top: solid 2px;
+}
+td, th {
+ border: solid 1px;
+}
+td {
+ text-align: center;
+}
+</style>
+</head>
+<body>
+<p>This is a specification table for line breaking.</p>
+<p>The values of IE7 and Opera9: 'A' means that the line is breakable After the character, and 'B' means Before. 'BA' means Before and After.</p>
+<p>(C) which is the tail of the IE7 and the Opera9 means Character. (N) means Numeric.
+This means that they are around the character at testing. E.g., "a$a" is a testcase for (C), "0$0" is a testcase for (N).</p>
+<p>Gecko is not breaking the lines on most western language context. But for file paths, URLs and very long word which is connected hyphens,
+some characters might be breakable. They are 'breakable' in the table. However, they are not always breakable,
+they <em>depend on the context</em> in the word.</p>
+<table border="1">
+<thead>
+<tr><th colspan="2">character</th><th>Gecko</th><th>IE7(C)</th><th>IE7(N)</th><th>Opera9.2(C)</th><th>Opera9.2(N)</th></tr>
+</thead>
+<tfoot>
+<tr><th colspan="2">character</th><th>Gecko</th><th>IE7(C)</th><th>IE7(N)</th><th>Opera9.2(C)</th><th>Opera9.2(N)</th></tr>
+</tfoot>
+<tbody>
+<tr><th>0x21</th><th>&#x21;</th><td></td><td>A</td><td>A</td><td></td><td></td></tr>
+<tr><th>0x22</th><th>&#x22;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x23</th><th>&#x23;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x24</th><th>&#x24;</th><td></td><td></td><td>B</td><td></td><td></td></tr>
+<tr><th>0x25</th><th>&#x25;</th><td>breakable</td><td>A</td><td>A</td><td></td><td></td></tr>
+<tr><th>0x26</th><th>&#x26;</th><td>breakable</td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x27</th><th>&#x27;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x28</th><th>&#x28;</th><td></td><td>B</td><td>B</td><td></td><td></td></tr>
+<tr><th>0x29</th><th>&#x29;</th><td></td><td>A</td><td>A</td><td></td><td></td></tr>
+<tr><th>0x2A</th><th>&#x2A;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x2B</th><th>&#x2B;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x2C</th><th>&#x2C;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x2D</th><th>&#x2D;</th><td>breakable</td><td>BA</td><td>BA</td><td>A</td><td>A</td></tr>
+<tr><th>0x2E</th><th>&#x2E;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x2F</th><th>&#x2F;</th><td>breakable</td><td></td><td></td><td>A</td><td>A</td></tr>
+</tbody>
+<tbody>
+<tr><th>0x3A</th><th>&#x3A;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x3B</th><th>&#x3B;</th><td>breakable</td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x3C</th><th>&#x3C;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x3D</th><th>&#x3D;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x3E</th><th>&#x3E;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x3F</th><th>&#x3F;</th><td></td><td>A</td><td>A</td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0x40</th><th>&#x40;</th><td></td><td></td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0x5B</th><th>&#x5B;</th><td></td><td>B</td><td>B</td><td></td><td></td></tr>
+<tr><th>0x5C</th><th>&#x5C;</th><td>breakable</td><td></td><td>B</td><td></td><td></td></tr>
+<tr><th>0x5D</th><th>&#x5D;</th><td></td><td>A</td><td>A</td><td></td><td></td></tr>
+<tr><th>0x5E</th><th>&#x5E;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0x5F</th><th>&#x5F;</th><td></td><td></td><td></td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0x60</th><th>&#x60;</th><td></td><td></td><td></td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0x7B</th><th>&#x7B;</th><td></td><td>B</td><td>B</td><td></td><td></td></tr>
+<tr><th>0x7C</th><th>&#x7C;</th><td></td><td></td><td></td><td>A</td><td>A</td></tr>
+<tr><th>0x7D</th><th>&#x7D;</th><td></td><td>A</td><td>A</td><td></td><td></td></tr>
+<tr><th>0x7E</th><th>&#x7E;</th><td></td><td></td><td></td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0xA1</th><th>&#xA1;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xA2</th><th>&#xA2;</th><td></td><td>A</td><td>A</td><td></td><td></td></tr>
+<tr><th>0xA3</th><th>&#xA3;</th><td></td><td></td><td>B</td><td></td><td></td></tr>
+<tr><th>0xA4</th><th>&#xA4;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xA5</th><th>&#xA5;</th><td></td><td></td><td>B</td><td></td><td></td></tr>
+<tr><th>0xA6</th><th>&#xA6;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xA7</th><th>&#xA7;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xA8</th><th>&#xA8;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xA9</th><th>&#xA9;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xAA</th><th>&#xAA;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xAB</th><th>&#xAB;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xAC</th><th>&#xAC;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xAE</th><th>&#xAE;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xAF</th><th>&#xAF;</th><td></td><td></td><td></td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0xB0</th><th>&#xB0;</th><td></td><td>A</td><td>A</td><td></td><td></td></tr>
+<tr><th>0xB1</th><th>&#xB1;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xB2</th><th>&#xB2;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xB3</th><th>&#xB3;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xB4</th><th>&#xB4;</th><td></td><td></td><td></td><td>B</td><td>B</td></tr>
+<tr><th>0xB5</th><th>&#xB5;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xB6</th><th>&#xB6;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xB7</th><th>&#xB7;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xB8</th><th>&#xB8;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xB9</th><th>&#xB9;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xBA</th><th>&#xBA;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xBB</th><th>&#xBB;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xBC</th><th>&#xBC;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xBD</th><th>&#xBD;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xBE</th><th>&#xBE;</th><td></td><td></td><td></td><td></td><td></td></tr>
+<tr><th>0xBF</th><th>&#xBF;</th><td></td><td></td><td></td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0xD7</th><th>&#xD7;</th><td></td><td></td><td></td><td></td><td></td></tr>
+</tbody>
+<tbody>
+<tr><th>0xF7</th><th>&#xF7;</th><td></td><td></td><td></td><td></td><td></td></tr>
+</tbody>
+</table>
+</body>
+</html>