12 files changed, 5142 insertions, 51 deletions
diff --git a/debian/changelog b/debian/changelog
index 4b9ea86628..8a60279882 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,26 @@
+libreoffice (4:24.2.3-2) unstable; urgency=medium
+
+  * debian/patches/use-PyConfig.diff: backport from master;
+    fix build with python 3.13
+  * debian/patches/reviewed-breakIterator-customizations.diff: backport
+    from master; add tests to what is expected (and needed for the following
+    patch to apply)
+  * debian/patches/breakiterator-updates.diff: backport from master; re-bases
+    the BreakIterator rule customizations on top of a clean copy of the ICU
+    74.2 rules. Fixes build with ICU 74 and 75
+  * debian/patches/icu-74.1.diff: new unicode stuff for ICU 74+, for
+    completeness
+
+  * debian/rules:
+    - don't loose -qt5 if PLASMA_VERSION=6 as we might need it for -kf5
+      still; don't make that depends on the plasma version
+    - fix install if PLASMA_VERSION=X but there's no kdeXbe (yet)
+    - recommend kio >> 5.115.0-5 in -kf5
+  * debian/rules, debian/control.plasma.in:
+    - prepare -plasma to say Kf6 (add @PLASMA_KF_VERSION@)
+
+ -- Rene Engelhard <rene@debian.org>  Wed, 22 May 2024 16:18:58 +0000
+
 libreoffice (4:24.2.3-1) unstable; urgency=medium
 
   * LibreOffice 24.2.3 final release (identical to rc2)
@@ -32,6 +55,8 @@ libreoffice (4:24.2.3~rc1-2) unstable; urgency=medium
 libreoffice (4:24.2.3~rc1-1) experimental; urgency=medium
 
   * New upstream release candidate
+    - fixes CVE-2024-3044 ("Graphic on-click binding allows unchecked script
+      execution")
 
   * debian/rules:
     - more target fixes to make Rules-Requires-Root: no work
diff --git a/debian/control b/debian/control
index 353f3f00c3..1724e33f68 100644
--- a/debian/control
+++ b/debian/control
@@ -167,14 +167,14 @@ Build-Depends-Arch: at-spi2-core [!alpha !armel !hppa !i386 !ia64 !kfreebsd-amd6
                     dbus-x11 [!alpha !armel !hppa !i386 !ia64 !kfreebsd-amd64 !kfreebsd-i386 !loong64 !m68k !mips !mipsel !mips64 !mips64el !powerpc !powerpcspe !ppc64 !ppc64el !riscv64 !s390x !sparc !sparc64] <!nocheck>,
                     firebird-dev [!m68k],
                     firebird3.0-server-core [!m68k] <!nocheck>,
-                    fontconfig [amd64 arm64 armhf i386 ppc64el s390x] <!nocheck>,
-                    fonts-crosextra-caladea [amd64 arm64 armhf i386 ppc64el s390x] <!nocheck>,
-                    fonts-crosextra-carlito (>= 20230309) [amd64 arm64 armhf i386 ppc64el s390x] <!nocheck>,
-                    fonts-dejavu [amd64 arm64 armhf i386 ppc64el s390x] <!nocheck>,
-                    fonts-hosny-amiri [amd64 arm64 armhf i386 ppc64el s390x] <!nocheck>,
-                    fonts-liberation (>= 1:2) [amd64 arm64 armhf i386 ppc64el s390x] <!nocheck>,
-                    fonts-linuxlibertine [amd64 arm64 armhf i386 ppc64el s390x] <!nocheck>,
-                    fonts-noto-core [amd64 arm64 armhf i386 ppc64el s390x] <!nocheck>,
+                    fontconfig [amd64 arm64 armhf armel i386 ppc64el s390x riscv64] <!nocheck>,
+                    fonts-crosextra-caladea [amd64 arm64 armhf armel i386 ppc64el s390x riscv64] <!nocheck>,
+                    fonts-crosextra-carlito (>= 20230309) [amd64 arm64 armhf armel i386 ppc64el s390x riscv64] <!nocheck>,
+                    fonts-dejavu [amd64 arm64 armhf armel i386 ppc64el s390x riscv64] <!nocheck>,
+                    fonts-hosny-amiri [amd64 arm64 armhf armel i386 ppc64el s390x riscv64] <!nocheck>,
+                    fonts-liberation (>= 1:2) [amd64 arm64 armhf armel i386 ppc64el s390x riscv64] <!nocheck>,
+                    fonts-linuxlibertine [amd64 arm64 armhf armel i386 ppc64el s390x riscv64] <!nocheck>,
+                    fonts-noto-core [amd64 arm64 armhf armel i386 ppc64el s390x riscv64] <!nocheck>,
                     fonts-opensymbol <!pkg.libreoffice.opensymbolbuild>,
                     gdb [!alpha !armel !hppa !i386 !ia64 !kfreebsd-amd64 !kfreebsd-i386 !loong64 !m68k !mips !mipsel !mips64 !mips64el !powerpc !powerpcspe !ppc64 !ppc64el !riscv64 !s390x !sparc !sparc64] <!nocheck>,
                     ghostscript [!alpha !armel !hppa !i386 !ia64 !kfreebsd-amd64 !kfreebsd-i386 !loong64 !m68k !mips !mipsel !mips64 !mips64el !powerpc !powerpcspe !ppc64 !ppc64el !riscv64 !s390x !sparc !sparc64] <!nocheck>,
@@ -5262,8 +5262,8 @@ Description: office productivity suite -- GTK+ 3 integration
  LibreOffice is a full-featured office productivity suite that provides
  a near drop-in replacement for Microsoft(R) Office.
  .
- This package contains the Gtk plugin for drawing LibreOffices widgets
- with Gtk+ 3 and Gtk/GNOMEish print dialog when running under GNOME.
+ This package contains the GTK+ plugin for drawing LibreOffices widgets
+ with GTK+ 3 and GTK+/GNOMEish print dialog when running under GNOME.
 
 Package: gir1.2-lokdocview-0.1
 Architecture: alpha amd64 arm64 armel armhf hppa i386 ia64 kfreebsd-amd64 kfreebsd-i386 loong64 m68k mips mipsel mips64 mips64el powerpc powerpcspe ppc64 ppc64el riscv64 s390x sparc sparc64
@@ -5272,8 +5272,8 @@ Build-Profiles: <!nogir>
 Depends: liblibreofficekitgtk (= ${binary:Version}),
          ${gir:Depends},
          ${misc:Depends}
-Description: GTK3 widget wrapping LibreOffice functionality - introspection
- LOKDocView is the GTK3 widget that wraps the libreoffice functionality and
+Description: GTK+ 3 widget wrapping LibreOffice functionality - introspection
+ LOKDocView is the GTK+ 3 widget that wraps the libreoffice functionality and
  exposes a simple API for applications to use this widget.
  .
  This package contains the Introspection data.
@@ -5285,7 +5285,7 @@ Depends: libreofficekit-data, ${misc:Depends}, ${shlibs:Depends}
 Replaces: libreoffice-gtk3 (<< 1:5.2.0~)
 Breaks: libreoffice-gtk3 (<< 1:5.2.0~)
 Recommends: libreoffice-gtk3
-Description: GTK3 widget wrapping LibreOffice functionality
+Description: GTK+ 3 widget wrapping LibreOffice functionality
  This package contains a (basic) GTK+ document viewer widget (used
  by e.g. LOKDocView)
 
@@ -5295,7 +5295,7 @@ Section: web
 Depends: ${misc:Depends}
 Recommends: gir1.2-lokdocview-0.1
 Description: common data for LOKDocView
- LOKDocView is the GTK3 widget that wraps the libreoffice functionality and
+ LOKDocView is the GTK+ 3 widget that wraps the libreoffice functionality and
  exposes a simple API for applications to use this widget.
  .
  This package contains architecture-independent data (e.g. the selection
@@ -5310,12 +5310,12 @@ Suggests: libreofficekit-data
 Replaces: libreoffice-core (<< 4:7.6.0~rc2)
 Section: gnome
 Enhances: libreoffice
-Description: office productivity suite -- GTK+ 4 integration
+Description: office productivity suite -- GTK 4 integration
  LibreOffice is a full-featured office productivity suite that provides
  a near drop-in replacement for Microsoft(R) Office.
  .
- This package contains the Gtk plugin for drawing LibreOffices widgets
- with Gtk+ 4 and Gtk/GNOMEish print dialog.
+ This package contains the GTK plugin for drawing LibreOffices widgets
+ with GTK 4 and GTK/GNOMEish print dialog.
  .
  You need to enable it manually by export SAL_USE_VCLPLUGIN=gtk4.
 
@@ -5325,7 +5325,7 @@ Depends: libreoffice-core (= ${binary:Version}),
          ${kf5-qt5-depends},
          ${misc:Depends},
          ${shlibs:Depends}
-Recommends: ${plasma-iconset-dep}
+Recommends: kio (>> 5.115.0-5), ${plasma-iconset-dep}
 Replaces: libreoffice-kde (<< 1:6.1.0~alpha1-1)
 Section: kde
 Enhances: libreoffice
diff --git a/debian/control.gtk3.in b/debian/control.gtk3.in
index 847adbe88b..ad842c783a 100644
--- a/debian/control.gtk3.in
+++ b/debian/control.gtk3.in
@@ -11,16 +11,16 @@ Description: office productivity suite -- GTK+ 3 integration
  LibreOffice is a full-featured office productivity suite that provides
  a near drop-in replacement for Microsoft(R) Office.
  .
- This package contains the Gtk plugin for drawing LibreOffices widgets
- with Gtk+ 3 and Gtk/GNOMEish print dialog when running under GNOME.
+ This package contains the GTK+ plugin for drawing LibreOffices widgets
+ with GTK+ 3 and GTK+/GNOMEish print dialog when running under GNOME.
 
 Package: gir1.2-lokdocview-0.1
 Architecture: %OOO_ARCHS%
 Section: introspection
 Build-Profiles: <!nogir>
 Depends: ${gir:Depends}, ${misc:Depends}, liblibreofficekitgtk (= ${binary:Version})
-Description: GTK3 widget wrapping LibreOffice functionality - introspection
- LOKDocView is the GTK3 widget that wraps the libreoffice functionality and
+Description: GTK+ 3 widget wrapping LibreOffice functionality - introspection
+ LOKDocView is the GTK+ 3 widget that wraps the libreoffice functionality and
  exposes a simple API for applications to use this widget.
  .
  This package contains the Introspection data.
@@ -32,7 +32,7 @@ Depends: ${shlibs:Depends}, ${misc:Depends}, libreofficekit-data
 Replaces: libreoffice-gtk3 (<< 1:5.2.0~)
 Breaks: libreoffice-gtk3 (<< 1:5.2.0~)
 Recommends: libreoffice-gtk3
-Description: GTK3 widget wrapping LibreOffice functionality
+Description: GTK+ 3 widget wrapping LibreOffice functionality
  This package contains a (basic) GTK+ document viewer widget (used
  by e.g. LOKDocView)
 
@@ -42,7 +42,7 @@ Section: web
 Depends: ${misc:Depends}
 Recommends: gir1.2-lokdocview-0.1
 Description: common data for LOKDocView
- LOKDocView is the GTK3 widget that wraps the libreoffice functionality and
+ LOKDocView is the GTK+ 3 widget that wraps the libreoffice functionality and
  exposes a simple API for applications to use this widget.
  .
  This package contains architecture-independent data (e.g. the selection
diff --git a/debian/control.gtk4.in b/debian/control.gtk4.in
index 56e3fcb4e8..2838b83826 100644
--- a/debian/control.gtk4.in
+++ b/debian/control.gtk4.in
@@ -7,12 +7,12 @@ Suggests: libreofficekit-data
 Replaces: libreoffice-core (<< 4:7.6.0~rc2)
 Section: gnome
 Enhances: libreoffice
-Description: office productivity suite -- GTK+ 4 integration
+Description: office productivity suite -- GTK 4 integration
  LibreOffice is a full-featured office productivity suite that provides
  a near drop-in replacement for Microsoft(R) Office.
  .
- This package contains the Gtk plugin for drawing LibreOffices widgets
- with Gtk+ 4 and Gtk/GNOMEish print dialog.
+ This package contains the GTK plugin for drawing LibreOffices widgets
+ with GTK 4 and GTK/GNOMEish print dialog.
  .
  You need to enable it manually by export SAL_USE_VCLPLUGIN=gtk4.
 
diff --git a/debian/control.plasma.in b/debian/control.plasma.in
index ced72a9c35..bcd2c8dd73 100644
--- a/debian/control.plasma.in
+++ b/debian/control.plasma.in
@@ -10,5 +10,5 @@ Description: office productivity suite -- some Plasma integration
  a near drop-in replacement for Microsoft(R) Office.
  .
  This package contains some minor Plasma integration (like AppData
- and "Create New..." integration) and a KDE/KF5 configuration backend.
+ and "Create New..." integration) and a KDE/KF@PLASMA_KF_VERSION@ configuration backend.
 
diff --git a/debian/patches/breakiterator-updates.diff b/debian/patches/breakiterator-updates.diff
new file mode 100644
index 0000000000..8ac9cdbb5d
--- /dev/null
+++ b/debian/patches/breakiterator-updates.diff
@@ -0,0 +1,3620 @@
+From 5b688b03a916a0f6127c7aba891bf613cff0de0b Mon Sep 17 00:00:00 2001
+From: Jonathan Clark <jonathan@libreoffice.org>
+Date: Wed, 17 Apr 2024 09:09:50 -0600
+Subject: [PATCH] tdf#49885 BreakIterator rule upgrades
+
+This change re-bases the BreakIterator rule customizations on top of a
+clean copy of the ICU 74.2 rules.
+
+Change-Id: Iadcf16cab138cc6c869fac61ad64e996e65b5ae4
+---
+ i18npool/CustomTarget_breakiterator.mk        |   6 +-
+ i18npool/qa/cppunit/test_breakiterator.cxx    | 356 +++++----
+ .../source/breakiterator/data/dict_word.txt   | 267 ++++---
+ .../breakiterator/data/dict_word_he.txt       | 139 ----
+ .../breakiterator/data/dict_word_hu.txt       | 324 +++++----
+ .../breakiterator/data/dict_word_nodash.txt   | 147 ----
+ .../data/dict_word_prepostdash.txt            | 288 +++++---
+ .../source/breakiterator/data/edit_word.txt   | 261 ++++---
+ .../breakiterator/data/edit_word_he.txt       | 142 ----
+ .../breakiterator/data/edit_word_hu.txt       | 294 +++++---
+ i18npool/source/breakiterator/data/line.txt   | 680 ++++++------------
+ i18npool/source/breakiterator/data/sent.txt   | 128 ----
+ 12 files changed, 1307 insertions(+), 1725 deletions(-)
+ delete mode 100644 i18npool/source/breakiterator/data/dict_word_he.txt
+ delete mode 100644 i18npool/source/breakiterator/data/dict_word_nodash.txt
+ delete mode 100644 i18npool/source/breakiterator/data/edit_word_he.txt
+ delete mode 100644 i18npool/source/breakiterator/data/sent.txt
+
+diff --git a/i18npool/CustomTarget_breakiterator.mk b/i18npool/CustomTarget_breakiterator.mk
+index 8229a5e8f314..ef951142837a 100644
+--- a/i18npool/CustomTarget_breakiterator.mk
++++ b/i18npool/CustomTarget_breakiterator.mk
+@@ -45,16 +45,12 @@ endif
+ 
+ i18npool_BRKTXTS := \
+     count_word.brk \
+-    $(call gb_Helper_optional_locale,he,dict_word_he.brk) \
+     $(call gb_Helper_optional_locale,hu,dict_word_hu.brk) \
+-    dict_word_nodash.brk \
+     dict_word_prepostdash.brk \
+     dict_word.brk \
+-    $(call gb_Helper_optional_locale,he,edit_word_he.brk) \
+     $(call gb_Helper_optional_locale,hu,edit_word_hu.brk) \
+     edit_word.brk \
+-    line.brk \
+-    sent.brk
++    line.brk
+ 
+ # 'gencmn', 'genbrk' and 'genccode' are tools generated and delivered by icu project to process icu breakiterator rules.
+ # The output of gencmn generates warnings under Windows. We want to minimize the patches to external tools,
+diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx
+index b33466bee46d..2a35b2eee58f 100644
+--- a/i18npool/qa/cppunit/test_breakiterator.cxx
++++ b/i18npool/qa/cppunit/test_breakiterator.cxx
+@@ -184,11 +184,10 @@ void TestBreakIterator::testLineBreaking()
+ 
+         {
+             // Per the bug, the line break should leave -bar clumped together on the next line.
+-            // However, this change was reverted at some point. This test asserts the new behavior.
+             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
+                 "foo -bar", strlen("foo -ba"), aLocale, 0, aHyphOptions, aUserOptions);
+             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash",
+-                                         static_cast<sal_Int32>(5), aResult.breakIndex);
++                                         static_cast<sal_Int32>(4), aResult.breakIndex);
+         }
+     }
+ 
+@@ -198,11 +197,29 @@ void TestBreakIterator::testLineBreaking()
+         aLocale.Country = "US";
+ 
+         {
+-            // Here we want the line break to leave C:\Program Files\ on the first line
++            // Note that the current behavior deviates from the original fix for this bug.
++            //
++            // The original report was filed due to wrapping all of "\Program Files\aaaa" to the
++            // next line, even though only "aaaa" overflowed. The original fix was to simply make
++            // U+005C reverse solidus (backslash) a breaking character.
++            //
++            // However, the root cause for this bug was not the behavior of '\', but rather some
++            // other bug making all of "\Program Files\" behave like a single token, despite it
++            // even containing whitespace.
++            //
++            // Reverting to the ICU line rules fixes this root issue. Now, in the following,
++            // "C:\Program" and "Files\LibreOffice" are treated as separate tokens. This is also
++            // consistent with the behavior of other office programs.
+             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
+                 "C:\\Program Files\\LibreOffice", strlen("C:\\Program Files\\Libre"), aLocale, 0,
+                 aHyphOptions, aUserOptions);
+-            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), aResult.breakIndex);
++
++            // An identical result should be generated for solidus.
++            aResult = m_xBreak->getLineBreak(
++                "C:/Program Files/LibreOffice", strlen("C:/Program Files/Libre"), aLocale, 0,
++                aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), aResult.breakIndex);
+         }
+     }
+ 
+@@ -251,23 +268,125 @@ void TestBreakIterator::testLineBreaking()
+         aLocale.Country = "US";
+ 
+         {
++            // The root cause for this bug was the Unicode standard introducing special treatment
++            // for '-' in a number range context. This change makes number ranges (e.g. "100-199")
++            // behave as if they are single tokens for the purposes of line breaking. Unfortunately,
++            // this caused a significant appearance change to existing documents.
++            //
++            // Despite being a user-visible layout change, this isn't exactly a bug. Wrapping
++            // number ranges as a single token is consistent with other applications, including web
++            // browsers, and other office suites as mentioned in the bug discussion. Removing this
++            // customization seems like it would be a major change, however.
++            //
+             // Here we want the line break to leave 100- clumped on the first line.
++
+             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
+                 "word 100-199 word", strlen("word 100-1"), aLocale, 0, aHyphOptions, aUserOptions);
+-            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(9), aResult.breakIndex);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex);
++        }
++
++        {
++            // From the same bug: "the leading minus must stay with numbers and strings"
++
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                    "range of -100.000 to 100.000", strlen("range of -1"), aLocale, 0,
++                    aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex);
++
++            constexpr OUString str = u"range of \u2212100.000 to 100.000"_ustr;
++            aResult = m_xBreak->getLineBreak(
++                    str, strlen("range of -"), aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex);
+         }
+-    }
+ 
+-    // i#83649: Line break should be between typographical quote and left bracket
+-    {
+         aLocale.Language = "de";
+         aLocale.Country = "DE";
+ 
+         {
+-            // Here we want the line break to leave »angetan werden« on the first line
++            // From the same bug: "the leading minus must stay with numbers and strings"
++
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                    "EURO is -10,50", strlen("EURO is -1"), aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex);
++
++            // Also the mathematical minus sign:
++
++            constexpr OUString str = u"EURO is \u221210,50"_ustr;
++            aResult = m_xBreak->getLineBreak(
++                    str, strlen("EURO is -"), aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex);
++        }
++
++        {
++            // From the same bug: "the leading minus must stay with numbers and strings"
++
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                    "und -kosten", strlen("und -ko"), aLocale, 0,
++                    aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32{4}, aResult.breakIndex);
++
++            // But not the non-breaking hyphen:
++
++            constexpr OUString str = u"und \u2011"_ustr;
++            aResult = m_xBreak->getLineBreak(
++                    str, strlen("und -ko"), aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32{5}, aResult.breakIndex);
++        }
++    }
++
++    // i#83649: "Line break should be between typographical quote and left bracket"
++    // - Actually: Spaces between quotation mark and opening punctuation not treated as a break.
++    // - Note that per the Unicode standard, prohibiting breaks in this context is intentional
++    // because it may cause issues in certain languages due to the various ways quotation
++    // characters are used.
++    // - We do it anyway by customizing the ICU line breaking rules.
++    {
++        {
++            // This uses the sample text provided in the bug report. Based on usage, it is assumed
++            // they were in the de_DE locale.
++
++            aLocale.Language = "de";
++            aLocale.Country = "DE";
++
++            // Per the bug report, it is expected that »angetan werden« remains on the first line.
+             const OUString str = u"»angetan werden« [Passiv]"_ustr;
+             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
+-                str, strlen("Xangetan werdenX ["), aLocale, 0, aHyphOptions, aUserOptions);
++                str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
++
++            // The same result should be returned for this and the first case.
++            const OUString str2 = u"»angetan werden« Passiv"_ustr;
++            aResult = m_xBreak->getLineBreak(
++                str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
++
++            // Under ICU rules, no amount of spaces would cause this to wrap.
++            const OUString str3 = u"»angetan werden«    [Passiv]"_ustr;
++            aResult = m_xBreak->getLineBreak(
++                str3, str3.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(20), aResult.breakIndex);
++
++            // However, tabs will
++            const OUString str4 = u"»angetan werden«\t[Passiv]"_ustr;
++            aResult = m_xBreak->getLineBreak(
++                str4, str4.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
++        }
++
++        {
++            // The same behavior is seen in English
++
++            aLocale.Language = "en";
++            aLocale.Country = "US";
++
++            const OUString str = u"\"angetan werden\" [Passiv]"_ustr;
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
++
++            const OUString str2 = u"\"angetan werden\" Passiv"_ustr;
++            aResult = m_xBreak->getLineBreak(
++                str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
+             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
+         }
+     }
+@@ -355,7 +474,7 @@ void TestBreakIterator::testLineBreaking()
+             auto res = m_xBreak->getLineBreak("Wort -prinzessinnen, wort",
+                                               strlen("Wort -prinzessinnen,"), aLocale, 0,
+                                               aHyphOptions, aUserOptions);
+-            CPPUNIT_ASSERT_EQUAL(sal_Int32{ 6 }, res.breakIndex);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32{ 5 }, res.breakIndex);
+         }
+     }
+ }
+@@ -638,7 +757,8 @@ void TestBreakIterator::testWordBoundaries()
+         CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
+     }
+ 
+-    //See https://bz.apache.org/ooo/show_bug.cgi?id=85411
++    // i#85411: ZWSP should be a word separator for spellchecking
++    // - This fix was applied to both dict and edit customizations
+     for (int j = 0; j < 3; ++j)
+     {
+         switch (j)
+@@ -660,21 +780,23 @@ void TestBreakIterator::testWordBoundaries()
+                 break;
+         }
+ 
+-        static constexpr OUString aTest =
+-            u"I\u200Bwant\u200Bto\u200Bgo"_ustr;
++        static constexpr OUString aTest = u"I\u200Bwant\u200Bto\u200Bgo"_ustr;
+ 
+         sal_Int32 nPos = 0;
+-        sal_Int32 aExpected[] = {1, 6, 9, 12};
++        sal_Int32 aExpected[] = { 1, 6, 9, 12 };
+         size_t i = 0;
+         do
+         {
+             CPPUNIT_ASSERT(i < std::size(aExpected));
+-            nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
+-                i18n::WordType::DICTIONARY_WORD, true).endPos;
+-            CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
++            auto dwPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
++                                                   i18n::WordType::DICTIONARY_WORD, true);
++            CPPUNIT_ASSERT_EQUAL(aExpected[i], dwPos.endPos);
++            auto ewPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
++                                                   i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
++            CPPUNIT_ASSERT_EQUAL(aExpected[i], ewPos.endPos);
++            nPos = dwPos.endPos;
+             ++i;
+-        }
+-        while (nPos++ < aTest.getLength());
++        } while (nPos++ < aTest.getLength());
+         CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
+     }
+ 
+@@ -814,121 +936,45 @@ void TestBreakIterator::testWordBoundaries()
+     }
+ 
+     // i#56347: "BreakIterator patch for Hungarian"
+-    // Rules for Hungarian affixes after numbers and certain symbols
+-    {
+-        auto mode = i18n::WordType::DICTIONARY_WORD;
+-        aLocale.Language = "hu";
+-        aLocale.Country = "HU";
+-
+-        OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr;
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
+-    }
+-
+     // i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian)
+-    // Rules for Hungarian affixes after numbers and certain symbols in edit mode.
+-    // The patch was merged, but the original bug was never closed and the current behavior seems
+-    // identical to the ICU default behavior. Added this test to ensure that doesn't change.
++    // Rules for Hungarian affixes after numbers and certain symbols
+     {
+-        auto mode = i18n::WordType::ANY_WORD;
+         aLocale.Language = "hu";
+         aLocale.Country = "HU";
+ 
+         OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr;
+ 
+-        aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 17, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 19, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 20, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.endPos);
+-
+-        aBounds = m_xBreak->getWordBoundary(aTest, 21, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
++        for (auto mode :
++             { i18n::WordType::DICTIONARY_WORD, i18n::WordType::ANYWORD_IGNOREWHITESPACES })
++        {
++            aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+ 
+-        aBounds = m_xBreak->getWordBoundary(aTest, 24, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.endPos);
++            aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+ 
+-        aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.endPos);
++            aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
+ 
+-        aBounds = m_xBreak->getWordBoundary(aTest, 26, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.endPos);
++            aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
+ 
+-        aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
++            aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
+ 
+-        aBounds = m_xBreak->getWordBoundary(aTest, 30, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.endPos);
++            aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
+ 
+-        aBounds = m_xBreak->getWordBoundary(aTest, 31, aLocale, mode, true);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
+-        CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
++            aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
++        }
+     }
+ }
+ 
+@@ -967,6 +1013,56 @@ void TestBreakIterator::testSentenceBoundaries()
+         CPPUNIT_ASSERT_EQUAL(sal_Int32(24), m_xBreak->beginOfSentence(aTest, 26, aLocale));
+         CPPUNIT_ASSERT_EQUAL(sal_Int32(53), m_xBreak->endOfSentence(aTest, 26, aLocale));
+     }
++
++    // i#55063: Sentence selection in Thai should select a space-delimited phrase.
++    // - This customization broke at some point. It works in an English locale in a synthetic test
++    // like this one, but does not work in the Thai locale, nor on Thai text in practice.
++    {
++        static constexpr OUString aTest = u"ว้อย โหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอก โปรโมเตอร์"_ustr;
++
++        aLocale.Language = "en";
++        aLocale.Country = "US";
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale));
++
++        aLocale.Language = "th";
++        aLocale.Country = "TH";
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale));
++    }
++
++    // i#55063: Thai phrases should delimit English sentence selection.
++    // - This customization broke at some point. It works in an English locale in a synthetic test
++    // like this one, but does not work in the Thai locale, nor on Thai text in practice.
++    {
++        static constexpr OUString aTest = u"ว้อย English usually ends with a period โปรโมเตอร์."_ustr;
++
++        aLocale.Language = "en";
++        aLocale.Country = "US";
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale));
++
++        aLocale.Language = "th";
++        aLocale.Country = "TH";
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale));
++    }
++
++    // i#55063: Characteristic test for English text delimiting Thai phrases (sentences)
++    // - English text should not delimit Thai phrases.
++    {
++        static constexpr OUString aTest = u"Englishโหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอกEnglish"_ustr;
++
++        aLocale.Language = "en";
++        aLocale.Country = "US";
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale));
++
++        aLocale.Language = "th";
++        aLocale.Country = "TH";
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale));
++    }
+ }
+ 
+ //See https://bugs.libreoffice.org/show_bug.cgi?id=40292
+@@ -1501,6 +1597,7 @@ void TestBreakIterator::testLegacyHebrewQuoteInsideWord()
+     aLocale.Language = "he";
+     aLocale.Country = "IL";
+ 
++    // i#51661: Add quotation mark as middle letter for Hebrew
+     {
+         auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr;
+ 
+@@ -1514,6 +1611,21 @@ void TestBreakIterator::testLegacyHebrewQuoteInsideWord()
+         CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
+         CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
+     }
++
++    // i#51661: Add quotation mark as middle letter for Hebrew
++    {
++        auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr;
++
++        i18n::Boundary aBounds = m_xBreak->getWordBoundary(
++            aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale,
++                                            i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
++    }
+ }
+ 
+ void TestBreakIterator::testLegacySurrogatePairs()
+diff --git a/i18npool/source/breakiterator/data/dict_word.txt b/i18npool/source/breakiterator/data/dict_word.txt
+index b1666f44daab..f804b0eec214 100644
+--- a/i18npool/source/breakiterator/data/dict_word.txt
++++ b/i18npool/source/breakiterator/data/dict_word.txt
+@@ -1,148 +1,199 @@
+ #
+-#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
+-#       All Rights Reserved.
++# Copyright (C) 2016 and later: Unicode, Inc. and others.
++# License & terms of use: http://www.unicode.org/copyright.html
++# Copyright (C) 2002-2016, International Business Machines Corporation
++# and others. All Rights Reserved.
+ #
+-#   file:  dict_word.txt   
++# file:  word.txt
+ #
+-#   ICU Word Break Rules
++# ICU Word Break Rules
+ #      See Unicode Standard Annex #29.
+-#      These rules are based on Version 4.0.0, dated 2003-04-17
++#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
+ #
++# Note:  Updates to word.txt will usually need to be merged into
++#        word_POSIX.txt also.
+ 
+-
+-
+-####################################################################################
++##############################################################################
+ #
+ #  Character class definitions from TR 29
+ #
+-####################################################################################
+-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
+-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+-
+-$Ideographic = [:Ideographic:];
+-$Hangul = [:Script = HANGUL:];
+-
+-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
+-                           - $Ideographic
+-                           - $Katakana
+-                           - $Hangul
+-                           - [:Script = Thai:]
+-                           - [:Script = Lao:]
+-                           - [:Script = Hiragana:]];
+-                           
+-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] 
+-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
+-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] 
+-              [:name = HYPHEN-MINUS:] ];
+-
+-$SufixLetter = [:name= FULL STOP:];
+-              
+-
+-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
+-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
+-             [:name = PRIME:]];
+-$Numeric   = [:LineBreak = Numeric:];
+-
+-
+-$TheZWSP = \u200b;
++##############################################################################
++
++### BEGIN CUSTOMIZATION
++### This file contains LibreOffice-specific rule customizations.
++###
++### To aid future maintainability:
++### - The change location should be bracketed by comments of this form.
++### - The original rule should be commented out, and the modified rule placed alongside.
++### - By doing this, maintainers can more easily compare to an upstream baseline.
++###
++### END CUSTOMIZATION
++
++!!chain;
++!!quoted_literals_only;
++
+ 
+ #
+ #  Character Class Definitions.
+-#    The names are those from TR29.
+ #
+-$CR         = \u000d;
+-$LF         = \u000a;
+-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
+-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
+ 
++$Han                = [:Han:];
+ 
++$CR                 = [\p{Word_Break = CR}];
++$LF                 = [\p{Word_Break = LF}];
++$Newline            = [\p{Word_Break = Newline}];
++$Extend             = [\p{Word_Break = Extend}-$Han];
++$ZWJ                = [\p{Word_Break = ZWJ}];
++$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
++$Format             = [\p{Word_Break = Format}];
++$Katakana           = [\p{Word_Break = Katakana}];
++$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
++$ALetter            = [\p{Word_Break = ALetter}];
++$Single_Quote       = [\p{Word_Break = Single_Quote}];
++$Double_Quote       = [\p{Word_Break = Double_Quote}];
++$MidNumLet          = [\p{Word_Break = MidNumLet}];
++$MidNum             = [\p{Word_Break = MidNum}];
++$Numeric            = [\p{Word_Break = Numeric}];
++$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
++$WSegSpace          = [\p{Word_Break = WSegSpace}];
++$Extended_Pict      = [\p{Extended_Pictographic}];
+ 
++### BEGIN CUSTOMIZATION
++### Unknown issue number: Dictionary words can contain hyphens
++### tdf#49885: Sync custom BreakIterator rules with ICU originals
++### - ICU is now more permissive about punctuation inside words.
++### - For compatibility, exclude certain characters that were previously excluded.
+ 
+-####################################################################################
+-#
+-#  Word Break Rules.    Definitions and Rules specific to word break begin Here. 
+-#
+-####################################################################################
++$IncludedML         = [:name = HYPHEN-MINUS:];
++$ExcludedML         = [[:name = COLON:]
++                       [:name = GREEK ANO TELEIA:]
++                       [:name = PRESENTATION FORM FOR VERTICAL COLON:]
++                       [:name = SMALL COLON:]
++                       [:name = FULLWIDTH COLON:]];
+ 
+-$Format    = [[:Cf:] - $TheZWSP];
++# $MidLetter          = [\p{Word_Break = MidLetter}];
++$MidLetter          = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML];
+ 
++### END CUSTOMIZATION
+ 
++$Hiragana           = [:Hiragana:];
++$Ideographic        = [\p{Ideographic}];
+ 
+-# Rule 3:  Treat a grapheme cluster as if it were a single character.
+-#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
+-#          because we don't need to find the boundaries between adjacent syllables -
+-#          they won't be word boundaries.
+-#
+ 
++#   Dictionary character set, for triggering language-based break engines. Currently
++#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
++#   5.0 or later as the definition of Complex_Context was corrected to include all
++#   characters requiring dictionary break.
+ 
+-#
+-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
+-#
+-$ALetterEx    = $ALetter   $Extend*; 
+-$NumericEx    = $Numeric   $Extend*;
+-$MidNumEx     = $MidNum    $Extend*;
+-$MidLetterEx  = $MidLetter $Extend*;
+-$SufixLetterEx= $SufixLetter $Extend*;
+-$KatakanaEx   = $Katakana  $Extend*;
+-$IdeographicEx= $Ideographic  $Extend*;
+-$HangulEx = $Hangul  $Extend*;
+-$FormatEx     = $Format    $Extend*;
++$Control        = [\p{Grapheme_Cluster_Break = Control}];
++$HangulSyllable = [\uac00-\ud7a3];
++$ComplexContext = [:LineBreak = Complex_Context:];
++$KanaKanji      = [$Han $Hiragana $Katakana];
++$dictionaryCJK  = [$KanaKanji $HangulSyllable];
++$dictionary     = [$ComplexContext $dictionaryCJK];
+ 
++# TODO: check if handling of katakana in dictionary makes rules incorrect/void
+ 
+-#
+-#  Numbers.  Rules 8, 11, 12 form the TR.
+-#
+-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
+-$NumberSequence {100};
++# leave CJK scripts out of ALetterPlus
++$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+ 
+-#
+-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
+-#     - must include at least one letter. 
+-#     - may include both letters and numbers.
+-#     - may include  MideLetter, MidNumber punctuation.
+-#
+-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*;     # rules #6, #7
+-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
+ 
+-[[:P:][:S:]]*;
++## -------------------------------------------------
+ 
++# Rule 3 - CR x LF
+ #
+-#  Do not break between Katakana.   Rule #13.
+-#
+-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
+-[:Hiragana:] $Extend* {300};
++$CR $LF;
+ 
++# Rule 3c   Do not break within emoji zwj sequences.
++#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no intervening Extend chars allowed.
+ #
+-#  Ideographic Characters.  Stand by themselves as words.
+-#                           Separated from the "Everything Else" rule, below, only so that they
+-#                           can be tagged with a return value.   TODO:  is this what we want?
+-#
+-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
+-$HangulEx ($FormatEx* $HangulEx)* {400};
++$ZWJ $Extended_Pict;
+ 
++# Rule 3d - Keep horizontal whitespace together.
+ #
+-#  Everything Else, with no tag.
+-#                   Non-Control chars combine with $Extend (combining) chars.
+-#                   Controls are do not.
+-#
+-[^$Control [:Ideographic:]] $Extend*;
+-$CR $LF;
++$WSegSpace $WSegSpace;
++
++# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
++#          of a region of Text.
++
++$ExFm  = [$Extend $Format $ZWJ];
++
++^$ExFm+;            # This rule fires only when there are format or extend characters at the
++                    # start of text, or immediately following another boundary. It groups them, in
++                    # the event there are more than one.
++
++[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing format/extends to words,
++                                    # with no special rule status value.
++
++$Numeric $ExFm* {100};              # This group of rules also attach trailing format/extends, but
++$ALetterPlus $ExFm* {200};          # with rule status set based on the word's final base character.
++$HangulSyllable {200};
++$Hebrew_Letter $ExFm* {200};
++$Katakana $ExFm* {400};             # note:  these status values override those from rule 5
++$Hiragana $ExFm* {400};             #        by virtue of being numerically larger.
++$Ideographic $ExFm* {400};          #
+ 
+ #
+-#  Reverse Rules.   Back up over any of the chars that can group together.
+-#                   (Reverse rules do not need to be exact; they can back up  too far,
+-#                   but must back up at least enough, and must stop on a boundary.)
++# rule 5
++#    Do not break between most letters.
+ #
++($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
++
++# rule 6 and 7
++($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
++
++# rule 7a
++$Hebrew_Letter $ExFm* $Single_Quote {200};
++
++# rule 7b and 7c
++$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
++
++# rule 8
++
++$Numeric $ExFm* $Numeric;
++
++# rule 9
++
++($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
++
++# rule 10
++
++$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
++
++# rule 11 and 12
++
++$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
++
++# rule 13
++# to be consistent with $KanaKanji $KanaKanhi, changed
++# from 300 to 400.
++# See also TestRuleStatus in intltest/rbbiapts.cpp
++$Katakana $ExFm*  $Katakana {400};
++
++# rule 13a/b
++
++$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
++$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
++$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
++$Katakana      $ExFm* $ExtendNumLet {400};    #  (13a)
++$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
++
++$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
++$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
++$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
++$ExtendNumLet  $ExFm* $Katakana     {400};    #  (13b)
+ 
+-# NonStarters are the set of all characters that can appear at the 2nd - nth position of
+-#    a word.   (They may also be the first.)   The reverse rule skips over these, until it
+-#    reaches something that can only be the start (and probably only) char in a "word".
+-#    A space or punctuation meets the test.
++# rules 15 - 17
++#    Pairs of Regional Indicators stay together.
++#    With incoming rule chaining disabled by ^, this rule will match exactly two of them.
++#    No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
+ #
+-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
++^$Regional_Indicator $ExFm* $Regional_Indicator;
+ 
+-#!.*;
+-! ($NonStarters* | \n \r) .;
++# special handling for CJK characters: chain for later dictionary segmentation
++$HangulSyllable $HangulSyllable {200};
++$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
+ 
++# Rule 999
++#     Match a single code point if no other rule applies.
++.;
+diff --git a/i18npool/source/breakiterator/data/dict_word_he.txt b/i18npool/source/breakiterator/data/dict_word_he.txt
+deleted file mode 100644
+index 40197d92a431..000000000000
+--- a/i18npool/source/breakiterator/data/dict_word_he.txt
++++ /dev/null
+@@ -1,139 +0,0 @@
+-#
+-#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
+-#       All Rights Reserved.
+-#
+-#   file:  dict_word.txt   
+-#
+-#   ICU Word Break Rules
+-#      See Unicode Standard Annex #29.
+-#      These rules are based on Version 4.0.0, dated 2003-04-17
+-#
+-
+-
+-
+-####################################################################################
+-#
+-#  Character class definitions from TR 29
+-#
+-####################################################################################
+-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
+-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+-
+-
+-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
+-                           - $Katakana
+-                           - [:Script = Thai:]
+-                           - [:Script = Lao:]
+-                           - [:Script = Hiragana:]];
+-                           
+-$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:]
+-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
+-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:]];  
+-              
+-$SufixLetter = [:name= FULL STOP:];
+-
+-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
+-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
+-             [:name = PRIME:]];
+-$Numeric   = [:LineBreak = Numeric:];
+-
+-
+-$TheZWSP = \u200b;
+-
+-#
+-#  Character Class Definitions.
+-#    The names are those from TR29.
+-#
+-$CR         = \u000d;
+-$LF         = \u000a;
+-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
+-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
+-
+-
+-
+-
+-####################################################################################
+-#
+-#  Word Break Rules.    Definitions and Rules specific to word break begin Here. 
+-#
+-####################################################################################
+-
+-$Format    = [[:Cf:] - $TheZWSP];
+-
+-
+-
+-# Rule 3:  Treat a grapheme cluster as if it were a single character.
+-#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
+-#          because we don't need to find the boundaries between adjacent syllables -
+-#          they won't be word boundaries.
+-#
+-
+-
+-#
+-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
+-#
+-$ALetterEx    = $ALetter   $Extend*; 
+-$NumericEx    = $Numeric   $Extend*;
+-$MidNumEx     = $MidNum    $Extend*;
+-$MidLetterEx  = $MidLetter $Extend*;
+-$SufixLetterEx= $SufixLetter $Extend*;
+-$KatakanaEx   = $Katakana  $Extend*;
+-$FormatEx     = $Format    $Extend*;
+-
+-
+-#
+-#  Numbers.  Rules 8, 11, 12 form the TR.
+-#
+-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
+-$NumberSequence {100};
+-
+-#
+-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
+-#     - must include at least one letter. 
+-#     - may include both letters and numbers.
+-#     - may include  MideLetter, MidNumber punctuation.
+-#
+-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*;     # rules #6, #7
+-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
+-
+-[[:P:][:S:]]*;
+-
+-#
+-#  Do not break between Katakana.   Rule #13.
+-#
+-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
+-[:Hiragana:] $Extend* {300};
+-
+-#
+-#  Ideographic Characters.  Stand by themselves as words.
+-#                           Separated from the "Everything Else" rule, below, only so that they
+-#                           can be tagged with a return value.   TODO:  is this what we want?
+-#
+-# [:IDEOGRAPHIC:] $Extend* {400};
+-
+-#
+-#  Everything Else, with no tag.
+-#                   Non-Control chars combine with $Extend (combining) chars.
+-#                   Controls are do not.
+-#
+-[^$Control [:Ideographic:]] $Extend*;
+-$CR $LF;
+-
+-#
+-#  Reverse Rules.   Back up over any of the chars that can group together.
+-#                   (Reverse rules do not need to be exact; they can back up  too far,
+-#                   but must back up at least enough, and must stop on a boundary.)
+-#
+-
+-# NonStarters are the set of all characters that can appear at the 2nd - nth position of
+-#    a word.   (They may also be the first.)   The reverse rule skips over these, until it
+-#    reaches something that can only be the start (and probably only) char in a "word".
+-#    A space or punctuation meets the test.
+-#
+-$NonStarters = [$Numeric $ALetter $Katakana [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
+-
+-#!.*;
+-! ($NonStarters* | \n \r) .;
+-
+diff --git a/i18npool/source/breakiterator/data/dict_word_hu.txt b/i18npool/source/breakiterator/data/dict_word_hu.txt
+index b0a0276b36a8..88648e6e5716 100644
+--- a/i18npool/source/breakiterator/data/dict_word_hu.txt
++++ b/i18npool/source/breakiterator/data/dict_word_hu.txt
+@@ -1,176 +1,222 @@
+ #
+-#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
+-#       All Rights Reserved.
++# Copyright (C) 2016 and later: Unicode, Inc. and others.
++# License & terms of use: http://www.unicode.org/copyright.html
++# Copyright (C) 2002-2016, International Business Machines Corporation
++# and others. All Rights Reserved.
+ #
+-#   file:  dict_word.txt   
++# file:  word.txt
+ #
+-#   ICU Word Break Rules
++# ICU Word Break Rules
+ #      See Unicode Standard Annex #29.
+-#      These rules are based on Version 4.0.0, dated 2003-04-17
++#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
+ #
++# Note:  Updates to word.txt will usually need to be merged into
++#        word_POSIX.txt also.
+ 
+-
+-
+-####################################################################################
++##############################################################################
+ #
+ #  Character class definitions from TR 29
+ #
+-####################################################################################
+-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
+-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+-
+-$Ideographic = [:Ideographic:];
+-$Hangul = [:Script = HANGUL:];
+-
+-
+-# Fix spelling of a)-ban, b)-ben, when the letter is a reference
+-# resulting bad word breaking "ban" and "ben"
+-# (reference fields are not expanded in spell checking, yet, only
+-# for grammar checking).
+-
+-$PrefixLetter = [[:name = RIGHT PARENTHESIS:]];
+-
+-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
+-                [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:]
+-                [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:]
+-                [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:]
+-                [:name = DIGIT ZERO:]
+-                [:name = DIGIT ONE:]
+-                [:name = DIGIT TWO:]
+-                [:name = DIGIT THREE:]
+-                [:name = DIGIT FOUR:]
+-                [:name = DIGIT FIVE:]
+-                [:name = DIGIT SIX:]
+-                [:name = DIGIT SEVEN:]
+-                [:name = DIGIT EIGHT:]
+-                [:name = DIGIT NINE:]
+-                           - $Ideographic
+-                           - $Katakana
+-                           - $Hangul
+-                           - [:Script = Thai:]
+-                           - [:Script = Lao:]
+-                           - [:Script = Hiragana:]];
+-                           
+-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:]
+-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
+-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:]
+-              [:name = EURO SIGN:] [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:]
+-              [:name = EN DASH:] [:name = EM DASH:]
+-              [:name = RIGHT DOUBLE QUOTATION MARK:]
+-              [:name = LEFT PARENTHESIS:]
+-              [:name = RIGHT PARENTHESIS:]
+-              [:name = RIGHT SQUARE BRACKET:]
+-              [:name = EXCLAMATION MARK:]
+-              [:name = QUESTION MARK:]
+-              [:name = FULL STOP:] [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]];  
+-              
+-$SufixLetter = [:name= FULL STOP:];
+-
+-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
+-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
+-             [:name = PRIME:]];
+-$Numeric   = [:LineBreak = Numeric:];
+-
+-
+-$TheZWSP = \u200b;
++##############################################################################
++
++### BEGIN CUSTOMIZATION
++### This file contains LibreOffice-specific rule customizations.
++###
++### To aid future maintainability:
++### - The change location should be bracketed by comments of this form.
++### - The original rule should be commented out, and the modified rule placed alongside.
++### - By doing this, maintainers can more easily compare to an upstream baseline.
++###
++### END CUSTOMIZATION
++
++!!chain;
++!!quoted_literals_only;
++
+ 
+ #
+ #  Character Class Definitions.
+-#    The names are those from TR29.
+ #
+-$CR         = \u000d;
+-$LF         = \u000a;
+-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
+-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
+-
+-
+ 
++$Han                = [:Han:];
++
++$CR                 = [\p{Word_Break = CR}];
++$LF                 = [\p{Word_Break = LF}];
++$Newline            = [\p{Word_Break = Newline}];
++$Extend             = [\p{Word_Break = Extend}-$Han];
++$ZWJ                = [\p{Word_Break = ZWJ}];
++$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
++$Format             = [\p{Word_Break = Format}];
++$Katakana           = [\p{Word_Break = Katakana}];
++$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
++$Single_Quote       = [\p{Word_Break = Single_Quote}];
++$Double_Quote       = [\p{Word_Break = Double_Quote}];
++$MidNumLet          = [\p{Word_Break = MidNumLet}];
++$MidNum             = [\p{Word_Break = MidNum}];
++$Numeric            = [\p{Word_Break = Numeric}];
++$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
++$WSegSpace          = [\p{Word_Break = WSegSpace}];
++$Extended_Pict      = [\p{Extended_Pictographic}];
++
++### BEGIN CUSTOMIZATION
++### Unknown issue number: Dictionary words can contain hyphens
++### tdf#49885: Sync custom BreakIterator rules with ICU originals
++### - ICU is now more permissive about punctuation inside words.
++### - For compatibility, exclude certain characters that were previously excluded.
++### tdf#116072: Extend MidLetter in Hungarian word breaking
++### i#56347: BreakIterator patch for Hungarian
++### i#56348: Special chars in first pos not handled by spell checking for Hungarian
++
++$Symbols_hu         = [[:name = PERCENT SIGN:]
++                       [:name = PER MILLE SIGN:]
++                       [:name = PER TEN THOUSAND SIGN:]
++                       [:name = SECTION SIGN:]
++                       [:name = DEGREE SIGN:]
++                       [:name = EURO SIGN:]
++                       [:name = HYPHEN-MINUS:]
++                       [:name = EN DASH:]
++                       [:name = EM DASH:]];
++
++#$ALetter            = [\p{Word_Break = ALetter}];
++$ALetter            = [\p{Word_Break = ALetter} $Symbols_hu];
++
++$IncludedML         = [:name = HYPHEN-MINUS:];
++$ExcludedML         = [[:name = COLON:]
++                       [:name = GREEK ANO TELEIA:]
++                       [:name = PRESENTATION FORM FOR VERTICAL COLON:]
++                       [:name = SMALL COLON:]
++                       [:name = FULLWIDTH COLON:]];
++
++$IncludedML_hu      = [[:name = RIGHT DOUBLE QUOTATION MARK:]
++                       [:name = LEFT PARENTHESIS:]
++                       [:name = RIGHT PARENTHESIS:]
++                       [:name = RIGHT SQUARE BRACKET:]
++                       [:name = EXCLAMATION MARK:]
++                       [:name = QUESTION MARK:]
++                       $Symbols_hu];
++
++# $MidLetter          = [\p{Word_Break = MidLetter}];
++$MidLetter          = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML $IncludedML_hu];
++
++### END CUSTOMIZATION
++
++$Hiragana           = [:Hiragana:];
++$Ideographic        = [\p{Ideographic}];
++
++
++#   Dictionary character set, for triggering language-based break engines. Currently
++#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
++#   5.0 or later as the definition of Complex_Context was corrected to include all
++#   characters requiring dictionary break.
++
++$Control        = [\p{Grapheme_Cluster_Break = Control}];
++$HangulSyllable = [\uac00-\ud7a3];
++$ComplexContext = [:LineBreak = Complex_Context:];
++$KanaKanji      = [$Han $Hiragana $Katakana];
++$dictionaryCJK  = [$KanaKanji $HangulSyllable];
++$dictionary     = [$ComplexContext $dictionaryCJK];
++
++# TODO: check if handling of katakana in dictionary makes rules incorrect/void
++
++# leave CJK scripts out of ALetterPlus
++$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
++
++
++## -------------------------------------------------
++
++# Rule 3 - CR x LF
++#
++$CR $LF;
+ 
+-####################################################################################
++# Rule 3c   Do not break within emoji zwj sequences.
++#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no intervening Extend chars allowed.
+ #
+-#  Word Break Rules.    Definitions and Rules specific to word break begin Here. 
++$ZWJ $Extended_Pict;
++
++# Rule 3d - Keep horizontal whitespace together.
+ #
+-####################################################################################
++$WSegSpace $WSegSpace;
+ 
+-$Format    = [[:Cf:] - $TheZWSP];
++# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
++#          of a region of Text.
+ 
++$ExFm  = [$Extend $Format $ZWJ];
+ 
++^$ExFm+;            # This rule fires only when there are format or extend characters at the
++                    # start of text, or immediately following another boundary. It groups them, in
++                    # the event there are more than one.
+ 
+-# Rule 3:  Treat a grapheme cluster as if it were a single character.
+-#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
+-#          because we don't need to find the boundaries between adjacent syllables -
+-#          they won't be word boundaries.
+-#
++[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing format/extends to words,
++                                    # with no special rule status value.
+ 
++$Numeric $ExFm* {100};              # This group of rules also attach trailing format/extends, but
++$ALetterPlus $ExFm* {200};          # with rule status set based on the word's final base character.
++$HangulSyllable {200};
++$Hebrew_Letter $ExFm* {200};
++$Katakana $ExFm* {400};             # note:  these status values override those from rule 5
++$Hiragana $ExFm* {400};             #        by virtue of being numerically larger.
++$Ideographic $ExFm* {400};          #
+ 
+ #
+-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
++# rule 5
++#    Do not break between most letters.
+ #
+-$ALetterEx    = $ALetter   $Extend*; 
+-$NumericEx    = $Numeric   $Extend*;
+-$MidNumEx     = $MidNum    $Extend*;
+-$MidLetterEx  = $MidLetter $Extend*;
+-$SufixLetterEx= $SufixLetter $Extend*;
+-$KatakanaEx   = $Katakana  $Extend*;
+-$IdeographicEx= $Ideographic  $Extend*;
+-$HangulEx = $Hangul  $Extend*;
+-$FormatEx     = $Format    $Extend*;
++($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
+ 
++# rule 6 and 7
++($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
+ 
+-#
+-#  Numbers.  Rules 8, 11, 12 form the TR.
+-#
+-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
+-$NumberSequence {100};
++# rule 7a
++$Hebrew_Letter $ExFm* $Single_Quote {200};
+ 
+-#
+-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
+-#     - must include at least one letter. 
+-#     - may include both letters and numbers.
+-#     - may include  MideLetter, MidNumber punctuation.
+-#
+-$LetterSequence = $PrefixLetter? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*;     # rules #6, #7
+-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
++# rule 7b and 7c
++$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
+ 
+-[[:P:][:S:]]*;
++# rule 8
+ 
+-#
+-#  Do not break between Katakana.   Rule #13.
+-#
+-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
+-[:Hiragana:] $Extend* {300};
++$Numeric $ExFm* $Numeric;
+ 
+-#
+-#  Ideographic Characters.  Stand by themselves as words.
+-#                           Separated from the "Everything Else" rule, below, only so that they
+-#                           can be tagged with a return value.   TODO:  is this what we want?
+-#
+-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
+-$HangulEx ($FormatEx* $HangulEx)* {400};
++# rule 9
+ 
+-#
+-#  Everything Else, with no tag.
+-#                   Non-Control chars combine with $Extend (combining) chars.
+-#                   Controls are do not.
+-#
+-[^$Control [:Ideographic:]] $Extend*;
+-$CR $LF;
++($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
+ 
+-#
+-#  Reverse Rules.   Back up over any of the chars that can group together.
+-#                   (Reverse rules do not need to be exact; they can back up  too far,
+-#                   but must back up at least enough, and must stop on a boundary.)
+-#
++# rule 10
++
++$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
++
++# rule 11 and 12
++
++$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
++
++# rule 13
++# to be consistent with $KanaKanji $KanaKanhi, changed
++# from 300 to 400.
++# See also TestRuleStatus in intltest/rbbiapts.cpp
++$Katakana $ExFm*  $Katakana {400};
++
++# rule 13a/b
++
++$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
++$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
++$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
++$Katakana      $ExFm* $ExtendNumLet {400};    #  (13a)
++$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
++
++$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
++$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
++$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
++$ExtendNumLet  $ExFm* $Katakana     {400};    #  (13b)
+ 
+-# NonStarters are the set of all characters that can appear at the 2nd - nth position of
+-#    a word.   (They may also be the first.)   The reverse rule skips over these, until it
+-#    reaches something that can only be the start (and probably only) char in a "word".
+-#    A space or punctuation meets the test.
++# rules 15 - 17
++#    Pairs of Regional Indicators stay together.
++#    With incoming rule chaining disabled by ^, this rule will match exactly two of them.
++#    No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
+ #
+-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
++^$Regional_Indicator $ExFm* $Regional_Indicator;
+ 
+-#!.*;
+-! ($NonStarters* | \n \r) .;
++# special handling for CJK characters: chain for later dictionary segmentation
++$HangulSyllable $HangulSyllable {200};
++$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
+ 
++# Rule 999
++#     Match a single code point if no other rule applies.
++.;
+diff --git a/i18npool/source/breakiterator/data/dict_word_nodash.txt b/i18npool/source/breakiterator/data/dict_word_nodash.txt
+deleted file mode 100644
+index 279cc50e5b66..000000000000
+--- a/i18npool/source/breakiterator/data/dict_word_nodash.txt
++++ /dev/null
+@@ -1,147 +0,0 @@
+-#
+-#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
+-#       All Rights Reserved.
+-#
+-#   file:  dict_word.txt   
+-#
+-#   ICU Word Break Rules
+-#      See Unicode Standard Annex #29.
+-#      These rules are based on Version 4.0.0, dated 2003-04-17
+-#
+-
+-
+-
+-####################################################################################
+-#
+-#  Character class definitions from TR 29
+-#
+-####################################################################################
+-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
+-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+-
+-$Ideographic = [:Ideographic:];
+-$Hangul = [:Script = HANGUL:];
+-
+-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
+-                           - $Ideographic
+-                           - $Katakana
+-                           - $Hangul
+-                           - [:Script = Thai:]
+-                           - [:Script = Lao:]
+-                           - [:Script = Hiragana:]];
+-                           
+-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] 
+-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
+-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] ];  
+-
+-$SufixLetter = [:name= FULL STOP:];
+-              
+-
+-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
+-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
+-             [:name = PRIME:]];
+-$Numeric   = [:LineBreak = Numeric:];
+-
+-
+-$TheZWSP = \u200b;
+-
+-#
+-#  Character Class Definitions.
+-#    The names are those from TR29.
+-#
+-$CR         = \u000d;
+-$LF         = \u000a;
+-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
+-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
+-
+-
+-
+-
+-####################################################################################
+-#
+-#  Word Break Rules.    Definitions and Rules specific to word break begin Here. 
+-#
+-####################################################################################
+-
+-$Format    = [[:Cf:] - $TheZWSP];
+-
+-
+-
+-# Rule 3:  Treat a grapheme cluster as if it were a single character.
+-#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
+-#          because we don't need to find the boundaries between adjacent syllables -
+-#          they won't be word boundaries.
+-#
+-
+-
+-#
+-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
+-#
+-$ALetterEx    = $ALetter   $Extend*; 
+-$NumericEx    = $Numeric   $Extend*;
+-$MidNumEx     = $MidNum    $Extend*;
+-$MidLetterEx  = $MidLetter $Extend*;
+-$SufixLetterEx= $SufixLetter $Extend*;
+-$KatakanaEx   = $Katakana  $Extend*;
+-$IdeographicEx= $Ideographic  $Extend*;
+-$HangulEx = $Hangul  $Extend*;
+-$FormatEx     = $Format    $Extend*;
+-
+-
+-#
+-#  Numbers.  Rules 8, 11, 12 form the TR.
+-#
+-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
+-$NumberSequence {100};
+-
+-#
+-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
+-#     - must include at least one letter. 
+-#     - may include both letters and numbers.
+-#     - may include  MideLetter, MidNumber punctuation.
+-#
+-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*;     # rules #6, #7
+-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
+-
+-[[:P:][:S:]]*;
+-
+-#
+-#  Do not break between Katakana.   Rule #13.
+-#
+-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
+-[:Hiragana:] $Extend* {300};
+-
+-#
+-#  Ideographic Characters.  Stand by themselves as words.
+-#                           Separated from the "Everything Else" rule, below, only so that they
+-#                           can be tagged with a return value.   TODO:  is this what we want?
+-#
+-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
+-$HangulEx ($FormatEx* $HangulEx)* {400};
+-
+-#
+-#  Everything Else, with no tag.
+-#                   Non-Control chars combine with $Extend (combining) chars.
+-#                   Controls are do not.
+-#
+-[^$Control [:Ideographic:]] $Extend*;
+-$CR $LF;
+-
+-#
+-#  Reverse Rules.   Back up over any of the chars that can group together.
+-#                   (Reverse rules do not need to be exact; they can back up  too far,
+-#                   but must back up at least enough, and must stop on a boundary.)
+-#
+-
+-# NonStarters are the set of all characters that can appear at the 2nd - nth position of
+-#    a word.   (They may also be the first.)   The reverse rule skips over these, until it
+-#    reaches something that can only be the start (and probably only) char in a "word".
+-#    A space or punctuation meets the test.
+-#
+-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
+-
+-#!.*;
+-! ($NonStarters* | \n \r) .;
+-
+diff --git a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt
+index fb29b478af21..b39503d1b405 100644
+--- a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt
++++ b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt
+@@ -1,157 +1,221 @@
+ #
+-#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
+-#       All Rights Reserved.
++# Copyright (C) 2016 and later: Unicode, Inc. and others.
++# License & terms of use: http://www.unicode.org/copyright.html
++# Copyright (C) 2002-2016, International Business Machines Corporation
++# and others. All Rights Reserved.
+ #
+-#   file:  dict_word.txt   
++# file:  word.txt
+ #
+-#   ICU Word Break Rules
++# ICU Word Break Rules
+ #      See Unicode Standard Annex #29.
+-#      These rules are based on Version 4.0.0, dated 2003-04-17
++#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
+ #
++# Note:  Updates to word.txt will usually need to be merged into
++#        word_POSIX.txt also.
+ 
+-
+-
+-####################################################################################
++##############################################################################
+ #
+ #  Character class definitions from TR 29
+ #
+-####################################################################################
+-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
+-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
++##############################################################################
+ 
+-$Ideographic = [:Ideographic:];
+-$Hangul = [:Script = HANGUL:];
++### BEGIN CUSTOMIZATION
++### This file contains LibreOffice-specific rule customizations.
++###
++### To aid future maintainability:
++### - The change location should be bracketed by comments of this form.
++### - The original rule should be commented out, and the modified rule placed alongside.
++### - By doing this, maintainers can more easily compare to an upstream baseline.
++###
++### END CUSTOMIZATION
+ 
+-# list of dashes or hyphens that should be accepted as part of the word if a single one of these
+-# pre- or postfixes a word. E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to
+-# be part of the word in order to have it properly spell checked etc.
+-$PrePostDashHyphen = [ [:name = HYPHEN-MINUS:] ];
++!!chain;
++!!quoted_literals_only;
+ 
+ 
+-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
+-                           - $Ideographic
+-                           - $Katakana
+-                           - $Hangul
+-                           - [:Script = Thai:]
+-                           - [:Script = Lao:]
+-                           - [:Script = Hiragana:]];
+-                           
+-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] 
+-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
+-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] 
+-              [:name = HYPHEN-MINUS:] ];
++#
++#  Character Class Definitions.
++#
+ 
+-$SufixLetter = [:name= FULL STOP:];
+-              
++$Han                = [:Han:];
+ 
+-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
+-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
+-             [:name = PRIME:]];
+-$Numeric   = [:LineBreak = Numeric:];
++$CR                 = [\p{Word_Break = CR}];
++$LF                 = [\p{Word_Break = LF}];
++$Newline            = [\p{Word_Break = Newline}];
++$Extend             = [\p{Word_Break = Extend}-$Han];
++$ZWJ                = [\p{Word_Break = ZWJ}];
++$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
++$Format             = [\p{Word_Break = Format}];
++$Katakana           = [\p{Word_Break = Katakana}];
++$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
++$ALetter            = [\p{Word_Break = ALetter}];
++$Single_Quote       = [\p{Word_Break = Single_Quote}];
++$Double_Quote       = [\p{Word_Break = Double_Quote}];
++$MidNumLet          = [\p{Word_Break = MidNumLet}];
++$MidNum             = [\p{Word_Break = MidNum}];
++$Numeric            = [\p{Word_Break = Numeric}];
++$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
++$WSegSpace          = [\p{Word_Break = WSegSpace}];
++$Extended_Pict      = [\p{Extended_Pictographic}];
+ 
++### BEGIN CUSTOMIZATION
++### Unknown issue number: Dictionary words can contain hyphens
++### tdf#49885: Sync custom BreakIterator rules with ICU originals
++### - ICU is now more permissive about punctuation inside words.
++### - For compatibility, exclude certain characters that were previously excluded.
+ 
+-$TheZWSP = \u200b;
++$IncludedML         = [:name = HYPHEN-MINUS:];
++$ExcludedML         = [[:name = COLON:]
++                       [:name = GREEK ANO TELEIA:]
++                       [:name = PRESENTATION FORM FOR VERTICAL COLON:]
++                       [:name = SMALL COLON:]
++                       [:name = FULLWIDTH COLON:]];
+ 
+-#
+-#  Character Class Definitions.
+-#    The names are those from TR29.
+-#
+-$CR         = \u000d;
+-$LF         = \u000a;
+-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
+-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
++# $MidLetter          = [\p{Word_Break = MidLetter}];
++$MidLetter          = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML];
+ 
++### END CUSTOMIZATION
+ 
++### BEGIN CUSTOMIZATION
++### Unknown issue number: Allow leading and trailing hyphens in certain languages
++### This part of the customization does not replace any rules.
+ 
++$PrePostHyphen      = [:name = HYPHEN-MINUS:];
+ 
+-####################################################################################
+-#
+-#  Word Break Rules.    Definitions and Rules specific to word break begin Here. 
+-#
+-####################################################################################
++### END CUSTOMIZATION
+ 
+-$Format    = [[:Cf:] - $TheZWSP];
++$Hiragana           = [:Hiragana:];
++$Ideographic        = [\p{Ideographic}];
+ 
+ 
++#   Dictionary character set, for triggering language-based break engines. Currently
++#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
++#   5.0 or later as the definition of Complex_Context was corrected to include all
++#   characters requiring dictionary break.
+ 
+-# Rule 3:  Treat a grapheme cluster as if it were a single character.
+-#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
+-#          because we don't need to find the boundaries between adjacent syllables -
+-#          they won't be word boundaries.
+-#
++$Control        = [\p{Grapheme_Cluster_Break = Control}];
++$HangulSyllable = [\uac00-\ud7a3];
++$ComplexContext = [:LineBreak = Complex_Context:];
++$KanaKanji      = [$Han $Hiragana $Katakana];
++$dictionaryCJK  = [$KanaKanji $HangulSyllable];
++$dictionary     = [$ComplexContext $dictionaryCJK];
+ 
++# TODO: check if handling of katakana in dictionary makes rules incorrect/void
+ 
+-#
+-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
+-#
+-$ALetterEx    = $ALetter   $Extend*; 
+-$NumericEx    = $Numeric   $Extend*;
+-$MidNumEx     = $MidNum    $Extend*;
+-$MidLetterEx  = $MidLetter $Extend*;
+-$SufixLetterEx= $SufixLetter $Extend*;
+-$KatakanaEx   = $Katakana  $Extend*;
+-$IdeographicEx= $Ideographic  $Extend*;
+-$HangulEx = $Hangul  $Extend*;
+-$FormatEx     = $Format    $Extend*;
++# leave CJK scripts out of ALetterPlus
++$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+ 
+ 
++## -------------------------------------------------
++
++# Rule 3 - CR x LF
+ #
+-#  Numbers.  Rules 8, 11, 12 form the TR.
+-#
+-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
+-$NumberSequence {100};
++$CR $LF;
+ 
++# Rule 3c   Do not break within emoji zwj sequences.
++#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no intervening Extend chars allowed.
+ #
+-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
+-#     - must include at least one letter. 
+-#     - may include both letters and numbers.
+-#     - may include  MideLetter, MidNumber punctuation.
++$ZWJ $Extended_Pict;
++
++# Rule 3d - Keep horizontal whitespace together.
+ #
+-# At most one leading or trailing dash/hyphen should be accepted as well.
+-# E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to
+-# be part of the word in order to have it properly spell checked etc.
+-$LetterSequence = $PrePostDashHyphen? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)* $PrePostDashHyphen?;     # rules #6, #7
+-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
++$WSegSpace $WSegSpace;
+ 
+-[[:P:][:S:]]*;
++# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
++#          of a region of Text.
+ 
+-#
+-#  Do not break between Katakana.   Rule #13.
+-#
+-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
+-[:Hiragana:] $Extend* {300};
++$ExFm  = [$Extend $Format $ZWJ];
+ 
+-#
+-#  Ideographic Characters.  Stand by themselves as words.
+-#                           Separated from the "Everything Else" rule, below, only so that they
+-#                           can be tagged with a return value.   TODO:  is this what we want?
+-#
+-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
+-$HangulEx ($FormatEx* $HangulEx)* {400};
++^$ExFm+;            # This rule fires only when there are format or extend characters at the
++                    # start of text, or immediately following another boundary. It groups them, in
++                    # the event there are more than one.
+ 
+-#
+-#  Everything Else, with no tag.
+-#                   Non-Control chars combine with $Extend (combining) chars.
+-#                   Controls are do not.
+-#
+-[^$Control [:Ideographic:]] $Extend*;
+-$CR $LF;
++[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing format/extends to words,
++                                    # with no special rule status value.
++
++$Numeric $ExFm* {100};              # This group of rules also attach trailing format/extends, but
++$ALetterPlus $ExFm* {200};          # with rule status set based on the word's final base character.
++$HangulSyllable {200};
++$Hebrew_Letter $ExFm* {200};
++$Katakana $ExFm* {400};             # note:  these status values override those from rule 5
++$Hiragana $ExFm* {400};             #        by virtue of being numerically larger.
++$Ideographic $ExFm* {400};          #
+ 
+ #
+-#  Reverse Rules.   Back up over any of the chars that can group together.
+-#                   (Reverse rules do not need to be exact; they can back up  too far,
+-#                   but must back up at least enough, and must stop on a boundary.)
++# rule 5
++#    Do not break between most letters.
+ #
+ 
+-# NonStarters are the set of all characters that can appear at the 2nd - nth position of
+-#    a word.   (They may also be the first.)   The reverse rule skips over these, until it
+-#    reaches something that can only be the start (and probably only) char in a "word".
+-#    A space or punctuation meets the test.
++### BEGIN CUSTOMIZATION
++### Unknown issue number: Allow leading and trailing hyphens in certain languages
++
++# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
++($PrePostHyphen) ? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen)?;
++
++### END CUSTOMIZATION
++
++# rule 6 and 7
++
++### BEGIN CUSTOMIZATION
++### Unknown issue number: Allow leading and trailing hyphens in certain languages
++
++# ($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
++($PrePostHyphen)? ($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen)? {200};
++
++### END CUSTOMIZATION
++
++# rule 7a
++$Hebrew_Letter $ExFm* $Single_Quote {200};
++
++# rule 7b and 7c
++$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
++
++# rule 8
++
++$Numeric $ExFm* $Numeric;
++
++# rule 9
++
++($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
++
++# rule 10
++
++$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
++
++# rule 11 and 12
++
++$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
++
++# rule 13
++# to be consistent with $KanaKanji $KanaKanhi, changed
++# from 300 to 400.
++# See also TestRuleStatus in intltest/rbbiapts.cpp
++$Katakana $ExFm*  $Katakana {400};
++
++# rule 13a/b
++
++$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
++$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
++$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
++$Katakana      $ExFm* $ExtendNumLet {400};    #  (13a)
++$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
++
++$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
++$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
++$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
++$ExtendNumLet  $ExFm* $Katakana     {400};    #  (13b)
++
++# rules 15 - 17
++#    Pairs of Regional Indicators stay together.
++#    With incoming rule chaining disabled by ^, this rule will match exactly two of them.
++#    No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
+ #
+-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
++^$Regional_Indicator $ExFm* $Regional_Indicator;
+ 
+-#!.*;
+-! ($NonStarters* | \n \r) .;
++# special handling for CJK characters: chain for later dictionary segmentation
++$HangulSyllable $HangulSyllable {200};
++$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
+ 
++# Rule 999
++#     Match a single code point if no other rule applies.
++.;
+diff --git a/i18npool/source/breakiterator/data/edit_word.txt b/i18npool/source/breakiterator/data/edit_word.txt
+index 92b344c19d41..14fc221aa96e 100644
+--- a/i18npool/source/breakiterator/data/edit_word.txt
++++ b/i18npool/source/breakiterator/data/edit_word.txt
+@@ -1,142 +1,199 @@
+ #
+-#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
+-#       All Rights Reserved.
++# Copyright (C) 2016 and later: Unicode, Inc. and others.
++# License & terms of use: http://www.unicode.org/copyright.html
++# Copyright (C) 2002-2016, International Business Machines Corporation
++# and others. All Rights Reserved.
+ #
+-#   file:  edit_word.txt   
++# file:  word.txt
+ #
+-#   ICU Word Break Rules
++# ICU Word Break Rules
+ #      See Unicode Standard Annex #29.
+-#      These rules are based on Version 4.0.0, dated 2003-04-17
++#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
+ #
++# Note:  Updates to word.txt will usually need to be merged into
++#        word_POSIX.txt also.
+ 
+-
+-
+-####################################################################################
++##############################################################################
+ #
+ #  Character class definitions from TR 29
+ #
+-####################################################################################
+-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
+-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+-
+-$Ideographic = [:Ideographic:];
+-$Hangul = [:Script = HANGUL:];
+-
+-$ALetter   = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] 
+-                           - $Ideographic
+-                           - $Katakana
+-                           - $Hangul
+-                           - [:Script = Thai:]
+-                           - [:Script = Lao:]
+-                           - [:Script = Hiragana:]];
+-                           
+-$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]  [:name = HEBREW PUNCTUATION GERSHAYIM:]
+-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];  
+-              
+-$MidNum    = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]];
+-$Numeric   = [:LineBreak = Numeric:];
+-
+-
+-$TheZWSP = \u200b;
++##############################################################################
++
++### BEGIN CUSTOMIZATION
++### This file contains LibreOffice-specific rule customizations.
++###
++### To aid future maintainability:
++### - The change location should be bracketed by comments of this form.
++### - The original rule should be commented out, and the modified rule placed alongside.
++### - By doing this, maintainers can more easily compare to an upstream baseline.
++###
++### END CUSTOMIZATION
++
++!!chain;
++!!quoted_literals_only;
++
+ 
+ #
+ #  Character Class Definitions.
+-#    The names are those from TR29.
+ #
+-$CR         = \u000d;
+-$LF         = \u000a;
+-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
+-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
+ 
++$Han                = [:Han:];
+ 
++$CR                 = [\p{Word_Break = CR}];
++$LF                 = [\p{Word_Break = LF}];
++$Newline            = [\p{Word_Break = Newline}];
++$Extend             = [\p{Word_Break = Extend}-$Han];
++$ZWJ                = [\p{Word_Break = ZWJ}];
++$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
++$Format             = [\p{Word_Break = Format}];
++$Katakana           = [\p{Word_Break = Katakana}];
++$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
++$ALetter            = [\p{Word_Break = ALetter}];
++$Single_Quote       = [\p{Word_Break = Single_Quote}];
++$Double_Quote       = [\p{Word_Break = Double_Quote}];
++$MidLetter          = [\p{Word_Break = MidLetter}];
++$MidNum             = [\p{Word_Break = MidNum}];
++$Numeric            = [\p{Word_Break = Numeric}];
++$WSegSpace          = [\p{Word_Break = WSegSpace}];
++$Extended_Pict      = [\p{Extended_Pictographic}];
+ 
++### BEGIN CUSTOMIZATION
++### i#13494: For the purposes of editing, standalone punctuation should be treated as a word.
++### This change subtracts undesired characters from the above families
+ 
+-####################################################################################
+-#
+-#  Word Break Rules.    Definitions and Rules specific to word break begin Here. 
+-#
+-####################################################################################
++# $MidNumLet          = [\p{Word_Break = MidNumLet}];
++$MidNumLet          = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]];
+ 
+-$Format    = [[:Cf:] - $TheZWSP];
++# $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
++$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]];
+ 
++### END CUSTOMIZATION
+ 
++$Hiragana           = [:Hiragana:];
++$Ideographic        = [\p{Ideographic}];
+ 
+-# Rule 3:  Treat a grapheme cluster as if it were a single character.
+-#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
+-#          because we don't need to find the boundaries between adjacent syllables -
+-#          they won't be word boundaries.
+-#
+ 
++#   Dictionary character set, for triggering language-based break engines. Currently
++#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
++#   5.0 or later as the definition of Complex_Context was corrected to include all
++#   characters requiring dictionary break.
+ 
+-#
+-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
+-#
+-$ALetterEx    = $ALetter   $Extend*; 
+-$NumericEx    = $Numeric   $Extend*;
+-$MidNumEx     = $MidNum    $Extend*;
+-$MidLetterEx  = $MidLetter $Extend*;
+-$KatakanaEx   = $Katakana  $Extend*;
+-$IdeographicEx= $Ideographic  $Extend*;
+-$HangulEx = $Hangul  $Extend*;
+-$FormatEx     = $Format    $Extend*;
++$Control        = [\p{Grapheme_Cluster_Break = Control}];
++$HangulSyllable = [\uac00-\ud7a3];
++$ComplexContext = [:LineBreak = Complex_Context:];
++$KanaKanji      = [$Han $Hiragana $Katakana];
++$dictionaryCJK  = [$KanaKanji $HangulSyllable];
++$dictionary     = [$ComplexContext $dictionaryCJK];
+ 
++# TODO: check if handling of katakana in dictionary makes rules incorrect/void
+ 
+-#
+-#  Numbers.  Rules 8, 11, 12 form the TR.
+-#
+-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
+-$NumberSequence {100};
++# leave CJK scripts out of ALetterPlus
++$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+ 
+-#
+-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
+-#     - must include at least one letter. 
+-#     - may include both letters and numbers.
+-#     - may include  MideLetter, MidNumber punctuation.
+-#
+-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*;     # rules #6, #7
+-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200};
+ 
+-# Punctuations by themselves
+-[[:P:][:S:]-[:name = FULL STOP:]]*;
+-[[:name = FULL STOP:]]*;
++## -------------------------------------------------
+ 
++# Rule 3 - CR x LF
+ #
+-#  Do not break between Katakana.   Rule #13.
+-#
+-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
+-[:Hiragana:] $Extend* {300};
++$CR $LF;
+ 
++# Rule 3c   Do not break within emoji zwj sequences.
++#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no intervening Extend chars allowed.
+ #
+-#  Ideographic Characters.  Stand by themselves as words.
+-#                           Separated from the "Everything Else" rule, below, only so that they
+-#                           can be tagged with a return value.   TODO:  is this what we want?
+-#
+-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
+-$HangulEx ($FormatEx* $HangulEx)* {400};
++$ZWJ $Extended_Pict;
+ 
++# Rule 3d - Keep horizontal whitespace together.
+ #
+-#  Everything Else, with no tag.
+-#                   Non-Control chars combine with $Extend (combining) chars.
+-#                   Controls are do not.
+-#
+-[^$Control [:Ideographic:]] $Extend*;
+-$CR $LF;
++$WSegSpace $WSegSpace;
++
++# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
++#          of a region of Text.
++
++$ExFm  = [$Extend $Format $ZWJ];
++
++^$ExFm+;            # This rule fires only when there are format or extend characters at the
++                    # start of text, or immediately following another boundary. It groups them, in
++                    # the event there are more than one.
++
++[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing format/extends to words,
++                                    # with no special rule status value.
++
++$Numeric $ExFm* {100};              # This group of rules also attach trailing format/extends, but
++$ALetterPlus $ExFm* {200};          # with rule status set based on the word's final base character.
++$HangulSyllable {200};
++$Hebrew_Letter $ExFm* {200};
++$Katakana $ExFm* {400};             # note:  these status values override those from rule 5
++$Hiragana $ExFm* {400};             #        by virtue of being numerically larger.
++$Ideographic $ExFm* {400};          #
+ 
+ #
+-#  Reverse Rules.   Back up over any of the chars that can group together.
+-#                   (Reverse rules do not need to be exact; they can back up  too far,
+-#                   but must back up at least enough, and must stop on a boundary.)
++# rule 5
++#    Do not break between most letters.
+ #
++($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
++
++# rule 6 and 7
++($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
++
++# rule 7a
++$Hebrew_Letter $ExFm* $Single_Quote {200};
++
++# rule 7b and 7c
++$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
++
++# rule 8
++
++$Numeric $ExFm* $Numeric;
++
++# rule 9
++
++($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
+ 
+-# NonStarters are the set of all characters that can appear at the 2nd - nth position of
+-#    a word.   (They may also be the first.)   The reverse rule skips over these, until it
+-#    reaches something that can only be the start (and probably only) char in a "word".
+-#    A space or punctuation meets the test.
++# rule 10
++
++$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
++
++# rule 11 and 12
++
++$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
++
++# rule 13
++# to be consistent with $KanaKanji $KanaKanhi, changed
++# from 300 to 400.
++# See also TestRuleStatus in intltest/rbbiapts.cpp
++$Katakana $ExFm*  $Katakana {400};
++
++# rule 13a/b
++
++$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
++$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
++$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
++$Katakana      $ExFm* $ExtendNumLet {400};    #  (13a)
++$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
++
++$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
++$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
++$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
++$ExtendNumLet  $ExFm* $Katakana     {400};    #  (13b)
++
++# rules 15 - 17
++#    Pairs of Regional Indicators stay together.
++#    With incoming rule chaining disabled by ^, this rule will match exactly two of them.
++#    No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
+ #
+-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format];
++^$Regional_Indicator $ExFm* $Regional_Indicator;
+ 
+-#!.*;
+-! ($NonStarters* | \n \r) .;
++# special handling for CJK characters: chain for later dictionary segmentation
++$HangulSyllable $HangulSyllable {200};
++$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
++
++### BEGIN CUSTOMIZATION
++### i#13494: For the purposes of editing, standalone punctuation should be treated as a word.
++### This customization does not replace any rules.
++[[:P:][:S:]-[:name = FULL STOP:]]*
++[[:name = FULL STOP:]]*;
++### END CUSTOMIZATION
+ 
++# Rule 999
++#     Match a single code point if no other rule applies.
++.;
+diff --git a/i18npool/source/breakiterator/data/edit_word_he.txt b/i18npool/source/breakiterator/data/edit_word_he.txt
+deleted file mode 100644
+index 0b5908814e08..000000000000
+--- a/i18npool/source/breakiterator/data/edit_word_he.txt
++++ /dev/null
+@@ -1,142 +0,0 @@
+-#
+-#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
+-#       All Rights Reserved.
+-#
+-#   file:  edit_word.txt   
+-#
+-#   ICU Word Break Rules
+-#      See Unicode Standard Annex #29.
+-#      These rules are based on Version 4.0.0, dated 2003-04-17
+-#
+-
+-
+-
+-####################################################################################
+-#
+-#  Character class definitions from TR 29
+-#
+-####################################################################################
+-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
+-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+-
+-$Ideographic = [:Ideographic:];
+-$Hangul = [:Script = HANGUL:];
+-
+-$ALetter   = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] 
+-                           - $Ideographic
+-                           - $Katakana
+-                           - $Hangul
+-                           - [:Script = Thai:]
+-                           - [:Script = Lao:]
+-                           - [:Script = Hiragana:]];
+-                           
+-$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:]
+-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];  
+-              
+-$MidNum    = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]];
+-$Numeric   = [:LineBreak = Numeric:];
+-
+-
+-$TheZWSP = \u200b;
+-
+-#
+-#  Character Class Definitions.
+-#    The names are those from TR29.
+-#
+-$CR         = \u000d;
+-$LF         = \u000a;
+-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
+-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
+-
+-
+-
+-
+-####################################################################################
+-#
+-#  Word Break Rules.    Definitions and Rules specific to word break begin Here. 
+-#
+-####################################################################################
+-
+-$Format    = [[:Cf:] - $TheZWSP];
+-
+-
+-
+-# Rule 3:  Treat a grapheme cluster as if it were a single character.
+-#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
+-#          because we don't need to find the boundaries between adjacent syllables -
+-#          they won't be word boundaries.
+-#
+-
+-
+-#
+-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
+-#
+-$ALetterEx    = $ALetter   $Extend*; 
+-$NumericEx    = $Numeric   $Extend*;
+-$MidNumEx     = $MidNum    $Extend*;
+-$MidLetterEx  = $MidLetter $Extend*;
+-$KatakanaEx   = $Katakana  $Extend*;
+-$IdeographicEx= $Ideographic  $Extend*;
+-$HangulEx = $Hangul  $Extend*;
+-$FormatEx     = $Format    $Extend*;
+-
+-
+-#
+-#  Numbers.  Rules 8, 11, 12 form the TR.
+-#
+-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
+-$NumberSequence {100};
+-
+-#
+-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
+-#     - must include at least one letter. 
+-#     - may include both letters and numbers.
+-#     - may include  MideLetter, MidNumber punctuation.
+-#
+-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*;     # rules #6, #7
+-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200};
+-
+-# Punctuations by themselves
+-[[:P:][:S:]-[:name = FULL STOP:]]*;
+-[[:name = FULL STOP:]]*;
+-
+-#
+-#  Do not break between Katakana.   Rule #13.
+-#
+-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
+-[:Hiragana:] $Extend* {300};
+-
+-#
+-#  Ideographic Characters.  Stand by themselves as words.
+-#                           Separated from the "Everything Else" rule, below, only so that they
+-#                           can be tagged with a return value.   TODO:  is this what we want?
+-#
+-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
+-$HangulEx ($FormatEx* $HangulEx)* {400};
+-
+-#
+-#  Everything Else, with no tag.
+-#                   Non-Control chars combine with $Extend (combining) chars.
+-#                   Controls are do not.
+-#
+-[^$Control [:Ideographic:]] $Extend*;
+-$CR $LF;
+-
+-#
+-#  Reverse Rules.   Back up over any of the chars that can group together.
+-#                   (Reverse rules do not need to be exact; they can back up  too far,
+-#                   but must back up at least enough, and must stop on a boundary.)
+-#
+-
+-# NonStarters are the set of all characters that can appear at the 2nd - nth position of
+-#    a word.   (They may also be the first.)   The reverse rule skips over these, until it
+-#    reaches something that can only be the start (and probably only) char in a "word".
+-#    A space or punctuation meets the test.
+-#
+-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format];
+-
+-#!.*;
+-! ($NonStarters* | \n \r) .;
+-
+diff --git a/i18npool/source/breakiterator/data/edit_word_hu.txt b/i18npool/source/breakiterator/data/edit_word_hu.txt
+index 4a08acab0029..389ad2bacc13 100644
+--- a/i18npool/source/breakiterator/data/edit_word_hu.txt
++++ b/i18npool/source/breakiterator/data/edit_word_hu.txt
+@@ -1,159 +1,215 @@
+ #
+-#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
+-#       All Rights Reserved.
++# Copyright (C) 2016 and later: Unicode, Inc. and others.
++# License & terms of use: http://www.unicode.org/copyright.html
++# Copyright (C) 2002-2016, International Business Machines Corporation
++# and others. All Rights Reserved.
+ #
+-#   file:  edit_word.txt   
++# file:  word.txt
+ #
+-#   ICU Word Break Rules
++# ICU Word Break Rules
+ #      See Unicode Standard Annex #29.
+-#      These rules are based on Version 4.0.0, dated 2003-04-17
++#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
+ #
++# Note:  Updates to word.txt will usually need to be merged into
++#        word_POSIX.txt also.
+ 
+-
+-
+-####################################################################################
++##############################################################################
+ #
+ #  Character class definitions from TR 29
+ #
+-####################################################################################
+-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
+-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+-
+-$Ideographic = [:Ideographic:];
+-$Hangul = [:Script = HANGUL:];
+-
+-$ALetter   = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] 
+-                [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:]
+-                [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:]
+-                [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:]
+-                [:name = DIGIT ZERO:]
+-                [:name = DIGIT ONE:]
+-                [:name = DIGIT TWO:]
+-                [:name = DIGIT THREE:]
+-                [:name = DIGIT FOUR:]
+-                [:name = DIGIT FIVE:]
+-                [:name = DIGIT SIX:]
+-                [:name = DIGIT SEVEN:]
+-                [:name = DIGIT EIGHT:]
+-                [:name = DIGIT NINE:]
+-                           - $Ideographic
+-                           - $Katakana
+-                           - $Hangul
+-                           - [:Script = Thai:]
+-                           - [:Script = Lao:]
+-                           - [:Script = Hiragana:]];
+-                           
+-$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]  [:name = HEBREW PUNCTUATION GERSHAYIM:]
+-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]  
+-              [:name = HYPHEN-MINUS:] [:name = EURO SIGN:] [:name = PERCENT SIGN:] 
+-              [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:]
+-              [:name = EN DASH:] [:name = EM DASH:]
+-              [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]];
+-              
+-$MidNum    = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]];
+-$Numeric   = [:LineBreak = Numeric:];
+-
+-
+-$TheZWSP = \u200b;
++##############################################################################
++
++### BEGIN CUSTOMIZATION
++### This file contains LibreOffice-specific rule customizations.
++###
++### To aid future maintainability:
++### - The change location should be bracketed by comments of this form.
++### - The original rule should be commented out, and the modified rule placed alongside.
++### - By doing this, maintainers can more easily compare to an upstream baseline.
++###
++### END CUSTOMIZATION
++
++!!chain;
++!!quoted_literals_only;
++
+ 
+ #
+ #  Character Class Definitions.
+-#    The names are those from TR29.
+ #
+-$CR         = \u000d;
+-$LF         = \u000a;
+-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
+-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
+ 
++$Han                = [:Han:];
+ 
++$CR                 = [\p{Word_Break = CR}];
++$LF                 = [\p{Word_Break = LF}];
++$Newline            = [\p{Word_Break = Newline}];
++$Extend             = [\p{Word_Break = Extend}-$Han];
++$ZWJ                = [\p{Word_Break = ZWJ}];
++$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
++$Format             = [\p{Word_Break = Format}];
++$Katakana           = [\p{Word_Break = Katakana}];
++$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
++$Single_Quote       = [\p{Word_Break = Single_Quote}];
++$Double_Quote       = [\p{Word_Break = Double_Quote}];
++$MidNum             = [\p{Word_Break = MidNum}];
++$Numeric            = [\p{Word_Break = Numeric}];
++$WSegSpace          = [\p{Word_Break = WSegSpace}];
++$Extended_Pict      = [\p{Extended_Pictographic}];
+ 
++### BEGIN CUSTOMIZATION
++### i#13494: For the purposes of editing, standalone punctuation should be treated as a word.
++### This change subtracts undesired characters from the above families
++### i#56347: BreakIterator patch for Hungarian
++### i#56348: Special chars in first pos not handled by spell checking for Hungarian
+ 
+-####################################################################################
+-#
+-#  Word Break Rules.    Definitions and Rules specific to word break begin Here. 
+-#
+-####################################################################################
++$Symbols_hu         = [[:name = PERCENT SIGN:]
++                       [:name = PER MILLE SIGN:]
++                       [:name = PER TEN THOUSAND SIGN:]
++                       [:name = SECTION SIGN:]
++                       [:name = DEGREE SIGN:]
++                       [:name = EURO SIGN:]
++                       [:name = HYPHEN-MINUS:]
++                       [:name = EN DASH:]
++                       [:name = EM DASH:]];
+ 
+-$Format    = [[:Cf:] - $TheZWSP];
++# $ALetter            = [\p{Word_Break = ALetter}];
++$ALetter            = [\p{Word_Break = ALetter} $Symbols_hu];
+ 
++# $MidLetter          = [\p{Word_Break = MidLetter}];
++$MidLetter          = [\p{Word_Break = MidLetter} $Symbols_hu];
+ 
++# $MidNumLet          = [\p{Word_Break = MidNumLet}];
++$MidNumLet          = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]];
+ 
+-# Rule 3:  Treat a grapheme cluster as if it were a single character.
+-#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
+-#          because we don't need to find the boundaries between adjacent syllables -
+-#          they won't be word boundaries.
+-#
++# $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
++$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]];
+ 
++### END CUSTOMIZATION
+ 
+-#
+-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
+-#
+-$ALetterEx    = $ALetter   $Extend*; 
+-$NumericEx    = $Numeric   $Extend*;
+-$MidNumEx     = $MidNum    $Extend*;
+-$MidLetterEx  = $MidLetter $Extend*;
+-$KatakanaEx   = $Katakana  $Extend*;
+-$IdeographicEx= $Ideographic  $Extend*;
+-$HangulEx = $Hangul  $Extend*;
+-$FormatEx     = $Format    $Extend*;
++$Hiragana           = [:Hiragana:];
++$Ideographic        = [\p{Ideographic}];
+ 
+ 
+-#
+-#  Numbers.  Rules 8, 11, 12 form the TR.
+-#
+-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
+-$NumberSequence {100};
++#   Dictionary character set, for triggering language-based break engines. Currently
++#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
++#   5.0 or later as the definition of Complex_Context was corrected to include all
++#   characters requiring dictionary break.
+ 
+-#
+-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
+-#     - must include at least one letter. 
+-#     - may include both letters and numbers.
+-#     - may include  MideLetter, MidNumber punctuation.
+-#
+-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*;     # rules #6, #7
+-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200};
++$Control        = [\p{Grapheme_Cluster_Break = Control}];
++$HangulSyllable = [\uac00-\ud7a3];
++$ComplexContext = [:LineBreak = Complex_Context:];
++$KanaKanji      = [$Han $Hiragana $Katakana];
++$dictionaryCJK  = [$KanaKanji $HangulSyllable];
++$dictionary     = [$ComplexContext $dictionaryCJK];
+ 
+-# Punctuations by themselves
+-[[:P:][:S:]-[:name = FULL STOP:]]*;
+-[[:name = FULL STOP:]]*;
++# TODO: check if handling of katakana in dictionary makes rules incorrect/void
+ 
+-#
+-#  Do not break between Katakana.   Rule #13.
+-#
+-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
+-[:Hiragana:] $Extend* {300};
++# leave CJK scripts out of ALetterPlus
++$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+ 
++
++## -------------------------------------------------
++
++# Rule 3 - CR x LF
+ #
+-#  Ideographic Characters.  Stand by themselves as words.
+-#                           Separated from the "Everything Else" rule, below, only so that they
+-#                           can be tagged with a return value.   TODO:  is this what we want?
+-#
+-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
+-$HangulEx ($FormatEx* $HangulEx)* {400};
++$CR $LF;
+ 
++# Rule 3c   Do not break within emoji zwj sequences.
++#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no intervening Extend chars allowed.
+ #
+-#  Everything Else, with no tag.
+-#                   Non-Control chars combine with $Extend (combining) chars.
+-#                   Controls are do not.
++$ZWJ $Extended_Pict;
++
++# Rule 3d - Keep horizontal whitespace together.
+ #
+-[^$Control [:Ideographic:]] $Extend*;
+-$CR $LF;
++$WSegSpace $WSegSpace;
++
++# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
++#          of a region of Text.
++
++$ExFm  = [$Extend $Format $ZWJ];
++
++^$ExFm+;            # This rule fires only when there are format or extend characters at the
++                    # start of text, or immediately following another boundary. It groups them, in
++                    # the event there are more than one.
++
++[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing format/extends to words,
++                                    # with no special rule status value.
++
++$Numeric $ExFm* {100};              # This group of rules also attach trailing format/extends, but
++$ALetterPlus $ExFm* {200};          # with rule status set based on the word's final base character.
++$HangulSyllable {200};
++$Hebrew_Letter $ExFm* {200};
++$Katakana $ExFm* {400};             # note:  these status values override those from rule 5
++$Hiragana $ExFm* {400};             #        by virtue of being numerically larger.
++$Ideographic $ExFm* {400};          #
+ 
+ #
+-#  Reverse Rules.   Back up over any of the chars that can group together.
+-#                   (Reverse rules do not need to be exact; they can back up  too far,
+-#                   but must back up at least enough, and must stop on a boundary.)
++# rule 5
++#    Do not break between most letters.
+ #
++($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
++
++# rule 6 and 7
++($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
++
++# rule 7a
++$Hebrew_Letter $ExFm* $Single_Quote {200};
++
++# rule 7b and 7c
++$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
++
++# rule 8
++
++$Numeric $ExFm* $Numeric;
++
++# rule 9
+ 
+-# NonStarters are the set of all characters that can appear at the 2nd - nth position of
+-#    a word.   (They may also be the first.)   The reverse rule skips over these, until it
+-#    reaches something that can only be the start (and probably only) char in a "word".
+-#    A space or punctuation meets the test.
++($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
++
++# rule 10
++
++$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
++
++# rule 11 and 12
++
++$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
++
++# rule 13
++# to be consistent with $KanaKanji $KanaKanhi, changed
++# from 300 to 400.
++# See also TestRuleStatus in intltest/rbbiapts.cpp
++$Katakana $ExFm*  $Katakana {400};
++
++# rule 13a/b
++
++$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
++$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
++$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
++$Katakana      $ExFm* $ExtendNumLet {400};    #  (13a)
++$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
++
++$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
++$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
++$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
++$ExtendNumLet  $ExFm* $Katakana     {400};    #  (13b)
++
++# rules 15 - 17
++#    Pairs of Regional Indicators stay together.
++#    With incoming rule chaining disabled by ^, this rule will match exactly two of them.
++#    No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
+ #
+-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format];
++^$Regional_Indicator $ExFm* $Regional_Indicator;
+ 
+-#!.*;
+-! ($NonStarters* | \n \r) .;
++# special handling for CJK characters: chain for later dictionary segmentation
++$HangulSyllable $HangulSyllable {200};
++$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
++
++### BEGIN CUSTOMIZATION
++### i#13494: For the purposes of editing, standalone punctuation should be treated as a word.
++### This customization does not replace any rules.
++[[:P:][:S:]-[:name = FULL STOP:]]*
++[[:name = FULL STOP:]]*;
++### END CUSTOMIZATION
+ 
++# Rule 999
++#     Match a single code point if no other rule applies.
++.;
+diff --git a/i18npool/source/breakiterator/data/line.txt b/i18npool/source/breakiterator/data/line.txt
+index ff3f3eafc42e..46a618c63cae 100644
+--- a/i18npool/source/breakiterator/data/line.txt
++++ b/i18npool/source/breakiterator/data/line.txt
+@@ -1,176 +1,116 @@
+-# Copyright (c) 2002-2006  International Business Machines Corporation and
++# Copyright (C) 2016 and later: Unicode, Inc. and others.
++# License & terms of use: http://www.unicode.org/copyright.html
++# Copyright (c) 2002-2016  International Business Machines Corporation and
+ # others. All Rights Reserved.
+ #
+ #  file:  line.txt
+ #
+ #         Line Breaking Rules
+-#         Implement default line breaking as defined by Unicode Standard Annex #14 version 5.0.0
+-#         http://www.unicode.org/reports/tr14/
+-
+-
++#         Implement default line breaking as defined by
++#         Unicode Standard Annex #14 (https://www.unicode.org/reports/tr14/)
++#         for Unicode 14.0, with the following modification:
++#
++#         Boundaries between hyphens and following letters are suppressed when
++#         there is a boundary preceding the hyphen. See rule 20.9
++#
++#         This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
++#         It sets characters of class CJ to behave like NS.
+ 
+ #
+ #  Character Classes defined by TR 14.
+ #
+ 
+-!!chain;
+-!!LBCMNoChain;
++### BEGIN CUSTOMIZATION
++### This file contains LibreOffice-specific rule customizations.
++###
++### To aid future maintainability:
++### - The change location should be bracketed by comments of this form.
++### - The original rule should be commented out, and the modified rule placed alongside.
++### - By doing this, maintainers can more easily compare to an upstream baseline.
++###
++### END CUSTOMIZATION
+ 
+-
+-!!lookAheadHardBreak;
+-#
+-#  !!lookAheadHardBreak    Described here because it is (as yet) undocumented elsewhere
+-#                          and only used for the line break rules.
+-#
+-#           It is used in the implementation of the incredibly annoying rule LB 10
+-#           which says to treat any combining mark that is not attached to a base
+-#           character as if it were of class AL  (alphabetic).
+-#
+-#           The problem occurs in the reverse rules.
+-#
+-#           Consider a sequence like, with correct breaks as shown
+-#               LF  ID  CM  AL  AL
+-#                  ^       ^       ^
+-#           Then consider the sequence without the initial ID (ideographic)
+-#                 LF  CM  AL  AL
+-#                    ^           ^
+-#           Our CM, which in the first example was attached to the ideograph,
+-#           is now unattached, becomes an alpha, and joins in with the other
+-#           alphas.
+-#
+-#           When iterating forwards, these sequences do not present any problems
+-#           When iterating backwards, we need to look ahead when encountering
+-#           a CM to see whether it attaches to something further on or not.
+-#           (Look-ahead in a reverse rule is looking towards the start)
+-#
+-#           If the CM is unattached, we need to force a break.
+-#
+-#           !!lookAheadHardBreak forces the run time state machine to
+-#           stop immediately when a look ahead rule ( '/' operator) matches,
+-#           and set the match position to that of the look-ahead operator,
+-#           no matter what other rules may be in play at the time.
+-#
+-#           See rule LB 19 for an example.
+-#
++!!chain;
++!!quoted_literals_only;
+ 
+ $AI = [:LineBreak =  Ambiguous:];
+-$DG = \u00B0;
+-$AL = [[:LineBreak =  Alphabetic:] $DG];
++$AL = [:LineBreak =  Alphabetic:];
+ $BA = [:LineBreak =  Break_After:];
++$HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
+ $BB = [:LineBreak =  Break_Before:];
+ $BK = [:LineBreak =  Mandatory_Break:];
+ $B2 = [:LineBreak =  Break_Both:];
+ $CB = [:LineBreak =  Contingent_Break:];
+ $CJ = [:LineBreak =  Conditional_Japanese_Starter:];
+-$CL = [[:LineBreak =  Close_Punctuation:] [:LineBreak = Close_Parenthesis:]]; # tdf#31271
+-$CM = [:LineBreak =  Combining_Mark:];
++$CL = [:LineBreak =  Close_Punctuation:];
++# $CM = [:LineBreak =  Combining_Mark:];
++$CP = [:LineBreak =  Close_Parenthesis:];
+ $CR = [:LineBreak =  Carriage_Return:];
++$EB = [:LineBreak =  EB:];
++$EM = [:LineBreak =  EM:];
+ $EX = [:LineBreak =  Exclamation:];
+ $GL = [:LineBreak =  Glue:];
+ $HL = [:LineBreak =  Hebrew_Letter:];
+ $HY = [:LineBreak =  Hyphen:];
+ $H2 = [:LineBreak =  H2:];
+ $H3 = [:LineBreak =  H3:];
+-$ID = [[:LineBreak =  Ideographic:] - [\ufe30]];
+-$IN = [:LineBreak =  Inseparable:];
+-$IS = [[:LineBreak =  Infix_Numeric:] [\ufe30]];
++$ID = [:LineBreak =  Ideographic:];
++$IN = [:LineBreak =  Inseperable:];
++$IS = [:LineBreak =  Infix_Numeric:];
+ $JL = [:LineBreak =  JL:];
+ $JV = [:LineBreak =  JV:];
+ $JT = [:LineBreak =  JT:];
+ $LF = [:LineBreak =  Line_Feed:];
+ $NL = [:LineBreak =  Next_Line:];
++# NS includes CJ for CSS strict line breaking.
+ $NS = [[:LineBreak =  Nonstarter:] $CJ];
+ $NU = [:LineBreak =  Numeric:];
+-$OP = [[:LineBreak =  Open_Punctuation:] - $DG];
++$OP = [:LineBreak =  Open_Punctuation:];
+ $PO = [:LineBreak =  Postfix_Numeric:];
+-$BS = \u005C;
+-$PR = [[:LineBreak =  Prefix_Numeric:] - $BS];
++$PR = [:LineBreak =  Prefix_Numeric:];
+ $QU = [:LineBreak =  Quotation:];
++$RI = [:LineBreak =  Regional_Indicator:];
+ $SA = [:LineBreak =  Complex_Context:];
+ $SG = [:LineBreak =  Surrogate:];
+ $SP = [:LineBreak =  Space:];
+-$SY = [[:LineBreak =  Break_Symbols:] $BS];
++$SY = [:LineBreak =  Break_Symbols:];
+ $WJ = [:LineBreak =  Word_Joiner:];
+ $XX = [:LineBreak =  Unknown:];
+ $ZW = [:LineBreak =  ZWSpace:];
++$ZWJ = [:LineBreak = ZWJ:];
++
++# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
++# without a formal name. Because ICU rules require multiple uses of the expressions,
++# give them a single definition with a name
++
++$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
++$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
++
++$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
++
++# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
++#         list it in the numerous rules that use CM.
++# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
++
++$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
++$CMX = [[$CM] - [$ZWJ]];
+ 
+ #   Dictionary character set, for triggering language-based break engines. Currently
+-#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+-#   5.0 or later as the definition of Complex_Context was corrected to include all
+-#   characters requiring dictionary break.
++#   limited to LineBreak=Complex_Context (SA).
+ 
+-$dictionary = [:LineBreak = Complex_Context:];
++$dictionary = [$SA];
+ 
+ #
+ #  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
+-#                               SA  (South East Asian: Thai, Lao, Khmer)
++#                               SA  (Dictionary chars, excluding Mn and Mc)
+ #                               SG  (Unpaired Surrogates)
+ #                               XX  (Unknown, unassigned)
+ #                         as $AL  (Alphabetic)
+ #
+-$ALPlus = [$AL $AI $SA $SG $XX];
+-
+-#
+-#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
+-#
+-$ALcm = $ALPlus $CM*;
+-$BAcm = $BA $CM*;
+-$BBcm = $BB $CM*;
+-$B2cm = $B2 $CM*;
+-$CLcm = $CL $CM*;
+-$EXcm = $EX $CM*;
+-$GLcm = $GL $CM*;
+-$HLcm = $HL $CM*;
+-$HYcm = $HY $CM*;
+-$H2cm = $H2 $CM*;
+-$H3cm = $H3 $CM*;
+-$IDcm = $ID $CM*;
+-$INcm = $IN $CM*;
+-$IScm = $IS $CM*;
+-$JLcm = $JL $CM*;
+-$JVcm = $JV $CM*;
+-$JTcm = $JT $CM*;
+-$NScm = $NS $CM*;
+-$NUcm = $NU $CM*;
+-$OPcm = $OP $CM*;
+-$POcm = $PO $CM*;
+-$PRcm = $PR $CM*;
+-$QUcm = $QU $CM*;
+-$SYcm = $SY $CM*;
+-$WJcm = $WJ $CM*;
++$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
+ 
+-## -------------------------------------------------
+ 
+-!!forward;
+-
+-#
+-#  Each class of character can stand by itself as an unbroken token, with trailing combining stuff
+-#
+-$ALPlus $CM+;
+-$BA $CM+;
+-$BB $CM+;
+-$B2 $CM+;
+-$CL $CM+;
+-$EX $CM+;
+-$GL $CM+;
+-$HL $CM+;
+-$HY $CM+;
+-$H2 $CM+;
+-$H3 $CM+;
+-$ID $CM+;
+-$IN $CM+;
+-$IS $CM+;
+-$JL $CM+;
+-$JV $CM+;
+-$JT $CM+;
+-$NS $CM+;
+-$NU $CM+;
+-$OP $CM+;
+-$PO $CM+;
+-$PR $CM+;
+-$QU $CM+;
+-$SY $CM+;
+-$WJ $CM+;
++## -------------------------------------------------
+ 
+ #
+ # CAN_CM  is the set of characters that may combine with CM combining chars.
+@@ -186,19 +126,15 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
+ #
+ # AL_FOLLOW  set of chars that can unconditionally follow an AL
+ #            Needed in rules where stand-alone $CM s are treated as AL.
+-#            Chaining is disabled with CM because it causes other failures,
+-#            so for this one case we need to manually list out longer sequences.
+ #
+-$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
+-$AL_FOLLOW_CM   = [$CL $EX $HL $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus $OP];
+-$AL_FOLLOW      = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
++$AL_FOLLOW      = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
+ 
+ 
+ #
+ #  Rule LB 4, 5    Mandatory (Hard) breaks.
+ #
+ $LB4Breaks    = [$BK $CR $LF $NL];
+-$LB4NonBreaks = [^$BK $CR $LF $NL];
++$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
+ $CR $LF {100};
+ 
+ #
+@@ -206,91 +142,124 @@ $CR $LF {100};
+ #
+ $LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
+ $CAN_CM $CM*    $LB4Breaks {100};
+-$CM+            $LB4Breaks {100};
++^$CM+           $LB4Breaks {100};
+ 
+ # LB 7         x SP
+ #              x ZW
+ $LB4NonBreaks [$SP $ZW];
+ $CAN_CM $CM*  [$SP $ZW];
+-$CM+          [$SP $ZW];
++^$CM+         [$SP $ZW];
+ 
+ #
+ # LB 8         Break after zero width space
++#              ZW SP* ÷
+ #
+ $LB8Breaks    = [$LB4Breaks $ZW];
+ $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
++$ZW $SP* / [^$SP $ZW $LB4Breaks];
+ 
++# LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
++#
++$ZWJ [^$CM];
+ 
+-# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 
+-#                                $CM not covered by the above needs to behave like $AL   
++# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
++#                                $CM not covered by the above needs to behave like $AL
+ #                                See definition of $CAN_CM.
+ 
+ $CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
+-$CM+;
++^$CM+;
+ 
+ #
+ # LB 11  Do not break before or after WORD JOINER & related characters.
+ #
+-$CAN_CM $CM*  $WJcm;
+-$LB8NonBreaks $WJcm;
+-$CM+          $WJcm;
++$CAN_CM $CM*  $WJ;
++$LB8NonBreaks $WJ;
++^$CM+         $WJ;
+ 
+-$WJcm [^$CAN_CM];
+-$WJcm $CAN_CM $CM*;
++$WJ $CM* .;
+ 
+ #
+-# LB 12  Do not break before or after NBSP and related characters.
++# LB 12  Do not break after NBSP and related characters.
++#         GL  x
+ #
+-#         (!SP) x GL
+-[$LB8NonBreaks-$SP] $CM* $GLcm;
+-$CM+               $GLcm;
++$GL $CM* .;
+ 
+-#         GL  x
+-$GLcm ($LB8Breaks | $SP);
+-$GLcm [$LB8NonBreaks-$SP] $CM*;     # Don't let a combining mark go onto $CR, $BK, etc.
+-                              #  TODO:  I don't think we need this rule.
+-                              #         All but $CM will chain off of preceding rule.
+-                              #         $GLcm will pick up the CM case by itself.
++#
++# LB 12a  Do not break before NBSP and related characters ...
++#            [^SP BA HY] x GL
++#
++[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
++^$CM+ $GL;
+ 
+ 
+ 
+ 
+-#
+-# LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
++# LB 13   Don't break before ']' or '!' or '/', even after spaces.
+ #
+ $LB8NonBreaks $CL;
+ $CAN_CM $CM*  $CL;
+-$CM+          $CL;              # by rule 10, stand-alone CM behaves as AL
++^$CM+         $CL;              # by rule 10, stand-alone CM behaves as AL
++
++$LB8NonBreaks $CP;
++$CAN_CM $CM*  $CP;
++^$CM+         $CP;              # by rule 10, stand-alone CM behaves as AL
+ 
+ $LB8NonBreaks $EX;
+ $CAN_CM $CM*  $EX;
+-$CM+          $EX;              # by rule 10, stand-alone CM behaves as AL
+-
+-$LB8NonBreaks $IS;
+-$CAN_CM $CM*  $IS;
+-$CM+          $IS;              # by rule 10, stand-alone CM behaves as AL
++^$CM+         $EX;              # by rule 10, stand-alone CM behaves as AL
+ 
+ $LB8NonBreaks $SY;
+ $CAN_CM $CM*  $SY;
+-$CM+          $SY;              # by rule 10, stand-alone CM behaves as AL
++^$CM+         $SY;              # by rule 10, stand-alone CM behaves as AL
+ 
+ 
+ #
+-# LB 14  Do not break after OP, even after spaced
++# LB 14  Do not break after OP, even after spaces
++#        Note subtle interaction with "SP IS /" rules in LB14a.
++#        This rule consumes the SP, chaining happens on the IS, effectivley overriding the  SP IS rules,
++#        which is the desired behavior.
++#
++$OP $CM* $SP* .;
++
++$OP $CM* $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
++                                   # by rule 8, CM following a SP is stand-alone.
++
++
++# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23"
++#        Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations.
++#        See issue ICU-20303
++
++
++$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
++$SP $IS           / [^ $CanFollowIS $NU $CM];
++$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
++
+ #
+-$OPcm $SP* $CAN_CM $CM*;
+-$OPcm $SP* $CANT_CM;
++# LB 14b Do not break before numeric separators (IS), even after spaces.
++
++[$LB8NonBreaks - $SP] $IS;
++$SP $IS $CM* [$CanFollowIS {eof}];
++$SP $IS $CM* $ZWJ [^$CM $NU];
++
++$CAN_CM $CM*  $IS;
++^$CM+         $IS;              # by rule 10, stand-alone CM behaves as AL
+ 
+-$OPcm $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
+ 
+ # LB 15
+-# $QUcm $SP* $OPcm;
++
++### BEGIN CUSTOMIZATION
++### i#83649: Allow line break between quote and opening punctuation.
++### This customization simply disables rule LB 15.
++###
++# $QU $CM* $SP* $OP;
++###
++### END CUSTOMIZATION
+ 
+ # LB 16
+-$CLcm $SP* $NScm;
++($CL | $CP) $CM* $SP* $NS;
+ 
+ # LB 17
+-$B2cm $SP* $B2cm;
++$B2 $CM* $SP* $B2;
+ 
+ #
+ # LB 18  Break after spaces.
+@@ -301,347 +270,134 @@ $LB18Breaks    = [$LB8Breaks $SP];
+ 
+ # LB 19
+ #         x QU
+-$LB18NonBreaks $CM* $QUcm;
+-$CM+                $QUcm;
++$LB18NonBreaks $CM* $QU;
++^$CM+               $QU;
+ 
+ #         QU  x
+-$QUcm .?;
+-$QUcm $LB18NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
+-                              #  TODO:  I don't think this rule is needed.
+-
++$QU $CM* .;
+ 
+ # LB 20
+ #        <break>  $CB
+ #        $CB   <break>
+-
++#
+ $LB20NonBreaks = [$LB18NonBreaks - $CB];
+ 
++# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
++#             Originally added as a Finnish tailoring, now promoted to default ICU behavior.
++#             Note: this is not default UAX-14 behaviour. See issue ICU-8151.
++#
++^($HY | $HH) $CM* $ALPlus;
++
+ # LB 21        x   (BA | HY | NS)
+ #           BB x
+ #
+-$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); 
++$LB20NonBreaks $CM* ($BA | $HY | $NS);
+ 
+-$BBcm [^$CB];                                  #  $BB  x
+-$BBcm $LB20NonBreaks $CM*;
+ 
+-# LB 21a Don't break after Hebrew + Hyphen
+-#   HL (HY | BA) x
+-#  
+-$HLcm ($HYcm | $BAcm) [^$CB]?;
++^$CM+ ($BA | $HY | $NS);
+ 
+-# LB 22
+-($ALcm | $HLcm) $INcm;
+-$CM+     $INcm;     #  by rule 10, any otherwise unattached CM behaves as AL
+-$IDcm    $INcm;
+-$INcm    $INcm;
+-$NUcm    $INcm;
++$BB $CM* [^$CB];                                  #  $BB  x
++$BB $CM* $LB20NonBreaks;
+ 
+-
+-# $LB 23
+-$IDcm  $POcm;
+-$ALcm  $NUcm;       # includes $LB19
+-$HLcm  $NUcm;
+-$CM+   $NUcm;       # Rule 10, any otherwise unattached CM behaves as AL
+-$NUcm  $ALcm;
+-$NUcm  $HLcm;
+-
+-#
+-# LB 24
+-#
+-$PRcm $IDcm;
+-$ALcm $PRcm;
+-$PRcm ($ALcm | $HLcm);
+-$POcm ($ALcm | $HLcm);
+-
+-#
+-# LB 25   Numbers.
+-#
+-($PRcm | $POcm)? ($OPcm)? $NUcm ($NUcm | $SYcm | $IScm)* $CLcm? ($PRcm | $POcm)?;
+-
+-# LB 26  Do not break a Korean syllable
++# LB 21a Don't break after Hebrew + Hyphen
++#   HL (HY | BA) x
+ #
+-$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
+-($JVcm | $H2cm) ($JVcm | $JTcm);
+-($JTcm | $H3cm) $JTcm;
+-
+-# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
+-($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
+-($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
+-$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
++$HL $CM* ($HY | $BA) $CM* [^$CB]?;
+ 
++# LB 21b (forward) Don't break between SY and HL
++# (break between HL and SY already disallowed by LB 13 above)
++$SY $CM* $HL;
+ 
+-# LB 28   Do not break between alphabetics
++# LB 22  Do not break before ellipses
+ #
+-($ALcm | $HLcm) ($ALcm | $HLcm);
+-$CM+ ($ALcm | $HLcm);      # The $CM+ is from rule 10, an unattached CM is treated as AL
++$LB20NonBreaks $CM*    $IN;
++^$CM+ $IN;
+ 
+-# LB 29
+-$IScm ($ALcm | $NUcm);
+ 
++# LB 23
+ #
+-# Rule 30   Do not break between letters, numbers or ordinary symbols
+-#           and opening or closing punctuation
+-#
+-($ALcm | $HLcm | $NUcm) $OPcm;
+-$CM+ $OPcm;
+-$CLcm ($ALcm | $HLcm | $NUcm);
++($ALPlus | $HL) $CM* $NU;
++^$CM+  $NU;       # Rule 10, any otherwise unattached CM behaves as AL
++$NU $CM* ($ALPlus | $HL);
+ 
++# LB 23a
+ #
+-#  Reverse Rules.
+-#
+-## -------------------------------------------------
++$PR $CM* ($ID | $EB | $EM);
++($ID | $EB | $EM) $CM*  $PO;
+ 
+-!!reverse;
+-
+-$CM+ $ALPlus;
+-$CM+ $BA;
+-$CM+ $BB;
+-$CM+ $B2;
+-$CM+ $CL;
+-$CM+ $EX;
+-$CM+ $GL;
+-$CM+ $HL;
+-$CM+ $HY;
+-$CM+ $H2;
+-$CM+ $H3;
+-$CM+ $ID;
+-$CM+ $IN;
+-$CM+ $IS;
+-$CM+ $JL;
+-$CM+ $JV;
+-$CM+ $JT;
+-$CM+ $NS;
+-$CM+ $NU;
+-$CM+ $OP;
+-$CM+ $PO;
+-$CM+ $PR;
+-$CM+ $QU;
+-$CM+ $SY;
+-$CM+ $WJ;
+-$CM+;
+-
+-
+-#
+-#  Sequences of the form  (shown forwards)
+-#      [CANT_CM]  <break>  [CM]  [whatever]
+-#  The CM needs to behave as an AL
+-#
+-$AL_FOLLOW $CM+ / (
+-          [$BK $CR $LF $NL $ZW {eof}] |
+-          $SP+ $CM+ $SP |
+-          $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));   # if LB 14 will match, need to suppress this break.
+-                                               #  LB14 says    OP SP* x .        
+-                                               #    becomes    OP SP* x AL
+-                                               #    becomes    OP SP* x CM+ AL_FOLLOW
+-                                               #
+-                                               # Further note:  the $AL in [$AL {eof}] is only to work around
+-                                               #                a rule compiler bug which complains about
+-                                               #                empty sets otherwise.
+-          
+-#
+-#  Sequences of the form  (shown forwards)
+-#      [CANT_CM]  <break> [CM]  <break>  [PR]
+-#  The CM needs to behave as an AL
+-#  This rule is concerned about getting the second of the two <breaks> in place.
+-#
+-
+-[$PR   ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
+-
+-
+-
+-# LB 4, 5, 5
+-
+-$LB4Breaks [$LB4NonBreaks-$CM];
+-$LB4Breaks $CM+ $CAN_CM;
+-$LF $CR;
+-
+-
+-# LB 7         x SP
+-#              x ZW
+-[$SP $ZW] [$LB4NonBreaks-$CM];
+-[$SP $ZW] $CM+ $CAN_CM;
+ 
+-# LB 8 Break after zero width space
+-
+-
+-# LB 9,10  Combining marks.
+-#    X   $CM needs to behave like X, where X is not $SP or controls.
+-#    $CM not covered by the above needs to behave like $AL
+-# Stick together any combining sequences that don't match other rules.
+-$CM+ $CAN_CM;
+-
+-
+-# LB 11
+-$CM* $WJ $CM* $CAN_CM;
+-$CM* $WJ      [$LB8NonBreaks-$CM];
+-
+-     $CANT_CM $CM* $WJ;
+-$CM* $CAN_CM  $CM* $WJ;
+-
+-# LB 12
+-#         x GL
+ #
+-$CM* $GL $CM* [$LB8NonBreaks-$CM-$SP];
++# LB 24
++#
++($PR | $PO) $CM* ($ALPlus | $HL);
++($ALPlus | $HL) $CM* ($PR | $PO);
++^$CM+ ($PR | $PO);       # Rule 10, any otherwise unattached CM behaves as AL
+ 
+ #
+-#     GL  x
++# LB 25   Numbers.
+ #
+-$CANT_CM $CM* $GL;
+-$CM* $CAN_CM $CM* $GL;
++(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))*
++    ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
+ 
++### BEGIN CUSTOMIZATION
++### i#83229: Allow line break after hyphen in number range context.
++### The default ICU rules treat number ranges (e.g. 100-199) as a single token. This change forces
++### a break opportunity after the embedded '-', but only if followed by another numeral.
++###
++### This customization does not replace any existing rule.
++### Maintainers: note that this rule should consist of two instances of the LB 25 numbers rule,
++### separated by a hyphen and an explicit break.
+ 
+-# LB 13
+-$CL $CM+ $CAN_CM;
+-$EX $CM+ $CAN_CM;
+-$IS $CM+ $CAN_CM;
+-$SY $CM+ $CAN_CM;
++((($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))*
++    ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?)
++    ($HY $CM*) /
++((($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))*
++    ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?);
+ 
+-$CL [$LB8NonBreaks-$CM];
+-$EX [$LB8NonBreaks-$CM];
+-$IS [$LB8NonBreaks-$CM];
+-$SY [$LB8NonBreaks-$CM];
++### END CUSTOMIZATION
+ 
+-# Rule 13 & 14 taken together for an edge case.
+-#   Match this, shown forward
+-#     OP SP+  ($CM+ behaving as $AL) (CL | EX | IS | IY)
+-#   This really wants to chain at the $CM+ (which is acting as an $AL)
+-#   except for $CM chaining being disabled.
+-[$CL $EX $IS $SY] $CM+ $SP+ $CM* $OP;  
++### TODO
++### ((PrefixNumeric | PostfixNumeric) CombMark*) ? ((OpenPunc | Hyphen) CombMark*)?
++###    (InfixNumeric CombMark*)? Numeric (CombMark* (Numeric | BreakSym | InfixNumeric))*
++###    (CombMark* (ClosePunc | CloseParen))? (CombMark* (PrefixNumeric | PostfixNumeric))?
+ 
+-# LB 14    OP SP* x
++# LB 26  Do not break a Korean syllable
+ #
+-$CM* $CAN_CM    $SP* $CM* $OP;
+-     $CANT_CM   $SP* $CM* $OP;
+-$AL_FOLLOW? $CM+  $SP $SP* $CM* $OP;     #  by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
+-     
+-     $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+-$CM* $AL_FOLLOW_CM   $CM+ $SP+ $CM* $OP;
+-$SY $CM $SP+ $OP;   # TODO:  Experiment.  Remove.
+-
+-
+-
+-# LB 15
+-# $CM* $OP $SP* $CM* $QU;
+-
+-# LB 16
+-$CM* $NS $SP* $CM* $CL;
++$JL $CM* ($JL | $JV | $H2 | $H3);
++($JV | $H2) $CM* ($JV | $JT);
++($JT | $H3) $CM* $JT;
+ 
+-# LB 17
+-$CM* $B2 $SP* $CM* $B2;
+-
+-# LB 18  break after spaces
+-#        Nothing explicit needed here.
+-
+-
+-#
+-# LB 19
+-#
+-$CM* $QU $CM* $CAN_CM;                                #   . x QU
+-$CM* $QU      $LB18NonBreaks;
++# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
++($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
++$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
+ 
+ 
+-$CM* $CAN_CM  $CM* $QU;                               #   QU x .
+-     $CANT_CM $CM* $QU;
+-     
+-#
+-#  LB 20  Break before and after CB.
+-#         nothing needed here.
++# LB 28   Do not break between alphabetics
+ #
+-
+-# LB 21
+-$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM];     #  . x (BA | HY | NS)
+-
+-$CM* [$LB20NonBreaks-$CM] $CM* $BB;                   #  BB x .
+-[^$CB] $CM* $BB;                                      # 
+-
+-# LB21a
+-[^$CB] $CM* ($HY | $BA) $CM* $HL;
+-
+-# LB 22
+-$CM* $IN $CM* ($ALPlus | $HL);
+-$CM* $IN $CM* $ID;
+-$CM* $IN $CM* $IN;
+-$CM* $IN $CM* $NU;
+-
+-# LB 23
+-$CM* $PO $CM* $ID;
+-$CM* $NU $CM* ($ALPlus | $HL);
+-$CM* ($ALPlus | $HL) $CM* $NU;
+-
+-# LB 24
+-$CM* $ID $CM* $PR;
+-$CM* $PR $CM* $ALPlus;
+-$CM* ($ALPlus | $HL) $CM* $PR;
+-$CM* ($ALPlus | $HL) $CM* $PO;
+-
+-$CM* $ALPlus $CM* ($IS | $SY | $HY)+ / $SP;
+-$CM* $NU+ $CM* $HY+ / $SP;
+-
+-# LB 25
+-($CM* ($PR | $PO))? ($CM* $CL)? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP))? ($CM* ($PR | $PO))?;
+-
+-# LB 26
+-$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
+-$CM* ($JT | $JV) $CM* ($H2 | $JV);
+-$CM* $JT $CM* ($H3 | $JT);
+-
+-# LB 27
+-$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
+-$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
+-$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
+-
+-# LB 28
+-$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
++($ALPlus | $HL) $CM* ($ALPlus | $HL);
++^$CM+ ($ALPlus | $HL);      # The $CM+ is from rule 10, an unattached CM is treated as AL
+ 
+ # LB 29
+-$CM* ($NU | $ALPlus) $CM* $IS+ [^$SP];
++$IS $CM* ($ALPlus | $HL);
+ 
+ # LB 30
+-$CM* $OP $CM* ($ALPlus | $HL | $NU);
+-$CM* ($ALPlus | $HL | $NU) $CM* ($CL | $SY)+ [^$SP];
+-
+-
+-## -------------------------------------------------
+-
+-!!safe_reverse;
+-
+-# LB 7
+-$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+-$CM+ $SP / .;
+-
+-# LB 9
+-$SP+ $CM* $OP;
+-
+-# LB 10
+-$SP+ $CM* $QU;
+-
+-# LB 11
+-$SP+ $CM* $CL;
+-$SP+ $CM* $B2;
+-
+-# LB 21
+-$CM* ($HY | $BA) $CM* $HL;
+-
+-# LB 18
+-($CM* ($IS | $SY))+ $CM* $NU;
+-$CL $CM* ($NU | $IS | $SY);
+-
+-# For dictionary-based break
+-$dictionary $dictionary;
+-
+-## -------------------------------------------------
+-
+-!!safe_forward;
+-
+-# Skip forward over all character classes that are involved in
+-#   rules containing patterns with possibly more than one char
+-#   of context.
+-#
+-#  It might be slightly more efficient to have specific rules
+-#  instead of one generic one, but only if we could
+-#  turn off rule chaining.  We don't want to move more
+-#  than necessary.
+-#
+-[$CM $OP $QU $CL $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $B2 $PR $HY $BA $dictionary];
+-$dictionary $dictionary;
+-
++($ALPlus | $HL | $NU) $CM* $OP30;
++^$CM+ $OP30;         # The $CM+ is from rule 10, an unattached CM is treated as AL.
++$CP30 $CM* ($ALPlus | $HL | $NU);
++
++# LB 30a  Do not break between regional indicators. Break after pairs of them.
++#         Tricky interaction with LB8a: ZWJ x .   together with ZWJ acting like a CM.
++$RI $CM* $RI                 / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
++$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
++$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
++# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
++#       because of the chain-out behavior difference. The rule must chain out only from the [set characters],
++#       not from the preceding $RI or $CM, which it would be able to do if the set were optional.
++
++# LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
++$EB $CM* $EM;
++$ExtPictUnassigned $CM* $EM;
++
++# LB 31 Break everywhere else.
++#       Match a single code point if no other rule applies.
++.;
+diff --git a/i18npool/source/breakiterator/data/sent.txt b/i18npool/source/breakiterator/data/sent.txt
+deleted file mode 100644
+index 7fada89e6278..000000000000
+--- a/i18npool/source/breakiterator/data/sent.txt
++++ /dev/null
+@@ -1,128 +0,0 @@
+-#
+-#   Copyright (C) 2002-2006, International Business Machines Corporation and others.
+-#       All Rights Reserved.
+-#
+-#   file:  sent.txt
+-#
+-#   ICU Sentence Break Rules
+-#      See Unicode Standard Annex #29.
+-#      These rules are based on SA 29 version 5.0.0
+-#      Includes post 5.0 changes to treat Japanese half width voicing marks
+-#        as Grapheme Extend.
+-#
+-
+-
+-$VoiceMarks   = [\uff9e\uff9f];
+-$Thai         = [:Script = Thai:];
+-
+-#
+-# Character categories as defined in TR 29
+-#
+-$Sep       = [\p{Sentence_Break = Sep}];
+-$Format    = [\p{Sentence_Break = Format}];
+-$Sp        = [\p{Sentence_Break = Sp}];
+-$Lower     = [\p{Sentence_Break = Lower}];
+-$Upper     = [\p{Sentence_Break = Upper}];
+-$OLetter   = [\p{Sentence_Break = OLetter}-$VoiceMarks];
+-$Numeric   = [\p{Sentence_Break = Numeric}];
+-$ATerm     = [\p{Sentence_Break = ATerm}];
+-$STerm     = [\p{Sentence_Break = STerm}];
+-$Close     = [\p{Sentence_Break = Close}];
+-
+-#
+-# Define extended forms of the character classes,
+-#   incorporate grapheme cluster + format chars.
+-#   Rules 4 and 5.  
+-
+-
+-$CR         = \u000d;
+-$LF         = \u000a;
+-$Extend     = [[:Grapheme_Extend = TRUE:]$VoiceMarks];
+-
+-$SpEx       = $Sp      ($Extend | $Format)*;
+-$LowerEx    = $Lower   ($Extend | $Format)*;
+-$UpperEx    = $Upper   ($Extend | $Format)*;
+-$OLetterEx  = $OLetter ($Extend | $Format)*;
+-$NumericEx  = $Numeric ($Extend | $Format)*;
+-$ATermEx    = $ATerm   ($Extend | $Format)*;
+-$STermEx    = $STerm   ($Extend | $Format)*;
+-$CloseEx    = $Close   ($Extend | $Format)*;
+-
+-
+-## -------------------------------------------------
+-
+-!!chain;
+-!!forward;
+-
+-# Rule 3 - break after separators.  Keep CR/LF together.
+-#
+-$CR $LF;
+-
+-$LettersEx = [$OLetter $Upper $Lower $Numeric $Close $STerm] ($Extend | $Format)*;
+-$LettersEx* $Thai $LettersEx* ($ATermEx | $SpEx)*;
+-
+-# Rule 4 - Break after $Sep.
+-# Rule 5 - Ignore $Format and $Extend
+-#
+-[^$Sep]? ($Extend | $Format)*;
+-
+-
+-# Rule 6
+-$ATermEx $NumericEx;
+-
+-# Rule 7
+-$UpperEx $ATermEx $UpperEx;
+-
+-#Rule 8
+-#  Note:  follows errata for Unicode 5.0 boundary rules.
+-$NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] ($Extend | $Format)*;
+-$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
+-
+-# Rule 8a
+-($STermEx | $ATermEx) $CloseEx* $SpEx* ($STermEx | $ATermEx);
+-
+-#Rule 9, 10, 11
+-($STermEx | $ATermEx) $CloseEx* $SpEx* $Sep?;
+-
+-#Rule 12
+-[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend $Thai]{bof}] ($Extend | $Format | $Close | $Sp)* [^$Thai];
+-[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep{eof}] | $CR $LF){100};
+-
+-## -------------------------------------------------
+-
+-!!reverse;
+-
+-$SpEx_R       = ($Extend | $Format)* $Sp;
+-$ATermEx_R    = ($Extend | $Format)* $ATerm;
+-$STermEx_R    = ($Extend | $Format)* $STerm;
+-$CloseEx_R    = ($Extend | $Format)* $Close;
+-
+-#
+-#  Reverse rules.
+-#     For now, use the old style inexact reverse rules, which are easier
+-#     to write, but less efficient.
+-#     TODO:  exact reverse rules.  It appears that exact reverse rules
+-#            may require improving support for look-ahead breaks in the
+-#            builder.  Needs more investigation.
+-#
+-
+-[{bof}] (.? | $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
+-#.*;
+-
+-# Explanation for this rule:
+-#
+-#    It needs to back over
+-#        The $Sep at which we probably begin
+-#        All of the non $Sep chars leading to the preceding $Sep
+-#        The preceding $Sep, which will be the second one that the rule matches.
+-#        Any immediately preceding STerm or ATerm sequences.  We need to see these
+-#              to get the correct rule status when moving forwards again.
+-#        
+-# [{bof}]           inhibit rule chaining.  Without this, rule would loop on itself and match
+-#                   the entire string.
+-#
+-# (.? | $LF $CR)    Match one $Sep instance.  Use .? rather than $Sep because position might be
+-#                   at the beginning of the string at this point, and we don't want to fail.
+-#                   Can only use {eof} once, and it is used later.
+-#
+-
+-- 
+2.39.2
+
diff --git a/debian/patches/icu-74.1.diff b/debian/patches/icu-74.1.diff
new file mode 100644
index 0000000000..4cbd69c668
--- /dev/null
+++ b/debian/patches/icu-74.1.diff
@@ -0,0 +1,71 @@
+From ae182240328f20508c7a8936daf74a088627540b Mon Sep 17 00:00:00 2001
+From: Taichi Haradaguchi <20001722@ymail.ne.jp>
+Date: Tue, 31 Oct 2023 19:46:23 +0900
+Subject: Update to ICU 74.1
+
+https://icu.unicode.org/download/74
+
+Unicode 15.1
+https://blog.unicode.org/2023/09/announcing-unicode-standard-version-151.html
+
+CLDR 44
+https://cldr.unicode.org/index/downloads/cldr-44
+
+New Unicode blocks:
+UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I
+
+Change-Id: Ic9196e10138663d07235f5ebd9cc4bf3a9750824
+Reviewed-on: https://gerrit.libreoffice.org/c/core/+/158749
+Tested-by: Eike Rathke <erack@redhat.com>
+Reviewed-by: Eike Rathke <erack@redhat.com>
+---
+ configure.ac                                | 4 ++--
+ download.lst                                | 8 ++++----
+ external/icu/icu4c-khmerbreakengine.patch.1 | 1 -
+ include/svx/strings.hrc                     | 1 +
+ svx/source/dialog/charmap.cxx               | 5 +++++
+ 5 files changed, 12 insertions(+), 7 deletions(-)
+
+diff --git a/external/icu/icu4c-khmerbreakengine.patch.1 b/external/icu/icu4c-khmerbreakengine.patch.1
+index 605914014e96..db8ac50e6f75 100644
+--- a/external/icu/icu4c-khmerbreakengine.patch.1
++++ b/external/icu/icu4c-khmerbreakengine.patch.1
+@@ -796,7 +796,6 @@ diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionaryda
+              if (wordCount < limit) {
+                  if (values != nullptr) {
+                      values[wordCount] = bt.getValue();
+-
+ diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h
+ --- icu.org/source/common/dictionarydata.h	2023-06-14 06:23:55.000000000 +0900
+ +++ icu/source/common/dictionarydata.h	2023-06-26 17:43:53.097724900 +0900
+diff --git a/include/svx/strings.hrc b/include/svx/strings.hrc
+index 13f896f04eeb..b8e69dc3dbe1 100644
+--- a/include/svx/strings.hrc
++++ b/include/svx/strings.hrc
+@@ -1790,6 +1790,7 @@
+ #define RID_SUBSETSTR_KAKTOVIK_NUMERALS                     NC_("RID_SUBSETMAP", "Kaktovik Numerals")
+ #define RID_SUBSETSTR_KAWI                                  NC_("RID_SUBSETMAP", "Kawi")
+ #define RID_SUBSETSTR_NAG_MUNDARI                           NC_("RID_SUBSETMAP", "Nag Mundari")
++#define RID_SUBSETSTR_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I    NC_("RID_SUBSETMAP", "CJK Unified Ideographs Extension I")
+ 
+ #define RID_SVXSTR_FRAMEDIR_LTR                             NC_("RID_SVXSTR_FRAMEDIR_LTR", "Left-to-right (LTR)")
+ #define RID_SVXSTR_FRAMEDIR_RTL                             NC_("RID_SVXSTR_FRAMEDIR_RTL", "Right-to-left (RTL)")
+diff --git a/svx/source/dialog/charmap.cxx b/svx/source/dialog/charmap.cxx
+index ed0c626b59c6..a73b0e263d60 100644
+--- a/svx/source/dialog/charmap.cxx
++++ b/svx/source/dialog/charmap.cxx
+@@ -1923,6 +1923,11 @@ void SubsetMap::InitList()
+                 case UBLOCK_NAG_MUNDARI:
+                     aAllSubsets.emplace_back( 0x1E4D0, 0x1E4FF, SvxResId(RID_SUBSETSTR_NAG_MUNDARI) );
+                     break;
++#endif
++#if (U_ICU_VERSION_MAJOR_NUM >= 74)
++                case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I:
++                    aAllSubsets.emplace_back( 0x2EBF0, 0x2EE5F, SvxResId(RID_SUBSETSTR_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I) );
++                    break;
+ #endif
+             }
+ 
+-- 
+cgit v1.2.3
+
diff --git a/debian/patches/reviewed-breakIterator-customizations.diff b/debian/patches/reviewed-breakIterator-customizations.diff
new file mode 100644
index 0000000000..80f9bd814e
--- /dev/null
+++ b/debian/patches/reviewed-breakIterator-customizations.diff
@@ -0,0 +1,1269 @@
+From fb94cc0d1348140d03c2826771c57255ff74a94a Mon Sep 17 00:00:00 2001
+From: Jonathan Clark <jonathan@libreoffice.org>
+Date: Thu, 11 Apr 2024 16:42:39 -0600
+Subject: [PATCH] tdf#49885 Reviewed BreakIterator customizations
+
+This change completes the review of BreakIterator rule customizations,
+and adds unit tests for relevant customizations.
+
+Change-Id: I06678fcccfc48d020aac64dd9f58ff36a763af30
+Reviewed-on: https://gerrit.libreoffice.org/c/core/+/166017
+Tested-by: Jenkins
+Reviewed-by: Eike Rathke <erack@redhat.com>
+---
+ i18npool/qa/cppunit/test_breakiterator.cxx | 559 +++++++++++++++++++
+ i18npool/source/breakiterator/data/README  | 612 ++++-----------------
+ 2 files changed, 668 insertions(+), 503 deletions(-)
+
+diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx
+index 0f2629fe05ec..b33466bee46d 100644
+--- a/i18npool/qa/cppunit/test_breakiterator.cxx
++++ b/i18npool/qa/cppunit/test_breakiterator.cxx
+@@ -31,6 +31,7 @@ public:
+ 
+     void testLineBreaking();
+     void testWordBoundaries();
++    void testSentenceBoundaries();
+     void testGraphemeIteration();
+     void testWeak();
+     void testAsian();
+@@ -43,9 +44,18 @@ public:
+     void testJapanese();
+     void testChinese();
+ 
++    void testLegacyDictWordPrepostDash_de_DE();
++    void testLegacyDictWordPrepostDash_nds_DE();
++    void testLegacyDictWordPrepostDash_nl_NL();
++    void testLegacyDictWordPrepostDash_sv_SE();
++    void testLegacyHebrewQuoteInsideWord();
++    void testLegacySurrogatePairs();
++    void testLegacyWordCountCompat();
++
+     CPPUNIT_TEST_SUITE(TestBreakIterator);
+     CPPUNIT_TEST(testLineBreaking);
+     CPPUNIT_TEST(testWordBoundaries);
++    CPPUNIT_TEST(testSentenceBoundaries);
+     CPPUNIT_TEST(testGraphemeIteration);
+     CPPUNIT_TEST(testWeak);
+     CPPUNIT_TEST(testAsian);
+@@ -57,6 +67,13 @@ public:
+ #endif
+     CPPUNIT_TEST(testJapanese);
+     CPPUNIT_TEST(testChinese);
++    CPPUNIT_TEST(testLegacyDictWordPrepostDash_de_DE);
++    CPPUNIT_TEST(testLegacyDictWordPrepostDash_nds_DE);
++    CPPUNIT_TEST(testLegacyDictWordPrepostDash_nl_NL);
++    CPPUNIT_TEST(testLegacyDictWordPrepostDash_sv_SE);
++    CPPUNIT_TEST(testLegacyHebrewQuoteInsideWord);
++    CPPUNIT_TEST(testLegacySurrogatePairs);
++    CPPUNIT_TEST(testLegacyWordCountCompat);
+     CPPUNIT_TEST_SUITE_END();
+ 
+ private:
+@@ -118,6 +135,173 @@ void TestBreakIterator::testLineBreaking()
+         }
+     }
+ 
++    // i#22602: writer breaks word after dot immediately followed by a letter
++    {
++        aLocale.Language = "en";
++        aLocale.Country = "US";
++
++        {
++            //Here we want the line break to leave ./bar/baz clumped together on the next line
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                "foo ./bar/baz", strlen("foo ./bar/ba"), aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first period",
++                                         static_cast<sal_Int32>(4), aResult.breakIndex);
++        }
++    }
++
++    // i#81448: slash and backslash make non-breaking spaces of preceding spaces
++    {
++        aLocale.Language = "en";
++        aLocale.Country = "US";
++
++        {
++            // Per the bug, the line break should leave ...BE clumped together on the next line.
++            // However, the current behavior does not wrap the string at all. This test asserts the
++            // current behavior as a point of reference.
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                "THIS... ...BE", strlen("THIS... ...B"), aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aResult.breakIndex);
++        }
++    }
++
++    // i#81448: slash and backslash make non-breaking spaces of preceding spaces
++    {
++        aLocale.Language = "en";
++        aLocale.Country = "US";
++
++        {
++            // The line break should leave /BE clumped together on the next line.
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                "THIS... /BE", strlen("THIS... /B"), aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(8), aResult.breakIndex);
++        }
++    }
++
++    // i#80548: Bad word wrap between dash and word
++    {
++        aLocale.Language = "fi";
++        aLocale.Country = "FI";
++
++        {
++            // Per the bug, the line break should leave -bar clumped together on the next line.
++            // However, this change was reverted at some point. This test asserts the new behavior.
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                "foo -bar", strlen("foo -ba"), aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash",
++                                         static_cast<sal_Int32>(5), aResult.breakIndex);
++        }
++    }
++
++    // i#80645: Line erroneously breaks at backslash
++    {
++        aLocale.Language = "en";
++        aLocale.Country = "US";
++
++        {
++            // Here we want the line break to leave C:\Program Files\ on the first line
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                "C:\\Program Files\\LibreOffice", strlen("C:\\Program Files\\Libre"), aLocale, 0,
++                aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
++        }
++    }
++
++    // i#80841: Words separated by hyphens will always break to next line
++    {
++        aLocale.Language = "en";
++        aLocale.Country = "US";
++
++        {
++            // Here we want the line break to leave toll- on the first line
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                "toll-free", strlen("toll-fr"), aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
++        }
++    }
++
++    // i#83464: Line break between letter and $
++    {
++        aLocale.Language = "en";
++        aLocale.Country = "US";
++
++        {
++            // Here we want the line break to leave US$ clumped on the next line.
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                "word US$ 123", strlen("word U"), aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
++        }
++    }
++
++    // Unknown bug number: "fix line break problem of dot after letter and before number"
++    {
++        aLocale.Language = "en";
++        aLocale.Country = "US";
++
++        {
++            // Here we want the line break to leave US$ clumped on the next line.
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                "word L.5 word", strlen("word L"), aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
++        }
++    }
++
++    // i#83229: Wrong line break when word contains a hyphen
++    {
++        aLocale.Language = "en";
++        aLocale.Country = "US";
++
++        {
++            // Here we want the line break to leave 100- clumped on the first line.
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                "word 100-199 word", strlen("word 100-1"), aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(9), aResult.breakIndex);
++        }
++    }
++
++    // i#83649: Line break should be between typographical quote and left bracket
++    {
++        aLocale.Language = "de";
++        aLocale.Country = "DE";
++
++        {
++            // Here we want the line break to leave »angetan werden« on the first line
++            const OUString str = u"»angetan werden« [Passiv]"_ustr;
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                str, strlen("Xangetan werdenX ["), aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
++        }
++    }
++
++    // i#72868: Writer/Impress line does not break after Chinese punctuation and Latin letters
++    {
++        aLocale.Language = "zh";
++        aLocale.Country = "HK";
++
++        {
++            // Per the bug, this should break at the ideographic comma. However, this change has
++            // been reverted at some point. This test only verifies current behavior.
++            const OUString str = u"word word、word word"_ustr;
++            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
++                str, strlen("word wordXwor"), aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(13), aResult.breakIndex);
++        }
++    }
++
++    // i#80891: Character in the forbidden list sometimes appears at the start of line
++    {
++        aLocale.Language = "zh";
++        aLocale.Country = "HK";
++
++        {
++            // Per the bug, the ideographic two-dot leader should be a forbidden character. However,
++            // this change seems to have been reverted or broken at some point.
++            const OUString str = u"電話︰電話"_ustr;
++            i18n::LineBreakResults aResult
++                = m_xBreak->getLineBreak(str, 2, aLocale, 0, aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(2), aResult.breakIndex);
++        }
++    }
++
+     //See https://bz.apache.org/ooo/show_bug.cgi?id=19716
+     {
+         aLocale.Language = "en";
+@@ -160,6 +344,20 @@ void TestBreakIterator::testLineBreaking()
+             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex);
+         }
+     }
++
++    // i#65267: Comma is badly broken at end of line
++    // - The word should be wrapped along with the comma
++    {
++        aLocale.Language = "de";
++        aLocale.Country = "DE";
++
++        {
++            auto res = m_xBreak->getLineBreak("Wort -prinzessinnen, wort",
++                                              strlen("Wort -prinzessinnen,"), aLocale, 0,
++                                              aHyphOptions, aUserOptions);
++            CPPUNIT_ASSERT_EQUAL(sal_Int32{ 6 }, res.breakIndex);
++        }
++    }
+ }
+ 
+ //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
+@@ -601,6 +799,174 @@ void TestBreakIterator::testWordBoundaries()
+         CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
+         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+     }
++
++    // i#55778: Words containing numbers get broken up
++    {
++        aLocale.Language = "en";
++        aLocale.Country = "US";
++
++        static constexpr OUString aTest = u"first i18n third"_ustr;
++
++        aBounds
++            = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.endPos);
++    }
++
++    // i#56347: "BreakIterator patch for Hungarian"
++    // Rules for Hungarian affixes after numbers and certain symbols
++    {
++        auto mode = i18n::WordType::DICTIONARY_WORD;
++        aLocale.Language = "hu";
++        aLocale.Country = "HU";
++
++        OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr;
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
++    }
++
++    // i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian)
++    // Rules for Hungarian affixes after numbers and certain symbols in edit mode.
++    // The patch was merged, but the original bug was never closed and the current behavior seems
++    // identical to the ICU default behavior. Added this test to ensure that doesn't change.
++    {
++        auto mode = i18n::WordType::ANY_WORD;
++        aLocale.Language = "hu";
++        aLocale.Country = "HU";
++
++        OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr;
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 17, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 19, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 20, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 21, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 24, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 26, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 30, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.endPos);
++
++        aBounds = m_xBreak->getWordBoundary(aTest, 31, aLocale, mode, true);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
++    }
++}
++
++void TestBreakIterator::testSentenceBoundaries()
++{
++    lang::Locale aLocale;
++    aLocale.Language = "en";
++    aLocale.Country = "US";
++
++    // Trivial characteristic test for sentence boundary detection
++    {
++        OUString aTest("This is a sentence. This is a different sentence.");
++
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 5, aLocale));
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak->endOfSentence(aTest, 5, aLocale));
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak->beginOfSentence(aTest, 31, aLocale));
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak->endOfSentence(aTest, 31, aLocale));
++    }
++
++    // i#24098: i18n API beginOfSentence/endOfSentence
++    // fix beginOfSentence, ... when cursor is on the beginning of the sentence
++    {
++        OUString aTest("This is a sentence. This is a different sentence.");
++
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak->beginOfSentence(aTest, 20, aLocale));
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak->endOfSentence(aTest, 20, aLocale));
++    }
++
++    // i#24098: i18n API beginOfSentence/endOfSentence
++    // "skip preceding space for beginOfSentence"
++    {
++        OUString aTest("This is a sentence.     This is a different sentence.");
++
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 20, aLocale));
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak->endOfSentence(aTest, 20, aLocale));
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(24), m_xBreak->beginOfSentence(aTest, 26, aLocale));
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(53), m_xBreak->endOfSentence(aTest, 26, aLocale));
++    }
+ }
+ 
+ //See https://bugs.libreoffice.org/show_bug.cgi?id=40292
+@@ -1043,6 +1409,199 @@ void TestBreakIterator::testChinese()
+         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+     }
+ }
++
++void TestBreakIterator::testLegacyDictWordPrepostDash_de_DE()
++{
++    lang::Locale aLocale;
++    aLocale.Language = "de";
++    aLocale.Country = "DE";
++
++    {
++        auto aTest = u"Arbeits- -nehmer"_ustr;
++
++        i18n::Boundary aBounds
++            = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
++
++        aBounds
++            = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
++    }
++}
++
++void TestBreakIterator::testLegacyDictWordPrepostDash_nds_DE()
++{
++    lang::Locale aLocale;
++    aLocale.Language = "nds";
++    aLocale.Country = "DE";
++
++    {
++        auto aTest = u"Arbeits- -nehmer"_ustr;
++
++        i18n::Boundary aBounds
++            = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
++
++        aBounds
++            = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
++    }
++}
++
++void TestBreakIterator::testLegacyDictWordPrepostDash_nl_NL()
++{
++    lang::Locale aLocale;
++    aLocale.Language = "nl";
++    aLocale.Country = "NL";
++
++    {
++        auto aTest = u"Arbeits- -nehmer"_ustr;
++
++        i18n::Boundary aBounds
++            = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
++
++        aBounds
++            = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
++    }
++}
++
++void TestBreakIterator::testLegacyDictWordPrepostDash_sv_SE()
++{
++    lang::Locale aLocale;
++    aLocale.Language = "sv";
++    aLocale.Country = "SE";
++
++    {
++        auto aTest = u"Arbeits- -nehmer"_ustr;
++
++        i18n::Boundary aBounds
++            = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
++
++        aBounds
++            = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
++    }
++}
++
++void TestBreakIterator::testLegacyHebrewQuoteInsideWord()
++{
++    lang::Locale aLocale;
++
++    aLocale.Language = "he";
++    aLocale.Country = "IL";
++
++    {
++        auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr;
++
++        i18n::Boundary aBounds
++            = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
++
++        aBounds
++            = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
++    }
++}
++
++void TestBreakIterator::testLegacySurrogatePairs()
++{
++    lang::Locale aLocale;
++
++    aLocale.Language = "ja";
++    aLocale.Country = "JP";
++
++    // i#75632: [surrogate pair] Japanese word break does not work properly for surrogate pairs.
++    // and many others to address bugs: i#75631 i#75633 i#75412 etc.
++    //
++    // BreakIterator supports surrogate pairs (UTF-16). This is a simple characteristic test.
++    {
++        const sal_Unicode buf[] = { u"X 𠮟 X" };
++        OUString aTest(buf, SAL_N_ELEMENTS(buf));
++
++        auto aBounds
++            = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
++
++        aBounds
++            = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
++
++        aBounds
++            = m_xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, false);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
++        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
++    }
++}
++
++void TestBreakIterator::testLegacyWordCountCompat()
++{
++    lang::Locale aLocale;
++
++    aLocale.Language = "en";
++    aLocale.Country = "US";
++
++    // i#80815: "Word count differs from MS Word"
++    // This is a characteristic test for word count using test data from the linked bug.
++    {
++        const OUString str = u""
++                             "test data for word count issue #80815\n"
++                             "fo\\\'sforos\n"
++                             "archipi\\\'elago\n"
++                             "do\\^me\n"
++                             "f**k\n"
++                             "\n"
++                             "battery-driven\n"
++                             "and/or\n"
++                             "apple(s)\n"
++                             "money+opportunity\n"
++                             "Micro$oft\n"
++                             "\n"
++                             "300$\n"
++                             "I(not you)\n"
++                             "a****n\n"
++                             "1+3=4\n"
++                             "\n"
++                             "aaaaaaa.aaaaaaa\n"
++                             "aaaaaaa,aaaaaaa\n"
++                             "aaaaaaa;aaaaaaa\n"_ustr;
++
++        int num_words = 0;
++        sal_Int32 next_pos = 0;
++        int iter_guard = 0;
++        while (true)
++        {
++            CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++iter_guard < 100);
++
++            auto aBounds = m_xBreak->nextWord(str, next_pos, aLocale, i18n::WordType::WORD_COUNT);
++
++            if (aBounds.endPos < next_pos)
++            {
++                break;
++            }
++
++            next_pos = aBounds.endPos;
++            ++num_words;
++        }
++
++        CPPUNIT_ASSERT_EQUAL(23, num_words);
++    }
++}
++
+ void TestBreakIterator::setUp()
+ {
+     BootstrapFixtureBase::setUp();
+diff --git a/i18npool/source/breakiterator/data/README b/i18npool/source/breakiterator/data/README
+index 6246b80ae77f..76e3e37c3faf 100644
+--- a/i18npool/source/breakiterator/data/README
++++ b/i18npool/source/breakiterator/data/README
+@@ -9,411 +9,108 @@ At various stages these copies have been customized and are now horribly out of
+ sync. It unclear which diffs from the base versions are deliberate and which
+ are now accidental :-(
+ 
+-We need to review the various issues referenced in the commits that caused
+-customizations and see if they're still relevant or not, write regression tests
+-for them, if any are still relevant then apply the changes back on top of the
+-latest versions.
++The various issues and customizations have been reviewed, with tests written for
++customizations that are still relevant. However, these files are still extremely
++out-of-date and need to be refreshed. Relevant customizations should be reapplied
++on top of a current version.
+ 
+-to-review, later are ok:
+-
+-commit e1ad946ef5db3f7c0a540207d0f0fd85799e3b66
+-Author: Release Engineers <releng@openoffice.org>
+-Date:   Thu Aug 6 18:13:57 2009 +0000
+-
+-    CWS-TOOLING: integrate CWS tl73
+-    2009-07-31 15:29:33 +0200 tl  r274535 : #i64400# dash/hyphen should not break words
+-
+-commit 9964a76ef58786bba47d409970512d7ded6c8889
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Wed Jul 2 07:53:05 2008 +0000
+-
+-    INTEGRATION: CWS i18n41 (1.1.2); FILE ADDED
+-    2008/04/25 17:06:26 khong 1.1.2.3: i55063, make period a sentence delimiter
+-    2008/04/25 06:40:50 khong 1.1.2.2: i55063, make space as Thai sentence delimiter
+-    2008/04/24 03:19:10 khong 1.1.2.1: i55063, set Thai letters as sentence delimiter for Thai and English mixed text
+-
+-commit e4a6e4284dae1ca6fbfa7d1e43690dbf87d796cd
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Wed Jul 2 07:52:44 2008 +0000
+-
+-    INTEGRATION: CWS i18n41 (1.9.12); FILE MERGED
+-    2008/06/17 20:22:30 khong 1.9.12.2: i83229 fix the problem of leading hyphen for numbers
+-    2008/04/23 06:20:16 khong 1.9.12.1: i72868, i80891, i83229, fix Chinese punctuations and hyphen for line breakiterator
+-
+-commit 55dff22611659a1567c968fbf9e512a2765ab62e
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Wed Jul 2 07:52:07 2008 +0000
+-
+-    INTEGRATION: CWS i18n41 (1.33.36); FILE MERGED
+-    2008/06/05 22:18:29 khong 1.33.36.2: RESYNC: (1.33-1.35); FILE MERGED
+-    2008/04/23 06:11:55 khong 1.33.36.1: i55063, enable language specific sentence breakiterator
+-
+-commit 1c2b8095631a3c2d2f396bf50a8f0c62f49be65c
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Wed Jul 2 07:51:12 2008 +0000
+-
+-    INTEGRATION: CWS i18n41 (1.12.140); FILE MERGED
+-    2008/06/05 22:18:26 khong 1.12.140.2: RESYNC: (1.12-1.13); FILE MERGED
+-    2008/04/23 06:04:53 khong 1.12.140.1: i87530 avoid breaking line before un-completed cell
+-
+-commit 9bbdb52df370c69c0f7eba387a2068ee80bd7994
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Wed Jul 2 07:50:43 2008 +0000
+-
+-    INTEGRATION: CWS i18n41 (1.25.2); FILE MERGED
+-    2008/06/05 22:18:23 khong 1.25.2.2: RESYNC: (1.25-1.26); FILE MERGED
+-    2008/04/23 06:09:02 khong 1.25.2.1: i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts
+-
+-commit 8dcdd3ca268f78295731b86797c2b8cd447ba667
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Tue May 20 13:36:01 2008 +0000
+-
+-    INTEGRATION: CWS i18n43_DEV300 (1.33.38); FILE MERGED
+-    2008/04/29 21:51:51 khong 1.33.38.1: #i88411# apply the patch from Coleman Kane to fix icu setBreakType issue
+-
+-commit bedef98c24ef9ada6aaffe9bc5284d9759a31a9a
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Wed Apr 2 08:49:09 2008 +0000
+-
+-    INTEGRATION: CWS i18n40 (1.2.314); FILE MERGED
+-    2008/03/19 06:30:23 khong 1.2.314.2: #i80815# count dash like MS Word
+-    2008/03/15 07:32:44 khong 1.2.314.1: #i80815# count punctuation as word
+-
+-commit 59144104b3f91a2e6ed816f0bde0fdb91ea218d7
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Wed Apr 2 08:48:53 2008 +0000
+-
+-    INTEGRATION: CWS i18n40 (1.24.44); FILE MERGED
+-    2008/03/19 18:56:42 khong 1.24.44.2: i80815 make word count feature like MS Word
+-    2008/03/15 07:31:38 khong 1.24.44.1: #i80815# count punctuation as word
+-
+-commit 3f0b51776602c45e8aca991450fcbb30f2484ae5
+-Author: Vladimir Glazounov <vg@openoffice.org>
+-Date:   Mon Jan 28 14:33:46 2008 +0000
+-
+-    INTEGRATION: CWS i18n39 (1.8.4); FILE MERGED
+-    2007/12/12 17:45:45 khong 1.8.4.3: b6634800# fix line break problem of dot after letter and before number
+-    2007/12/08 01:05:52 khong 1.8.4.2: #i83649# fixed the problem of line break between quotation mark and open bracket
+-    2007/12/07 23:44:30 khong 1.8.4.1: #i83464# fix the problem of line break between letter and 1326
+-
+-commit 5d8ef209b1f63d1c8ea5014bdbef96660b355423
+-Author: Vladimir Glazounov <vg@openoffice.org>
+-Date:   Tue Oct 23 08:09:00 2007 +0000
+-
+-    INTEGRATION: CWS i18n38 (1.7.4); FILE MERGED
+-    2007/09/19 00:08:04 khong 1.7.4.3: i81448 fixed dot line break issue
+-    2007/09/10 23:57:12 khong 1.7.4.2: i81440 fix the problem of line break on punctuations
+-    2007/09/10 22:55:46 khong 1.7.4.1: i81448 fix problem of line break on symbols
+-
+-commit a2f3b48cacfcef338ca5e37acde34c83876e082e
+-Author: Vladimir Glazounov <vg@openoffice.org>
+-Date:   Tue Oct 23 08:08:47 2007 +0000
+-
+-    INTEGRATION: CWS i18n38 (1.32.10); FILE MERGED
+-    2007/09/18 20:32:39 khong 1.32.10.1: i81519 set break type icu breakiterator
+-
+-commit 1967d8fb182b3101dee4f715e78be384400bc1e8
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Wed Sep 5 16:37:28 2007 +0000
+-
+-    INTEGRATION: CWS i18n37 (1.22.6); FILE MERGED
+-    2007/09/03 18:27:39 khong 1.22.6.2: i8132 fixed a problem in skipping space for word breakiterator
+-    2007/08/31 21:30:30 khong 1.22.6.1: i81158 fix skipping space problem
+-
+-commit d2c2baf1a31d281d20e8b4d4c806dda027b2d5a3
+-Author: Vladimir Glazounov <vg@openoffice.org>
+-Date:   Tue Aug 28 11:46:45 2007 +0000
+-
+-    INTEGRATION: CWS i18n36_SRC680 (1.5.20.1.2); FILE MERGED
+-    2007/08/22 17:12:36 khong 1.5.20.1.2.1: i80841 fix hyphen line break problem
+-
+-commit d56bedfb425cf77f176f143455e4a9fb6ce65540
+-Author: Vladimir Glazounov <vg@openoffice.org>
+-Date:   Tue Aug 28 11:46:34 2007 +0000
+-
+-    INTEGRATION: CWS i18n36_SRC680 (1.21.2.1.2); FILE MERGED
+-    2007/08/22 20:02:28 khong 1.21.2.1.2.2: i80923 fix infinite loop problem
+-    2007/08/22 17:11:44 khong 1.21.2.1.2.1: i80923 fix a infinite loop
+-
+-commit 8a36b196925a5561eabde0a0ef293c73fcb5add3
+-Author: Ivo Hinkelmann <ihi@openoffice.org>
+-Date:   Fri Aug 17 13:58:48 2007 +0000
+-
+-    INTEGRATION: CWS i18n34 (1.5.22); FILE MERGED
+-    2007/08/13 22:26:12 khong 1.5.22.1: i80548 i80645 fix dash and backslash issues in line breakiterator
+-
+-commit c00b2b49bad765144f90552139e63d87d520d1cf
+-Author: Ivo Hinkelmann <ihi@openoffice.org>
+-Date:   Fri Aug 17 13:58:36 2007 +0000
+-
+-    INTEGRATION: CWS i18n34 (1.15.4); FILE MERGED
+-    2007/08/13 22:33:38 khong 1.15.4.1: i86439 fix surrogate characters handling issues
+-
+-commit 3fc5fbc71d4c244d7c8002aa530481741e585bd4
+-Author: Ivo Hinkelmann <ihi@openoffice.org>
+-Date:   Fri Aug 17 13:58:23 2007 +0000
+-
+-    INTEGRATION: CWS i18n34 (1.31.4); FILE MERGED
+-    2007/08/13 22:33:37 khong 1.31.4.1: i86439 fix surrogate characters handling issues
+-
+-commit ee44b43881e7c82c379931f111c452a477b73341
+-Author: Ivo Hinkelmann <ihi@openoffice.org>
+-Date:   Fri Aug 17 13:58:11 2007 +0000
+-
+-    INTEGRATION: CWS i18n34 (1.21.4); FILE MERGED
+-    2007/08/14 08:38:53 khong 1.21.4.2: i86439 fix surrogate characters handling issues
+-    2007/08/13 22:33:37 khong 1.21.4.1: i86439 fix surrogate characters handling issues
+-
+-commit f47369dbbc385f8968ad43e43cba293a29a4c2df
+-Author: Jens-Heiner Rechtien <hr@openoffice.org>
+-Date:   Tue Jul 31 16:09:13 2007 +0000
+-
+-    INTEGRATION: CWS i18n32 (1.29.14); FILE MERGED
+-    2007/07/24 20:39:44 khong 1.29.14.1: #i79148# fix a local word breakiterator rules loading issue
+-
+-commit 2791553b4e3fc5e04b96d0b2fd119d9fba1946bc
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Thu Jul 26 08:08:51 2007 +0000
+-
+-    INTEGRATION: CWS i18n31 (1.14.60); FILE MERGED
+-    2007/07/16 22:18:44 khong 1.14.60.4: i75631 i75632 i75633 i75412 handle surrogate pair characters
+-    2007/07/13 20:37:32 khong 1.14.60.3: #i75632# use ICU characters properties
+-    2007/07/04 01:17:22 khong 1.14.60.2: i75631 i75632 i75633 i75412 handle surrogate pair characters
+-    2007/06/27 04:33:11 khong 1.14.60.1: i75631 i75632 i75633 i75412 handle surrogate pair characters
+-
+-commit 1c79a2bf1e89ac4eb409922ab7eb8ad3cacc688a
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Thu Jul 26 08:08:39 2007 +0000
+-
+-    INTEGRATION: CWS i18n31 (1.8.60); FILE MERGED
+-    2007/06/27 04:33:11 khong 1.8.60.1: i75631 i75632 i75633 i75412 handle surrogate pair characters
+-
+-commit 517bbaddbaf81a5a6bb00979944cad13a1575d50
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Thu Jul 26 08:08:27 2007 +0000
+-
+-    INTEGRATION: CWS i18n31 (1.28.14); FILE MERGED
+-    2007/07/13 20:37:32 khong 1.28.14.5: #i75632# use ICU characters properties
+-    2007/07/04 01:17:22 khong 1.28.14.4: i75631 i75632 i75633 i75412 handle surrogate pair characters
+-    2007/06/27 23:25:58 khong 1.28.14.3: i75412 handle surrogate pair characters
+-    2007/06/27 05:33:20 khong 1.28.14.2: RESYNC: (1.28-1.29); FILE MERGED
+-    2007/06/27 04:33:11 khong 1.28.14.1: i75631 i75632 i75633 i75412 handle surrogate pair characters
+-
+-commit 0154e3492f2527535c0d648274e7ff674674318b
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Thu Jul 26 08:08:14 2007 +0000
+-
+-    INTEGRATION: CWS i18n31 (1.14.42); FILE MERGED
+-    2007/06/27 05:33:03 khong 1.14.42.2: RESYNC: (1.14-1.15); FILE MERGED
+-    2007/06/27 04:33:11 khong 1.14.42.1: i75631 i75632 i75633 i75412 handle surrogate pair characters
+-
+-commit e2a5a2532ee187669980adb7bfa747c7803c330a
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Thu Jul 26 08:08:02 2007 +0000
+-
+-    INTEGRATION: CWS i18n31 (1.19.60); FILE MERGED
+-    2007/07/13 20:37:32 khong 1.19.60.4: #i75632# use ICU characters properties
+-    2007/07/04 01:17:22 khong 1.19.60.3: i75631 i75632 i75633 i75412 handle surrogate pair characters
+-    2007/06/27 05:00:48 khong 1.19.60.2: i75231 handle surrogate pair characters
+-    2007/06/27 04:33:11 khong 1.19.60.1: i75631 i75632 i75633 i75412 handle surrogate pair characters
+-
+-commit 80a26a7d4720b5b8cfa0acc624b28014c96d9948
+-Author: Jens-Heiner Rechtien <hr@openoffice.org>
+-Date:   Tue Jun 26 16:41:02 2007 +0000
+-
+-    INTEGRATION: CWS ause081 (1.2.332); FILE MERGED
+-    2007/06/21 10:53:19 hjs 1.2.332.1: #i78393# remove component_getDescriptionFunc from exports
+-
+-commit c2801db6b04bf6f0dbb07727c91b2c66e7e027b8
+-Author: Ivo Hinkelmann <ihi@openoffice.org>
+-Date:   Wed Jun 6 11:17:38 2007 +0000
+-
+-    INTEGRATION: CWS i18n30 (1.4.24); FILE MERGED
+-    2007/05/08 21:32:18 khong 1.4.24.1: #i73903# update line breakiterator rule to icu3.6 style
+-
+-commit ea290668f78475c3b277c9e44bf5622ccb4dcec8
+-Author: Ivo Hinkelmann <ihi@openoffice.org>
+-Date:   Wed Jun 6 11:17:25 2007 +0000
+-
+-    INTEGRATION: CWS i18n30 (1.28.4); FILE MERGED
+-    2007/05/08 21:47:00 khong 1.28.4.3: #i75412# remove fix from cws i18n30, move it to other cws to fix with other Japanese surrogate issues
+-    2007/03/20 18:39:58 khong 1.28.4.2: #i72589# fixed BS problem for surrogate characters
+-    2007/03/13 19:11:44 khong 1.28.4.1: #i75319# fixed ANY_WORD rule loading problem
+-
+-commit b6308a6e322fd4eaa7845793beb70900624f351c
+-Author: Ivo Hinkelmann <ihi@openoffice.org>
+-Date:   Wed Jun 6 11:17:12 2007 +0000
+-
+-    INTEGRATION: CWS i18n30 (1.14.32); FILE MERGED
+-    2007/05/08 21:44:15 khong 1.14.32.1: #i76706# fix infinite loop for CJK word breakiterator for text mixed with Latin and CJK characters
+-
+-commit e068e0e9aa9405ea4016ad19e9a963129adfed79
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Thu Jan 25 08:35:42 2007 +0000
+-
+-    INTEGRATION: CWS i18n28 (1.1.2); FILE ADDED
+-    2006/12/06 05:52:39 khong 1.1.2.1: #i64400# add an optional breakiterator entry in localedata
+-
+-commit 8d6f35a46085bb420e8896505504b376d17b842a
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Thu Jan 25 08:35:31 2007 +0000
+-
+-    INTEGRATION: CWS i18n28 (1.24.36); FILE MERGED
+-    2006/12/19 17:27:58 khong 1.24.36.2: RESYNC: (1.24-1.25); FILE MERGED
+-    2006/12/06 05:52:38 khong 1.24.36.1: #i64400# add an optional breakiterator entry in localedata
+-
+-commit 633d34fa33330339ab6795ce3703477216e0062e
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Tue Dec 12 15:14:36 2006 +0000
+-
+-    INTEGRATION: CWS icuupgrade (1.9.24); FILE MERGED
+-    2006/10/11 06:11:11 khong 1.9.24.4: RESYNC: (1.10-1.11); FILE MERGED
+-    2006/07/07 10:57:40 hdu 1.9.24.3: RESYNC: (1.9-1.10); FILE MERGED
+-    2006/06/30 01:31:40 khong 1.9.24.2: #i53388# upgrade icu to 3.4.1
+-    2006/06/15 19:16:55 khong 1.9.24.1: #i60645# upgrade icu to 3.4.1
+-
+-commit 5d46dabe95271c846601a2575d3304fd5b4b24f1
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Tue Dec 12 15:14:05 2006 +0000
+-
+-    INTEGRATION: CWS icuupgrade (1.22.20); FILE MERGED
+-    2006/11/11 07:12:47 khong 1.22.20.6: #142664# fix breakiterator crash problem
+-    2006/10/11 06:10:51 khong 1.22.20.5: RESYNC: (1.23-1.24); FILE MERGED
+-    2006/09/06 01:00:31 khong 1.22.20.4: #i60645# upgrade to icu 3.6
+-    2006/07/07 10:57:32 hdu 1.22.20.3: RESYNC: (1.22-1.23); FILE MERGED
+-    2006/06/30 01:31:40 khong 1.22.20.2: #i53388# upgrade icu to 3.4.1
+-    2006/06/20 14:27:26 hdu 1.22.20.1: #i60645# fix crash when udata_open failed
+-
+-commit 7431d816cdfc47b08978c0afd1f6503644bb11b8
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Mon Nov 6 13:40:05 2006 +0000
+-
+-    INTEGRATION: CWS i18n27 (1.3.142); FILE MERGED
+-    2006/10/10 21:10:57 khong 1.3.142.1: #i65267# fix line break rule
+-
+-commit d7471e1462ffd9baeb3449eb86ccbb649e32b233
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Mon Nov 6 13:39:52 2006 +0000
+-
+-    INTEGRATION: CWS i18n27 (1.1.2); FILE ADDED
+-    2006/10/10 21:08:55 khong 1.1.2.1: #i56348# add Hungarian word break rule for edit mode
+-
+-commit 1b65b0b886e2cb16382bc11770230fb6a140f33b
+-Author: Jens-Heiner Rechtien <hr@openoffice.org>
+-Date:   Tue Oct 24 12:53:13 2006 +0000
+-
+-    INTEGRATION: CWS tl29 (1.12.24); FILE MERGED
+-    2006/09/20 01:24:53 khong 1.12.24.1: #i69482# fixed mismatch of nextWord and getWordBoundary
+-
+-commit 97d89862a2285071202cc8010d888ffcbf96279a
+-Author: Jens-Heiner Rechtien <hr@openoffice.org>
+-Date:   Thu Nov 17 19:30:35 2005 +0000
+-
+-    INTEGRATION: CWS i18n23 (1.20.22); FILE MERGED
+-    2005/11/17 20:00:37 khong 1.20.22.3: RESYNC: (1.20-1.21); FILE MERGED
+-    2005/11/17 19:45:05 khong 1.20.22.2: #i57866# merge cws i18n23 and thaiissues
+-    2005/11/15 21:10:24 khong 1.20.22.1: #i57866# fix line breakiterator problem
+-
+-commit 05fadde6f025bcaafca4f3093e88be3cc1bb6836
+-Author: Oliver Bolte <obo@openoffice.org>
+-Date:   Wed Nov 16 09:18:37 2005 +0000
+-
+-    INTEGRATION: CWS thaiissues (1.20.6); FILE MERGED
+-    2005/10/26 20:42:40 khong 1.20.6.2: use icu thai linke break algorithm for thai breakiterator
+-    2005/10/26 13:36:24 fme 1.20.6.1: #i55716# Handling of WORDJOINER
+-
+-commit a10b0e70c641d7438c557ef718c6942b3abffaec
+-Author: Oliver Bolte <obo@openoffice.org>
+-Date:   Wed Nov 16 09:18:25 2005 +0000
+-
+-    INTEGRATION: CWS thaiissues (1.8.6); FILE MERGED
+-    2005/10/26 20:42:39 khong 1.8.6.1: use icu thai linke break algorithm for thai breakiterator
+-
+-commit 4a1f1586173839d532f90507c72306bc9e2aec56
+-Author: Oliver Bolte <obo@openoffice.org>
+-Date:   Wed Nov 16 09:18:11 2005 +0000
+-
+-    INTEGRATION: CWS thaiissues (1.9.4); FILE MERGED
+-    2005/10/28 17:54:39 khong 1.9.4.1: Fix a bug in ctl line break when there is word joiner character
+-
+-commit beb2a536738ba761a92f8266570f1859c85f94ae
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Tue Nov 8 15:59:16 2005 +0000
+-
+-    INTEGRATION: CWS siloch (1.3.50); FILE MERGED
+-    2005/10/26 10:55:05 er 1.3.50.1: #i56347# apply patch to recognize suffixes of numbers in Hungarian spellchecking; contributed by Nemeth Laszlo <nemeth@ooo>
+-
+-commit 939e7c2bc93c13b6740051beeb08c5883b65ffce
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Fri Nov 4 14:33:30 2005 +0000
+-
+-    INTEGRATION: CWS i18n21 (1.3.46); FILE MERGED
+-    2005/10/21 00:35:09 khong 1.3.46.1: #i55778 reverse back last change, treat letter and number combination as one word.
+-
+-commit 51594ef552a872b9868e5c7a025a68665488a016
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Fri Nov 4 14:33:16 2005 +0000
+-
+-    INTEGRATION: CWS i18n21 (1.2.2); FILE MERGED
+-    2005/10/21 00:35:08 khong 1.2.2.1: #i55778 reverse back last change, treat letter and number combination as one word.
+-
+-commit f4fe39909c7ed645a8b387cf66de249572226ad6
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Fri Nov 4 14:33:03 2005 +0000
+-
+-    INTEGRATION: CWS i18n21 (1.3.46); FILE MERGED
+-    2005/10/21 00:35:08 khong 1.3.46.1: #i55778 reverse back last change, treat letter and number combination as one word.
+-
+-commit 7f8af14611e66655ea7354083eafd71afc9703e3
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Fri Nov 4 14:32:41 2005 +0000
+-
+-    INTEGRATION: CWS i18n21 (1.4.46); FILE MERGED
+-    2005/10/21 00:35:07 khong 1.4.46.1: #i55778 reverse back last change, treat letter and number combination as one word.
+-
+-commit 924e158b9d871fbf7500e9215540e26aa95b3b20
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Mon Oct 17 14:43:17 2005 +0000
+-
+-    INTEGRATION: CWS i18n20 (1.1.2); FILE ADDED
+-    2005/09/22 23:47:49 khong 1.1.2.1: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule.
+-
+-commit a428a8927006a10ccfe7182e6fe5a8b677281eca
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Mon Oct 17 14:42:30 2005 +0000
+-
+-    INTEGRATION: CWS i18n20 (1.18.32); FILE MERGED
+-    2005/09/23 15:59:13 khong 1.18.32.6: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule.
+-    2005/09/23 08:09:54 khong 1.18.32.5: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule.
+-    2005/09/23 07:38:03 khong 1.18.32.4: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule
+-    2005/09/22 23:47:48 khong 1.18.32.3: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule.
+-    2005/08/26 23:34:37 khong 1.18.32.2: #i50172# add cell breakiterator rule for Tamil
+-    2005/08/26 23:31:59 khong 1.18.32.1: #i50172# add cell breakiterator rule for Tamil
+-
+-commit f518f78557931b81e06fd7b31bb22c6639e5e553
+-Author: Rüdiger Timm <rt@openoffice.org>
+-Date:   Mon Oct 17 14:42:14 2005 +0000
+-
+-    INTEGRATION: CWS i18n20 (1.6.32); FILE MERGED
+-    2005/09/23 15:59:13 khong 1.6.32.3: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule.
+-    2005/09/23 07:38:02 khong 1.6.32.2: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule
+-    2005/09/22 23:47:48 khong 1.6.32.1: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule.
+-
+-commit 9b870055ecd043d1d4fadeacd351f8739e1979a0
+-Author: Vladimir Glazounov <vg@openoffice.org>
+-Date:   Fri Feb 25 09:08:13 2005 +0000
+-
+-    INTEGRATION: CWS i18n16 (1.16.22); FILE MERGED
+-    2005/02/04 19:05:45 khong 1.16.22.3: #i41671# use ICU rules for Thai breakiterator
+-    2005/01/24 21:56:34 khong 1.16.22.2: #i35285# merge cws i18n16 with top version 1.17
+-    2005/01/12 01:12:41 khong 1.16.22.1: #i35285# remove uprv_malloc, use udata_open for loading icu rule breakiterator
+-
+-commit 29b9e86f5dac388d7aaced24d3826ac9331b03e3
+-Author: Vladimir Glazounov <vg@openoffice.org>
+-Date:   Fri Feb 25 09:07:59 2005 +0000
++done, regression tests added:
+ 
+-    INTEGRATION: CWS i18n16 (1.5.22); FILE MERGED
+-    2005/02/04 19:05:45 khong 1.5.22.1: #i41671# use ICU rules for Thai breakiterator
++#112623# update Japanese word breakiterator dictionary
++#i50172# add cell breakiterator rule for Tamil
++#i80412# indic cursoring
++#i107843# em-dash/en-dash breakiterator fix for spell checking
++#i103552# Japanese word for 'shutdown' added to ja.dic
++#i113785# ligatures for spell checking will no longer break words
++An opening quote should not be counted as a word by word count tool (regression test in writer)
++fdo#31271 wrong line break with (
++#i89042# word count fix (regression test is in writer)
++#i58513# add break iterator rules for Finish
++#i19716# fix wrong line break on bracket characters
++#i21290# extend Greek script type
++#i21907# fix isBeginWord and isEndWord problem
++#i85411# Apply patch for ZWSP
++#i17155# fix line breakiterator rule to make slash and hyphen as part of word when doing line break
++#i13451# add '-' as midLetter for Catalan dictionary word breakiterator
++#i13494# fix word breakiterator rule to handle punctuations and signs correctly
++#i29548# Fix Thai word breakiterator problem
++#i11993# #i14904# fix word breakiterator issues
++#i64400# dash/hyphen should not break words (de/nds/nl/sv)
++#i22602# make dot stick on beginning of a word when doing line break
++#i24098# skip preceding space for beginOfSentence
++#i24098# fix beginOfSentence, which did not work correctly when cursor is on the beginning of the sentence
++#i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule.
++#i50172# add cell breakiterator rule for Tamil
++#i55778# reverse back last change, treat letter and number combination as one word.
++#i56347# apply patch to recognize suffixes of numbers in Hungarian spellchecking
++#i56348# add Hungarian word break rule for edit mode
++#i65267# fix line break rule
++#i86439# many changes to implement, tweak, debug UTF-16 surrogate pair handling
++#i75631#         "
++#i75632#         "
++#i75633#         "
++#i75412#         "
++#i80645# fix backslash issues in line breakiterator
++#i80841# fix hyphen line break problem
++#i81448# fixed dot line break issue
++#i81448# fix the problem of line break on punctuations (commit message says i81440)
++#i81448# fix problem of line break on symbols
++#i83649# fixed the problem of line break between quotation mark and open bracket
++#i83464# fix the problem of line break between letter and 1326
++b6634800# fix line break problem of dot after letter and before number
++#i83229# fix the problem of leading hyphen for numbers
++#i80815# count words like MS Word
++
++likely superseded:
++
++#i21392# Obscure line break behavior mismatch in string of symbols between MSO and LO.
++#i80548# "fix dash issues in line breakiterator" - fix no longer works
++#i72868# "fix Chinese punctuation for line breakiterator" - fix no longer works
++#i80891# "fix Chinese punctuation for line breakiterator" - fix no longer works
++
++#i27711# Adding/tweaking/removing languages later added to ICU.
++#i33756#         "
++#i41671#         "
++#i41671#         "
++#i55063#         "
++#i24850# ICU upgrades, internal bug fixes, or other work-arounds.
++#i24098#         "
++#112772#         "
++#i35285#         "
++4a1f1586173839d532f90507c72306bc9e2aec56        "
++a10b0e70c641d7438c557ef718c6942b3abffaec        "
++05fadde6f025bcaafca4f3093e88be3cc1bb6836        "
++#i57866#         "
++#i57866#         "
++#i69482#         "
++#142664#         "
++#i60645#         "
++#i53388#         "
++#i60645#         "
++#i78393#         "
++#i73903#         "
++#i75412#         "
++#i72589#         "
++#i75319#         "
++#i76706#         "
++#i64400#         "
++#i64400#         "
++#i79148#         "
++#i55063#         "
++#i87530#         "
++#i88041#         "
++#i88411#         "
++#i80923#         "
++#i80923#         "
++#i81519#         "
++
++
++suspect:
++
++
++- The intentions behind the following commits are unclear, as the referenced bugs were in the
++StarOffice internal bug tracker. These changes are contemporaneous with TR14 Revision 17, and seem
++to be part of an effort to backport upstream rule changes across multiple language customizations.
+ 
+ commit 746ea3d8c29b27b23af3433446f66db0ad3096d6
+ Author: Oliver Bolte <obo@openoffice.org>
+@@ -436,108 +133,17 @@ Date:   Tue Jan 11 10:18:51 2005 +0000
+     INTEGRATION: CWS i18n15 (1.3.36); FILE MERGED
+     2004/09/04 02:03:53 khong 1.3.36.1: #117685# make dictionary word contain only letter or only number, dot can be in middle or end of a word, but only one.
+ 
+-commit e5a62ce85bebcc9fb2bf0e5b9aced5fc7748055b
+-Author: Oliver Bolte <obo@openoffice.org>
+-Date:   Tue Jan 11 10:18:37 2005 +0000
+-
+-    INTEGRATION: CWS i18n15 (1.16.4); FILE MERGED
+-    2004/10/07 18:19:11 khong 1.16.4.1: #i33756# update Hungarian breakiterator
+-
+-commit d2a6a31e6981800c2a920f8c6ff901c341a0466e
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Fri Jul 30 13:38:57 2004 +0000
+-
+-    INTEGRATION: CWS i18n13 (1.8.92); FILE MERGED
+-    2004/06/14 23:24:16 khong 1.8.92.2: #112772# Japanese word breakiterator is not correct
+-    2004/06/11 19:23:04 khong 1.8.92.1: #112772# Japanese word breakiterator is not correct
+ 
+-commit d6b8dabc3dc4811e1152d411a8428ccb334d16ab
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Fri Jul 30 13:38:17 2004 +0000
+-
+-    INTEGRATION: CWS i18n13 (1.7.162); FILE MERGED
+-    2004/06/11 19:23:04 khong 1.7.162.1: #112772# Japanese word breakiterator is not correct
+-
+-commit 9ea4c16a699ac7cf5e255a19653651ac993f022b
+-Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Fri Jul 30 13:38:05 2004 +0000
+-
+-    INTEGRATION: CWS i18n13 (1.9.92); FILE MERGED
+-    2004/06/11 19:23:04 khong 1.9.92.1: #112772# Japanese word breakiterator is not correct
++- The intention behind the following commit is unclear, as the bug references are incorrect and no
++good candidates were immediately apparent. Based on the text of the commit, however, it appears to
++be a simple bug fix for skipSpace(). This function has also had a great deal of churn since this
++commit, further suggesting it is no longer pertinent.
+ 
+-commit 2887ecb5554eee699e1dce4ffbc2dfcf71a54a41
++commit 1967d8fb182b3101dee4f715e78be384400bc1e8
+ Author: Kurt Zenker <kz@openoffice.org>
+-Date:   Fri Jul 30 13:37:54 2004 +0000
+-
+-    INTEGRATION: CWS i18n13 (1.15.18); FILE MERGED
+-    2004/06/17 20:29:38 khong 1.15.18.2: #
+-    2004/06/02 04:54:24 khong 1.15.18.1: #i11993# fix getWordBoundary problem when position is on the end of the word.
+-
+-commit 606556eed208d1218f950df2200510a7e19af1d9
+-Author: Oliver Bolte <obo@openoffice.org>
+-Date:   Fri May 28 15:33:28 2004 +0000
+-
+-    INTEGRATION: CWS i18n12 (1.1.2); FILE ADDED
+-    2004/04/30 14:37:52 er 1.1.2.1: #i27711# Hungarian breakiterator (provided by Timar Andras)
+-
+-commit 9710ca90166c18c0a92f7f0246a7c2f7dae87ebc
+-Author: Oliver Bolte <obo@openoffice.org>
+-Date:   Fri May 28 15:33:17 2004 +0000
+-
+-    INTEGRATION: CWS i18n12 (1.4.22); FILE MERGED
+-    2004/04/13 11:55:32 er 1.4.22.1: #i27711# Hungarian breakiterator
+-
+-commit b138663ef4f4ade38fb42f8a2f567527cf15949b
+-Author: Oliver Bolte <obo@openoffice.org>
+-Date:   Fri May 28 15:33:02 2004 +0000
+-
+-    INTEGRATION: CWS i18n12 (1.13.22); FILE MERGED
+-    2004/04/30 11:25:47 er 1.13.22.2: RESYNC: (1.13-1.14); FILE MERGED
+-    2004/04/13 11:55:32 er 1.13.22.1: #i27711# Hungarian breakiterator
+-
+-commit f5bc5f04e4de8fa502d498a99f4ef6a340d796c0
+-Author: Oliver Bolte <obo@openoffice.org>
+-Date:   Wed Mar 17 08:02:14 2004 +0000
+-
+-    INTEGRATION: CWS i18n11 (1.13.14); FILE MERGED
+-    2004/02/04 02:09:04 khong 1.13.14.2: #i24098# skip preceding space for beginOfSentence
+-    2004/01/06 19:41:49 khong 1.13.14.1: #i24098# fix beginOfSentence, which did not work correctly when cursor is on the beginning of the sentence
+-
+-commit 16401a5b865b5da8a2dd70057e8b048e9b797d5a
+-Author: Oliver Bolte <obo@openoffice.org>
+-Date:   Wed Mar 17 08:02:01 2004 +0000
+-
+-    INTEGRATION: CWS i18n11 (1.12.14); FILE MERGED
+-    2004/02/10 14:21:13 er 1.12.14.3: RESYNC: (1.12-1.13); FILE MERGED
+-    2004/02/05 16:45:30 khong 1.12.14.2: #i24850# fix the problem in previousCharBlock, when target char block is in position 1
+-    2004/02/04 02:13:48 khong 1.12.14.1: #i24098# check boundary condition for Sentence, Script, CharBlock breakiterator
+-
+-commit 4da98b648497af30de0fcf1a16e649ce18b0564f
+-Author: Jens-Heiner Rechtien <hr@openoffice.org>
+-Date:   Mon Mar 8 16:17:05 2004 +0000
+-
+-    INTEGRATION: CWS i18n09 (1.2.2); FILE MERGED
+-    2003/12/04 23:45:37 khong 1.2.2.3: #i22602# make dot stick on beginning of a word when doing line break
+-    2003/12/04 23:12:37 khong 1.2.2.2: #i21392# change line break rule to match with MS office
++Date:   Wed Sep 5 16:37:28 2007 +0000
+ 
+-done, regression tests added:
++    INTEGRATION: CWS i18n37 (1.22.6); FILE MERGED
++    2007/09/03 18:27:39 khong 1.22.6.2: i8132 fixed a problem in skipping space for word breakiterator
++    2007/08/31 21:30:30 khong 1.22.6.1: i81158 fix skipping space problem
+ 
+-#112623# update Japanese word breakiterator dictionary
+-#i50172# add cell breakiterator rule for Tamil
+-#i80412# indic cursoring
+-#i107843# em-dash/en-dash breakiterator fix for spell checking
+-#i103552# Japanese word for 'shutdown' added to ja.dic
+-#i113785# ligatures for spell checking will no longer break words
+-An opening quote should not be counted as a word by word count tool (regression test in writer)
+-fdo#31271 wrong line break with (
+-#i89042# word count fix (regression test is in writer)
+-#i58513# add break iterator rules for Finish
+-#i19716# fix wrong line break on bracket characters
+-#i21290# extend Greek script type
+-#i21907# fix isBeginWord and isEndWord problem
+-#i85411# Apply patch for ZWSP
+-#i17155# fix line breakiterator rule to make slash and hyphen as part of word when doing line break
+-#i13451# add '-' as midLetter for Catalan dictionary word breakiterator
+-#i13494# fix word breakiterator rule to handle punctuations and signs correctly
+-#i29548# Fix Thai word breakiterator problem
+-#i11993# #i14904# fix word breakiterator issues
+-- 
+2.39.2
+
diff --git a/debian/patches/series b/debian/patches/series
index 5ca46579bf..65bdb5df83 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -50,3 +50,7 @@ fix-system-abseil-build.diff
 fix-riscv64-bridge.diff
 pdfium-ports.diff
 split-sdbc-firebird-mariadb.diff
+use-PyConfig.diff
+reviewed-breakIterator-customizations.diff
+breakiterator-updates.diff
+icu-74.1.diff
diff --git a/debian/patches/use-PyConfig.diff b/debian/patches/use-PyConfig.diff
new file mode 100644
index 0000000000..2cf1270d21
--- /dev/null
+++ b/debian/patches/use-PyConfig.diff
@@ -0,0 +1,80 @@
+From da0e9240bf6505ac3a67ff985705950566c66144 Mon Sep 17 00:00:00 2001
+From: Ilmari Lauhakangas <ilmari.lauhakangas@libreoffice.org>
+Date: Thu, 21 Dec 2023 12:01:50 +0200
+Subject: tdf#158447 Use PyConfig for setting Python home directory with Python
+ >= 3.8
+
+Change-Id: Ic5b7c60613b22f5215cb1a2a13fecf3e0946ca49
+Reviewed-on: https://gerrit.libreoffice.org/c/core/+/161089
+Reviewed-by: Ilmari Lauhakangas <ilmari.lauhakangas@libreoffice.org>
+Tested-by: Jenkins
+Reviewed-by: Noel Grandin <noel.grandin@collabora.co.uk>
+Tested-by: Ilmari Lauhakangas <ilmari.lauhakangas@libreoffice.org>
+---
+ pyuno/source/loader/pyuno_loader.cxx | 22 +++++++++++++++++++---
+ 1 file changed, 19 insertions(+), 3 deletions(-)
+
+diff --git a/pyuno/source/loader/pyuno_loader.cxx b/pyuno/source/loader/pyuno_loader.cxx
+index 008d58634947..1e00773761e7 100644
+--- a/pyuno/source/loader/pyuno_loader.cxx
++++ b/pyuno/source/loader/pyuno_loader.cxx
+@@ -114,7 +114,11 @@ static PyRef getObjectFromLoaderModule( const char * func )
+     return object;
+ }
+ 
++#if PY_VERSION_HEX >= 0x03080000
++static void setPythonHome ( const OUString & pythonHome, PyConfig * config )
++#else
+ static void setPythonHome ( const OUString & pythonHome )
++#endif
+ {
+     OUString systemPythonHome;
+     osl_getSystemPathFromFileURL( pythonHome.pData, &(systemPythonHome.pData) );
+@@ -138,9 +142,11 @@ static void setPythonHome ( const OUString & pythonHome )
+         PyErr_SetString(PyExc_SystemError, "python home path is too long");
+         return;
+     }
+-SAL_WNODEPRECATED_DECLARATIONS_PUSH
+-    Py_SetPythonHome(wide); // deprecated since python 3.11
+-SAL_WNODEPRECATED_DECLARATIONS_POP
++#if PY_VERSION_HEX >= 0x03080000
++    config->home = wide;
++#else
++    Py_SetPythonHome(wide);
++#endif
+ }
+ 
+ static void prependPythonPath( std::u16string_view pythonPathBootstrap )
+@@ -192,11 +198,17 @@ void pythonInit() {
+     if ( Py_IsInitialized()) // may be inited by getComponentContext() already
+         return;
+ 
++#if PY_VERSION_HEX >= 0x03080000
++    PyConfig config;
++#endif
+     OUString pythonPath;
+     OUString pythonHome;
+     OUString path( "$BRAND_BASE_DIR/" LIBO_ETC_FOLDER "/" SAL_CONFIGFILE("pythonloader.uno" ));
+     rtl::Bootstrap::expandMacros(path); //TODO: detect failure
+     rtl::Bootstrap bootstrap(path);
++#if PY_VERSION_HEX >= 0x03080000
++    PyConfig_InitPythonConfig( &config );
++#endif
+ 
+     // look for pythonhome
+     bootstrap.getFrom( "PYUNO_LOADER_PYTHONHOME", pythonHome );
+@@ -205,7 +217,11 @@ void pythonInit() {
+     // pythonhome+pythonpath must be set before Py_Initialize(), otherwise there appear warning on the console
+     // sadly, there is no api for setting the pythonpath, we have to use the environment variable
+     if( !pythonHome.isEmpty() )
++#if PY_VERSION_HEX >= 0x03080000
++        setPythonHome( pythonHome, &config );
++#else
+         setPythonHome( pythonHome );
++#endif
+ 
+     if( !pythonPath.isEmpty() )
+         prependPythonPath( pythonPath );
+-- 
+cgit v1.2.3
+
diff --git a/debian/rules b/debian/rules
index b43cc79c3c..346905ec1f 100755
--- a/debian/rules
+++ b/debian/rules
@@ -206,27 +206,33 @@ ifeq "$(ENABLE_GUI)" "y"
 BUILD_PLASMA=y
 # let's assume we won't ship two plasmas parallel..
 PLASMA_VERSION=5
+PLASMA_KF_VERSION=$(PLASMA_VERSION)
 ENABLE_QT5=n
 #QT5_MINVER=
+ENABLE_KF5=y
+ENABLE_QT6=y
+ENABLE_KF6=n
 endif
+# go sure. if we want plasma, we want kfX, too and if we want that
+# one we want qtX, too.
 ifeq "$(BUILD_PLASMA)" "y"
    ifeq "$(PLASMA_VERSION)" "5"
      ENABLE_KF5=y
-     # KF5 depends on Qt5
-     ifeq "$(ENABLE_KF5)" "y"
-       ENABLE_QT5=y
-       KF5_QT5_DEPENDS := libreoffice-qt5 (= $${binary:Version})
-     endif
-  else
+   endif
+   ifeq "$(PLASMA_VERSION)" "6"
 	ENABLE_KF6=y
-	# KF6 depends on Qt6
-     ifeq "$(ENABLE_KF6)" "y"
-       ENABLE_QT6=y
-       KF6_QT6_DEPENDS := libreoffice-qt6 (= $${binary:Version})
-     endif
   endif
 endif
-ENABLE_QT6=y
+# KF5 depends on Qt5
+ifeq "$(ENABLE_KF5)" "y"
+  ENABLE_QT5=y
+  KF5_QT5_DEPENDS := libreoffice-qt5 (= $${binary:Version})
+endif
+# KF6 depends on Qt6
+ifeq "$(ENABLE_KF6)" "y"
+  ENABLE_QT6=y
+  KF6_QT6_DEPENDS := libreoffice-qt6 (= $${binary:Version})
+endif
 # https://www.debian.org/doc/debian-policy/ says this is not defined and must
 # be ignored, but dh_strip mentions (and honours) it, so...
 ifneq (noautodbgsym,$(findstring noautodbgsym,$(DEB_BUILD_OPTIONS)))
@@ -619,7 +625,7 @@ OOO_CHECK_FATAL_ARCHS += armhf
 OOO_SMOKETEST_FATAL_ARCHS := $(OOO_ARCHS)
 
 # archs where the archive/ci runs autopkgtests
-OOO_AUTOPKGTEST_ARCHS := $(OOO_CHECK_ARCHS) i386 ppc64el s390x
+OOO_AUTOPKGTEST_ARCHS := $(OOO_CHECK_ARCHS) armel i386 ppc64el s390x riscv64
 
 ifeq (alpha,$(findstring $(DEB_HOST_ARCH),$(OOO_ARCHS)))
 PACKAGE_GEN=n
@@ -2185,7 +2191,13 @@ ifeq "$(ENABLE_QT6)" "y"
 endif
 ifeq "$(BUILD_PLASMA)" "y"
 	sed -e "s|@PLASMA_VERSION@|$(PLASMA_VERSION)|g" \
+		-e "s|@PLASMA_KF_VERSION@|$(PLASMA_KF_VERSION)|g" \
 	        >> debian/control.new < debian/control.plasma.in
+	# no kf6be here
+ifeq "$(PLASMA_VERSION)" "6"
+	sed -i "s/and a KDE\/KF$(PLASMA_KF_VERSION) configuration backend//" \
+		debian/control.new
+endif
 endif
 	cat debian/control.postgresql.in >> debian/control.new
 ifeq "$(ENABLE_EVO2)" "y"
@@ -2220,6 +2232,14 @@ else
 	perl -pi -e 's/%LO-DESKTOP-INTEGRATION%/libreoffice-gnome/' debian/control.new
 endif
 
+ifeq "$(ENABLE_KF5)" "y"
+ifneq "$(BOOKWORM_BACKPORT)" "y"
+	perl -pi -e 's/plasma-iconset-dep}/plasma-iconset-dep}, kio (>> 5.115.0-5)/' debian/control.new
+else
+	perl -pi -e 's/plasma-iconset-dep}/plasma-iconset-dep}, kio (>> 5.103.0-1)' debian/control.new
+endif
+endif
+
 ifeq (sk,$(findstring sk,$(HELPISOS)))
 	perl -pi -e 's/(Depends:.*)libreoffice-l10n-sk(.*)$$/\1libreoffice-l10n-sk, libreoffice-help-cs\2/' debian/control.new
 endif
@@ -2678,13 +2698,15 @@ ifeq "$(ENABLE_QT6)" "y"
 endif
 ifeq "$(shell echo $(ENABLE_KF5)$(ENABLE_KF6) | grep -q y && echo true)" "true"
 ifeq "$(BUILD_PLASMA)" "y"
-	mkdir -p debian/tmp/pkg/libreoffice-plasma/$(OODIR)/program
-	mv debian/tmp/pkg/libreoffice-kde/$(OODIR)/program/libkf$(PLASMA_VERSION)be1lo.so \
-		debian/tmp/pkg/libreoffice-plasma/$(OODIR)/program
+	if [ -f debian/tmp/pkg/libreoffice-kde/$(OODIR)/program/libkf$(PLASMA_VERSION)be1lo.so ]; then \
+		mkdir -p debian/tmp/pkg/libreoffice-plasma/$(OODIR)/program; \
+		mv debian/tmp/pkg/libreoffice-kde/$(OODIR)/program/libkf$(PLASMA_VERSION)be1lo.so \
+			debian/tmp/pkg/libreoffice-plasma/$(OODIR)/program; \
+	fi
 	# remove other ones maybe built by --enable-kfX
-	rm -f debian/tmp/pkg/libreoffice-kde/$(OODIR)/libkf*be1lo.so 
+	rm -f debian/tmp/pkg/libreoffice-kde/$(OODIR)/program/libkf*be1lo.so 
 else
-	rm -f debian/tmp/pkg/libreoffice-kde/$(OODIR)/libkf*be1lo.so
+	rm -f debian/tmp/pkg/libreoffice-kde/$(OODIR)/program/libkf*be1lo.so
 endif
 ifeq "$(ENABLE_KF5)" "y"
 	mkdir -p debian/tmp/pkg/libreoffice-kf5/$(OODIR)/program
diff --git a/debian/shlibs.override.icu b/debian/shlibs.override.icu
index 6db96d608f..f787723810 100644
--- a/debian/shlibs.override.icu
+++ b/debian/shlibs.override.icu
@@ -1,5 +1,5 @@
-libicudata	67
-libicui18n	67
-libicule	67
-libicuuc	67
-libicutu	67
+libicudata	73
+libicui18n	73
+libicule	73
+libicuuc	73
+libicutu	73