summaryrefslogtreecommitdiffstats
path: root/external/icu
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--external/icu/ExternalPackage_icu.mk42
-rw-r--r--external/icu/ExternalPackage_icu_ure.mk48
-rw-r--r--external/icu/ExternalProject_icu.mk100
-rw-r--r--external/icu/Makefile7
-rw-r--r--external/icu/Module_icu.mk19
-rw-r--r--external/icu/README1
-rw-r--r--external/icu/UnpackedTarball_icu.mk47
-rw-r--r--external/icu/Wdeprecated-copy-dtor.patch25
-rw-r--r--external/icu/c++20-comparison.patch.182
-rwxr-xr-xexternal/icu/cross-bin/icu-config12
-rw-r--r--external/icu/gcc9.patch26
-rw-r--r--external/icu/icu4c-android.patch.175
-rw-r--r--external/icu/icu4c-build.patch.191
-rw-r--r--external/icu/icu4c-clang-cl.patch.128
-rw-r--r--external/icu/icu4c-emscripten-cross.patch.199
-rw-r--r--external/icu/icu4c-icudata-stdlibs.patch.114
-rw-r--r--external/icu/icu4c-khmerbreakengine.patch.1837
-rw-r--r--external/icu/icu4c-macosx.patch.120
-rw-r--r--external/icu/icu4c-mkdir.patch.117
-rw-r--r--external/icu/icu4c-rpath.patch.136
-rw-r--r--external/icu/icu4c-rtti.patch.112
-rw-r--r--external/icu/icu4c-scriptrun.patch.160
-rw-r--r--external/icu/icu4c-solarisgcc.patch.112
-rw-r--r--external/icu/icu4c-ubsan.patch.114
-rw-r--r--external/icu/icu4c-use-pkgdata-single-ccode-file-mode.patch.112
-rw-r--r--external/icu/icu4c-warnings.patch.111
-rw-r--r--external/icu/icu4c-windows-cygwin-cross.patch.1131
-rw-r--r--external/icu/khmerdict.dictbin0 -> 263537 bytes
28 files changed, 1878 insertions, 0 deletions
diff --git a/external/icu/ExternalPackage_icu.mk b/external/icu/ExternalPackage_icu.mk
new file mode 100644
index 0000000000..dcd4da2169
--- /dev/null
+++ b/external/icu/ExternalPackage_icu.mk
@@ -0,0 +1,42 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+icu_VERSION := $(ICU_MAJOR).$(ICU_MINOR)$(if $(ICU_MICRO),.$(ICU_MICRO))
+
+$(eval $(call gb_ExternalPackage_ExternalPackage,icu,icu))
+
+$(eval $(call gb_ExternalPackage_use_external_project,icu,icu))
+
+ifneq ($(DISABLE_DYNLOADING),TRUE)
+ifeq ($(OS),WNT)
+
+ifeq ($(COM),GCC)
+$(eval $(call gb_ExternalPackage_add_files,icu,$(LIBO_LIB_FOLDER),\
+ source/lib/icuin$(ICU_MAJOR).dll \
+))
+else
+$(eval $(call gb_ExternalPackage_add_files,icu,$(LIBO_LIB_FOLDER),\
+ source/lib/icuin$(if $(MSVC_USE_DEBUG_RUNTIME),d)$(ICU_MAJOR).dll \
+))
+endif # $(COM)
+
+else ifeq ($(OS),ANDROID)
+
+$(eval $(call gb_ExternalPackage_add_files,icu,$(LIBO_LIB_FOLDER),\
+ source/lib/libicui18nlo.so \
+))
+
+else # $(OS) != WNT/ANDROID
+
+$(eval $(call gb_ExternalPackage_add_file,icu,$(LIBO_LIB_FOLDER)/libicui18n$(gb_Library_DLLEXT).$(ICU_MAJOR),source/lib/libicui18n$(gb_Library_DLLEXT).$(icu_VERSION)))
+
+endif # $(OS)
+endif # DISABLE_DYNLOADING
+
+# vim: set noet sw=4 ts=4:
diff --git a/external/icu/ExternalPackage_icu_ure.mk b/external/icu/ExternalPackage_icu_ure.mk
new file mode 100644
index 0000000000..fefe71afdc
--- /dev/null
+++ b/external/icu/ExternalPackage_icu_ure.mk
@@ -0,0 +1,48 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+# libxml2 is in URE and depends on icuuc*.dll on Windows; the i18nlangtag lib is
+# in URE and depends on the icuuc lib (which in turn depends on the icudata lib)
+# on all platforms:
+
+$(eval $(call gb_ExternalPackage_ExternalPackage,icu_ure,icu))
+
+$(eval $(call gb_ExternalPackage_use_external_project,icu_ure,icu))
+
+ifneq ($(DISABLE_DYNLOADING),TRUE)
+ifeq ($(OS),WNT)
+
+ifeq ($(COM),GCC)
+$(eval $(call gb_ExternalPackage_add_files,icu_ure,$(LIBO_URE_LIB_FOLDER),\
+ source/lib/icudt$(ICU_MAJOR).dll \
+ source/lib/icuuc$(ICU_MAJOR).dll \
+))
+else
+$(eval $(call gb_ExternalPackage_add_files,icu_ure,$(LIBO_URE_LIB_FOLDER),\
+ source/lib/icudt$(if $(MSVC_USE_DEBUG_RUNTIME),d)$(ICU_MAJOR).dll \
+ source/lib/icuuc$(if $(MSVC_USE_DEBUG_RUNTIME),d)$(ICU_MAJOR).dll \
+))
+endif # $(COM)
+
+else ifeq ($(OS),ANDROID)
+
+$(eval $(call gb_ExternalPackage_add_files,icu_ure,$(LIBO_URE_LIB_FOLDER),\
+ source/lib/libicudatalo.so \
+ source/lib/libicuuclo.so \
+))
+
+else # $(OS) != WNT/ANDROID
+
+$(eval $(call gb_ExternalPackage_add_file,icu_ure,$(LIBO_URE_LIB_FOLDER)/libicudata$(gb_Library_DLLEXT).$(ICU_MAJOR),source/lib/libicudata$(gb_Library_DLLEXT).$(icu_VERSION)))
+$(eval $(call gb_ExternalPackage_add_file,icu_ure,$(LIBO_URE_LIB_FOLDER)/libicuuc$(gb_Library_DLLEXT).$(ICU_MAJOR),source/lib/libicuuc$(gb_Library_DLLEXT).$(icu_VERSION)))
+
+endif # $(OS)
+endif # DISABLE_DYNLOADING
+
+# vim: set noet sw=4 ts=4:
diff --git a/external/icu/ExternalProject_icu.mk b/external/icu/ExternalProject_icu.mk
new file mode 100644
index 0000000000..5388eee589
--- /dev/null
+++ b/external/icu/ExternalProject_icu.mk
@@ -0,0 +1,100 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+$(eval $(call gb_ExternalProject_ExternalProject,icu))
+
+$(eval $(call gb_ExternalProject_register_targets,icu,\
+ build \
+))
+
+icu_CPPFLAGS:="-DHAVE_GCC_ATOMICS=$(if $(filter TRUE,$(GCC_HAVE_BUILTIN_ATOMIC)),1,0)"
+
+ifeq ($(OS),WNT)
+
+$(call gb_ExternalProject_get_state_target,icu,build) :
+ $(call gb_Trace_StartRange,icu,EXTERNAL)
+ $(call gb_ExternalProject_run,build,\
+ autoconf -f \
+ && export LIB="$(ILIB)" PYTHONWARNINGS="default" \
+ gb_ICU_XFLAGS="-FS $(SOLARINC) $(gb_DEBUGINFO_FLAGS) $(if $(MSVC_USE_DEBUG_RUNTIME),-MDd,-MD -Gy)" \
+ && CFLAGS="$${gb_ICU_XFLAGS}" CPPFLAGS="$(SOLARINC)" CXXFLAGS="$${gb_ICU_XFLAGS}" \
+ INSTALL=`cygpath -m /usr/bin/install` $(if $(MSVC_USE_DEBUG_RUNTIME),LDFLAGS="-DEBUG") \
+ $(gb_RUN_CONFIGURE) ./configure \
+ $(if $(MSVC_USE_DEBUG_RUNTIME),--enable-debug --disable-release) \
+ $(gb_CONFIGURE_PLATFORMS) \
+ $(if $(CROSS_COMPILING), \
+ --with-cross-build=$(WORKDIR_FOR_BUILD)/UnpackedTarball/icu/source \
+ --disable-tools --disable-extras) \
+ && $(MAKE) $(if $(CROSS_COMPILING),DATASUBDIR=data) $(if $(verbose),VERBOSE=1) \
+ ,source)
+ $(call gb_Trace_EndRange,icu,EXTERNAL)
+
+else # $(OS)
+
+icu_CFLAGS:="$(CFLAGS) \
+ $(if $(filter iOS,$(OS)),-DUCONFIG_NO_FILE_IO) \
+ $(if $(SYSBASE),-I$(SYSBASE)/usr/include) \
+ $(call gb_ExternalProject_get_build_flags,icu) \
+ $(if $(ENABLE_LTO),$(gb_LTOFLAGS)) \
+ $(if $(filter GCC,$(COM)),-fno-strict-aliasing) \
+ $(if $(filter FUZZERS,$(BUILD_TYPE)),-DU_USE_STRTOD_L=0) \
+ $(if $(filter ANDROID,$(OS)),-fvisibility=hidden -fno-omit-frame-pointer)"
+icu_CXXFLAGS:="$(CXXFLAGS) $(CXXFLAGS_CXX11) \
+ $(if $(filter iOS,$(OS)),-DUCONFIG_NO_FILE_IO) \
+ $(call gb_ExternalProject_get_build_flags,icu) \
+ $(if $(ENABLE_LTO),$(gb_LTOFLAGS)) \
+ $(if $(filter GCC,$(COM)),-fno-strict-aliasing) \
+ $(if $(filter FUZZERS,$(BUILD_TYPE)),-DU_USE_STRTOD_L=0) \
+ $(if $(filter ANDROID,$(OS)),-fvisibility=hidden -fno-omit-frame-pointer -I$(SRCDIR)/include)"
+icu_LDFLAGS:=" \
+ $(if $(ENABLE_LTO),$(gb_LTOFLAGS)) \
+ $(call gb_ExternalProject_get_link_flags,icu) \
+ $(if $(filter TRUE,$(HAVE_LD_HASH_STYLE)),-Wl$(COMMA)--hash-style=$(WITH_LINKER_HASH_STYLE)) \
+ $(if $(SYSBASE),-L../lib -L../../lib -L../stubdata -L../../stubdata -L$(SYSBASE)/usr/lib) \
+ $(if $(filter TRUE,$(HAVE_LD_BSYMBOLIC_FUNCTIONS)), -Wl$(COMMA)-Bsymbolic-functions) \
+ $(if $(filter ANDROID,$(OS)),$(gb_STDLIBS))"
+
+# DATASUBDIR=data in cross-compiling case, because --disable-tools completely skips the
+# data directory/doesn't build the requested library in that case (icu/source/Makefile.in)
+# so we need to add it back to the list of subdirectories to build
+$(call gb_ExternalProject_get_state_target,icu,build) :
+ $(call gb_Trace_StartRange,icu,EXTERNAL)
+ $(call gb_ExternalProject_run,build,\
+ autoconf -f && \
+ CPPFLAGS=$(icu_CPPFLAGS) CFLAGS=$(icu_CFLAGS) \
+ CXXFLAGS=$(icu_CXXFLAGS) LDFLAGS=$(icu_LDFLAGS) \
+ PYTHONWARNINGS="default" \
+ $(gb_RUN_CONFIGURE) ./configure \
+ --disable-layout --disable-samples \
+ $(if $(filter FUZZERS,$(BUILD_TYPE)),--disable-release) \
+ $(if $(filter EMSCRIPTEN ANDROID,$(OS)),--disable-strict ac_cv_c_bigendian=no) \
+ $(if $(filter SOLARIS,$(OS)),--disable-64bit-libs) \
+ $(if $(filter TRUE,$(DISABLE_DYNLOADING)),\
+ --with-data-packaging=static --enable-static --disable-shared --disable-dyload,\
+ --disable-static --enable-shared $(if $(filter ANDROID,$(OS)),--with-library-suffix=lo)) \
+ $(gb_CONFIGURE_PLATFORMS) \
+ $(if $(CROSS_COMPILING), \
+ --with-cross-build=$(WORKDIR_FOR_BUILD)/UnpackedTarball/icu/source \
+ --disable-tools --disable-extras) \
+ AR="$(AR)" RANLIB="$(RANLIB)" \
+ && $(MAKE) $(if $(CROSS_COMPILING),DATASUBDIR=data) $(if $(verbose),VERBOSE=1) \
+ $(if $(filter MACOSX,$(OS)), \
+ && $(PERL) $(SRCDIR)/solenv/bin/macosx-change-install-names.pl shl \
+ URELIB \
+ $(EXTERNAL_WORKDIR)/source/lib/libicuuc$(gb_Library_DLLEXT).$(icu_VERSION) \
+ $(EXTERNAL_WORKDIR)/source/lib/libicui18n$(gb_Library_DLLEXT).$(icu_VERSION) \
+ && $(PERL) $(SRCDIR)/solenv/bin/macosx-change-install-names.pl shl \
+ OOO \
+ $(EXTERNAL_WORKDIR)/source/lib/libicudata$(gb_Library_DLLEXT).$(icu_VERSION)) \
+ ,source)
+ $(call gb_Trace_EndRange,icu,EXTERNAL)
+
+endif
+
+# vim: set noet sw=4 ts=4:
diff --git a/external/icu/Makefile b/external/icu/Makefile
new file mode 100644
index 0000000000..e4968cf85f
--- /dev/null
+++ b/external/icu/Makefile
@@ -0,0 +1,7 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+
+module_directory:=$(dir $(realpath $(firstword $(MAKEFILE_LIST))))
+
+include $(module_directory)/../../solenv/gbuild/partial_build.mk
+
+# vim: set noet sw=4 ts=4:
diff --git a/external/icu/Module_icu.mk b/external/icu/Module_icu.mk
new file mode 100644
index 0000000000..5c99b930fc
--- /dev/null
+++ b/external/icu/Module_icu.mk
@@ -0,0 +1,19 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+$(eval $(call gb_Module_Module,icu))
+
+$(eval $(call gb_Module_add_targets,icu,\
+ UnpackedTarball_icu \
+ ExternalPackage_icu \
+ ExternalPackage_icu_ure \
+ ExternalProject_icu \
+))
+
+# vim: set noet sw=4 ts=4:
diff --git a/external/icu/README b/external/icu/README
new file mode 100644
index 0000000000..23cf5f0524
--- /dev/null
+++ b/external/icu/README
@@ -0,0 +1 @@
+Library providing Unicode support, from [https://icu.unicode.org/].
diff --git a/external/icu/UnpackedTarball_icu.mk b/external/icu/UnpackedTarball_icu.mk
new file mode 100644
index 0000000000..655614447d
--- /dev/null
+++ b/external/icu/UnpackedTarball_icu.mk
@@ -0,0 +1,47 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+$(eval $(call gb_UnpackedTarball_UnpackedTarball,icu))
+
+$(eval $(call gb_UnpackedTarball_set_tarball,icu,$(ICU_TARBALL)))
+
+$(eval $(call gb_UnpackedTarball_update_autoconf_configs,icu,source))
+
+# Data zip contains data/... and needs to end up in icu/source/data/...
+# Only data/misc/icudata.rc is needed for a Cygwin/MSVC build.
+$(eval $(call gb_UnpackedTarball_set_pre_action,icu,\
+ unzip -q -d source -o $(gb_UnpackedTarget_TARFILE_LOCATION)/$(ICU_DATA_TARBALL) data/misc/icudata.rc \
+))
+
+$(eval $(call gb_UnpackedTarball_set_patchlevel,icu,0))
+
+$(eval $(call gb_UnpackedTarball_add_patches,icu,\
+ external/icu/icu4c-build.patch.1 \
+ external/icu/icu4c-warnings.patch.1 \
+ external/icu/icu4c-macosx.patch.1 \
+ external/icu/icu4c-solarisgcc.patch.1 \
+ external/icu/icu4c-mkdir.patch.1 \
+ external/icu/icu4c-ubsan.patch.1 \
+ external/icu/icu4c-scriptrun.patch.1 \
+ external/icu/icu4c-rtti.patch.1 \
+ external/icu/icu4c-clang-cl.patch.1 \
+ external/icu/gcc9.patch \
+ external/icu/c++20-comparison.patch.1 \
+ external/icu/Wdeprecated-copy-dtor.patch \
+ external/icu/icu4c-windows-cygwin-cross.patch.1 \
+ external/icu/icu4c-emscripten-cross.patch.1 \
+ external/icu/icu4c-use-pkgdata-single-ccode-file-mode.patch.1 \
+ external/icu/icu4c-khmerbreakengine.patch.1 \
+ external/icu/icu4c-$(if $(filter ANDROID,$(OS)),android,rpath).patch.1 \
+ $(if $(filter-out ANDROID,$(OS)),external/icu/icu4c-icudata-stdlibs.patch.1) \
+))
+
+$(eval $(call gb_UnpackedTarball_add_file,icu,source/data/brkitr/khmerdict.dict,external/icu/khmerdict.dict))
+
+# vim: set noet sw=4 ts=4:
diff --git a/external/icu/Wdeprecated-copy-dtor.patch b/external/icu/Wdeprecated-copy-dtor.patch
new file mode 100644
index 0000000000..67078ef1bb
--- /dev/null
+++ b/external/icu/Wdeprecated-copy-dtor.patch
@@ -0,0 +1,25 @@
+--- source/common/unicode/uobject.h
++++ source/common/unicode/uobject.h
+@@ -245,10 +245,10 @@
+ // direct use of UObject itself
+
+ // default constructor
+- // inline UObject() {}
++ UObject() = default;
+
+ // copy constructor
+- // inline UObject(const UObject &other) {}
++ UObject(const UObject &other) = default;
+
+ #if 0
+ // TODO Sometime in the future. Implement operator==().
+@@ -280,8 +280,8 @@
+ * Subclasses need this assignment operator if they use compiler-provided
+ * assignment operators of their own. An alternative to not declaring one
+ * here would be to declare and empty-implement a protected or public one.
+- UObject &UObject::operator=(const UObject &);
+ */
++ UObject &operator=(const UObject &) = default;
+ };
+
+ #ifndef U_HIDE_INTERNAL_API
diff --git a/external/icu/c++20-comparison.patch.1 b/external/icu/c++20-comparison.patch.1
new file mode 100644
index 0000000000..fa10b048ce
--- /dev/null
+++ b/external/icu/c++20-comparison.patch.1
@@ -0,0 +1,82 @@
+diff -ur icu.org/source/i18n/unicode/rbtz.h icu/source/i18n/unicode/rbtz.h
+--- icu.org/source/i18n/unicode/rbtz.h 2022-10-19 02:53:21.000000000 +0200
++++ icu/source/i18n/unicode/rbtz.h 2022-10-24 22:20:10.889969185 +0200
+@@ -87,6 +87,7 @@
+ * @stable ICU 3.8
+ */
+ virtual bool operator!=(const TimeZone& that) const;
++ bool operator!=(const RuleBasedTimeZone& that) const {return !operator==(that);}
+
+ /**
+ * Adds the `TimeZoneRule` which represents time transitions.
+diff -ur icu.org/source/i18n/unicode/simpletz.h icu/source/i18n/unicode/simpletz.h
+--- icu.org/source/i18n/unicode/simpletz.h 2022-10-19 02:53:21.000000000 +0200
++++ icu/source/i18n/unicode/simpletz.h 2022-10-24 22:20:10.890969183 +0200
+@@ -112,6 +112,7 @@
+ * @stable ICU 2.0
+ */
+ virtual bool operator==(const TimeZone& that) const override;
++ bool operator!=(const SimpleTimeZone& that) const {return !operator==(that);}
+
+ /**
+ * Constructs a SimpleTimeZone with the given raw GMT offset and time zone ID,
+diff -ur icu.org/source/i18n/unicode/smpdtfmt.h icu/source/i18n/unicode/smpdtfmt.h
+--- icu.org/source/i18n/unicode/smpdtfmt.h 2022-10-19 02:53:21.000000000 +0200
++++ icu/source/i18n/unicode/smpdtfmt.h 2022-10-24 22:20:10.891969181 +0200
+@@ -877,6 +877,7 @@
+ * @stable ICU 2.0
+ */
+ virtual bool operator==(const Format& other) const override;
++ bool operator!=(const SimpleDateFormat& that) const {return !operator==(that);}
+
+
+ using DateFormat::format;
+diff -ur icu.org/source/i18n/unicode/stsearch.h icu/source/i18n/unicode/stsearch.h
+--- icu.org/source/i18n/unicode/stsearch.h 2022-10-19 02:53:21.000000000 +0200
++++ icu/source/i18n/unicode/stsearch.h 2022-10-24 22:20:10.892969178 +0200
+@@ -298,6 +298,7 @@
+ * @stable ICU 2.0
+ */
+ virtual bool operator==(const SearchIterator &that) const override;
++ bool operator!=(const StringSearch &that) const {return !operator==(that);}
+
+ // public get and set methods ----------------------------------------
+
+diff -ur icu.org/source/i18n/unicode/tzrule.h icu/source/i18n/unicode/tzrule.h
+--- icu.org/source/i18n/unicode/tzrule.h 2022-10-19 02:53:21.000000000 +0200
++++ icu/source/i18n/unicode/tzrule.h 2022-10-24 22:30:23.298744116 +0200
+@@ -257,6 +257,7 @@
+ * @stable ICU 3.8
+ */
+ virtual bool operator!=(const TimeZoneRule& that) const override;
++ bool operator!=(const InitialTimeZoneRule& that) const {return !operator==(that);}
+
+ /**
+ * Returns if this rule represents the same rule and offsets as another.
+@@ -454,6 +455,7 @@
+ * @stable ICU 3.8
+ */
+ virtual bool operator!=(const TimeZoneRule& that) const override;
++ bool operator!=(const AnnualTimeZoneRule& that) const {return !operator==(that);}
+
+ /**
+ * Gets the start date/time rule used by this rule.
+@@ -670,6 +672,7 @@
+ * @stable ICU 3.8
+ */
+ virtual bool operator!=(const TimeZoneRule& that) const override;
++ bool operator!=(const TimeArrayTimeZoneRule& that) const {return !operator==(that);}
+
+ /**
+ * Gets the time type of the start times used by this rule. The return value
+diff -ur icu.org/source/i18n/unicode/vtzone.h icu/source/i18n/unicode/vtzone.h
+--- icu.org/source/i18n/unicode/vtzone.h 2022-10-19 02:53:21.000000000 +0200
++++ icu/source/i18n/unicode/vtzone.h 2022-10-24 22:20:10.895969172 +0200
+@@ -83,6 +83,7 @@
+ * @stable ICU 3.8
+ */
+ virtual bool operator!=(const TimeZone& that) const;
++ bool operator!=(const VTimeZone& that) const {return !operator==(that);}
+
+ /**
+ * Create a <code>VTimeZone</code> instance by the time zone ID.
diff --git a/external/icu/cross-bin/icu-config b/external/icu/cross-bin/icu-config
new file mode 100755
index 0000000000..8ccf94f9bd
--- /dev/null
+++ b/external/icu/cross-bin/icu-config
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+case $1 in
+--version)
+ echo whatever
+ ;;
+--cppflags)
+ echo ${ICU_CFLAGS}
+ ;;
+--ldflags-searchpath)
+ echo ${ICU_LIBS}
+esac
diff --git a/external/icu/gcc9.patch b/external/icu/gcc9.patch
new file mode 100644
index 0000000000..5c9808f8c3
--- /dev/null
+++ b/external/icu/gcc9.patch
@@ -0,0 +1,26 @@
+--- source/i18n/unicode/format.h
++++ source/i18n/unicode/format.h
+@@ -22,6 +22,13 @@
+
+ #ifndef FORMAT_H
+ #define FORMAT_H
++
++#ifdef __GNUC__
++#pragma GCC diagnostic push
++#pragma GCC diagnostic ignored "-Wpragmas" // for old GCC
++#pragma GCC diagnostic ignored "-Wunknown-warning-option" // for Clang
++#pragma GCC diagnostic ignored "-Wdeprecated-copy"
++#endif
+
+
+ #include "unicode/utypes.h"
+@@ -314,5 +314,9 @@
+
+ #endif /* U_SHOW_CPLUSPLUS_API */
+
++#ifdef __GNUC__
++#pragma GCC diagnostic pop
++#endif
++
+ #endif // _FORMAT
+ //eof
diff --git a/external/icu/icu4c-android.patch.1 b/external/icu/icu4c-android.patch.1
new file mode 100644
index 0000000000..9ba252b402
--- /dev/null
+++ b/external/icu/icu4c-android.patch.1
@@ -0,0 +1,75 @@
+diff -ur icu.org/source/common/unicode/platform.h icu/source/common/unicode/platform.h
+--- icu.org/source/common/unicode/platform.h 2021-10-28 18:04:57.000000000 +0200
++++ icu/source/common/unicode/platform.h 2021-11-15 21:03:11.474638494 +0100
+@@ -818,7 +818,7 @@
+ UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllimport__))
+ # define U_EXPORT __declspec(dllexport)
+ #elif defined(__GNUC__)
+-# define U_EXPORT __attribute__((visibility("default")))
++# define U_EXPORT
+ #elif (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x550) \
+ || (defined(__SUNPRO_C) && __SUNPRO_C >= 0x550)
+ # define U_EXPORT __global
+diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux
+--- icu.org/source/config/mh-linux 2021-11-15 20:56:39.460705065 +0100
++++ icu/source/config/mh-linux 2021-11-15 21:03:11.474638494 +0100
+@@ -27,7 +27,7 @@
+
+ ## Compiler switch to embed a library name
+ # The initial tab in the next line is to prevent icu-config from reading it.
+- LD_SONAME = -Wl,-soname -Wl,$(notdir $(MIDDLE_SO_TARGET))
++ #LD_SONAME = -Wl,-soname -Wl,$(notdir $(MIDDLE_SO_TARGET))
+ #SH# # We can't depend on MIDDLE_SO_TARGET being set.
+ #SH# LD_SONAME=
+
+diff -ur icu.org/source/configure icu/source/configure
+--- icu.org/source/configure 2021-11-15 20:56:39.875703936 +0100
++++ icu/source/configure 2021-11-15 21:03:11.475638491 +0100
+@@ -5272,7 +5273,7 @@
+ else
+ icu_cv_host_frag=mh-linux-va
+ fi ;;
+-*-*-linux*|*-*-gnu|*-*-k*bsd*-gnu|*-*-kopensolaris*-gnu) icu_cv_host_frag=mh-linux ;;
++*-*-linux*|*-*-gnu|*-*-k*bsd*-gnu|*-*-kopensolaris*-gnu|*-*-*-androideabi*) icu_cv_host_frag=mh-linux ;;
+ i[34567]86-*-cygwin)
+ if test "$GCC" = yes; then
+ icu_cv_host_frag=mh-cygwin
+@@ -6472,6 +6466,10 @@
+ # Check to see if genccode can generate simple assembly.
+ GENCCODE_ASSEMBLY=
+ case "${host}" in
++arm-*-linux-androideabi)
++ if test "$GCC" = yes; then
++ GENCCODE_ASSEMBLY="-a gcc-android-arm"
++ fi ;;
+ *-linux*|*-kfreebsd*-gnu*|i*86-*-*bsd*|i*86-pc-gnu)
+ if test "$GCC" = yes; then
+ # We're using gcc, and the simple -a gcc command line works for genccode
+@@ -7594,6 +7592,10 @@
+ # wchar_t can be used
+ CHECK_UTF16_STRING_RESULT="available"
+ ;;
++*-*-*-androideabi|mips-unknown-linux-android)
++ # no UTF-16 strings thanks, I think, this is to avoid the -std=c++0x which causes trouble with uint64_t
++ CHECK_UTF16_STRING_RESULT="nope"
++ ;;
+ *)
+ ;;
+ esac
+diff -ur icu.org/source/i18n/decimfmt.cpp icu/source/i18n/decimfmt.cpp
+--- icu.org/source/i18n/decimfmt.cpp 2021-10-28 18:04:57.000000000 +0200
++++ icu/source/i18n/decimfmt.cpp 2021-11-15 21:03:11.476638489 +0100
+@@ -9,6 +9,13 @@
+ // Helpful in toString methods and elsewhere.
+ #define UNISTR_FROM_STRING_EXPLICIT
+
++#ifdef __ANDROID__
++#ifndef ARM
++#define ARM
++#endif
++#include <android/compatibility.hxx>
++#endif
++
+ #include <cmath>
+ #include <cstdlib>
+ #include <stdlib.h>
diff --git a/external/icu/icu4c-build.patch.1 b/external/icu/icu4c-build.patch.1
new file mode 100644
index 0000000000..a878de7323
--- /dev/null
+++ b/external/icu/icu4c-build.patch.1
@@ -0,0 +1,91 @@
+diff -ur icu.org/source/config/mh-darwin icu/source/config/mh-darwin
+--- icu.org/source/config/mh-darwin 2016-06-15 20:58:17.000000000 +0200
++++ icu/source/config/mh-darwin 2017-04-21 21:30:23.584568210 +0200
+@@ -30,11 +30,7 @@
+ SHLIB.cc= $(CXX) -dynamiclib -dynamic $(CXXFLAGS) $(LDFLAGS) $(LD_SOOPTIONS)
+
+ ## Compiler switches to embed a library name and version information
+-ifeq ($(ENABLE_RPATH),YES)
+-LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name $(libdir)/$(notdir $(MIDDLE_SO_TARGET))
+-else
+-LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name $(notdir $(MIDDLE_SO_TARGET)) $(PKGDATA_TRAILING_SPACE)
+-endif
++LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name @__________________________________________________URELIB/$(notdir $(MIDDLE_SO_TARGET))
+
+ ## Compiler switch to embed a runtime search path
+ LD_RPATH=
+@@ -50,10 +46,6 @@
+ ## Non-shared intermediate object suffix
+ STATIC_O = ao
+
+-## Override Versioned target for a shared library.
+-FINAL_SO_TARGET= $(basename $(SO_TARGET)).$(SO_TARGET_VERSION).$(SO)
+-MIDDLE_SO_TARGET= $(basename $(SO_TARGET)).$(SO_TARGET_VERSION_MAJOR).$(SO)
+-
+ ## Compilation and dependency rules
+ %.$(STATIC_O): $(srcdir)/%.c
+ $(call SILENT_COMPILE,$(strip $(COMPILE.c) $(STATICCPPFLAGS) $(STATICCFLAGS)) -MMD -MT "$*.d $*.o $*.$(STATIC_O)" -o $@ $<)
+@@ -67,16 +59,10 @@
+
+ ## Versioned libraries rules
+
+-%.$(SO_TARGET_VERSION_MAJOR).$(SO): %.$(SO_TARGET_VERSION).$(SO)
++%.$(SO).$(SO_TARGET_VERSION_MAJOR): %.$(SO).$(SO_TARGET_VERSION)
+ $(RM) $@ && ln -s ${<F} $@
+-%.$(SO): %.$(SO_TARGET_VERSION_MAJOR).$(SO)
+- $(RM) $@ && ln -s ${*F}.$(SO_TARGET_VERSION).$(SO) $@
+-
+-# tzcode option
+-TZORIG_EXTRA_CFLAGS=-DSTD_INSPIRED
+-
+-# genren opts
+-GENREN_PL_OPTS=-x Mach-O -n '-g' -p '| c++filt'
++%.$(SO): %.$(SO).$(SO_TARGET_VERSION_MAJOR)
++ $(RM) $@ && ln -s ${*F}.$(SO).$(SO_TARGET_VERSION) $@
+
+ ## Remove shared library 's'
+ STATIC_PREFIX_WHEN_USED =
+diff -ur icu.org/source/tools/toolutil/pkg_genc.cpp icu/source/tools/toolutil/pkg_genc.cpp
+--- icu.org/source/tools/toolutil/pkg_genc.cpp 2017-04-13 11:46:02.000000000 +0200
++++ icu/source/tools/toolutil/pkg_genc.cpp 2017-04-21 21:30:23.583568212 +0200
+@@ -160,6 +160,28 @@
+
+ ".long ","",HEX_0X
+ },
++ {"gcc-android-arm",
++ "\t.arch armv5te\n"
++ "\t.fpu softvfp\n"
++ "\t.eabi_attribute 20, 1\n"
++ "\t.eabi_attribute 21, 1\n"
++ "\t.eabi_attribute 23, 3\n"
++ "\t.eabi_attribute 24, 1\n"
++ "\t.eabi_attribute 25, 1\n"
++ "\t.eabi_attribute 26, 2\n"
++ "\t.eabi_attribute 30, 6\n"
++ "\t.eabi_attribute 18, 4\n"
++ "\t.file \"%s.s\"\n"
++ "\t.global %s\n"
++ "\t.section .rodata\n"
++ "\t.align 2\n"
++ "\t.type %s, %%object\n"
++ "%s:\n",
++
++ "\t.word ",
++ "\t.section .note.GNU-stack,\"\",%%progbits\n",
++ HEX_0X
++ },
+ /* 16 bytes alignment. */
+ /* http://docs.oracle.com/cd/E19641-01/802-1947/802-1947.pdf */
+ {"sun",
+diff -ur icu.org/source/tools/toolutil/pkg_genc.h icu/source/tools/toolutil/pkg_genc.h
+--- icu.org/source/tools/toolutil/pkg_genc.h 2017-01-20 01:20:31.000000000 +0100
++++ icu/source/tools/toolutil/pkg_genc.h 2017-04-21 21:30:23.582568215 +0200
+@@ -60,7 +60,7 @@
+ #endif
+
+ #define LARGE_BUFFER_MAX_SIZE 2048
+-#define SMALL_BUFFER_MAX_SIZE 512
++#define SMALL_BUFFER_MAX_SIZE 2048
+ #define SMALL_BUFFER_FLAG_NAMES 32
+ #define BUFFER_PADDING_SIZE 20
+
diff --git a/external/icu/icu4c-clang-cl.patch.1 b/external/icu/icu4c-clang-cl.patch.1
new file mode 100644
index 0000000000..a111a0df99
--- /dev/null
+++ b/external/icu/icu4c-clang-cl.patch.1
@@ -0,0 +1,28 @@
+diff -ur icu.org/source/config/mh-cygwin-msvc icu/source/config/mh-cygwin-msvc
+--- icu.org/source/config/mh-cygwin-msvc 2017-01-23 01:38:28.000000000 +0100
++++ icu/source/config/mh-cygwin-msvc 2017-04-21 23:07:28.482892025 +0200
+@@ -55,8 +55,8 @@
+ LDFLAGS+=-nologo
+
+ # Commands to compile
+-COMPILE.c= $(CC) $(CPPFLAGS) $(DEFS) $(CFLAGS) -c
+-COMPILE.cc= $(CXX) $(CPPFLAGS) $(DEFS) $(CXXFLAGS) -c
++COMPILE.c= true && $(CC) $(CPPFLAGS) $(DEFS) $(CFLAGS) -c
++COMPILE.cc= true && $(CXX) $(CPPFLAGS) $(DEFS) $(CXXFLAGS) -c
+
+ # Commands to link
+ LINK.c= LINK.EXE -subsystem:console $(LDFLAGS)
+diff -ur icu.org/source/runConfigureICU icu/source/runConfigureICU
+--- icu.org/source/runConfigureICU 2017-01-23 01:38:28.000000000 +0100
++++ icu/source/runConfigureICU 2017-04-21 23:07:28.482892025 +0200
+@@ -261,8 +261,8 @@
+ Cygwin/MSVC)
+ THE_OS="Windows with Cygwin"
+ THE_COMP="Microsoft Visual C++"
+- CC=cl; export CC
+- CXX=cl; export CXX
++ CC=${CC-cl}; export CC
++ CXX=${CXX-cl}; export CXX
+ RELEASE_CFLAGS='-Gy -MD'
+ RELEASE_CXXFLAGS='-Gy -MD'
+ DEBUG_CFLAGS='-FS -Zi -MDd'
diff --git a/external/icu/icu4c-emscripten-cross.patch.1 b/external/icu/icu4c-emscripten-cross.patch.1
new file mode 100644
index 0000000000..84c88a68a8
--- /dev/null
+++ b/external/icu/icu4c-emscripten-cross.patch.1
@@ -0,0 +1,99 @@
+--- icu/source/acinclude.m4.orig 2020-04-22 22:04:20.000000000 +0200
++++ icu/source/acinclude.m4 2020-11-04 06:10:29.993070072 +0100
+@@ -84,6 +84,7 @@
+ *-dec-osf*) icu_cv_host_frag=mh-alpha-osf ;;
+ *-*-nto*) icu_cv_host_frag=mh-qnx ;;
+ *-ncr-*) icu_cv_host_frag=mh-mpras ;;
++wasm*-*-emscripten*) icu_cv_host_frag=mh-emscripten ;;
+ *) icu_cv_host_frag=mh-unknown ;;
+ esac
+ ]
+--- /dev/null
++++ icu/source/config/mh-emscripten 2015-10-06 12:01:00.497972406 +0200
+@@ -0,0 +1,86 @@
++## Emscripten-specific setup
++## Copyright (c) 1999-2013, International Business Machines Corporation and
++## others. All Rights Reserved.
++## Commands to generate dependency files
++GEN_DEPS.c= $(CC) -E -MM $(DEFS) $(CPPFLAGS)
++GEN_DEPS.cc= $(CXX) -E -MM $(DEFS) $(CPPFLAGS) $(CXXFLAGS)
++
++## Flags for position independent code
++SHAREDLIBCFLAGS = -fPIC
++SHAREDLIBCXXFLAGS = -fPIC
++SHAREDLIBCPPFLAGS = -DPIC
++
++## Additional flags when building libraries and with threads
++THREADSCPPFLAGS = -D_REENTRANT
++LIBCPPFLAGS =
++
++## Compiler switch to embed a runtime search path
++LD_RPATH= -Wl,-zorigin,-rpath,'$$'ORIGIN
++LD_RPATH_PRE = -Wl,-rpath,
++
++## Force RPATH=$ORIGIN to locate own dependencies w/o need for LD_LIBRARY_PATH:
++ENABLE_RPATH=YES
++RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN'
++
++## These are the library specific LDFLAGS
++#LDFLAGSICUDT=-nodefaultlibs -nostdlib
++# Debian change: linking icudata as data only causes too many problems.
++LDFLAGSICUDT=
++
++## Compiler switch to embed a library name
++# The initial tab in the next line is to prevent icu-config from reading it.
++ LD_SONAME = -Wl,-soname -Wl,$(notdir $(MIDDLE_SO_TARGET))
++#SH# # We can't depend on MIDDLE_SO_TARGET being set.
++#SH# LD_SONAME=
++
++## Shared library options
++LD_SOOPTIONS= -Wl,-Bsymbolic-functions
++
++## Shared object suffix
++SO = so
++## Non-shared intermediate object suffix
++STATIC_O = o
++
++## Compilation rules
++# WASM needs -pthread for atomics support
++%.$(STATIC_O): $(srcdir)/%.c
++ $(call SILENT_COMPILE,$(strip $(COMPILE.c) $(STATICCPPFLAGS) $(STATICCFLAGS)) -pthread -o $@ $<)
++
++%.$(STATIC_O): $(srcdir)/%.cpp
++ $(call SILENT_COMPILE,$(strip $(COMPILE.cc) $(STATICCPPFLAGS) $(STATICCXXFLAGS)) -pthread -o $@ $<)
++
++
++## Dependency rules
++%.d: $(srcdir)/%.c
++ $(call ICU_MSG,(deps)) $<
++ @$(SHELL) -ec '$(GEN_DEPS.c) $< \
++ | sed '\''s%\($*\)\.o[ :]*%\1.o $@ : %g'\'' > $@; \
++ [ -s $@ ] || rm -f $@'
++
++%.d: $(srcdir)/%.cpp
++ $(call ICU_MSG,(deps)) $<
++ @$(SHELL) -ec '$(GEN_DEPS.cc) $< \
++ | sed '\''s%\($*\)\.o[ :]*%\1.o $@ : %g'\'' > $@; \
++ [ -s $@ ] || rm -f $@'
++
++## Versioned libraries rules
++
++%.$(SO).$(SO_TARGET_VERSION_MAJOR): %.$(SO).$(SO_TARGET_VERSION)
++ $(RM) $@ && ln -s ${<F} $@
++%.$(SO): %.$(SO).$(SO_TARGET_VERSION_MAJOR)
++ $(RM) $@ && ln -s ${*F}.$(SO).$(SO_TARGET_VERSION) $@
++
++## Bind internal references
++
++# LDflags that pkgdata will use
++BIR_LDFLAGS= -Wl,-Bsymbolic
++
++# Dependencies [i.e. map files] for the final library
++BIR_DEPS=
++
++## Remove shared library 's'
++STATIC_PREFIX_WHEN_USED =
++STATIC_PREFIX =
++
++## without assembly
++PKGDATA_OPTS = -O $(top_builddir)/data/icupkg.inc -w
diff --git a/external/icu/icu4c-icudata-stdlibs.patch.1 b/external/icu/icu4c-icudata-stdlibs.patch.1
new file mode 100644
index 0000000000..c8d66c6ed0
--- /dev/null
+++ b/external/icu/icu4c-icudata-stdlibs.patch.1
@@ -0,0 +1,14 @@
+diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux
+--- icu.org/source/config/mh-linux 2017-04-21 23:09:57.588533707 +0200
++++ icu/source/config/mh-linux 2017-04-21 23:11:38.075292226 +0200
+@@ -27,7 +27,9 @@
+ RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN'
+
+ ## These are the library specific LDFLAGS
+-LDFLAGSICUDT=-nodefaultlibs -nostdlib
++#LDFLAGSICUDT=-nodefaultlibs -nostdlib
++# Debian change: linking icudata as data only causes too many problems.
++LDFLAGSICUDT=
+
+ ## Compiler switch to embed a library name
+ # The initial tab in the next line is to prevent icu-config from reading it.
diff --git a/external/icu/icu4c-khmerbreakengine.patch.1 b/external/icu/icu4c-khmerbreakengine.patch.1
new file mode 100644
index 0000000000..605914014e
--- /dev/null
+++ b/external/icu/icu4c-khmerbreakengine.patch.1
@@ -0,0 +1,837 @@
+diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp
+--- icu.org/source/common/dictbe.cpp 2023-06-14 06:23:55.000000000 +0900
++++ icu/source/common/dictbe.cpp 2023-06-26 17:43:53.034173100 +0900
+@@ -35,7 +35,19 @@
+ ******************************************************************
+ */
+
+-DictionaryBreakEngine::DictionaryBreakEngine() {
++DictionaryBreakEngine::DictionaryBreakEngine()
++ : fTypes(0), clusterLimit(0) {
++}
++
++DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes)
++ : fTypes(breakTypes), clusterLimit(3) {
++ UErrorCode status = U_ZERO_ERROR;
++ fViramaSet.applyPattern(UnicodeString(u"[[:ccc=VR:]]"), status);
++
++ // note Skip Sets contain fIgnoreSet characters too.
++ fSkipStartSet.applyPattern(UnicodeString(u"[[:lb=OP:][:lb=QU:]\\u200C\\u200D\\u2060]"), status);
++ fSkipEndSet.applyPattern(UnicodeString(u"[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]\\u200C\\u200D\\u2060]"), status);
++ fNBeforeSet.applyPattern(UnicodeString(u"[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status);
+ }
+
+ DictionaryBreakEngine::~DictionaryBreakEngine() {
+@@ -85,6 +97,169 @@
+ fSet.compact();
+ }
+
++bool
++DictionaryBreakEngine::scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const {
++ UErrorCode status = U_ZERO_ERROR;
++ UText* ut = utext_clone(NULL, text, false, true, &status);
++ utext_setNativeIndex(ut, start);
++ UChar32 c = utext_current32(ut);
++ bool res = false;
++ doBreak = true;
++ while (start >= 0) {
++ if (!fSkipStartSet.contains(c)) {
++ res = (c == ZWSP);
++ break;
++ }
++ --start;
++ c = utext_previous32(ut);
++ doBreak = false;
++ }
++ utext_close(ut);
++ return res;
++}
++
++bool
++DictionaryBreakEngine::scanAfterEnd(UText *text, int32_t textEnd, int32_t& end, bool &doBreak) const {
++ UErrorCode status = U_ZERO_ERROR;
++ UText* ut = utext_clone(NULL, text, false, true, &status);
++ utext_setNativeIndex(ut, end);
++ UChar32 c = utext_current32(ut);
++ bool res = false;
++ doBreak = !fNBeforeSet.contains(c);
++ while (end < textEnd) {
++ if (!fSkipEndSet.contains(c)) {
++ res = (c == ZWSP);
++ break;
++ }
++ ++end;
++ c = utext_next32(ut);
++ doBreak = false;
++ }
++ utext_close(ut);
++ return res;
++}
++
++void
++DictionaryBreakEngine::scanBackClusters(UText *text, int32_t textStart, int32_t& start) const {
++ UChar32 c = 0;
++ start = utext_getNativeIndex(text);
++ while (start > textStart) {
++ c = utext_previous32(text);
++ --start;
++ if (!fSkipEndSet.contains(c))
++ break;
++ }
++ for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters
++ while (start > textStart) {
++ while (fIgnoreSet.contains(c))
++ c = utext_previous32(text);
++ if (!fMarkSet.contains(c)) {
++ if (fBaseSet.contains(c)) {
++ c = utext_previous32(text);
++ if (!fViramaSet.contains(c)) { // Virama (e.g. coeng) preceding base. Treat sequence as a mark
++ utext_next32(text);
++ c = utext_current32(text);
++ break;
++ } else {
++ --start;
++ }
++ } else {
++ break;
++ }
++ }
++ c = utext_previous32(text);
++ --start;
++ }
++ if (!fBaseSet.contains(c) || start < textStart) { // not a cluster start so finish
++ break;
++ }
++ c = utext_previous32(text);
++ --start; // go round again
++ } // ignore hitting previous inhibitor since scanning for it should have found us!
++ ++start; // counteract --before
++}
++
++void
++DictionaryBreakEngine::scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const {
++ UChar32 c = utext_current32(text);
++ end = utext_getNativeIndex(text);
++ while (end < textEnd) {
++ if (!fSkipStartSet.contains(c))
++ break;
++ utext_next32(text);
++ c = utext_current32(text);
++ ++end;
++ }
++ for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters
++ while (fIgnoreSet.contains(c)) {
++ utext_next32(text);
++ c = utext_current32(text);
++ }
++ if (fBaseSet.contains(c)) {
++ while (end < textEnd) {
++ utext_next32(text);
++ c = utext_current32(text);
++ ++end;
++ if (!fMarkSet.contains(c))
++ break;
++ else if (fViramaSet.contains(c)) { // handle coeng + base as mark
++ utext_next32(text);
++ c = utext_current32(text);
++ ++end;
++ if (!fBaseSet.contains(c))
++ break;
++ }
++ }
++ } else {
++ --end; // bad char so break after char before it
++ break;
++ }
++ }
++}
++
++bool
++DictionaryBreakEngine::scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const {
++ UErrorCode status = U_ZERO_ERROR;
++ UText* ut = utext_clone(NULL, text, false, true, &status);
++ int32_t nat = start;
++ utext_setNativeIndex(ut, nat);
++ bool foundFirst = true;
++ int32_t curr = start;
++ while (nat < end) {
++ UChar32 c = utext_current32(ut);
++ if (c == ZWSP || c == WJ) {
++ curr = nat + 1;
++ if (foundFirst) // only scan backwards for first inhibitor
++ scanBackClusters(ut, start, before);
++ foundFirst = false; // don't scan backwards if we go around again. Also marks found something
++
++ utext_next32(ut);
++ scanFwdClusters(ut, end, after);
++ nat = after + 1;
++
++ if (c == ZWSP || c == WJ) { // did we hit another one?
++ continue;
++ } else {
++ break;
++ }
++ }
++
++ ++nat; // keep hunting
++ utext_next32(ut);
++ }
++
++ utext_close(ut);
++
++ if (nat >= end && foundFirst) {
++ start = before = after = nat;
++ return false; // failed to find anything
++ }
++ else {
++ start = curr;
++ }
++ return true; // yup hit one
++}
++
+ /*
+ ******************************************************************
+ * PossibleWord
+@@ -114,7 +289,7 @@
+ ~PossibleWord() {}
+
+ // Fill the list of candidates if needed, select the longest, and return the number found
+- int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
++ int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet = NULL, int32_t minLength = 0 );
+
+ // Select the currently marked candidate, point after it in the text, and invalidate self
+ int32_t acceptMarked( UText *text );
+@@ -135,12 +310,12 @@
+ };
+
+
+-int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
++int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet, int32_t minLength) {
+ // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
+ int32_t start = (int32_t)utext_getNativeIndex(text);
+ if (start != offset) {
+ offset = start;
+- count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix);
++ count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix, ignoreSet, minLength);
+ // Dictionary leaves text after longest prefix, not longest word. Back up.
+ if (count <= 0) {
+ utext_setNativeIndex(text, start);
+@@ -814,53 +989,30 @@
+ * KhmerBreakEngine
+ */
+
+-// How many words in a row are "good enough"?
+-static const int32_t KHMER_LOOKAHEAD = 3;
+-
+-// Will not combine a non-word with a preceding dictionary word longer than this
+-static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
+-
+-// Will not combine a non-word that shares at least this much prefix with a
+-// dictionary word, with a preceding word
+-static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
+-
+-// Minimum word size
+-static const int32_t KHMER_MIN_WORD = 2;
+-
+-// Minimum number of characters for two words
+-static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
+-
+ KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
+- : DictionaryBreakEngine(),
++ : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
+ fDictionary(adoptDictionary)
+ {
+ UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
+ UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
+- UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);
++
++ clusterLimit = 3;
++
++ UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]\\u2060\\u200C\\u200D]"), status);
+ if (U_SUCCESS(status)) {
+ setCharacters(khmerWordSet);
+ }
+ fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
+- fMarkSet.add(0x0020);
+- fEndWordSet = khmerWordSet;
+- fBeginWordSet.add(0x1780, 0x17B3);
+- //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
+- //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
+- //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
+- fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
+- //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
+-// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
+-// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
+-// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
+-// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
+-// fSuffixSet.add(THAI_PAIYANNOI);
+-// fSuffixSet.add(THAI_MAIYAMOK);
++ fIgnoreSet.add(0x2060); // WJ
++ fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ
++ fBaseSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status);
++ fPuncSet.applyPattern(UnicodeString(u"[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status);
+
+ // Compact for caching.
+ fMarkSet.compact();
+- fEndWordSet.compact();
+- fBeginWordSet.compact();
+-// fSuffixSet.compact();
++ fIgnoreSet.compact();
++ fBaseSet.compact();
++ fPuncSet.compact();
+ UTRACE_EXIT_STATUS(status);
+ }
+
+@@ -876,175 +1028,205 @@
+ UBool /* isPhraseBreaking */,
+ UErrorCode& status ) const {
+ if (U_FAILURE(status)) return 0;
+- if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
+- return 0; // Not enough characters for two words
++ uint32_t wordsFound = foundBreaks.size();
++ int32_t before = 0;
++ int32_t after = 0;
++ int32_t finalBefore = 0;
++ int32_t initAfter = 0;
++ int32_t scanStart = rangeStart;
++ int32_t scanEnd = rangeEnd;
++
++ bool startZwsp = false;
++ bool breakStart = false;
++ bool breakEnd = false;
++
++ if (rangeStart > 0) {
++ --scanStart;
++ startZwsp = scanBeforeStart(text, scanStart, breakStart);
+ }
+
+- uint32_t wordsFound = 0;
+- int32_t cpWordLength = 0;
+- int32_t cuWordLength = 0;
+- int32_t current;
+- PossibleWord words[KHMER_LOOKAHEAD];
+-
+ utext_setNativeIndex(text, rangeStart);
++ scanFwdClusters(text, rangeEnd, initAfter);
++ bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd);
++ utext_setNativeIndex(text, rangeEnd - 1);
++ scanBackClusters(text, rangeStart, finalBefore);
++ if (finalBefore < initAfter) { // the whole run is tented so no breaks
++ if (breakStart || fTypes < UBRK_LINE)
++ foundBreaks.push(rangeStart, status);
++ if (breakEnd || fTypes < UBRK_LINE)
++ foundBreaks.push(rangeEnd, status);
++ return foundBreaks.size() - wordsFound;
++ }
+
+- while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
+- cuWordLength = 0;
+- cpWordLength = 0;
+-
+- // Look for candidate words at the current position
+- int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
+-
+- // If we found exactly one, use that
+- if (candidates == 1) {
+- cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
+- cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
+- wordsFound += 1;
+- }
++ scanStart = rangeStart;
++ scanWJ(text, scanStart, rangeEnd, before, after);
++ if (startZwsp || initAfter >= before) {
++ after = initAfter;
++ before = 0;
++ }
++ if (!endZwsp && after > finalBefore && after < rangeEnd)
++ endZwsp = true;
++ if (endZwsp && before > finalBefore)
++ before = finalBefore;
+
+- // If there was more than one, see which one can take us forward the most words
+- else if (candidates > 1) {
+- // If we're already at the end of the range, we're done
+- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
+- goto foundBest;
+- }
+- do {
+- if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
+- // Followed by another dictionary word; mark first word as a good candidate
+- words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
++ utext_setNativeIndex(text, rangeStart);
++ int32_t numCodePts = rangeEnd - rangeStart;
++ // bestSnlp[i] is the snlp of the best segmentation of the first i
++ // code points in the range to be matched.
++ UVector32 bestSnlp(numCodePts + 1, status);
++ bestSnlp.addElement(0, status);
++ for(int32_t i = 1; i <= numCodePts; i++) {
++ bestSnlp.addElement(kuint32max, status);
++ }
+
+- // If we're already at the end of the range, we're done
+- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
+- goto foundBest;
+- }
++ // prev[i] is the index of the last code point in the previous word in
++ // the best segmentation of the first i characters. Note negative implies
++ // that the code point is part of an unknown word.
++ UVector32 prev(numCodePts + 1, status);
++ for(int32_t i = 0; i <= numCodePts; i++) {
++ prev.addElement(kuint32max, status);
++ }
+
+- // See if any of the possible second words is followed by a third word
+- do {
+- // If we find a third word, stop right away
+- if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
+- words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
+- goto foundBest;
+- }
+- }
+- while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
+- }
++ const int32_t maxWordSize = 20;
++ UVector32 values(maxWordSize, status);
++ values.setSize(maxWordSize);
++ UVector32 lengths(maxWordSize, status);
++ lengths.setSize(maxWordSize);
++
++ // Dynamic programming to find the best segmentation.
++
++ // In outer loop, i is the code point index,
++ // ix is the corresponding string (code unit) index.
++ // They differ when the string contains supplementary characters.
++ int32_t ix = rangeStart;
++ for (int32_t i = 0; i < numCodePts; ++i, utext_setNativeIndex(text, ++ix)) {
++ if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
++ continue;
++ }
++
++ int32_t count;
++ count = fDictionary->matches(text, numCodePts - i, maxWordSize,
++ NULL, lengths.getBuffer(), values.getBuffer(), NULL, &fIgnoreSet, 2);
++ // Note: lengths is filled with code point lengths
++ // The NULL parameter is the ignored code unit lengths.
++
++ for (int32_t j = 0; j < count; j++) {
++ int32_t ln = lengths.elementAti(j);
++ if (ln + i >= numCodePts)
++ continue;
++ utext_setNativeIndex(text, ln+ix);
++ int32_t c = utext_current32(text);
++ if (fMarkSet.contains(c) || c == 0x17D2) { // Coeng
++ lengths.removeElementAt(j);
++ values.removeElementAt(j);
++ --j;
++ --count;
+ }
+- while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
+-foundBest:
+- cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
+- cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
+- wordsFound += 1;
+ }
+-
+- // We come here after having either found a word or not. We look ahead to the
+- // next word. If it's not a dictionary word, we will combine it with the word we
+- // just found (if there is one), but only if the preceding word does not exceed
+- // the threshold.
+- // The text iterator should now be positioned at the end of the word we found.
+- if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
+- // if it is a dictionary word, do nothing. If it isn't, then if there is
+- // no preceding word, or the non-word shares less than the minimum threshold
+- // of characters with a dictionary word, then scan to resynchronize
+- if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
+- && (cuWordLength == 0
+- || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
+- // Look for a plausible word boundary
+- int32_t remaining = rangeEnd - (current+cuWordLength);
+- UChar32 pc;
+- UChar32 uc;
+- int32_t chars = 0;
+- for (;;) {
+- int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
+- pc = utext_next32(text);
+- int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
+- chars += pcSize;
+- remaining -= pcSize;
+- if (remaining <= 0) {
++ if (count == 0) {
++ utext_setNativeIndex(text, ix);
++ int32_t c = utext_current32(text);
++ if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) {
++ values.setElementAt(0, count);
++ lengths.setElementAt(1, count++);
++ } else if (fBaseSet.contains(c)) {
++ int32_t currix = utext_getNativeIndex(text);
++ do {
++ utext_next32(text);
++ c = utext_current32(text);
++ if (utext_getNativeIndex(text) >= rangeEnd)
+ break;
+- }
+- uc = utext_current32(text);
+- if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
+- // Maybe. See if it's in the dictionary.
+- int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
+- utext_setNativeIndex(text, current+cuWordLength+chars);
+- if (num_candidates > 0) {
++ if (c == 0x17D2) { // Coeng
++ utext_next32(text);
++ c = utext_current32(text);
++ if (!fBaseSet.contains(c) || utext_getNativeIndex(text) >= rangeEnd) {
+ break;
++ } else {
++ utext_next32(text);
++ c = utext_current32(text);
++ if (utext_getNativeIndex(text) >= rangeEnd)
++ break;
+ }
+ }
+- }
+-
+- // Bump the word count if there wasn't already one
+- if (cuWordLength <= 0) {
+- wordsFound += 1;
+- }
++ } while (fMarkSet.contains(c) || fIgnoreSet.contains(c));
++ values.setElementAt(BADSNLP, count);
++ lengths.setElementAt(utext_getNativeIndex(text) - currix, count++);
++ } else {
++ values.setElementAt(BADSNLP, count);
++ lengths.setElementAt(1, count++);
++ }
++ }
+
+- // Update the length with the passed-over characters
+- cuWordLength += chars;
++ for (int32_t j = 0; j < count; j++) {
++ uint32_t v = values.elementAti(j);
++ int32_t newSnlp = bestSnlp.elementAti(i) + v;
++ int32_t ln = lengths.elementAti(j);
++ utext_setNativeIndex(text, ln+ix);
++ int32_t c = utext_current32(text);
++ while ((fPuncSet.contains(c) || fIgnoreSet.contains(c)) && ln + i < numCodePts) {
++ ++ln;
++ utext_next32(text);
++ c = utext_current32(text);
+ }
+- else {
+- // Back up to where we were for next iteration
+- utext_setNativeIndex(text, current+cuWordLength);
++ int32_t ln_j_i = ln + i; // yes really i!
++ if (newSnlp < bestSnlp.elementAti(ln_j_i)) {
++ if (v == BADSNLP) {
++ int32_t p = prev.elementAti(i);
++ if (p < 0)
++ prev.setElementAt(p, ln_j_i);
++ else
++ prev.setElementAt(-i, ln_j_i);
++ }
++ else
++ prev.setElementAt(i, ln_j_i);
++ bestSnlp.setElementAt(newSnlp, ln_j_i);
+ }
+ }
+-
+- // Never stop before a combining mark.
+- int32_t currPos;
+- while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
+- utext_next32(text);
+- cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
++ }
++ // Start pushing the optimal offset index into t_boundary (t for tentative).
++ // prev[numCodePts] is guaranteed to be meaningful.
++ // We'll first push in the reverse order, i.e.,
++ // t_boundary[0] = numCodePts, and afterwards do a swap.
++ UVector32 t_boundary(numCodePts+1, status);
++
++ int32_t numBreaks = 0;
++ // No segmentation found, set boundary to end of range
++ while (numCodePts >= 0 && (uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
++ --numCodePts;
++ }
++ if (numCodePts < 0) {
++ t_boundary.addElement(numCodePts, status);
++ numBreaks++;
++ } else {
++ for (int32_t i = numCodePts; (uint32_t)i != kuint32max; i = prev.elementAti(i)) {
++ if (i < 0) i = -i;
++ t_boundary.addElement(i, status);
++ numBreaks++;
+ }
++ // U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
++ }
+
+- // Look ahead for possible suffixes if a dictionary word does not follow.
+- // We do this in code rather than using a rule so that the heuristic
+- // resynch continues to function. For example, one of the suffix characters
+- // could be a typo in the middle of a word.
+-// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
+-// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
+-// && fSuffixSet.contains(uc = utext_current32(text))) {
+-// if (uc == KHMER_PAIYANNOI) {
+-// if (!fSuffixSet.contains(utext_previous32(text))) {
+-// // Skip over previous end and PAIYANNOI
+-// utext_next32(text);
+-// utext_next32(text);
+-// wordLength += 1; // Add PAIYANNOI to word
+-// uc = utext_current32(text); // Fetch next character
+-// }
+-// else {
+-// // Restore prior position
+-// utext_next32(text);
+-// }
+-// }
+-// if (uc == KHMER_MAIYAMOK) {
+-// if (utext_previous32(text) != KHMER_MAIYAMOK) {
+-// // Skip over previous end and MAIYAMOK
+-// utext_next32(text);
+-// utext_next32(text);
+-// wordLength += 1; // Add MAIYAMOK to word
+-// }
+-// else {
+-// // Restore prior position
+-// utext_next32(text);
+-// }
+-// }
+-// }
+-// else {
+-// utext_setNativeIndex(text, current+wordLength);
+-// }
+-// }
+-
+- // Did we find a word on this iteration? If so, push it on the break stack
+- if (cuWordLength > 0) {
+- foundBreaks.push((current+cuWordLength), status);
++ // Now that we're done, convert positions in t_boundary[] (indices in
++ // the normalized input string) back to indices in the original input UText
++ // while reversing t_boundary and pushing values to foundBreaks.
++ for (int32_t i = numBreaks-1; i >= 0; i--) {
++ int32_t cpPos = t_boundary.elementAti(i);
++ if (cpPos == 0 && !breakStart && fTypes >= UBRK_LINE) continue;
++ int32_t utextPos = cpPos + rangeStart;
++ while (utextPos > after && scanWJ(text, utextPos, scanEnd, before, after));
++ if (utextPos < before) {
++ // Boundaries are added to foundBreaks output in ascending order.
++ U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
++ foundBreaks.push(utextPos, status);
+ }
+ }
+-
++
+ // Don't return a break for the end of the dictionary range if there is one there.
+- if (foundBreaks.peeki() >= rangeEnd) {
++ if (!breakEnd && fTypes >= UBRK_LINE && foundBreaks.peeki() >= rangeEnd) {
+ (void) foundBreaks.popi();
+- wordsFound -= 1;
+ }
+
+- return wordsFound;
++ return foundBreaks.size() - wordsFound;
+ }
+
+ #if !UCONFIG_NO_NORMALIZATION
+diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h
+--- icu.org/source/common/dictbe.h 2022-04-08 00:41:55.000000000 +0200
++++ icu/source/common/dictbe.h 2022-05-16 13:49:33.820459894 +0200
+@@ -35,7 +35,8 @@
+ * threads without synchronization.</p>
+ */
+ class DictionaryBreakEngine : public LanguageBreakEngine {
+- private:
++ protected:
++
+ /**
+ * The set of characters handled by this engine
+ * @internal
+@@ -43,14 +44,84 @@
+
+ UnicodeSet fSet;
+
++ const int32_t WJ = 0x2060;
++ const int32_t ZWSP = 0x200B;
++
++ /**
++ * The break types it was constructed with
++ * @internal
++ */
++ uint32_t fTypes;
++
++ /**
++ * A Unicode set of all viramas
++ * @internal
++ */
++ UnicodeSet fViramaSet;
++
++ /**
++ * A Unicode set of all base characters
++ * @internal
++ */
++ UnicodeSet fBaseSet;
++
++ /**
++ * A Unicode set of all marks
++ * @internal
++ */
++ UnicodeSet fMarkSet;
++
++ /**
++ * A Unicode set of all characters ignored ignored in dictionary matching
++ * @internal
++ */
++ UnicodeSet fIgnoreSet;
++
++ /**
++ * A Unicode set of all characters ignored ignored in dictionary matching
++ * @internal
++ */
++ UnicodeSet fSkipStartSet;
++
++ /**
++ * A Unicode set of all characters ignored ignored in dictionary matching
++ * @internal
++ */
++ UnicodeSet fSkipEndSet;
++
++ /**
++ * A Unicode set of all characters that should not be broken before
++ * @internal
++ */
++ UnicodeSet fNBeforeSet;
++
++ /**
++ * The number of clusters within which breaks are inhibited
++ * @internal
++ */
++ int32_t clusterLimit;
++
++ bool scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const;
++
++ bool scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const;
++ bool scanAfterEnd(UText *text, int32_t rangeEnd, int32_t& end, bool &doBreak) const;
++ void scanBackClusters(UText *text, int32_t textStart, int32_t& start) const;
++ void scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const;
++
+ public:
+
+ /**
+- * <p>Constructor </p>
++ * <p>Default constructor.</p>
++ *
+ */
+ DictionaryBreakEngine();
+
+ /**
++ * <p>Constructor with break types.</p>
++ */
++ explicit DictionaryBreakEngine(uint32_t breakTypes);
++
++ /**
+ * <p>Virtual destructor.</p>
+ */
+ virtual ~DictionaryBreakEngine();
+@@ -305,10 +376,12 @@
+ * @internal
+ */
+
+- UnicodeSet fEndWordSet;
+ UnicodeSet fBeginWordSet;
+- UnicodeSet fMarkSet;
+- DictionaryMatcher *fDictionary;
++ UnicodeSet fPuncSet;
++ DictionaryMatcher *fDictionary;
++
++ const uint32_t BADSNLP = 256 * 20;
++ const uint32_t kuint32max = 0x7FFFFFFF;
+
+ public:
+
+diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionarydata.cpp
+--- icu.org/source/common/dictionarydata.cpp 2023-06-14 06:23:55.000000000 +0900
++++ icu/source/common/dictionarydata.cpp 2023-06-26 02:18:05.709454400 +0900
+@@ -44,7 +44,7 @@
+
+ int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
+ int32_t *lengths, int32_t *cpLengths, int32_t *values,
+- int32_t *prefix) const {
++ int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
+
+ UCharsTrie uct(characters);
+ int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
+@@ -55,7 +55,13 @@
+ UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
+ int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
+ codePointsMatched += 1;
++ if (ignoreSet != NULL && ignoreSet->contains(c)) {
++ continue;
++ }
+ if (USTRINGTRIE_HAS_VALUE(result)) {
++ if (codePointsMatched < minLength) {
++ continue;
++ }
+ if (wordCount < limit) {
+ if (values != nullptr) {
+ values[wordCount] = uct.getValue();
+@@ -112,7 +118,7 @@
+
+ int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
+ int32_t *lengths, int32_t *cpLengths, int32_t *values,
+- int32_t *prefix) const {
++ int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
+ BytesTrie bt(characters);
+ int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
+ int32_t wordCount = 0;
+@@ -122,7 +128,13 @@
+ UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
+ int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
+ codePointsMatched += 1;
++ if (ignoreSet != NULL && ignoreSet->contains(c)) {
++ continue;
++ }
+ if (USTRINGTRIE_HAS_VALUE(result)) {
++ if (codePointsMatched < minLength) {
++ continue;
++ }
+ if (wordCount < limit) {
+ if (values != nullptr) {
+ values[wordCount] = bt.getValue();
+
+diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h
+--- icu.org/source/common/dictionarydata.h 2023-06-14 06:23:55.000000000 +0900
++++ icu/source/common/dictionarydata.h 2023-06-26 17:43:53.097724900 +0900
+@@ -21,6 +21,7 @@
+ #include "unicode/utext.h"
+ #include "unicode/udata.h"
+ #include "udataswp.h"
++#include "unicode/uniset.h"
+ #include "unicode/uobject.h"
+ #include "unicode/ustringtrie.h"
+
+@@ -92,7 +93,7 @@
+ */
+ virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
+ int32_t *lengths, int32_t *cpLengths, int32_t *values,
+- int32_t *prefix) const = 0;
++ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const = 0;
+
+ /** @return DictionaryData::TRIE_TYPE_XYZ */
+ virtual int32_t getType() const = 0;
+@@ -107,7 +108,7 @@
+ virtual ~UCharsDictionaryMatcher();
+ virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
+ int32_t *lengths, int32_t *cpLengths, int32_t *values,
+- int32_t *prefix) const override;
++ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const override;
+ virtual int32_t getType() const override;
+ private:
+ const char16_t *characters;
+@@ -125,7 +126,7 @@
+ virtual ~BytesDictionaryMatcher();
+ virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
+ int32_t *lengths, int32_t *cpLengths, int32_t *values,
+- int32_t *prefix) const override;
++ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const override;
+ virtual int32_t getType() const override;
+ private:
+ UChar32 transform(UChar32 c) const;
diff --git a/external/icu/icu4c-macosx.patch.1 b/external/icu/icu4c-macosx.patch.1
new file mode 100644
index 0000000000..fee08eb057
--- /dev/null
+++ b/external/icu/icu4c-macosx.patch.1
@@ -0,0 +1,20 @@
+diff -ur icu.org/source/common/putil.cpp icu/source/common/putil.cpp
+--- icu.org/source/common/putil.cpp 2017-04-10 16:22:16.000000000 +0200
++++ icu/source/common/putil.cpp 2017-04-21 22:14:09.940217733 +0200
+@@ -1198,8 +1198,16 @@
+ static const time_t decemberSolstice=1198332540; /*2007-12-22 06:09 UT*/
+
+ /* This probing will tell us when daylight savings occurs. */
++#if U_PLATFORM_IS_DARWIN_BASED
++ struct tm *tmp;
++ tmp = localtime(&juneSolstice);
++ juneSol = *tmp;
++ tmp = localtime(&decemberSolstice);
++ decemberSol = *tmp;
++#else
+ localtime_r(&juneSolstice, &juneSol);
+ localtime_r(&decemberSolstice, &decemberSol);
++#endif
+ if(decemberSol.tm_isdst > 0) {
+ daylightType = U_DAYLIGHT_DECEMBER;
+ } else if(juneSol.tm_isdst > 0) {
diff --git a/external/icu/icu4c-mkdir.patch.1 b/external/icu/icu4c-mkdir.patch.1
new file mode 100644
index 0000000000..0cdcf2b078
--- /dev/null
+++ b/external/icu/icu4c-mkdir.patch.1
@@ -0,0 +1,17 @@
+diff -ur icu.org/source/data/Makefile.in icu/source/data/Makefile.in
+--- icu.org/source/data/Makefile.in 2020-10-28 22:21:12.000000000 +0100
++++ icu/source/data/Makefile.in 2020-11-17 10:18:37.960032668 +0100
+@@ -239,6 +239,13 @@
+
+ ifeq ($(ENABLE_SO_VERSION_DATA),1)
+ ifeq ($(PKGDATA_MODE),dll)
++
++# This should be in the included rules.mk but that is generated empty by
++# configure because we have no data/locales/root.txt with prebuilt data/in/
++$(TMP_DIR)/dirs.timestamp:
++ $(MKINSTALLDIRS) $(OUTTMPDIR) $(TMP_DIR)
++ echo timestamp > $@
++
+ SO_VERSION_DATA = $(OUTTMPDIR)/icudata.res
+ $(SO_VERSION_DATA) : $(MISCSRCDIR)/icudata.rc | $(TMP_DIR)/dirs.timestamp
+ ifeq ($(MSYS_RC_MODE),1)
diff --git a/external/icu/icu4c-rpath.patch.1 b/external/icu/icu4c-rpath.patch.1
new file mode 100644
index 0000000000..35a5457780
--- /dev/null
+++ b/external/icu/icu4c-rpath.patch.1
@@ -0,0 +1,36 @@
+diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux
+--- icu.org/source/config/mh-linux 2016-06-15 20:58:17.000000000 +0200
++++ icu/source/config/mh-linux 2017-04-21 22:38:18.893927819 +0200
+@@ -22,6 +22,10 @@
+ LD_RPATH= -Wl,-zorigin,-rpath,'$$'ORIGIN
+ LD_RPATH_PRE = -Wl,-rpath,
+
++## Force RPATH=$ORIGIN to locate own dependencies w/o need for LD_LIBRARY_PATH:
++ENABLE_RPATH=YES
++RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN'
++
+ ## These are the library specific LDFLAGS
+ LDFLAGSICUDT=-nodefaultlibs -nostdlib
+
+diff -ur icu.org/source/data/pkgdataMakefile.in icu/source/data/pkgdataMakefile.in
+--- icu.org/source/data/pkgdataMakefile.in 2016-06-15 20:58:17.000000000 +0200
++++ icu/source/data/pkgdataMakefile.in 2017-04-21 22:38:18.892927822 +0200
+@@ -18,6 +18,9 @@
+ MIDDLE_SO_TARGET=
+ PKGDATA_TRAILING_SPACE=" "
+
++# escape $ with \ when passing to echo; needed to preserve $ORIGIN
++SHLIB.c.shell := $(subst $$,\$$,$(SHLIB.c))
++
+ all : clean
+ @echo GENCCODE_ASSEMBLY_TYPE=$(GENCCODE_ASSEMBLY) >> $(OUTPUTFILE)
+ @echo SO=$(SO) >> $(OUTPUTFILE)
+@@ -26,7 +29,7 @@
+ @echo LIB_EXT_ORDER=$(FINAL_SO_TARGET) >> $(OUTPUTFILE)
+ @echo COMPILE="$(COMPILE.c)" >> $(OUTPUTFILE)
+ @echo LIBFLAGS="-I$(top_srcdir)/common -I$(top_builddir)/common $(SHAREDLIBCPPFLAGS) $(SHAREDLIBCFLAGS)" >> $(OUTPUTFILE)
+- @echo GENLIB="$(SHLIB.c)" >> $(OUTPUTFILE)
++ @echo GENLIB="$(SHLIB.c.shell)" >> $(OUTPUTFILE)
+ @echo LDICUDTFLAGS=$(LDFLAGSICUDT) >> $(OUTPUTFILE)
+ @echo LD_SONAME=$(LD_SONAME) >> $(OUTPUTFILE)
+ @echo RPATH_FLAGS=$(RPATH_FLAGS) >> $(OUTPUTFILE)
diff --git a/external/icu/icu4c-rtti.patch.1 b/external/icu/icu4c-rtti.patch.1
new file mode 100644
index 0000000000..c058c7f3c8
--- /dev/null
+++ b/external/icu/icu4c-rtti.patch.1
@@ -0,0 +1,12 @@
+diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux
+--- icu.org/source/config/mh-linux 2017-04-21 23:01:23.257769703 +0200
++++ icu/source/config/mh-linux 2017-04-21 23:03:23.166481552 +0200
+@@ -36,7 +36,7 @@
+ #SH# LD_SONAME=
+
+ ## Shared library options
+-LD_SOOPTIONS= -Wl,-Bsymbolic
++LD_SOOPTIONS= -Wl,-Bsymbolic-functions
+
+ ## Shared object suffix
+ SO = so
diff --git a/external/icu/icu4c-scriptrun.patch.1 b/external/icu/icu4c-scriptrun.patch.1
new file mode 100644
index 0000000000..f2f2cf9f3b
--- /dev/null
+++ b/external/icu/icu4c-scriptrun.patch.1
@@ -0,0 +1,60 @@
+diff -ur icu.org/source/extra/scrptrun/scrptrun.cpp icu/source/extra/scrptrun/scrptrun.cpp
+--- icu.org/source/extra/scrptrun/scrptrun.cpp 2017-01-20 01:20:31.000000000 +0100
++++ icu/source/extra/scrptrun/scrptrun.cpp 2017-04-21 22:59:31.708037770 +0200
+@@ -151,7 +151,11 @@
+ // characters above it on the stack will be poped.
+ if (pairIndex >= 0) {
+ if ((pairIndex & 1) == 0) {
+- parenStack[++parenSP].pairIndex = pairIndex;
++ ++parenSP;
++ int32_t nVecSize = parenStack.size();
++ if (parenSP == nVecSize)
++ parenStack.resize(nVecSize + 128);
++ parenStack[parenSP].pairIndex = pairIndex;
+ parenStack[parenSP].scriptCode = scriptCode;
+ } else if (parenSP >= 0) {
+ int32_t pi = pairIndex & ~1;
+@@ -185,7 +189,14 @@
+ // pop it from the stack
+ if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) {
+ parenSP -= 1;
+- startSP -= 1;
++ /* decrement startSP only if it is >= 0,
++ decrementing it unnecessarily will lead to memory corruption
++ while processing the above while block.
++ e.g. startSP = -4 , parenSP = -1
++ */
++ if (startSP >= 0) {
++ startSP -= 1;
++ }
+ }
+ } else {
+ // if the run broke on a surrogate pair,
+diff -ur icu.org/source/extra/scrptrun/scrptrun.h icu/source/extra/scrptrun/scrptrun.h
+--- icu.org/source/extra/scrptrun/scrptrun.h 2017-01-20 01:20:31.000000000 +0100
++++ icu/source/extra/scrptrun/scrptrun.h 2017-04-21 22:59:31.708037770 +0200
+@@ -19,6 +19,7 @@
+ #include "unicode/utypes.h"
+ #include "unicode/uobject.h"
+ #include "unicode/uscript.h"
++#include <vector>
+
+ U_NAMESPACE_BEGIN
+
+@@ -81,7 +82,7 @@
+ int32_t scriptEnd;
+ UScriptCode scriptCode;
+
+- ParenStackEntry parenStack[128];
++ std::vector<ParenStackEntry> parenStack;
+ int32_t parenSP;
+
+ static int8_t highBit(int32_t value);
+@@ -135,6 +136,7 @@
+ scriptEnd = charStart;
+ scriptCode = USCRIPT_INVALID_CODE;
+ parenSP = -1;
++ parenStack.resize(128);
+ }
+
+ inline void ScriptRun::reset(int32_t start, int32_t length)
diff --git a/external/icu/icu4c-solarisgcc.patch.1 b/external/icu/icu4c-solarisgcc.patch.1
new file mode 100644
index 0000000000..6000ed0cb9
--- /dev/null
+++ b/external/icu/icu4c-solarisgcc.patch.1
@@ -0,0 +1,12 @@
+diff -ur icu.org/source/common/uposixdefs.h icu/source/common/uposixdefs.h
+--- icu.org/source/common/uposixdefs.h 2017-03-09 03:12:45.000000000 +0100
++++ icu/source/common/uposixdefs.h 2017-04-21 22:23:11.857926971 +0200
+@@ -54,7 +54,7 @@
+ *
+ * z/OS needs this definition for timeval and to get usleep.
+ */
+-#if !defined(_XOPEN_SOURCE_EXTENDED) && defined(__TOS_MVS__)
++#if !defined(_XOPEN_SOURCE_EXTENDED) && (defined(__TOS_MVS__) || defined(__IBMC__) || defined(__IBMCPP__))
+ # define _XOPEN_SOURCE_EXTENDED 1
+ #endif
+
diff --git a/external/icu/icu4c-ubsan.patch.1 b/external/icu/icu4c-ubsan.patch.1
new file mode 100644
index 0000000000..7b0c2efc92
--- /dev/null
+++ b/external/icu/icu4c-ubsan.patch.1
@@ -0,0 +1,14 @@
+diff -ur icu.org/source/common/ubidiimp.h icu/source/common/ubidiimp.h
+--- icu.org/source/common/ubidiimp.h 2019-10-03 13:16:41.000000000 +0200
++++ icu/source/common/ubidiimp.h 2019-10-28 19:08:13.533284618 +0100
+@@ -198,8 +198,8 @@
+ /* in a Run, logicalStart will get this bit set if the run level is odd */
+ #define INDEX_ODD_BIT (1UL<<31)
+
+-#define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((int32_t)((level)&1)<<31))
+-#define ADD_ODD_BIT_FROM_LEVEL(x, level) ((x)|=((int32_t)((level)&1)<<31))
++#define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((uint32_t)((level)&1)<<31))
++#define ADD_ODD_BIT_FROM_LEVEL(x, level) ((x)|=((uint32_t)((level)&1)<<31))
+ #define REMOVE_ODD_BIT(x) ((x)&=~INDEX_ODD_BIT)
+
+ #define GET_INDEX(x) ((x)&~INDEX_ODD_BIT)
diff --git a/external/icu/icu4c-use-pkgdata-single-ccode-file-mode.patch.1 b/external/icu/icu4c-use-pkgdata-single-ccode-file-mode.patch.1
new file mode 100644
index 0000000000..237e554b8a
--- /dev/null
+++ b/external/icu/icu4c-use-pkgdata-single-ccode-file-mode.patch.1
@@ -0,0 +1,12 @@
+--- icu/source/tools/toolutil/pkg_genc.h.orig 2022-01-11 06:02:29.694678787 +0100
++++ icu/source/tools/toolutil/pkg_genc.h 2022-01-11 06:02:41.602640965 +0100
+@@ -48,9 +48,7 @@
+ * the data to generate the final data library. This can
+ * increase the performance of the pkdata tool.
+ */
+-#if U_PLATFORM == U_PF_OS400
+ #define USE_SINGLE_CCODE_FILE
+-#endif
+
+ /* Need to fix the file seperator character when using MinGW. */
+ #if defined(WINDOWS_WITH_GNUC) || defined(USING_CYGWIN)
diff --git a/external/icu/icu4c-warnings.patch.1 b/external/icu/icu4c-warnings.patch.1
new file mode 100644
index 0000000000..d8df0e14e9
--- /dev/null
+++ b/external/icu/icu4c-warnings.patch.1
@@ -0,0 +1,11 @@
+diff -ur icu.org/source/common/unicode/utf16.h icu/source/common/unicode/utf16.h
+--- icu.org/source/common/unicode/utf16.h 2020-10-28 22:21:12.000000000 +0100
++++ icu/source/common/unicode/utf16.h 2020-11-16 19:31:03.356478154 +0100
+@@ -398,6 +398,7 @@
+ (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
+ } else /* c>0x10ffff or not enough space */ { \
+ (isError)=true; \
++ (void)(isError); \
+ } \
+ } UPRV_BLOCK_MACRO_END
+
diff --git a/external/icu/icu4c-windows-cygwin-cross.patch.1 b/external/icu/icu4c-windows-cygwin-cross.patch.1
new file mode 100644
index 0000000000..dd6b47c172
--- /dev/null
+++ b/external/icu/icu4c-windows-cygwin-cross.patch.1
@@ -0,0 +1,131 @@
+diff -ur icu.org/source/acinclude.m4 icu/source/acinclude.m4
+--- icu.org/source/acinclude.m4 2020-04-10 16:22:16.000000000 +0200
++++ icu/source/acinclude.m4 2020-04-21 22:14:09.940217733 +0200
+@@ -52,6 +52,12 @@
+ else
+ icu_cv_host_frag=mh-cygwin-msvc
+ fi ;;
++aarch64-*-cygwin)
++ if test "$GCC" = yes; then
++ icu_cv_host_frag=mh-cygwin64
++ else
++ icu_cv_host_frag=mh-cygwin-msvc
++ fi ;;
+ *-*-mingw*)
+ if test "$GCC" = yes; then
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+--- icu/source/configure.ac.orig 2020-04-22 22:04:20.000000000 +0200
++++ icu/source/configure.ac 2020-10-01 09:39:05.570900400 +0200
+@@ -213,23 +213,33 @@
+ [cross_buildroot="${withval}"],
+ [cross_buildroot=""])
+
++cross_mixed_buildroot="$cross_buildroot"
++cross_unix_buildroot="$cross_buildroot"
+ if test "X$cross_buildroot" = "X"; then
+ if test "$cross_compiling" = "yes"; then
+ AC_MSG_ERROR([Error! Cross compiling but no --with-cross-build option specified - please supply the path to an executable ICU's build root])
+ dnl '
+ fi
+ else
+- if test -f "${cross_buildroot}/config/icucross.mk"; then
++ case "${host}" in
++ *-*-cygwin*)
++ #M# -m isn't used because it doesn't work on Win98
++ cross_mixed_buildroot=$(cygpath -ad "$cross_buildroot" | tr '\\' '/')
++ cross_unix_buildroot=$(cygpath -au "$cross_buildroot")
++ ;;
++ esac
++ if test -f "${cross_mixed_buildroot}/config/icucross.mk"; then
+ AC_MSG_RESULT([Using cross buildroot: $cross_buildroot])
+ else
+- if test -d "${cross_buildroot}"; then
+- AC_MSG_ERROR([${cross_buildroot}/config/icucross.mk not found. Please build ICU in ${cross_buildroot} first.])
++ if test -d "${cross_mixed_buildroot}"; then
++ AC_MSG_ERROR([${cross_mixed_buildroot}/config/icucross.mk not found. Please build ICU in ${cross_mixed_buildroot} first.])
+ else
+- AC_MSG_ERROR([No such directory ${cross_buildroot} supplied as the argument to --with-cross-build. Use an absolute path.])
++ AC_MSG_ERROR([No such directory ${cross_mixed_buildroot} supplied as the argument to --with-cross-build. Use an absolute path.])
+ fi
+ fi
+ fi
+-AC_SUBST(cross_buildroot)
++AC_SUBST(cross_mixed_buildroot)
++AC_SUBST(cross_unix_buildroot)
+
+ # Check for doxygen to generate documentation
+ AC_PATH_PROG(DOXYGEN,doxygen,,$PATH:/usr/local/bin:/usr/bin)
+--- icu/source/test/testdata/Makefile.in.orig 2020-10-01 09:37:25.847888900 +0200
++++ icu/source/test/testdata/Makefile.in 2020-10-01 09:36:41.859996500 +0200
+@@ -82,7 +82,7 @@
+ # relative lib links from pkgdata are the same as for tmp
+ GENRBOPTS=-k
+ # use the cross root, in case we are cross compiling. Otherwise it is equal to top_builddir
+-TOOLDIR=$(cross_buildroot)/tools
++TOOLDIR=$(cross_mixed_buildroot)/tools
+ SRCDATADIR=$(top_srcdir)/data
+ UNICODEDATADIR=$(SRCDATADIR)/unidata
+ OUTDIR=$(top_builddir)/data/out
+--- icu/source/Makefile.in.orig 2020-04-22 22:04:20.000000000 +0200
++++ icu/source/Makefile.in 2020-10-01 09:29:36.642364000 +0200
+@@ -255,16 +255,16 @@
+ @(echo "CROSS_ICU_VERSION=$(VERSION)" ;\
+ echo "TOOLEXEEXT=$(EXEEXT)" \
+ ) > $@
+- @(echo 'TOOLBINDIR=$$(cross_buildroot)/bin' ;\
+- echo 'TOOLLIBDIR=$$(cross_buildroot)/lib' ;\
+- echo "INVOKE=$(LDLIBRARYPATH_ENVVAR)=$(LIBRARY_PATH_PREFIX)"'$$(TOOLLIBDIR):$$(cross_buildroot)/stubdata:$$(cross_buildroot)/tools/ctestfw:$$$$'"$(LDLIBRARYPATH_ENVVAR)" ;\
+- echo "PKGDATA_INVOKE=$(LDLIBRARYPATH_ENVVAR)=$(LIBRARY_PATH_PREFIX)"'$$(cross_buildroot)/stubdata:$$(cross_buildroot)/tools/ctestfw:$$(TOOLLIBDIR):$$$$'"$(LDLIBRARYPATH_ENVVAR) " ;\
++ @(echo 'TOOLBINDIR=$$(cross_mixed_buildroot)/bin' ;\
++ echo 'TOOLLIBDIR=$$(cross_mixed_buildroot)/lib' ;\
++ echo "INVOKE=$(LDLIBRARYPATH_ENVVAR)=$(LIBRARY_PATH_PREFIX)"'$$(cross_unix_buildroot)/lib:$$(cross_unix_buildroot)/stubdata:$$(cross_unix_buildroot)/tools/ctestfw:$$$$'"$(LDLIBRARYPATH_ENVVAR)" ;\
++ echo "PKGDATA_INVOKE=$(LDLIBRARYPATH_ENVVAR)=$(LIBRARY_PATH_PREFIX)"'$$(cross_unix_buildroot)/stubdata:$$(cross_unix_buildroot)/tools/ctestfw:$$(cross_unix_buildroot)/lib:$$$$'"$(LDLIBRARYPATH_ENVVAR) " ;\
+ echo ) >> $@
+
+ config/icucross.inc: $(top_builddir)/icudefs.mk $(top_builddir)/Makefile @platform_make_fragment@
+ @echo rebuilding $@
+- @(grep '^CURR_FULL_DIR' $(top_builddir)/icudefs.mk ; \
+- grep '^CURR_FULL_DIR' @platform_make_fragment@ || echo ""; \
++ @(grep '^CURR_FULL_DIR' @platform_make_fragment@ || echo ""; \
++ grep '^CURR_FULL_DIR' $(top_builddir)/icudefs.mk ; \
+ ) > $@
+
+ config/icu.pc: $(srcdir)/config/icu.pc.in
+--- icu/source/icudefs.mk.in.orig 2020-04-22 22:04:20.000000000 +0200
++++ icu/source/icudefs.mk.in 2020-10-01 09:35:54.418128800 +0200
+@@ -35,7 +35,8 @@
+ sysconfdir = @sysconfdir@
+ # controls the include of $(top_builddir)/icucross.mk at bottom of file
+ cross_compiling = @cross_compiling@
+-cross_buildroot = @cross_buildroot@
++cross_mixed_buildroot = @cross_mixed_buildroot@
++cross_unix_buildroot = @cross_unix_buildroot@
+
+ # Package information
+
+@@ -303,8 +304,8 @@
+ INSTALLED_INVOKE = $(LDLIBRARYPATH_ENVVAR)=$(libdir):$$$(LDLIBRARYPATH_ENVVAR)
+
+ # Current full path directory for cross compilation
+-ifneq ($(strip $(cross_buildroot)),)
+-include $(cross_buildroot)/config/icucross.inc
++ifneq ($(strip $(cross_mixed_buildroot)),)
++include $(cross_mixed_buildroot)/config/icucross.inc
+ endif
+
+ # Platform-specific setup
+@@ -323,10 +324,11 @@
+
+ # some imported things from the cross env
+ TOOLEXEEXT = $(EXEEXT)
+-ifneq ($(strip $(cross_buildroot)),)
+-include $(cross_buildroot)/config/icucross.mk
++ifneq ($(strip $(cross_mixed_buildroot)),)
++include $(cross_mixed_buildroot)/config/icucross.mk
+ else
+-cross_buildroot = $(top_builddir)
++cross_mixed_buildroot = $(top_builddir)
++cross_unix_buildroot = $(top_builddir)
+ endif
+
+ # for tests
diff --git a/external/icu/khmerdict.dict b/external/icu/khmerdict.dict
new file mode 100644
index 0000000000..52605b6546
--- /dev/null
+++ b/external/icu/khmerdict.dict
Binary files differ