28 files changed, 1878 insertions, 0 deletions
diff --git a/external/icu/ExternalPackage_icu.mk b/external/icu/ExternalPackage_icu.mk
new file mode 100644
index 0000000000..dcd4da2169
--- /dev/null
+++ b/external/icu/ExternalPackage_icu.mk
@@ -0,0 +1,42 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+icu_VERSION := $(ICU_MAJOR).$(ICU_MINOR)$(if $(ICU_MICRO),.$(ICU_MICRO))
+
+$(eval $(call gb_ExternalPackage_ExternalPackage,icu,icu))
+
+$(eval $(call gb_ExternalPackage_use_external_project,icu,icu))
+
+ifneq ($(DISABLE_DYNLOADING),TRUE)
+ifeq ($(OS),WNT)
+
+ifeq ($(COM),GCC)
+$(eval $(call gb_ExternalPackage_add_files,icu,$(LIBO_LIB_FOLDER),\
+	source/lib/icuin$(ICU_MAJOR).dll \
+))
+else
+$(eval $(call gb_ExternalPackage_add_files,icu,$(LIBO_LIB_FOLDER),\
+	source/lib/icuin$(if $(MSVC_USE_DEBUG_RUNTIME),d)$(ICU_MAJOR).dll \
+))
+endif # $(COM)
+
+else ifeq ($(OS),ANDROID)
+
+$(eval $(call gb_ExternalPackage_add_files,icu,$(LIBO_LIB_FOLDER),\
+	source/lib/libicui18nlo.so \
+))
+
+else # $(OS) != WNT/ANDROID
+
+$(eval $(call gb_ExternalPackage_add_file,icu,$(LIBO_LIB_FOLDER)/libicui18n$(gb_Library_DLLEXT).$(ICU_MAJOR),source/lib/libicui18n$(gb_Library_DLLEXT).$(icu_VERSION)))
+
+endif # $(OS)
+endif # DISABLE_DYNLOADING
+
+# vim: set noet sw=4 ts=4:
diff --git a/external/icu/ExternalPackage_icu_ure.mk b/external/icu/ExternalPackage_icu_ure.mk
new file mode 100644
index 0000000000..fefe71afdc
--- /dev/null
+++ b/external/icu/ExternalPackage_icu_ure.mk
@@ -0,0 +1,48 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+# libxml2 is in URE and depends on icuuc*.dll on Windows; the i18nlangtag lib is
+# in URE and depends on the icuuc lib (which in turn depends on the icudata lib)
+# on all platforms:
+
+$(eval $(call gb_ExternalPackage_ExternalPackage,icu_ure,icu))
+
+$(eval $(call gb_ExternalPackage_use_external_project,icu_ure,icu))
+
+ifneq ($(DISABLE_DYNLOADING),TRUE)
+ifeq ($(OS),WNT)
+
+ifeq ($(COM),GCC)
+$(eval $(call gb_ExternalPackage_add_files,icu_ure,$(LIBO_URE_LIB_FOLDER),\
+	source/lib/icudt$(ICU_MAJOR).dll \
+	source/lib/icuuc$(ICU_MAJOR).dll \
+))
+else
+$(eval $(call gb_ExternalPackage_add_files,icu_ure,$(LIBO_URE_LIB_FOLDER),\
+	source/lib/icudt$(if $(MSVC_USE_DEBUG_RUNTIME),d)$(ICU_MAJOR).dll \
+	source/lib/icuuc$(if $(MSVC_USE_DEBUG_RUNTIME),d)$(ICU_MAJOR).dll \
+))
+endif # $(COM)
+
+else ifeq ($(OS),ANDROID)
+
+$(eval $(call gb_ExternalPackage_add_files,icu_ure,$(LIBO_URE_LIB_FOLDER),\
+	source/lib/libicudatalo.so \
+	source/lib/libicuuclo.so \
+))
+
+else # $(OS) != WNT/ANDROID
+
+$(eval $(call gb_ExternalPackage_add_file,icu_ure,$(LIBO_URE_LIB_FOLDER)/libicudata$(gb_Library_DLLEXT).$(ICU_MAJOR),source/lib/libicudata$(gb_Library_DLLEXT).$(icu_VERSION)))
+$(eval $(call gb_ExternalPackage_add_file,icu_ure,$(LIBO_URE_LIB_FOLDER)/libicuuc$(gb_Library_DLLEXT).$(ICU_MAJOR),source/lib/libicuuc$(gb_Library_DLLEXT).$(icu_VERSION)))
+
+endif # $(OS)
+endif # DISABLE_DYNLOADING
+
+# vim: set noet sw=4 ts=4:
diff --git a/external/icu/ExternalProject_icu.mk b/external/icu/ExternalProject_icu.mk
new file mode 100644
index 0000000000..5388eee589
--- /dev/null
+++ b/external/icu/ExternalProject_icu.mk
@@ -0,0 +1,100 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+$(eval $(call gb_ExternalProject_ExternalProject,icu))
+
+$(eval $(call gb_ExternalProject_register_targets,icu,\
+	build \
+))
+
+icu_CPPFLAGS:="-DHAVE_GCC_ATOMICS=$(if $(filter TRUE,$(GCC_HAVE_BUILTIN_ATOMIC)),1,0)"
+
+ifeq ($(OS),WNT)
+
+$(call gb_ExternalProject_get_state_target,icu,build) :
+	$(call gb_Trace_StartRange,icu,EXTERNAL)
+	$(call gb_ExternalProject_run,build,\
+		autoconf -f \
+		&& export LIB="$(ILIB)" PYTHONWARNINGS="default" \
+			gb_ICU_XFLAGS="-FS $(SOLARINC) $(gb_DEBUGINFO_FLAGS) $(if $(MSVC_USE_DEBUG_RUNTIME),-MDd,-MD -Gy)" \
+		&& CFLAGS="$${gb_ICU_XFLAGS}" CPPFLAGS="$(SOLARINC)" CXXFLAGS="$${gb_ICU_XFLAGS}" \
+			INSTALL=`cygpath -m /usr/bin/install` $(if $(MSVC_USE_DEBUG_RUNTIME),LDFLAGS="-DEBUG") \
+			$(gb_RUN_CONFIGURE) ./configure \
+				$(if $(MSVC_USE_DEBUG_RUNTIME),--enable-debug --disable-release) \
+				$(gb_CONFIGURE_PLATFORMS) \
+				$(if $(CROSS_COMPILING), \
+					--with-cross-build=$(WORKDIR_FOR_BUILD)/UnpackedTarball/icu/source \
+					--disable-tools --disable-extras) \
+		&& $(MAKE) $(if $(CROSS_COMPILING),DATASUBDIR=data) $(if $(verbose),VERBOSE=1) \
+	,source)
+	$(call gb_Trace_EndRange,icu,EXTERNAL)
+
+else # $(OS)
+
+icu_CFLAGS:="$(CFLAGS) \
+	$(if $(filter iOS,$(OS)),-DUCONFIG_NO_FILE_IO) \
+	$(if $(SYSBASE),-I$(SYSBASE)/usr/include) \
+	$(call gb_ExternalProject_get_build_flags,icu) \
+	$(if $(ENABLE_LTO),$(gb_LTOFLAGS)) \
+	$(if $(filter GCC,$(COM)),-fno-strict-aliasing) \
+	$(if $(filter FUZZERS,$(BUILD_TYPE)),-DU_USE_STRTOD_L=0) \
+	$(if $(filter ANDROID,$(OS)),-fvisibility=hidden -fno-omit-frame-pointer)"
+icu_CXXFLAGS:="$(CXXFLAGS) $(CXXFLAGS_CXX11) \
+	$(if $(filter iOS,$(OS)),-DUCONFIG_NO_FILE_IO) \
+	$(call gb_ExternalProject_get_build_flags,icu) \
+	$(if $(ENABLE_LTO),$(gb_LTOFLAGS)) \
+	$(if $(filter GCC,$(COM)),-fno-strict-aliasing) \
+	$(if $(filter FUZZERS,$(BUILD_TYPE)),-DU_USE_STRTOD_L=0) \
+	$(if $(filter ANDROID,$(OS)),-fvisibility=hidden -fno-omit-frame-pointer -I$(SRCDIR)/include)"
+icu_LDFLAGS:=" \
+	$(if $(ENABLE_LTO),$(gb_LTOFLAGS)) \
+	$(call gb_ExternalProject_get_link_flags,icu) \
+	$(if $(filter TRUE,$(HAVE_LD_HASH_STYLE)),-Wl$(COMMA)--hash-style=$(WITH_LINKER_HASH_STYLE)) \
+    $(if $(SYSBASE),-L../lib -L../../lib -L../stubdata -L../../stubdata -L$(SYSBASE)/usr/lib) \
+    $(if $(filter TRUE,$(HAVE_LD_BSYMBOLIC_FUNCTIONS)), -Wl$(COMMA)-Bsymbolic-functions) \
+    $(if $(filter ANDROID,$(OS)),$(gb_STDLIBS))"
+
+# DATASUBDIR=data in cross-compiling case, because --disable-tools completely skips the
+# data directory/doesn't build the requested library in that case (icu/source/Makefile.in)
+# so we need to add it back to the list of subdirectories to build
+$(call gb_ExternalProject_get_state_target,icu,build) :
+	$(call gb_Trace_StartRange,icu,EXTERNAL)
+	$(call gb_ExternalProject_run,build,\
+		autoconf -f && \
+		CPPFLAGS=$(icu_CPPFLAGS) CFLAGS=$(icu_CFLAGS) \
+		CXXFLAGS=$(icu_CXXFLAGS) LDFLAGS=$(icu_LDFLAGS) \
+		PYTHONWARNINGS="default" \
+		$(gb_RUN_CONFIGURE) ./configure \
+			--disable-layout --disable-samples \
+			$(if $(filter FUZZERS,$(BUILD_TYPE)),--disable-release) \
+			$(if $(filter EMSCRIPTEN ANDROID,$(OS)),--disable-strict ac_cv_c_bigendian=no) \
+			$(if $(filter SOLARIS,$(OS)),--disable-64bit-libs) \
+			$(if $(filter TRUE,$(DISABLE_DYNLOADING)),\
+				--with-data-packaging=static --enable-static --disable-shared --disable-dyload,\
+				--disable-static --enable-shared $(if $(filter ANDROID,$(OS)),--with-library-suffix=lo)) \
+			$(gb_CONFIGURE_PLATFORMS) \
+			$(if $(CROSS_COMPILING), \
+				--with-cross-build=$(WORKDIR_FOR_BUILD)/UnpackedTarball/icu/source \
+				--disable-tools --disable-extras) \
+			AR="$(AR)" RANLIB="$(RANLIB)" \
+		&& $(MAKE) $(if $(CROSS_COMPILING),DATASUBDIR=data) $(if $(verbose),VERBOSE=1) \
+		$(if $(filter MACOSX,$(OS)), \
+			&& $(PERL) $(SRCDIR)/solenv/bin/macosx-change-install-names.pl shl \
+				URELIB \
+				$(EXTERNAL_WORKDIR)/source/lib/libicuuc$(gb_Library_DLLEXT).$(icu_VERSION) \
+				$(EXTERNAL_WORKDIR)/source/lib/libicui18n$(gb_Library_DLLEXT).$(icu_VERSION) \
+			&& $(PERL) $(SRCDIR)/solenv/bin/macosx-change-install-names.pl shl \
+				OOO \
+				$(EXTERNAL_WORKDIR)/source/lib/libicudata$(gb_Library_DLLEXT).$(icu_VERSION)) \
+	,source)
+	$(call gb_Trace_EndRange,icu,EXTERNAL)
+
+endif
+
+# vim: set noet sw=4 ts=4:
diff --git a/external/icu/Makefile b/external/icu/Makefile
new file mode 100644
index 0000000000..e4968cf85f
--- /dev/null
+++ b/external/icu/Makefile
@@ -0,0 +1,7 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+
+module_directory:=$(dir $(realpath $(firstword $(MAKEFILE_LIST))))
+
+include $(module_directory)/../../solenv/gbuild/partial_build.mk
+
+# vim: set noet sw=4 ts=4:
diff --git a/external/icu/Module_icu.mk b/external/icu/Module_icu.mk
new file mode 100644
index 0000000000..5c99b930fc
--- /dev/null
+++ b/external/icu/Module_icu.mk
@@ -0,0 +1,19 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+$(eval $(call gb_Module_Module,icu))
+
+$(eval $(call gb_Module_add_targets,icu,\
+	UnpackedTarball_icu \
+	ExternalPackage_icu \
+	ExternalPackage_icu_ure \
+	ExternalProject_icu \
+))
+
+# vim: set noet sw=4 ts=4:
diff --git a/external/icu/README b/external/icu/README
new file mode 100644
index 0000000000..23cf5f0524
--- /dev/null
+++ b/external/icu/README
@@ -0,0 +1 @@
+Library providing Unicode support, from [https://icu.unicode.org/].
diff --git a/external/icu/UnpackedTarball_icu.mk b/external/icu/UnpackedTarball_icu.mk
new file mode 100644
index 0000000000..655614447d
--- /dev/null
+++ b/external/icu/UnpackedTarball_icu.mk
@@ -0,0 +1,47 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+$(eval $(call gb_UnpackedTarball_UnpackedTarball,icu))
+
+$(eval $(call gb_UnpackedTarball_set_tarball,icu,$(ICU_TARBALL)))
+
+$(eval $(call gb_UnpackedTarball_update_autoconf_configs,icu,source))
+
+# Data zip contains data/... and needs to end up in icu/source/data/...
+# Only data/misc/icudata.rc is needed for a Cygwin/MSVC build.
+$(eval $(call gb_UnpackedTarball_set_pre_action,icu,\
+	unzip -q -d source -o $(gb_UnpackedTarget_TARFILE_LOCATION)/$(ICU_DATA_TARBALL) data/misc/icudata.rc \
+))
+
+$(eval $(call gb_UnpackedTarball_set_patchlevel,icu,0))
+
+$(eval $(call gb_UnpackedTarball_add_patches,icu,\
+	external/icu/icu4c-build.patch.1 \
+	external/icu/icu4c-warnings.patch.1 \
+	external/icu/icu4c-macosx.patch.1 \
+	external/icu/icu4c-solarisgcc.patch.1 \
+	external/icu/icu4c-mkdir.patch.1 \
+	external/icu/icu4c-ubsan.patch.1 \
+	external/icu/icu4c-scriptrun.patch.1 \
+	external/icu/icu4c-rtti.patch.1 \
+	external/icu/icu4c-clang-cl.patch.1 \
+	external/icu/gcc9.patch \
+	external/icu/c++20-comparison.patch.1 \
+	external/icu/Wdeprecated-copy-dtor.patch \
+	external/icu/icu4c-windows-cygwin-cross.patch.1 \
+	external/icu/icu4c-emscripten-cross.patch.1 \
+	external/icu/icu4c-use-pkgdata-single-ccode-file-mode.patch.1 \
+	external/icu/icu4c-khmerbreakengine.patch.1 \
+	external/icu/icu4c-$(if $(filter ANDROID,$(OS)),android,rpath).patch.1 \
+	$(if $(filter-out ANDROID,$(OS)),external/icu/icu4c-icudata-stdlibs.patch.1) \
+))
+
+$(eval $(call gb_UnpackedTarball_add_file,icu,source/data/brkitr/khmerdict.dict,external/icu/khmerdict.dict))
+
+# vim: set noet sw=4 ts=4:
diff --git a/external/icu/Wdeprecated-copy-dtor.patch b/external/icu/Wdeprecated-copy-dtor.patch
new file mode 100644
index 0000000000..67078ef1bb
--- /dev/null
+++ b/external/icu/Wdeprecated-copy-dtor.patch
@@ -0,0 +1,25 @@
+--- source/common/unicode/uobject.h
++++ source/common/unicode/uobject.h
+@@ -245,10 +245,10 @@
+     // direct use of UObject itself
+ 
+     // default constructor
+-    // inline UObject() {}
++    UObject() = default;
+ 
+     // copy constructor
+-    // inline UObject(const UObject &other) {}
++    UObject(const UObject &other) = default;
+ 
+ #if 0
+     // TODO Sometime in the future. Implement operator==().
+@@ -280,8 +280,8 @@
+      * Subclasses need this assignment operator if they use compiler-provided
+      * assignment operators of their own. An alternative to not declaring one
+      * here would be to declare and empty-implement a protected or public one.
+-    UObject &UObject::operator=(const UObject &);
+      */
++    UObject &operator=(const UObject &) = default;
+ };
+ 
+ #ifndef U_HIDE_INTERNAL_API
diff --git a/external/icu/c++20-comparison.patch.1 b/external/icu/c++20-comparison.patch.1
new file mode 100644
index 0000000000..fa10b048ce
--- /dev/null
+++ b/external/icu/c++20-comparison.patch.1
@@ -0,0 +1,82 @@
+diff -ur icu.org/source/i18n/unicode/rbtz.h icu/source/i18n/unicode/rbtz.h
+--- icu.org/source/i18n/unicode/rbtz.h	2022-10-19 02:53:21.000000000 +0200
++++ icu/source/i18n/unicode/rbtz.h	2022-10-24 22:20:10.889969185 +0200
+@@ -87,6 +87,7 @@
+      * @stable ICU 3.8
+      */
+     virtual bool operator!=(const TimeZone& that) const;
++    bool operator!=(const RuleBasedTimeZone& that) const {return !operator==(that);}
+ 
+     /**
+      * Adds the `TimeZoneRule` which represents time transitions.
+diff -ur icu.org/source/i18n/unicode/simpletz.h icu/source/i18n/unicode/simpletz.h
+--- icu.org/source/i18n/unicode/simpletz.h	2022-10-19 02:53:21.000000000 +0200
++++ icu/source/i18n/unicode/simpletz.h	2022-10-24 22:20:10.890969183 +0200
+@@ -112,6 +112,7 @@
+      * @stable ICU 2.0
+      */
+     virtual bool operator==(const TimeZone& that) const override;
++    bool operator!=(const SimpleTimeZone& that) const {return !operator==(that);}
+ 
+     /**
+      * Constructs a SimpleTimeZone with the given raw GMT offset and time zone ID,
+diff -ur icu.org/source/i18n/unicode/smpdtfmt.h icu/source/i18n/unicode/smpdtfmt.h
+--- icu.org/source/i18n/unicode/smpdtfmt.h	2022-10-19 02:53:21.000000000 +0200
++++ icu/source/i18n/unicode/smpdtfmt.h	2022-10-24 22:20:10.891969181 +0200
+@@ -877,6 +877,7 @@
+      * @stable ICU 2.0
+      */
+     virtual bool operator==(const Format& other) const override;
++    bool operator!=(const SimpleDateFormat& that) const {return !operator==(that);}
+ 
+ 
+     using DateFormat::format;
+diff -ur icu.org/source/i18n/unicode/stsearch.h icu/source/i18n/unicode/stsearch.h
+--- icu.org/source/i18n/unicode/stsearch.h	2022-10-19 02:53:21.000000000 +0200
++++ icu/source/i18n/unicode/stsearch.h	2022-10-24 22:20:10.892969178 +0200
+@@ -298,6 +298,7 @@
+      * @stable ICU 2.0
+      */
+     virtual bool operator==(const SearchIterator &that) const override;
++    bool operator!=(const StringSearch &that) const {return !operator==(that);}
+ 
+     // public get and set methods ----------------------------------------
+ 
+diff -ur icu.org/source/i18n/unicode/tzrule.h icu/source/i18n/unicode/tzrule.h
+--- icu.org/source/i18n/unicode/tzrule.h	2022-10-19 02:53:21.000000000 +0200
++++ icu/source/i18n/unicode/tzrule.h	2022-10-24 22:30:23.298744116 +0200
+@@ -257,6 +257,7 @@
+      * @stable ICU 3.8
+      */
+     virtual bool operator!=(const TimeZoneRule& that) const override;
++    bool operator!=(const InitialTimeZoneRule& that) const {return !operator==(that);}
+ 
+     /**
+      * Returns if this rule represents the same rule and offsets as another.
+@@ -454,6 +455,7 @@
+      * @stable ICU 3.8
+      */
+     virtual bool operator!=(const TimeZoneRule& that) const override;
++    bool operator!=(const AnnualTimeZoneRule& that) const {return !operator==(that);}
+ 
+     /**
+      * Gets the start date/time rule used by this rule.
+@@ -670,6 +672,7 @@
+      * @stable ICU 3.8
+      */
+     virtual bool operator!=(const TimeZoneRule& that) const override;
++    bool operator!=(const TimeArrayTimeZoneRule& that) const {return !operator==(that);}
+ 
+     /**
+      * Gets the time type of the start times used by this rule.  The return value
+diff -ur icu.org/source/i18n/unicode/vtzone.h icu/source/i18n/unicode/vtzone.h
+--- icu.org/source/i18n/unicode/vtzone.h	2022-10-19 02:53:21.000000000 +0200
++++ icu/source/i18n/unicode/vtzone.h	2022-10-24 22:20:10.895969172 +0200
+@@ -83,6 +83,7 @@
+      * @stable ICU 3.8
+      */
+     virtual bool operator!=(const TimeZone& that) const;
++    bool operator!=(const VTimeZone& that) const {return !operator==(that);}
+ 
+     /**
+      * Create a <code>VTimeZone</code> instance by the time zone ID.
diff --git a/external/icu/cross-bin/icu-config b/external/icu/cross-bin/icu-config
new file mode 100755
index 0000000000..8ccf94f9bd
--- /dev/null
+++ b/external/icu/cross-bin/icu-config
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+case $1 in
+--version)
+    echo whatever
+    ;;
+--cppflags)
+    echo ${ICU_CFLAGS}
+    ;;
+--ldflags-searchpath)
+    echo ${ICU_LIBS}
+esac
diff --git a/external/icu/gcc9.patch b/external/icu/gcc9.patch
new file mode 100644
index 0000000000..5c9808f8c3
--- /dev/null
+++ b/external/icu/gcc9.patch
@@ -0,0 +1,26 @@
+--- source/i18n/unicode/format.h
++++ source/i18n/unicode/format.h
+@@ -22,6 +22,13 @@
+ 
+ #ifndef FORMAT_H
+ #define FORMAT_H
++
++#ifdef __GNUC__
++#pragma GCC diagnostic push
++#pragma GCC diagnostic ignored "-Wpragmas" // for old GCC
++#pragma GCC diagnostic ignored "-Wunknown-warning-option" // for Clang
++#pragma GCC diagnostic ignored "-Wdeprecated-copy"
++#endif
+ 
+ 
+ #include "unicode/utypes.h"
+@@ -314,5 +314,9 @@
+ 
+ #endif /* U_SHOW_CPLUSPLUS_API */
+ 
++#ifdef __GNUC__
++#pragma GCC diagnostic pop
++#endif
++
+ #endif // _FORMAT
+ //eof
diff --git a/external/icu/icu4c-android.patch.1 b/external/icu/icu4c-android.patch.1
new file mode 100644
index 0000000000..9ba252b402
--- /dev/null
+++ b/external/icu/icu4c-android.patch.1
@@ -0,0 +1,75 @@
+diff -ur icu.org/source/common/unicode/platform.h icu/source/common/unicode/platform.h
+--- icu.org/source/common/unicode/platform.h	2021-10-28 18:04:57.000000000 +0200
++++ icu/source/common/unicode/platform.h	2021-11-15 21:03:11.474638494 +0100
+@@ -818,7 +818,7 @@
+                             UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllimport__))
+ #   define U_EXPORT __declspec(dllexport)
+ #elif defined(__GNUC__)
+-#   define U_EXPORT __attribute__((visibility("default")))
++#   define U_EXPORT
+ #elif (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x550) \
+    || (defined(__SUNPRO_C) && __SUNPRO_C >= 0x550) 
+ #   define U_EXPORT __global
+diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux
+--- icu.org/source/config/mh-linux	2021-11-15 20:56:39.460705065 +0100
++++ icu/source/config/mh-linux	2021-11-15 21:03:11.474638494 +0100
+@@ -27,7 +27,7 @@
+ 
+ ## Compiler switch to embed a library name
+ # The initial tab in the next line is to prevent icu-config from reading it.
+-	LD_SONAME = -Wl,-soname -Wl,$(notdir $(MIDDLE_SO_TARGET))
++	#LD_SONAME = -Wl,-soname -Wl,$(notdir $(MIDDLE_SO_TARGET))
+ #SH# # We can't depend on MIDDLE_SO_TARGET being set.
+ #SH# LD_SONAME=
+ 
+diff -ur icu.org/source/configure icu/source/configure
+--- icu.org/source/configure	2021-11-15 20:56:39.875703936 +0100
++++ icu/source/configure	2021-11-15 21:03:11.475638491 +0100
+@@ -5272,7 +5273,7 @@
+ 	else
+ 		icu_cv_host_frag=mh-linux-va
+ 	fi ;;
+-*-*-linux*|*-*-gnu|*-*-k*bsd*-gnu|*-*-kopensolaris*-gnu) icu_cv_host_frag=mh-linux ;;
++*-*-linux*|*-*-gnu|*-*-k*bsd*-gnu|*-*-kopensolaris*-gnu|*-*-*-androideabi*) icu_cv_host_frag=mh-linux ;;
+ i[34567]86-*-cygwin)
+ 	if test "$GCC" = yes; then
+ 		icu_cv_host_frag=mh-cygwin
+@@ -6472,6 +6466,10 @@
+ # Check to see if genccode can generate simple assembly.
+ GENCCODE_ASSEMBLY=
+ case "${host}" in
++arm-*-linux-androideabi)
++    if test "$GCC" = yes; then
++        GENCCODE_ASSEMBLY="-a gcc-android-arm"
++    fi ;;
+ *-linux*|*-kfreebsd*-gnu*|i*86-*-*bsd*|i*86-pc-gnu)
+     if test "$GCC" = yes; then
+         # We're using gcc, and the simple -a gcc command line works for genccode
+@@ -7594,6 +7592,10 @@
+     # wchar_t can be used
+     CHECK_UTF16_STRING_RESULT="available"
+     ;;
++*-*-*-androideabi|mips-unknown-linux-android)
++    # no UTF-16 strings thanks, I think, this is to avoid the -std=c++0x which causes trouble with uint64_t
++    CHECK_UTF16_STRING_RESULT="nope"
++    ;;
+ *)
+     ;;
+ esac
+diff -ur icu.org/source/i18n/decimfmt.cpp icu/source/i18n/decimfmt.cpp
+--- icu.org/source/i18n/decimfmt.cpp	2021-10-28 18:04:57.000000000 +0200
++++ icu/source/i18n/decimfmt.cpp	2021-11-15 21:03:11.476638489 +0100
+@@ -9,6 +9,13 @@
+ // Helpful in toString methods and elsewhere.
+ #define UNISTR_FROM_STRING_EXPLICIT
+ 
++#ifdef __ANDROID__
++#ifndef ARM
++#define ARM
++#endif
++#include <android/compatibility.hxx>
++#endif
++
+ #include <cmath>
+ #include <cstdlib>
+ #include <stdlib.h>
diff --git a/external/icu/icu4c-build.patch.1 b/external/icu/icu4c-build.patch.1
new file mode 100644
index 0000000000..a878de7323
--- /dev/null
+++ b/external/icu/icu4c-build.patch.1
@@ -0,0 +1,91 @@
+diff -ur icu.org/source/config/mh-darwin icu/source/config/mh-darwin
+--- icu.org/source/config/mh-darwin	2016-06-15 20:58:17.000000000 +0200
++++ icu/source/config/mh-darwin	2017-04-21 21:30:23.584568210 +0200
+@@ -30,11 +30,7 @@
+ SHLIB.cc=	$(CXX) -dynamiclib -dynamic $(CXXFLAGS) $(LDFLAGS) $(LD_SOOPTIONS)
+ 
+ ## Compiler switches to embed a library name and version information
+-ifeq ($(ENABLE_RPATH),YES)
+-LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name $(libdir)/$(notdir $(MIDDLE_SO_TARGET))
+-else
+-LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name $(notdir $(MIDDLE_SO_TARGET)) $(PKGDATA_TRAILING_SPACE)
+-endif
++LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name @__________________________________________________URELIB/$(notdir $(MIDDLE_SO_TARGET))
+ 
+ ## Compiler switch to embed a runtime search path
+ LD_RPATH=
+@@ -50,10 +46,6 @@
+ ## Non-shared intermediate object suffix
+ STATIC_O = ao
+ 
+-## Override Versioned target for a shared library.
+-FINAL_SO_TARGET=  $(basename $(SO_TARGET)).$(SO_TARGET_VERSION).$(SO)
+-MIDDLE_SO_TARGET= $(basename $(SO_TARGET)).$(SO_TARGET_VERSION_MAJOR).$(SO)
+-
+ ## Compilation and dependency rules
+ %.$(STATIC_O): $(srcdir)/%.c
+ 	$(call SILENT_COMPILE,$(strip $(COMPILE.c) $(STATICCPPFLAGS) $(STATICCFLAGS)) -MMD -MT "$*.d $*.o $*.$(STATIC_O)" -o $@ $<)
+@@ -67,16 +59,10 @@
+ 
+ ## Versioned libraries rules
+ 
+-%.$(SO_TARGET_VERSION_MAJOR).$(SO): %.$(SO_TARGET_VERSION).$(SO)
++%.$(SO).$(SO_TARGET_VERSION_MAJOR): %.$(SO).$(SO_TARGET_VERSION)
+ 	$(RM) $@ && ln -s ${<F} $@
+-%.$(SO): %.$(SO_TARGET_VERSION_MAJOR).$(SO)
+-	$(RM) $@ && ln -s ${*F}.$(SO_TARGET_VERSION).$(SO) $@
+-
+-# tzcode option
+-TZORIG_EXTRA_CFLAGS=-DSTD_INSPIRED
+-
+-# genren opts
+-GENREN_PL_OPTS=-x Mach-O -n '-g' -p '| c++filt'
++%.$(SO): %.$(SO).$(SO_TARGET_VERSION_MAJOR)
++	$(RM) $@ && ln -s ${*F}.$(SO).$(SO_TARGET_VERSION) $@
+ 
+ ## Remove shared library 's'
+ STATIC_PREFIX_WHEN_USED = 
+diff -ur icu.org/source/tools/toolutil/pkg_genc.cpp icu/source/tools/toolutil/pkg_genc.cpp
+--- icu.org/source/tools/toolutil/pkg_genc.cpp	2017-04-13 11:46:02.000000000 +0200
++++ icu/source/tools/toolutil/pkg_genc.cpp	2017-04-21 21:30:23.583568212 +0200
+@@ -160,6 +160,28 @@
+ 
+         ".long ","",HEX_0X
+     },
++    {"gcc-android-arm",
++        "\t.arch armv5te\n"
++        "\t.fpu softvfp\n"
++        "\t.eabi_attribute 20, 1\n"
++        "\t.eabi_attribute 21, 1\n"
++        "\t.eabi_attribute 23, 3\n"
++        "\t.eabi_attribute 24, 1\n"
++        "\t.eabi_attribute 25, 1\n"
++        "\t.eabi_attribute 26, 2\n"
++        "\t.eabi_attribute 30, 6\n"
++        "\t.eabi_attribute 18, 4\n"
++        "\t.file \"%s.s\"\n"
++        "\t.global %s\n"
++        "\t.section .rodata\n"
++        "\t.align 2\n"
++        "\t.type %s, %%object\n"
++        "%s:\n",
++
++        "\t.word ",
++        "\t.section .note.GNU-stack,\"\",%%progbits\n",
++        HEX_0X
++    },
+ /* 16 bytes alignment. */
+ /* http://docs.oracle.com/cd/E19641-01/802-1947/802-1947.pdf */
+     {"sun",
+diff -ur icu.org/source/tools/toolutil/pkg_genc.h icu/source/tools/toolutil/pkg_genc.h
+--- icu.org/source/tools/toolutil/pkg_genc.h	2017-01-20 01:20:31.000000000 +0100
++++ icu/source/tools/toolutil/pkg_genc.h	2017-04-21 21:30:23.582568215 +0200
+@@ -60,7 +60,7 @@
+ #endif
+ 
+ #define LARGE_BUFFER_MAX_SIZE 2048
+-#define SMALL_BUFFER_MAX_SIZE 512
++#define SMALL_BUFFER_MAX_SIZE 2048
+ #define SMALL_BUFFER_FLAG_NAMES 32
+ #define BUFFER_PADDING_SIZE 20
+ 
diff --git a/external/icu/icu4c-clang-cl.patch.1 b/external/icu/icu4c-clang-cl.patch.1
new file mode 100644
index 0000000000..a111a0df99
--- /dev/null
+++ b/external/icu/icu4c-clang-cl.patch.1
@@ -0,0 +1,28 @@
+diff -ur icu.org/source/config/mh-cygwin-msvc icu/source/config/mh-cygwin-msvc
+--- icu.org/source/config/mh-cygwin-msvc	2017-01-23 01:38:28.000000000 +0100
++++ icu/source/config/mh-cygwin-msvc	2017-04-21 23:07:28.482892025 +0200
+@@ -55,8 +55,8 @@
+ LDFLAGS+=-nologo
+ 
+ # Commands to compile
+-COMPILE.c=	$(CC) $(CPPFLAGS) $(DEFS) $(CFLAGS) -c
+-COMPILE.cc=	$(CXX) $(CPPFLAGS) $(DEFS) $(CXXFLAGS) -c
++COMPILE.c=	true && $(CC) $(CPPFLAGS) $(DEFS) $(CFLAGS) -c
++COMPILE.cc=	true && $(CXX) $(CPPFLAGS) $(DEFS) $(CXXFLAGS) -c
+ 
+ # Commands to link
+ LINK.c=		LINK.EXE -subsystem:console $(LDFLAGS)
+diff -ur icu.org/source/runConfigureICU icu/source/runConfigureICU
+--- icu.org/source/runConfigureICU	2017-01-23 01:38:28.000000000 +0100
++++ icu/source/runConfigureICU	2017-04-21 23:07:28.482892025 +0200
+@@ -261,8 +261,8 @@
+     Cygwin/MSVC)
+         THE_OS="Windows with Cygwin"
+         THE_COMP="Microsoft Visual C++"
+-        CC=cl; export CC
+-        CXX=cl; export CXX
++        CC=${CC-cl}; export CC
++        CXX=${CXX-cl}; export CXX
+         RELEASE_CFLAGS='-Gy -MD'
+         RELEASE_CXXFLAGS='-Gy -MD'
+         DEBUG_CFLAGS='-FS -Zi -MDd'
diff --git a/external/icu/icu4c-emscripten-cross.patch.1 b/external/icu/icu4c-emscripten-cross.patch.1
new file mode 100644
index 0000000000..84c88a68a8
--- /dev/null
+++ b/external/icu/icu4c-emscripten-cross.patch.1
@@ -0,0 +1,99 @@
+--- icu/source/acinclude.m4.orig	2020-04-22 22:04:20.000000000 +0200
++++ icu/source/acinclude.m4	2020-11-04 06:10:29.993070072 +0100
+@@ -84,6 +84,7 @@
+ *-dec-osf*) icu_cv_host_frag=mh-alpha-osf ;;
+ *-*-nto*)	icu_cv_host_frag=mh-qnx ;;
+ *-ncr-*)	icu_cv_host_frag=mh-mpras ;;
++wasm*-*-emscripten*)	icu_cv_host_frag=mh-emscripten ;;
+ *) 		icu_cv_host_frag=mh-unknown ;;
+ esac
+ 		]
+--- /dev/null
++++ icu/source/config/mh-emscripten	2015-10-06 12:01:00.497972406 +0200
+@@ -0,0 +1,86 @@
++## Emscripten-specific setup
++## Copyright (c) 1999-2013, International Business Machines Corporation and
++## others. All Rights Reserved.
++## Commands to generate dependency files
++GEN_DEPS.c=  $(CC) -E -MM $(DEFS) $(CPPFLAGS)
++GEN_DEPS.cc= $(CXX) -E -MM $(DEFS) $(CPPFLAGS) $(CXXFLAGS)
++ 
++## Flags for position independent code
++SHAREDLIBCFLAGS = -fPIC
++SHAREDLIBCXXFLAGS = -fPIC
++SHAREDLIBCPPFLAGS = -DPIC
++
++## Additional flags when building libraries and with threads
++THREADSCPPFLAGS = -D_REENTRANT
++LIBCPPFLAGS =
++
++## Compiler switch to embed a runtime search path
++LD_RPATH= -Wl,-zorigin,-rpath,'$$'ORIGIN
++LD_RPATH_PRE = -Wl,-rpath,
++
++## Force RPATH=$ORIGIN to locate own dependencies w/o need for LD_LIBRARY_PATH:
++ENABLE_RPATH=YES
++RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN'
++
++## These are the library specific LDFLAGS
++#LDFLAGSICUDT=-nodefaultlibs -nostdlib
++# Debian change: linking icudata as data only causes too many problems.
++LDFLAGSICUDT=
++
++## Compiler switch to embed a library name
++# The initial tab in the next line is to prevent icu-config from reading it.
++	LD_SONAME = -Wl,-soname -Wl,$(notdir $(MIDDLE_SO_TARGET))
++#SH# # We can't depend on MIDDLE_SO_TARGET being set.
++#SH# LD_SONAME=
++
++## Shared library options
++LD_SOOPTIONS= -Wl,-Bsymbolic-functions
++
++## Shared object suffix
++SO = so
++## Non-shared intermediate object suffix
++STATIC_O = o
++
++## Compilation rules
++# WASM needs -pthread for atomics support
++%.$(STATIC_O): $(srcdir)/%.c
++	$(call SILENT_COMPILE,$(strip $(COMPILE.c) $(STATICCPPFLAGS) $(STATICCFLAGS)) -pthread -o $@ $<)
++
++%.$(STATIC_O): $(srcdir)/%.cpp
++	$(call SILENT_COMPILE,$(strip $(COMPILE.cc) $(STATICCPPFLAGS) $(STATICCXXFLAGS)) -pthread -o $@ $<)
++
++
++## Dependency rules
++%.d: $(srcdir)/%.c
++	$(call ICU_MSG,(deps)) $<
++	@$(SHELL) -ec '$(GEN_DEPS.c) $< \
++		| sed '\''s%\($*\)\.o[ :]*%\1.o $@ : %g'\'' > $@; \
++		[ -s $@ ] || rm -f $@'
++
++%.d: $(srcdir)/%.cpp
++	$(call ICU_MSG,(deps)) $<
++	@$(SHELL) -ec '$(GEN_DEPS.cc) $< \
++		| sed '\''s%\($*\)\.o[ :]*%\1.o $@ : %g'\'' > $@; \
++		[ -s $@ ] || rm -f $@'
++
++## Versioned libraries rules
++
++%.$(SO).$(SO_TARGET_VERSION_MAJOR): %.$(SO).$(SO_TARGET_VERSION)
++	$(RM) $@ && ln -s ${<F} $@
++%.$(SO): %.$(SO).$(SO_TARGET_VERSION_MAJOR)
++	$(RM) $@ && ln -s ${*F}.$(SO).$(SO_TARGET_VERSION) $@
++
++##  Bind internal references
++
++# LDflags that pkgdata will use
++BIR_LDFLAGS= -Wl,-Bsymbolic
++
++# Dependencies [i.e. map files] for the final library
++BIR_DEPS=
++
++## Remove shared library 's'
++STATIC_PREFIX_WHEN_USED =
++STATIC_PREFIX =
++
++## without assembly
++PKGDATA_OPTS = -O $(top_builddir)/data/icupkg.inc -w
diff --git a/external/icu/icu4c-icudata-stdlibs.patch.1 b/external/icu/icu4c-icudata-stdlibs.patch.1
new file mode 100644
index 0000000000..c8d66c6ed0
--- /dev/null
+++ b/external/icu/icu4c-icudata-stdlibs.patch.1
@@ -0,0 +1,14 @@
+diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux
+--- icu.org/source/config/mh-linux	2017-04-21 23:09:57.588533707 +0200
++++ icu/source/config/mh-linux	2017-04-21 23:11:38.075292226 +0200
+@@ -27,7 +27,9 @@
+ RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN'
+ 
+ ## These are the library specific LDFLAGS
+-LDFLAGSICUDT=-nodefaultlibs -nostdlib
++#LDFLAGSICUDT=-nodefaultlibs -nostdlib
++# Debian change: linking icudata as data only causes too many problems.
++LDFLAGSICUDT=
+ 
+ ## Compiler switch to embed a library name
+ # The initial tab in the next line is to prevent icu-config from reading it.
diff --git a/external/icu/icu4c-khmerbreakengine.patch.1 b/external/icu/icu4c-khmerbreakengine.patch.1
new file mode 100644
index 0000000000..605914014e
--- /dev/null
+++ b/external/icu/icu4c-khmerbreakengine.patch.1
@@ -0,0 +1,837 @@
+diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp
+--- icu.org/source/common/dictbe.cpp	2023-06-14 06:23:55.000000000 +0900
++++ icu/source/common/dictbe.cpp	2023-06-26 17:43:53.034173100 +0900
+@@ -35,7 +35,19 @@
+  ******************************************************************
+  */
+ 
+-DictionaryBreakEngine::DictionaryBreakEngine() {
++DictionaryBreakEngine::DictionaryBreakEngine()
++    : fTypes(0), clusterLimit(0) {
++}
++
++DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes)
++    : fTypes(breakTypes), clusterLimit(3) {
++    UErrorCode status = U_ZERO_ERROR;
++    fViramaSet.applyPattern(UnicodeString(u"[[:ccc=VR:]]"), status);
++
++    // note Skip Sets contain fIgnoreSet characters too.
++    fSkipStartSet.applyPattern(UnicodeString(u"[[:lb=OP:][:lb=QU:]\\u200C\\u200D\\u2060]"), status);
++    fSkipEndSet.applyPattern(UnicodeString(u"[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]\\u200C\\u200D\\u2060]"), status);
++    fNBeforeSet.applyPattern(UnicodeString(u"[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status);
+ }
+ 
+ DictionaryBreakEngine::~DictionaryBreakEngine() {
+@@ -85,6 +97,169 @@
+     fSet.compact();
+ }
+ 
++bool
++DictionaryBreakEngine::scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const {
++    UErrorCode status = U_ZERO_ERROR;
++    UText* ut = utext_clone(NULL, text, false, true, &status);
++    utext_setNativeIndex(ut, start);
++    UChar32 c = utext_current32(ut);
++    bool res = false;
++    doBreak = true;
++    while (start >= 0) {
++        if (!fSkipStartSet.contains(c)) {
++            res = (c == ZWSP);
++            break;
++        }
++        --start;
++        c = utext_previous32(ut);
++        doBreak = false;
++    }
++    utext_close(ut);
++    return res;
++}
++
++bool
++DictionaryBreakEngine::scanAfterEnd(UText *text, int32_t textEnd, int32_t& end, bool &doBreak) const {
++    UErrorCode status = U_ZERO_ERROR;
++    UText* ut = utext_clone(NULL, text, false, true, &status);
++    utext_setNativeIndex(ut, end);
++    UChar32 c = utext_current32(ut);
++    bool res = false;
++    doBreak = !fNBeforeSet.contains(c);
++    while (end < textEnd) {
++        if (!fSkipEndSet.contains(c)) {
++            res = (c == ZWSP);
++            break;
++        }
++        ++end;
++        c = utext_next32(ut);
++        doBreak = false;
++    }
++    utext_close(ut);
++    return res;
++}
++
++void
++DictionaryBreakEngine::scanBackClusters(UText *text, int32_t textStart, int32_t& start) const {
++    UChar32 c = 0;
++    start = utext_getNativeIndex(text);
++    while (start > textStart) {
++        c = utext_previous32(text);
++        --start;
++        if (!fSkipEndSet.contains(c))
++            break;
++    }
++    for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters
++        while (start > textStart) {
++            while (fIgnoreSet.contains(c))
++                c = utext_previous32(text);
++            if (!fMarkSet.contains(c)) {
++                if (fBaseSet.contains(c)) {
++                    c = utext_previous32(text);
++                    if (!fViramaSet.contains(c)) { // Virama (e.g. coeng) preceding base. Treat sequence as a mark
++                        utext_next32(text);
++                        c = utext_current32(text);
++                        break;
++                    } else {
++                        --start;
++                    }
++                } else {
++                    break;
++                }
++            }
++            c = utext_previous32(text);
++            --start;
++        }
++        if (!fBaseSet.contains(c) || start < textStart) {  // not a cluster start so finish
++            break;
++        }
++        c = utext_previous32(text);
++        --start;        // go round again
++    }                   // ignore hitting previous inhibitor since scanning for it should have found us!
++    ++start;            // counteract --before
++}
++
++void
++DictionaryBreakEngine::scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const {
++    UChar32 c = utext_current32(text);
++    end = utext_getNativeIndex(text);
++    while (end < textEnd) {
++        if (!fSkipStartSet.contains(c))
++            break;
++        utext_next32(text);
++        c = utext_current32(text);
++        ++end;
++    }
++    for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters
++        while (fIgnoreSet.contains(c)) {
++            utext_next32(text);
++            c = utext_current32(text);
++        }
++        if (fBaseSet.contains(c)) {
++            while (end < textEnd) {
++                utext_next32(text);
++                c = utext_current32(text);
++                ++end;
++                if (!fMarkSet.contains(c))
++                    break;
++                else if (fViramaSet.contains(c)) {  // handle coeng + base as mark
++                    utext_next32(text);
++                    c = utext_current32(text);
++                    ++end;
++                    if (!fBaseSet.contains(c))
++                        break;
++                }
++            }
++        } else {
++            --end;    // bad char so break after char before it
++            break;
++        }
++    }
++}
++
++bool
++DictionaryBreakEngine::scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const {
++    UErrorCode status = U_ZERO_ERROR;
++    UText* ut = utext_clone(NULL, text, false, true, &status);
++    int32_t nat = start;
++    utext_setNativeIndex(ut, nat);
++    bool foundFirst = true;
++    int32_t curr = start;
++    while (nat < end) {
++        UChar32 c = utext_current32(ut);
++        if (c == ZWSP || c == WJ) {
++            curr = nat + 1;
++            if (foundFirst)     // only scan backwards for first inhibitor
++                scanBackClusters(ut, start, before);
++            foundFirst = false; // don't scan backwards if we go around again. Also marks found something
++
++            utext_next32(ut);
++            scanFwdClusters(ut, end, after);
++            nat = after + 1;
++
++            if (c == ZWSP || c == WJ) {  // did we hit another one?
++                continue;
++            } else {
++                break;
++            }
++        }
++
++        ++nat;                  // keep hunting
++        utext_next32(ut);
++    }
++
++    utext_close(ut);
++
++    if (nat >= end && foundFirst) {
++        start = before = after = nat;
++        return false;           // failed to find anything
++    }
++    else {
++        start = curr;
++    }
++    return true;                // yup hit one
++}
++
+ /*
+  ******************************************************************
+  * PossibleWord
+@@ -114,7 +289,7 @@
+     ~PossibleWord() {}
+   
+     // Fill the list of candidates if needed, select the longest, and return the number found
+-    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
++    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet = NULL, int32_t minLength = 0 );
+   
+     // Select the currently marked candidate, point after it in the text, and invalidate self
+     int32_t   acceptMarked( UText *text );
+@@ -135,12 +310,12 @@
+ };
+ 
+ 
+-int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
++int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet, int32_t minLength) {
+     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
+     int32_t start = (int32_t)utext_getNativeIndex(text);
+     if (start != offset) {
+         offset = start;
+-        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix);
++        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix, ignoreSet, minLength);
+         // Dictionary leaves text after longest prefix, not longest word. Back up.
+         if (count <= 0) {
+             utext_setNativeIndex(text, start);
+@@ -814,53 +989,30 @@
+  * KhmerBreakEngine
+  */
+ 
+-// How many words in a row are "good enough"?
+-static const int32_t KHMER_LOOKAHEAD = 3;
+-
+-// Will not combine a non-word with a preceding dictionary word longer than this
+-static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
+-
+-// Will not combine a non-word that shares at least this much prefix with a
+-// dictionary word, with a preceding word
+-static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
+-
+-// Minimum word size
+-static const int32_t KHMER_MIN_WORD = 2;
+-
+-// Minimum number of characters for two words
+-static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
+-
+ KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
+-    : DictionaryBreakEngine(),
++    : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
+       fDictionary(adoptDictionary)
+ {
+     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
+     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
+-    UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);
++
++    clusterLimit = 3;
++
++    UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]\\u2060\\u200C\\u200D]"), status);
+     if (U_SUCCESS(status)) {
+         setCharacters(khmerWordSet);
+     }
+     fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
+-    fMarkSet.add(0x0020);
+-    fEndWordSet = khmerWordSet;
+-    fBeginWordSet.add(0x1780, 0x17B3);
+-    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
+-    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
+-    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
+-    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
+-    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
+-//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
+-//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
+-//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
+-//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
+-//    fSuffixSet.add(THAI_PAIYANNOI);
+-//    fSuffixSet.add(THAI_MAIYAMOK);
++    fIgnoreSet.add(0x2060);         // WJ
++    fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ
++    fBaseSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status);
++    fPuncSet.applyPattern(UnicodeString(u"[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status);
+ 
+     // Compact for caching.
+     fMarkSet.compact();
+-    fEndWordSet.compact();
+-    fBeginWordSet.compact();
+-//    fSuffixSet.compact();
++    fIgnoreSet.compact();
++    fBaseSet.compact();
++    fPuncSet.compact();
+     UTRACE_EXIT_STATUS(status);
+ }
+ 
+@@ -876,175 +1028,205 @@
+                                                 UBool /* isPhraseBreaking */,
+                                                 UErrorCode& status ) const {
+     if (U_FAILURE(status)) return 0;
+-    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
+-        return 0;       // Not enough characters for two words
++    uint32_t wordsFound = foundBreaks.size();
++    int32_t before = 0;
++    int32_t after = 0;
++    int32_t finalBefore = 0;
++    int32_t initAfter = 0;
++    int32_t scanStart = rangeStart;
++    int32_t scanEnd = rangeEnd;
++
++    bool startZwsp = false;
++    bool breakStart = false;
++    bool breakEnd = false;
++
++    if (rangeStart > 0) {
++        --scanStart;
++        startZwsp = scanBeforeStart(text, scanStart, breakStart);
+     }
+ 
+-    uint32_t wordsFound = 0;
+-    int32_t cpWordLength = 0;
+-    int32_t cuWordLength = 0;
+-    int32_t current;
+-    PossibleWord words[KHMER_LOOKAHEAD];
+-
+     utext_setNativeIndex(text, rangeStart);
++    scanFwdClusters(text, rangeEnd, initAfter);
++    bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd);
++    utext_setNativeIndex(text, rangeEnd - 1);
++    scanBackClusters(text, rangeStart, finalBefore);
++    if (finalBefore < initAfter) {   // the whole run is tented so no breaks
++        if (breakStart || fTypes < UBRK_LINE)
++            foundBreaks.push(rangeStart, status);
++        if (breakEnd || fTypes < UBRK_LINE)
++            foundBreaks.push(rangeEnd, status);
++        return foundBreaks.size() - wordsFound;
++    }
+ 
+-    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
+-        cuWordLength = 0;
+-        cpWordLength = 0;
+-
+-        // Look for candidate words at the current position
+-        int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
+-
+-        // If we found exactly one, use that
+-        if (candidates == 1) {
+-            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
+-            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
+-            wordsFound += 1;
+-        }
++    scanStart = rangeStart;
++    scanWJ(text, scanStart, rangeEnd, before, after);
++    if (startZwsp || initAfter >= before) {
++        after = initAfter;
++        before = 0;
++    }
++    if (!endZwsp && after > finalBefore && after < rangeEnd)
++        endZwsp = true;
++    if (endZwsp && before > finalBefore)
++        before = finalBefore;
+ 
+-        // If there was more than one, see which one can take us forward the most words
+-        else if (candidates > 1) {
+-            // If we're already at the end of the range, we're done
+-            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
+-                goto foundBest;
+-            }
+-            do {
+-                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
+-                    // Followed by another dictionary word; mark first word as a good candidate
+-                    words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
++    utext_setNativeIndex(text, rangeStart);
++    int32_t numCodePts = rangeEnd - rangeStart;
++    // bestSnlp[i] is the snlp of the best segmentation of the first i
++    // code points in the range to be matched.
++    UVector32 bestSnlp(numCodePts + 1, status);
++    bestSnlp.addElement(0, status);
++    for(int32_t i = 1; i <= numCodePts; i++) {
++        bestSnlp.addElement(kuint32max, status);
++    }
+ 
+-                    // If we're already at the end of the range, we're done
+-                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
+-                        goto foundBest;
+-                    }
++    // prev[i] is the index of the last code point in the previous word in
++    // the best segmentation of the first i characters. Note negative implies
++	// that the code point is part of an unknown word.
++    UVector32 prev(numCodePts + 1, status);
++    for(int32_t i = 0; i <= numCodePts; i++) {
++        prev.addElement(kuint32max, status);
++    }
+ 
+-                    // See if any of the possible second words is followed by a third word
+-                    do {
+-                        // If we find a third word, stop right away
+-                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
+-                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
+-                            goto foundBest;
+-                        }
+-                    }
+-                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
+-                }
++    const int32_t maxWordSize = 20;
++    UVector32 values(maxWordSize, status);
++    values.setSize(maxWordSize);
++    UVector32 lengths(maxWordSize, status);
++    lengths.setSize(maxWordSize);
++
++    // Dynamic programming to find the best segmentation.
++
++    // In outer loop, i  is the code point index,
++    //                ix is the corresponding string (code unit) index.
++    //    They differ when the string contains supplementary characters.
++    int32_t ix = rangeStart;
++    for (int32_t i = 0;  i < numCodePts;  ++i, utext_setNativeIndex(text, ++ix)) {
++        if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
++            continue;
++        }
++
++        int32_t count;
++        count = fDictionary->matches(text, numCodePts - i, maxWordSize,
++                             NULL, lengths.getBuffer(), values.getBuffer(), NULL, &fIgnoreSet, 2);
++                             // Note: lengths is filled with code point lengths
++                             //       The NULL parameter is the ignored code unit lengths.
++
++        for (int32_t j = 0; j < count; j++) {
++            int32_t ln = lengths.elementAti(j);
++            if (ln + i >= numCodePts)
++                continue;
++            utext_setNativeIndex(text, ln+ix);
++            int32_t c = utext_current32(text);
++            if (fMarkSet.contains(c) || c == 0x17D2) { // Coeng
++                lengths.removeElementAt(j);
++                values.removeElementAt(j);
++                --j;
++                --count;
+             }
+-            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
+-foundBest:
+-            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
+-            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
+-            wordsFound += 1;
+         }
+-
+-        // We come here after having either found a word or not. We look ahead to the
+-        // next word. If it's not a dictionary word, we will combine it with the word we
+-        // just found (if there is one), but only if the preceding word does not exceed
+-        // the threshold.
+-        // The text iterator should now be positioned at the end of the word we found.
+-        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
+-            // if it is a dictionary word, do nothing. If it isn't, then if there is
+-            // no preceding word, or the non-word shares less than the minimum threshold
+-            // of characters with a dictionary word, then scan to resynchronize
+-            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
+-                  && (cuWordLength == 0
+-                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
+-                // Look for a plausible word boundary
+-                int32_t remaining = rangeEnd - (current+cuWordLength);
+-                UChar32 pc;
+-                UChar32 uc;
+-                int32_t chars = 0;
+-                for (;;) {
+-                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
+-                    pc = utext_next32(text);
+-                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
+-                    chars += pcSize;
+-                    remaining -= pcSize;
+-                    if (remaining <= 0) {
++        if (count == 0) {
++            utext_setNativeIndex(text, ix);
++            int32_t c = utext_current32(text);
++            if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) {
++                values.setElementAt(0, count);
++                lengths.setElementAt(1, count++);
++            } else if (fBaseSet.contains(c)) {
++                int32_t currix = utext_getNativeIndex(text);
++                do {
++                    utext_next32(text);
++                    c = utext_current32(text);
++                    if (utext_getNativeIndex(text) >= rangeEnd)
+                         break;
+-                    }
+-                    uc = utext_current32(text);
+-                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
+-                        // Maybe. See if it's in the dictionary.
+-                        int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
+-                        utext_setNativeIndex(text, current+cuWordLength+chars);
+-                        if (num_candidates > 0) {
++                    if (c == 0x17D2) { // Coeng
++                        utext_next32(text);
++                        c = utext_current32(text);
++                        if (!fBaseSet.contains(c) || utext_getNativeIndex(text) >= rangeEnd) {
+                             break;
++                        } else {
++                            utext_next32(text);
++                            c = utext_current32(text);
++                            if (utext_getNativeIndex(text) >= rangeEnd)
++                                break;
+                         }
+                     }
+-                }
+-
+-                // Bump the word count if there wasn't already one
+-                if (cuWordLength <= 0) {
+-                    wordsFound += 1;
+-                }
++                } while (fMarkSet.contains(c) || fIgnoreSet.contains(c));
++                values.setElementAt(BADSNLP, count);
++                lengths.setElementAt(utext_getNativeIndex(text) - currix, count++);
++            } else {
++                values.setElementAt(BADSNLP, count);
++                lengths.setElementAt(1, count++);
++            }
++        }
+ 
+-                // Update the length with the passed-over characters
+-                cuWordLength += chars;
++        for (int32_t j = 0; j < count; j++) {
++            uint32_t v = values.elementAti(j);
++            int32_t newSnlp = bestSnlp.elementAti(i) + v;
++            int32_t ln = lengths.elementAti(j);
++            utext_setNativeIndex(text, ln+ix);
++            int32_t c = utext_current32(text);
++            while ((fPuncSet.contains(c) || fIgnoreSet.contains(c)) && ln + i < numCodePts) {
++                ++ln;
++                utext_next32(text);
++                c = utext_current32(text);
+             }
+-            else {
+-                // Back up to where we were for next iteration
+-                utext_setNativeIndex(text, current+cuWordLength);
++            int32_t ln_j_i = ln + i;   // yes really i!
++            if (newSnlp < bestSnlp.elementAti(ln_j_i)) {
++                if (v == BADSNLP) {
++                    int32_t p = prev.elementAti(i);
++                    if (p < 0)
++                        prev.setElementAt(p, ln_j_i);
++                    else
++                        prev.setElementAt(-i, ln_j_i);
++                }
++                else
++                    prev.setElementAt(i, ln_j_i);
++                bestSnlp.setElementAt(newSnlp, ln_j_i);
+             }
+         }
+-
+-        // Never stop before a combining mark.
+-        int32_t currPos;
+-        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
+-            utext_next32(text);
+-            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
++    }
++    // Start pushing the optimal offset index into t_boundary (t for tentative).
++    // prev[numCodePts] is guaranteed to be meaningful.
++    // We'll first push in the reverse order, i.e.,
++    // t_boundary[0] = numCodePts, and afterwards do a swap.
++    UVector32 t_boundary(numCodePts+1, status);
++
++    int32_t numBreaks = 0;
++    // No segmentation found, set boundary to end of range
++    while (numCodePts >= 0 && (uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
++        --numCodePts;
++    }
++    if (numCodePts < 0) {
++        t_boundary.addElement(numCodePts, status);
++        numBreaks++;
++    } else {
++        for (int32_t i = numCodePts; (uint32_t)i != kuint32max; i = prev.elementAti(i)) {
++            if (i < 0) i = -i;
++            t_boundary.addElement(i, status);
++            numBreaks++;
+         }
++        // U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
++    }
+ 
+-        // Look ahead for possible suffixes if a dictionary word does not follow.
+-        // We do this in code rather than using a rule so that the heuristic
+-        // resynch continues to function. For example, one of the suffix characters
+-        // could be a typo in the middle of a word.
+-//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
+-//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
+-//                && fSuffixSet.contains(uc = utext_current32(text))) {
+-//                if (uc == KHMER_PAIYANNOI) {
+-//                    if (!fSuffixSet.contains(utext_previous32(text))) {
+-//                        // Skip over previous end and PAIYANNOI
+-//                        utext_next32(text);
+-//                        utext_next32(text);
+-//                        wordLength += 1;            // Add PAIYANNOI to word
+-//                        uc = utext_current32(text);     // Fetch next character
+-//                    }
+-//                    else {
+-//                        // Restore prior position
+-//                        utext_next32(text);
+-//                    }
+-//                }
+-//                if (uc == KHMER_MAIYAMOK) {
+-//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
+-//                        // Skip over previous end and MAIYAMOK
+-//                        utext_next32(text);
+-//                        utext_next32(text);
+-//                        wordLength += 1;            // Add MAIYAMOK to word
+-//                    }
+-//                    else {
+-//                        // Restore prior position
+-//                        utext_next32(text);
+-//                    }
+-//                }
+-//            }
+-//            else {
+-//                utext_setNativeIndex(text, current+wordLength);
+-//            }
+-//        }
+-
+-        // Did we find a word on this iteration? If so, push it on the break stack
+-        if (cuWordLength > 0) {
+-            foundBreaks.push((current+cuWordLength), status);
++    // Now that we're done, convert positions in t_boundary[] (indices in
++    // the normalized input string) back to indices in the original input UText
++    // while reversing t_boundary and pushing values to foundBreaks.
++    for (int32_t i = numBreaks-1; i >= 0; i--) {
++        int32_t cpPos = t_boundary.elementAti(i);
++        if (cpPos == 0 && !breakStart && fTypes >= UBRK_LINE) continue;
++        int32_t utextPos = cpPos + rangeStart;
++        while (utextPos > after && scanWJ(text, utextPos, scanEnd, before, after));
++        if (utextPos < before) {
++        // Boundaries are added to foundBreaks output in ascending order.
++            U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
++            foundBreaks.push(utextPos, status);
+         }
+     }
+-    
++
+     // Don't return a break for the end of the dictionary range if there is one there.
+-    if (foundBreaks.peeki() >= rangeEnd) {
++    if (!breakEnd && fTypes >= UBRK_LINE && foundBreaks.peeki() >= rangeEnd) {
+         (void) foundBreaks.popi();
+-        wordsFound -= 1;
+     }
+ 
+-    return wordsFound;
++    return foundBreaks.size() - wordsFound;
+ }
+ 
+ #if !UCONFIG_NO_NORMALIZATION
+diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h
+--- icu.org/source/common/dictbe.h	2022-04-08 00:41:55.000000000 +0200
++++ icu/source/common/dictbe.h	2022-05-16 13:49:33.820459894 +0200
+@@ -35,7 +35,8 @@
+  * threads without synchronization.</p>
+  */
+ class DictionaryBreakEngine : public LanguageBreakEngine {
+- private:
++ protected:
++
+     /**
+      * The set of characters handled by this engine
+      * @internal
+@@ -43,14 +44,84 @@
+ 
+   UnicodeSet    fSet;
+ 
++  const int32_t WJ   = 0x2060;
++  const int32_t ZWSP = 0x200B;
++
++  /**
++   * The break types it was constructed with
++   * @internal
++   */
++  uint32_t      fTypes;
++
++  /**
++   * A Unicode set of all viramas
++   * @internal
++   */
++  UnicodeSet    fViramaSet;
++
++  /**
++   * A Unicode set of all base characters
++   * @internal
++   */
++  UnicodeSet    fBaseSet;
++
++  /**
++   * A Unicode set of all marks
++   * @internal
++   */
++  UnicodeSet    fMarkSet;
++
++  /**
++   * A Unicode set of all characters ignored ignored in dictionary matching
++   * @internal
++   */
++  UnicodeSet    fIgnoreSet;
++
++  /**
++   * A Unicode set of all characters ignored ignored in dictionary matching
++   * @internal
++   */
++  UnicodeSet    fSkipStartSet;
++
++  /**
++   * A Unicode set of all characters ignored ignored in dictionary matching
++   * @internal
++   */
++  UnicodeSet    fSkipEndSet;
++
++  /**
++   * A Unicode set of all characters that should not be broken before
++   * @internal
++   */
++  UnicodeSet    fNBeforeSet;
++
++  /**
++   * The number of clusters within which breaks are inhibited
++   * @internal
++   */
++  int32_t clusterLimit;
++
++  bool scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const;
++
++  bool scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const;
++  bool scanAfterEnd(UText *text, int32_t rangeEnd, int32_t& end, bool &doBreak) const;
++  void scanBackClusters(UText *text, int32_t textStart, int32_t& start) const;
++  void scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const;
++
+  public:
+ 
+   /**
+-   * <p>Constructor </p>
++   * <p>Default constructor.</p>
++   *
+    */
+   DictionaryBreakEngine();
+ 
+   /**
++   * <p>Constructor with break types.</p>
++   */
++  explicit DictionaryBreakEngine(uint32_t breakTypes);
++
++  /**
+    * <p>Virtual destructor.</p>
+    */
+   virtual ~DictionaryBreakEngine();
+@@ -305,10 +376,12 @@
+      * @internal
+      */
+ 
+-  UnicodeSet                fEndWordSet;
+   UnicodeSet                fBeginWordSet;
+-  UnicodeSet                fMarkSet;
+-  DictionaryMatcher  *fDictionary;
++  UnicodeSet                fPuncSet;
++  DictionaryMatcher        *fDictionary;
++
++  const uint32_t BADSNLP = 256 * 20;
++  const uint32_t kuint32max = 0x7FFFFFFF;
+ 
+  public:
+ 
+diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionarydata.cpp
+--- icu.org/source/common/dictionarydata.cpp	2023-06-14 06:23:55.000000000 +0900
++++ icu/source/common/dictionarydata.cpp	2023-06-26 02:18:05.709454400 +0900
+@@ -44,7 +44,7 @@
+ 
+ int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
+                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
+-                            int32_t *prefix) const {
++                            int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
+ 
+     UCharsTrie uct(characters);
+     int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
+@@ -55,7 +55,13 @@
+         UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
+         int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
+         codePointsMatched += 1;
++        if (ignoreSet != NULL && ignoreSet->contains(c)) {
++            continue;
++        }
+         if (USTRINGTRIE_HAS_VALUE(result)) {
++            if (codePointsMatched < minLength) {
++                continue;
++            }
+             if (wordCount < limit) {
+                 if (values != nullptr) {
+                     values[wordCount] = uct.getValue();
+@@ -112,7 +118,7 @@
+ 
+ int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
+                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
+-                            int32_t *prefix) const {
++                            int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
+     BytesTrie bt(characters);
+     int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
+     int32_t wordCount = 0;
+@@ -122,7 +128,13 @@
+         UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
+         int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
+         codePointsMatched += 1;
++        if (ignoreSet != NULL && ignoreSet->contains(c)) {
++            continue;
++        }
+         if (USTRINGTRIE_HAS_VALUE(result)) {
++            if (codePointsMatched < minLength) {
++                continue;
++            }
+             if (wordCount < limit) {
+                 if (values != nullptr) {
+                     values[wordCount] = bt.getValue();
+
+diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h
+--- icu.org/source/common/dictionarydata.h	2023-06-14 06:23:55.000000000 +0900
++++ icu/source/common/dictionarydata.h	2023-06-26 17:43:53.097724900 +0900
+@@ -21,6 +21,7 @@
+ #include "unicode/utext.h"
+ #include "unicode/udata.h"
+ #include "udataswp.h"
++#include "unicode/uniset.h"
+ #include "unicode/uobject.h"
+ #include "unicode/ustringtrie.h"
+ 
+@@ -92,7 +93,7 @@
+      */
+     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
+                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
+-                            int32_t *prefix) const = 0;
++                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const = 0;
+ 
+     /** @return DictionaryData::TRIE_TYPE_XYZ */
+     virtual int32_t getType() const = 0;
+@@ -107,7 +108,7 @@
+     virtual ~UCharsDictionaryMatcher();
+     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
+                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
+-                            int32_t *prefix) const override;
++                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const override;
+     virtual int32_t getType() const override;
+ private:
+     const char16_t *characters;
+@@ -125,7 +126,7 @@
+     virtual ~BytesDictionaryMatcher();
+     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
+                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
+-                            int32_t *prefix) const override;
++                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const override;
+     virtual int32_t getType() const override;
+ private:
+     UChar32 transform(UChar32 c) const;
diff --git a/external/icu/icu4c-macosx.patch.1 b/external/icu/icu4c-macosx.patch.1
new file mode 100644
index 0000000000..fee08eb057
--- /dev/null
+++ b/external/icu/icu4c-macosx.patch.1
@@ -0,0 +1,20 @@
+diff -ur icu.org/source/common/putil.cpp icu/source/common/putil.cpp
+--- icu.org/source/common/putil.cpp	2017-04-10 16:22:16.000000000 +0200
++++ icu/source/common/putil.cpp	2017-04-21 22:14:09.940217733 +0200
+@@ -1198,8 +1198,16 @@
+         static const time_t decemberSolstice=1198332540; /*2007-12-22 06:09 UT*/
+ 
+         /* This probing will tell us when daylight savings occurs.  */
++#if U_PLATFORM_IS_DARWIN_BASED
++        struct tm *tmp;
++        tmp = localtime(&juneSolstice);
++        juneSol = *tmp;
++        tmp = localtime(&decemberSolstice);
++        decemberSol = *tmp;
++#else
+         localtime_r(&juneSolstice, &juneSol);
+         localtime_r(&decemberSolstice, &decemberSol);
++#endif
+         if(decemberSol.tm_isdst > 0) {
+           daylightType = U_DAYLIGHT_DECEMBER;
+         } else if(juneSol.tm_isdst > 0) {
diff --git a/external/icu/icu4c-mkdir.patch.1 b/external/icu/icu4c-mkdir.patch.1
new file mode 100644
index 0000000000..0cdcf2b078
--- /dev/null
+++ b/external/icu/icu4c-mkdir.patch.1
@@ -0,0 +1,17 @@
+diff -ur icu.org/source/data/Makefile.in icu/source/data/Makefile.in
+--- icu.org/source/data/Makefile.in	2020-10-28 22:21:12.000000000 +0100
++++ icu/source/data/Makefile.in	2020-11-17 10:18:37.960032668 +0100
+@@ -239,6 +239,13 @@
+ 
+ ifeq ($(ENABLE_SO_VERSION_DATA),1)
+ ifeq ($(PKGDATA_MODE),dll)
++
++# This should be in the included rules.mk but that is generated empty by
++# configure because we have no data/locales/root.txt with prebuilt data/in/
++$(TMP_DIR)/dirs.timestamp:
++	$(MKINSTALLDIRS) $(OUTTMPDIR) $(TMP_DIR)
++	echo timestamp > $@
++
+ SO_VERSION_DATA = $(OUTTMPDIR)/icudata.res
+ $(SO_VERSION_DATA) : $(MISCSRCDIR)/icudata.rc | $(TMP_DIR)/dirs.timestamp
+ ifeq ($(MSYS_RC_MODE),1)
diff --git a/external/icu/icu4c-rpath.patch.1 b/external/icu/icu4c-rpath.patch.1
new file mode 100644
index 0000000000..35a5457780
--- /dev/null
+++ b/external/icu/icu4c-rpath.patch.1
@@ -0,0 +1,36 @@
+diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux
+--- icu.org/source/config/mh-linux	2016-06-15 20:58:17.000000000 +0200
++++ icu/source/config/mh-linux	2017-04-21 22:38:18.893927819 +0200
+@@ -22,6 +22,10 @@
+ LD_RPATH= -Wl,-zorigin,-rpath,'$$'ORIGIN 
+ LD_RPATH_PRE = -Wl,-rpath,
+ 
++## Force RPATH=$ORIGIN to locate own dependencies w/o need for LD_LIBRARY_PATH:
++ENABLE_RPATH=YES
++RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN'
++
+ ## These are the library specific LDFLAGS
+ LDFLAGSICUDT=-nodefaultlibs -nostdlib
+ 
+diff -ur icu.org/source/data/pkgdataMakefile.in icu/source/data/pkgdataMakefile.in
+--- icu.org/source/data/pkgdataMakefile.in	2016-06-15 20:58:17.000000000 +0200
++++ icu/source/data/pkgdataMakefile.in	2017-04-21 22:38:18.892927822 +0200
+@@ -18,6 +18,9 @@
+ MIDDLE_SO_TARGET=
+ PKGDATA_TRAILING_SPACE=" "
+ 
++# escape $ with \ when passing to echo; needed to preserve $ORIGIN
++SHLIB.c.shell := $(subst $$,\$$,$(SHLIB.c))
++
+ all : clean 
+ 	@echo GENCCODE_ASSEMBLY_TYPE=$(GENCCODE_ASSEMBLY) >> $(OUTPUTFILE)
+ 	@echo SO=$(SO) >> $(OUTPUTFILE)
+@@ -26,7 +29,7 @@
+ 	@echo LIB_EXT_ORDER=$(FINAL_SO_TARGET) >> $(OUTPUTFILE)
+ 	@echo COMPILE="$(COMPILE.c)" >> $(OUTPUTFILE)
+ 	@echo LIBFLAGS="-I$(top_srcdir)/common -I$(top_builddir)/common $(SHAREDLIBCPPFLAGS) $(SHAREDLIBCFLAGS)" >> $(OUTPUTFILE)
+-	@echo GENLIB="$(SHLIB.c)" >> $(OUTPUTFILE)
++	@echo GENLIB="$(SHLIB.c.shell)" >> $(OUTPUTFILE)
+ 	@echo LDICUDTFLAGS=$(LDFLAGSICUDT) >> $(OUTPUTFILE)
+ 	@echo LD_SONAME=$(LD_SONAME) >> $(OUTPUTFILE)
+ 	@echo RPATH_FLAGS=$(RPATH_FLAGS) >> $(OUTPUTFILE)
diff --git a/external/icu/icu4c-rtti.patch.1 b/external/icu/icu4c-rtti.patch.1
new file mode 100644
index 0000000000..c058c7f3c8
--- /dev/null
+++ b/external/icu/icu4c-rtti.patch.1
@@ -0,0 +1,12 @@
+diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux
+--- icu.org/source/config/mh-linux	2017-04-21 23:01:23.257769703 +0200
++++ icu/source/config/mh-linux	2017-04-21 23:03:23.166481552 +0200
+@@ -36,7 +36,7 @@
+ #SH# LD_SONAME=
+ 
+ ## Shared library options
+-LD_SOOPTIONS= -Wl,-Bsymbolic
++LD_SOOPTIONS= -Wl,-Bsymbolic-functions
+ 
+ ## Shared object suffix
+ SO = so
diff --git a/external/icu/icu4c-scriptrun.patch.1 b/external/icu/icu4c-scriptrun.patch.1
new file mode 100644
index 0000000000..f2f2cf9f3b
--- /dev/null
+++ b/external/icu/icu4c-scriptrun.patch.1
@@ -0,0 +1,60 @@
+diff -ur icu.org/source/extra/scrptrun/scrptrun.cpp icu/source/extra/scrptrun/scrptrun.cpp
+--- icu.org/source/extra/scrptrun/scrptrun.cpp	2017-01-20 01:20:31.000000000 +0100
++++ icu/source/extra/scrptrun/scrptrun.cpp	2017-04-21 22:59:31.708037770 +0200
+@@ -151,7 +151,11 @@
+         // characters above it on the stack will be poped.
+         if (pairIndex >= 0) {
+             if ((pairIndex & 1) == 0) {
+-                parenStack[++parenSP].pairIndex = pairIndex;
++                ++parenSP;
++                int32_t nVecSize = parenStack.size();
++                if (parenSP == nVecSize)
++                    parenStack.resize(nVecSize + 128);
++                parenStack[parenSP].pairIndex = pairIndex;
+                 parenStack[parenSP].scriptCode  = scriptCode;
+             } else if (parenSP >= 0) {
+                 int32_t pi = pairIndex & ~1;
+@@ -185,7 +189,14 @@
+             // pop it from the stack
+             if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) {
+                 parenSP -= 1;
+-                startSP -= 1;
++                /* decrement startSP only if it is >= 0,
++                   decrementing it unnecessarily will lead to memory corruption
++                   while processing the above while block.
++                   e.g. startSP = -4 , parenSP = -1
++                */
++                if (startSP >= 0) {
++                    startSP -= 1;
++                }
+             }
+         } else {
+             // if the run broke on a surrogate pair,
+diff -ur icu.org/source/extra/scrptrun/scrptrun.h icu/source/extra/scrptrun/scrptrun.h
+--- icu.org/source/extra/scrptrun/scrptrun.h	2017-01-20 01:20:31.000000000 +0100
++++ icu/source/extra/scrptrun/scrptrun.h	2017-04-21 22:59:31.708037770 +0200
+@@ -19,6 +19,7 @@
+ #include "unicode/utypes.h"
+ #include "unicode/uobject.h"
+ #include "unicode/uscript.h"
++#include <vector>
+ 
+ U_NAMESPACE_BEGIN
+ 
+@@ -81,7 +82,7 @@
+     int32_t scriptEnd;
+     UScriptCode scriptCode;
+ 
+-    ParenStackEntry parenStack[128];
++    std::vector<ParenStackEntry> parenStack;
+     int32_t parenSP;
+ 
+     static int8_t highBit(int32_t value);
+@@ -135,6 +136,7 @@
+     scriptEnd   = charStart;
+     scriptCode  = USCRIPT_INVALID_CODE;
+     parenSP     = -1;
++    parenStack.resize(128);
+ }
+ 
+ inline void ScriptRun::reset(int32_t start, int32_t length)
diff --git a/external/icu/icu4c-solarisgcc.patch.1 b/external/icu/icu4c-solarisgcc.patch.1
new file mode 100644
index 0000000000..6000ed0cb9
--- /dev/null
+++ b/external/icu/icu4c-solarisgcc.patch.1
@@ -0,0 +1,12 @@
+diff -ur icu.org/source/common/uposixdefs.h icu/source/common/uposixdefs.h
+--- icu.org/source/common/uposixdefs.h	2017-03-09 03:12:45.000000000 +0100
++++ icu/source/common/uposixdefs.h	2017-04-21 22:23:11.857926971 +0200
+@@ -54,7 +54,7 @@
+  *
+  * z/OS needs this definition for timeval and to get usleep.
+  */
+-#if !defined(_XOPEN_SOURCE_EXTENDED) && defined(__TOS_MVS__)
++#if !defined(_XOPEN_SOURCE_EXTENDED) && (defined(__TOS_MVS__) || defined(__IBMC__) || defined(__IBMCPP__))
+ #   define _XOPEN_SOURCE_EXTENDED 1
+ #endif
+ 
diff --git a/external/icu/icu4c-ubsan.patch.1 b/external/icu/icu4c-ubsan.patch.1
new file mode 100644
index 0000000000..7b0c2efc92
--- /dev/null
+++ b/external/icu/icu4c-ubsan.patch.1
@@ -0,0 +1,14 @@
+diff -ur icu.org/source/common/ubidiimp.h icu/source/common/ubidiimp.h
+--- icu.org/source/common/ubidiimp.h	2019-10-03 13:16:41.000000000 +0200
++++ icu/source/common/ubidiimp.h	2019-10-28 19:08:13.533284618 +0100
+@@ -198,8 +198,8 @@
+ /* in a Run, logicalStart will get this bit set if the run level is odd */
+ #define INDEX_ODD_BIT (1UL<<31)
+ 
+-#define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((int32_t)((level)&1)<<31))
+-#define ADD_ODD_BIT_FROM_LEVEL(x, level)  ((x)|=((int32_t)((level)&1)<<31))
++#define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((uint32_t)((level)&1)<<31))
++#define ADD_ODD_BIT_FROM_LEVEL(x, level)  ((x)|=((uint32_t)((level)&1)<<31))
+ #define REMOVE_ODD_BIT(x)                 ((x)&=~INDEX_ODD_BIT)
+ 
+ #define GET_INDEX(x)   ((x)&~INDEX_ODD_BIT)
diff --git a/external/icu/icu4c-use-pkgdata-single-ccode-file-mode.patch.1 b/external/icu/icu4c-use-pkgdata-single-ccode-file-mode.patch.1
new file mode 100644
index 0000000000..237e554b8a
--- /dev/null
+++ b/external/icu/icu4c-use-pkgdata-single-ccode-file-mode.patch.1
@@ -0,0 +1,12 @@
+--- icu/source/tools/toolutil/pkg_genc.h.orig	2022-01-11 06:02:29.694678787 +0100
++++ icu/source/tools/toolutil/pkg_genc.h	2022-01-11 06:02:41.602640965 +0100
+@@ -48,9 +48,7 @@
+  * the data to generate the final data library. This can
+  * increase the performance of the pkdata tool.
+  */
+-#if U_PLATFORM == U_PF_OS400
+ #define USE_SINGLE_CCODE_FILE
+-#endif
+ 
+ /* Need to fix the file seperator character when using MinGW. */
+ #if defined(WINDOWS_WITH_GNUC) || defined(USING_CYGWIN)
diff --git a/external/icu/icu4c-warnings.patch.1 b/external/icu/icu4c-warnings.patch.1
new file mode 100644
index 0000000000..d8df0e14e9
--- /dev/null
+++ b/external/icu/icu4c-warnings.patch.1
@@ -0,0 +1,11 @@
+diff -ur icu.org/source/common/unicode/utf16.h icu/source/common/unicode/utf16.h
+--- icu.org/source/common/unicode/utf16.h	2020-10-28 22:21:12.000000000 +0100
++++ icu/source/common/unicode/utf16.h	2020-11-16 19:31:03.356478154 +0100
+@@ -398,6 +398,7 @@
+         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
+     } else /* c>0x10ffff or not enough space */ { \
+         (isError)=true; \
++        (void)(isError); \
+     } \
+ } UPRV_BLOCK_MACRO_END
+ 
diff --git a/external/icu/icu4c-windows-cygwin-cross.patch.1 b/external/icu/icu4c-windows-cygwin-cross.patch.1
new file mode 100644
index 0000000000..dd6b47c172
--- /dev/null
+++ b/external/icu/icu4c-windows-cygwin-cross.patch.1
@@ -0,0 +1,131 @@
+diff -ur icu.org/source/acinclude.m4 icu/source/acinclude.m4
+--- icu.org/source/acinclude.m4     2020-04-10 16:22:16.000000000 +0200
++++ icu/source/acinclude.m4 2020-04-21 22:14:09.940217733 +0200
+@@ -52,6 +52,12 @@
+ 	else
+ 		icu_cv_host_frag=mh-cygwin-msvc
+ 	fi ;;
++aarch64-*-cygwin)
++	if test "$GCC" = yes; then
++		icu_cv_host_frag=mh-cygwin64
++	else
++		icu_cv_host_frag=mh-cygwin-msvc
++	fi ;;
+ *-*-mingw*)
+ 	if test "$GCC" = yes; then
+                 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+--- icu/source/configure.ac.orig	2020-04-22 22:04:20.000000000 +0200
++++ icu/source/configure.ac	2020-10-01 09:39:05.570900400 +0200
+@@ -213,23 +213,33 @@
+ 	[cross_buildroot="${withval}"],
+         [cross_buildroot=""])
+ 
++cross_mixed_buildroot="$cross_buildroot"
++cross_unix_buildroot="$cross_buildroot"
+ if test "X$cross_buildroot" = "X"; then
+     if test "$cross_compiling" = "yes"; then
+         AC_MSG_ERROR([Error! Cross compiling but no --with-cross-build option specified - please supply the path to an executable ICU's build root])
+ 	dnl '
+     fi
+ else
+-    if test -f "${cross_buildroot}/config/icucross.mk"; then
++    case "${host}" in
++    *-*-cygwin*)
++        #M# -m isn't used because it doesn't work on Win98
++        cross_mixed_buildroot=$(cygpath -ad "$cross_buildroot" | tr '\\' '/')
++        cross_unix_buildroot=$(cygpath -au "$cross_buildroot")
++	;;
++    esac
++    if test -f "${cross_mixed_buildroot}/config/icucross.mk"; then
+         AC_MSG_RESULT([Using cross buildroot: $cross_buildroot])
+     else
+-        if test -d "${cross_buildroot}"; then
+-            AC_MSG_ERROR([${cross_buildroot}/config/icucross.mk not found. Please build ICU in ${cross_buildroot} first.])
++        if test -d "${cross_mixed_buildroot}"; then
++            AC_MSG_ERROR([${cross_mixed_buildroot}/config/icucross.mk not found. Please build ICU in ${cross_mixed_buildroot} first.])
+         else
+-            AC_MSG_ERROR([No such directory ${cross_buildroot} supplied as the argument to --with-cross-build. Use an absolute path.])
++            AC_MSG_ERROR([No such directory ${cross_mixed_buildroot} supplied as the argument to --with-cross-build. Use an absolute path.])
+         fi
+     fi
+ fi
+-AC_SUBST(cross_buildroot)
++AC_SUBST(cross_mixed_buildroot)
++AC_SUBST(cross_unix_buildroot)
+ 
+ # Check for doxygen to generate documentation
+ AC_PATH_PROG(DOXYGEN,doxygen,,$PATH:/usr/local/bin:/usr/bin)
+--- icu/source/test/testdata/Makefile.in.orig	2020-10-01 09:37:25.847888900 +0200
++++ icu/source/test/testdata/Makefile.in	2020-10-01 09:36:41.859996500 +0200
+@@ -82,7 +82,7 @@
+ # relative lib links from pkgdata are the same as for tmp
+ GENRBOPTS=-k
+ # use the cross root, in case we are cross compiling. Otherwise it is equal to top_builddir
+-TOOLDIR=$(cross_buildroot)/tools
++TOOLDIR=$(cross_mixed_buildroot)/tools
+ SRCDATADIR=$(top_srcdir)/data
+ UNICODEDATADIR=$(SRCDATADIR)/unidata
+ OUTDIR=$(top_builddir)/data/out
+--- icu/source/Makefile.in.orig	2020-04-22 22:04:20.000000000 +0200
++++ icu/source/Makefile.in	2020-10-01 09:29:36.642364000 +0200
+@@ -255,16 +255,16 @@
+ 	@(echo "CROSS_ICU_VERSION=$(VERSION)" ;\
+ 	  echo "TOOLEXEEXT=$(EXEEXT)" \
+ 	   ) > $@
+-	@(echo 'TOOLBINDIR=$$(cross_buildroot)/bin' ;\
+-	  echo 'TOOLLIBDIR=$$(cross_buildroot)/lib' ;\
+-	  echo "INVOKE=$(LDLIBRARYPATH_ENVVAR)=$(LIBRARY_PATH_PREFIX)"'$$(TOOLLIBDIR):$$(cross_buildroot)/stubdata:$$(cross_buildroot)/tools/ctestfw:$$$$'"$(LDLIBRARYPATH_ENVVAR)" ;\
+-	  echo "PKGDATA_INVOKE=$(LDLIBRARYPATH_ENVVAR)=$(LIBRARY_PATH_PREFIX)"'$$(cross_buildroot)/stubdata:$$(cross_buildroot)/tools/ctestfw:$$(TOOLLIBDIR):$$$$'"$(LDLIBRARYPATH_ENVVAR) " ;\
++	@(echo 'TOOLBINDIR=$$(cross_mixed_buildroot)/bin' ;\
++	  echo 'TOOLLIBDIR=$$(cross_mixed_buildroot)/lib' ;\
++	  echo "INVOKE=$(LDLIBRARYPATH_ENVVAR)=$(LIBRARY_PATH_PREFIX)"'$$(cross_unix_buildroot)/lib:$$(cross_unix_buildroot)/stubdata:$$(cross_unix_buildroot)/tools/ctestfw:$$$$'"$(LDLIBRARYPATH_ENVVAR)" ;\
++	  echo "PKGDATA_INVOKE=$(LDLIBRARYPATH_ENVVAR)=$(LIBRARY_PATH_PREFIX)"'$$(cross_unix_buildroot)/stubdata:$$(cross_unix_buildroot)/tools/ctestfw:$$(cross_unix_buildroot)/lib:$$$$'"$(LDLIBRARYPATH_ENVVAR) " ;\
+ 	  echo ) >> $@
+ 
+ config/icucross.inc: $(top_builddir)/icudefs.mk  $(top_builddir)/Makefile @platform_make_fragment@
+ 	@echo rebuilding $@
+-	@(grep '^CURR_FULL_DIR' $(top_builddir)/icudefs.mk ; \
+-	  grep '^CURR_FULL_DIR' @platform_make_fragment@ || echo ""; \
++	@(grep '^CURR_FULL_DIR' @platform_make_fragment@ || echo ""; \
++	  grep '^CURR_FULL_DIR' $(top_builddir)/icudefs.mk ; \
+ 	   ) > $@
+ 
+ config/icu.pc: $(srcdir)/config/icu.pc.in
+--- icu/source/icudefs.mk.in.orig	2020-04-22 22:04:20.000000000 +0200
++++ icu/source/icudefs.mk.in	2020-10-01 09:35:54.418128800 +0200
+@@ -35,7 +35,8 @@
+ sysconfdir = @sysconfdir@
+ # controls the include of $(top_builddir)/icucross.mk at bottom of file
+ cross_compiling = @cross_compiling@
+-cross_buildroot = @cross_buildroot@
++cross_mixed_buildroot = @cross_mixed_buildroot@
++cross_unix_buildroot = @cross_unix_buildroot@
+ 
+ # Package information
+ 
+@@ -303,8 +304,8 @@
+ INSTALLED_INVOKE = $(LDLIBRARYPATH_ENVVAR)=$(libdir):$$$(LDLIBRARYPATH_ENVVAR)
+ 
+ # Current full path directory for cross compilation
+-ifneq ($(strip $(cross_buildroot)),)
+-include $(cross_buildroot)/config/icucross.inc
++ifneq ($(strip $(cross_mixed_buildroot)),)
++include $(cross_mixed_buildroot)/config/icucross.inc
+ endif
+ 
+ # Platform-specific setup
+@@ -323,10 +324,11 @@
+ 
+ # some imported things from the cross env
+ TOOLEXEEXT = $(EXEEXT)
+-ifneq ($(strip $(cross_buildroot)),)
+-include $(cross_buildroot)/config/icucross.mk
++ifneq ($(strip $(cross_mixed_buildroot)),)
++include $(cross_mixed_buildroot)/config/icucross.mk
+ else
+-cross_buildroot = $(top_builddir)
++cross_mixed_buildroot = $(top_builddir)
++cross_unix_buildroot = $(top_builddir)
+ endif
+ 
+ # for tests
diff --git a/external/icu/khmerdict.dict b/external/icu/khmerdict.dict
new file mode 100644
index 0000000000..52605b6546
--- /dev/null
+++ b/external/icu/khmerdict.dict