diff options
Diffstat (limited to 'external/icu')
28 files changed, 1878 insertions, 0 deletions
diff --git a/external/icu/ExternalPackage_icu.mk b/external/icu/ExternalPackage_icu.mk new file mode 100644 index 0000000000..dcd4da2169 --- /dev/null +++ b/external/icu/ExternalPackage_icu.mk @@ -0,0 +1,42 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +icu_VERSION := $(ICU_MAJOR).$(ICU_MINOR)$(if $(ICU_MICRO),.$(ICU_MICRO)) + +$(eval $(call gb_ExternalPackage_ExternalPackage,icu,icu)) + +$(eval $(call gb_ExternalPackage_use_external_project,icu,icu)) + +ifneq ($(DISABLE_DYNLOADING),TRUE) +ifeq ($(OS),WNT) + +ifeq ($(COM),GCC) +$(eval $(call gb_ExternalPackage_add_files,icu,$(LIBO_LIB_FOLDER),\ + source/lib/icuin$(ICU_MAJOR).dll \ +)) +else +$(eval $(call gb_ExternalPackage_add_files,icu,$(LIBO_LIB_FOLDER),\ + source/lib/icuin$(if $(MSVC_USE_DEBUG_RUNTIME),d)$(ICU_MAJOR).dll \ +)) +endif # $(COM) + +else ifeq ($(OS),ANDROID) + +$(eval $(call gb_ExternalPackage_add_files,icu,$(LIBO_LIB_FOLDER),\ + source/lib/libicui18nlo.so \ +)) + +else # $(OS) != WNT/ANDROID + +$(eval $(call gb_ExternalPackage_add_file,icu,$(LIBO_LIB_FOLDER)/libicui18n$(gb_Library_DLLEXT).$(ICU_MAJOR),source/lib/libicui18n$(gb_Library_DLLEXT).$(icu_VERSION))) + +endif # $(OS) +endif # DISABLE_DYNLOADING + +# vim: set noet sw=4 ts=4: diff --git a/external/icu/ExternalPackage_icu_ure.mk b/external/icu/ExternalPackage_icu_ure.mk new file mode 100644 index 0000000000..fefe71afdc --- /dev/null +++ b/external/icu/ExternalPackage_icu_ure.mk @@ -0,0 +1,48 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +# libxml2 is in URE and depends on icuuc*.dll on Windows; the i18nlangtag lib is +# in URE and depends on the icuuc lib (which in turn depends on the icudata lib) +# on all platforms: + +$(eval $(call gb_ExternalPackage_ExternalPackage,icu_ure,icu)) + +$(eval $(call gb_ExternalPackage_use_external_project,icu_ure,icu)) + +ifneq ($(DISABLE_DYNLOADING),TRUE) +ifeq ($(OS),WNT) + +ifeq ($(COM),GCC) +$(eval $(call gb_ExternalPackage_add_files,icu_ure,$(LIBO_URE_LIB_FOLDER),\ + source/lib/icudt$(ICU_MAJOR).dll \ + source/lib/icuuc$(ICU_MAJOR).dll \ +)) +else +$(eval $(call gb_ExternalPackage_add_files,icu_ure,$(LIBO_URE_LIB_FOLDER),\ + source/lib/icudt$(if $(MSVC_USE_DEBUG_RUNTIME),d)$(ICU_MAJOR).dll \ + source/lib/icuuc$(if $(MSVC_USE_DEBUG_RUNTIME),d)$(ICU_MAJOR).dll \ +)) +endif # $(COM) + +else ifeq ($(OS),ANDROID) + +$(eval $(call gb_ExternalPackage_add_files,icu_ure,$(LIBO_URE_LIB_FOLDER),\ + source/lib/libicudatalo.so \ + source/lib/libicuuclo.so \ +)) + +else # $(OS) != WNT/ANDROID + +$(eval $(call gb_ExternalPackage_add_file,icu_ure,$(LIBO_URE_LIB_FOLDER)/libicudata$(gb_Library_DLLEXT).$(ICU_MAJOR),source/lib/libicudata$(gb_Library_DLLEXT).$(icu_VERSION))) +$(eval $(call gb_ExternalPackage_add_file,icu_ure,$(LIBO_URE_LIB_FOLDER)/libicuuc$(gb_Library_DLLEXT).$(ICU_MAJOR),source/lib/libicuuc$(gb_Library_DLLEXT).$(icu_VERSION))) + +endif # $(OS) +endif # DISABLE_DYNLOADING + +# vim: set noet sw=4 ts=4: diff --git a/external/icu/ExternalProject_icu.mk b/external/icu/ExternalProject_icu.mk new file mode 100644 index 0000000000..5388eee589 --- /dev/null +++ b/external/icu/ExternalProject_icu.mk @@ -0,0 +1,100 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +$(eval $(call gb_ExternalProject_ExternalProject,icu)) + +$(eval $(call gb_ExternalProject_register_targets,icu,\ + build \ +)) + +icu_CPPFLAGS:="-DHAVE_GCC_ATOMICS=$(if $(filter TRUE,$(GCC_HAVE_BUILTIN_ATOMIC)),1,0)" + +ifeq ($(OS),WNT) + +$(call gb_ExternalProject_get_state_target,icu,build) : + $(call gb_Trace_StartRange,icu,EXTERNAL) + $(call gb_ExternalProject_run,build,\ + autoconf -f \ + && export LIB="$(ILIB)" PYTHONWARNINGS="default" \ + gb_ICU_XFLAGS="-FS $(SOLARINC) $(gb_DEBUGINFO_FLAGS) $(if $(MSVC_USE_DEBUG_RUNTIME),-MDd,-MD -Gy)" \ + && CFLAGS="$${gb_ICU_XFLAGS}" CPPFLAGS="$(SOLARINC)" CXXFLAGS="$${gb_ICU_XFLAGS}" \ + INSTALL=`cygpath -m /usr/bin/install` $(if $(MSVC_USE_DEBUG_RUNTIME),LDFLAGS="-DEBUG") \ + $(gb_RUN_CONFIGURE) ./configure \ + $(if $(MSVC_USE_DEBUG_RUNTIME),--enable-debug --disable-release) \ + $(gb_CONFIGURE_PLATFORMS) \ + $(if $(CROSS_COMPILING), \ + --with-cross-build=$(WORKDIR_FOR_BUILD)/UnpackedTarball/icu/source \ + --disable-tools --disable-extras) \ + && $(MAKE) $(if $(CROSS_COMPILING),DATASUBDIR=data) $(if $(verbose),VERBOSE=1) \ + ,source) + $(call gb_Trace_EndRange,icu,EXTERNAL) + +else # $(OS) + +icu_CFLAGS:="$(CFLAGS) \ + $(if $(filter iOS,$(OS)),-DUCONFIG_NO_FILE_IO) \ + $(if $(SYSBASE),-I$(SYSBASE)/usr/include) \ + $(call gb_ExternalProject_get_build_flags,icu) \ + $(if $(ENABLE_LTO),$(gb_LTOFLAGS)) \ + $(if $(filter GCC,$(COM)),-fno-strict-aliasing) \ + $(if $(filter FUZZERS,$(BUILD_TYPE)),-DU_USE_STRTOD_L=0) \ + $(if $(filter ANDROID,$(OS)),-fvisibility=hidden -fno-omit-frame-pointer)" +icu_CXXFLAGS:="$(CXXFLAGS) $(CXXFLAGS_CXX11) \ + $(if $(filter iOS,$(OS)),-DUCONFIG_NO_FILE_IO) \ + $(call gb_ExternalProject_get_build_flags,icu) \ + $(if $(ENABLE_LTO),$(gb_LTOFLAGS)) \ + $(if $(filter GCC,$(COM)),-fno-strict-aliasing) \ + $(if $(filter FUZZERS,$(BUILD_TYPE)),-DU_USE_STRTOD_L=0) \ + $(if $(filter ANDROID,$(OS)),-fvisibility=hidden -fno-omit-frame-pointer -I$(SRCDIR)/include)" +icu_LDFLAGS:=" \ + $(if $(ENABLE_LTO),$(gb_LTOFLAGS)) \ + $(call gb_ExternalProject_get_link_flags,icu) \ + $(if $(filter TRUE,$(HAVE_LD_HASH_STYLE)),-Wl$(COMMA)--hash-style=$(WITH_LINKER_HASH_STYLE)) \ + $(if $(SYSBASE),-L../lib -L../../lib -L../stubdata -L../../stubdata -L$(SYSBASE)/usr/lib) \ + $(if $(filter TRUE,$(HAVE_LD_BSYMBOLIC_FUNCTIONS)), -Wl$(COMMA)-Bsymbolic-functions) \ + $(if $(filter ANDROID,$(OS)),$(gb_STDLIBS))" + +# DATASUBDIR=data in cross-compiling case, because --disable-tools completely skips the +# data directory/doesn't build the requested library in that case (icu/source/Makefile.in) +# so we need to add it back to the list of subdirectories to build +$(call gb_ExternalProject_get_state_target,icu,build) : + $(call gb_Trace_StartRange,icu,EXTERNAL) + $(call gb_ExternalProject_run,build,\ + autoconf -f && \ + CPPFLAGS=$(icu_CPPFLAGS) CFLAGS=$(icu_CFLAGS) \ + CXXFLAGS=$(icu_CXXFLAGS) LDFLAGS=$(icu_LDFLAGS) \ + PYTHONWARNINGS="default" \ + $(gb_RUN_CONFIGURE) ./configure \ + --disable-layout --disable-samples \ + $(if $(filter FUZZERS,$(BUILD_TYPE)),--disable-release) \ + $(if $(filter EMSCRIPTEN ANDROID,$(OS)),--disable-strict ac_cv_c_bigendian=no) \ + $(if $(filter SOLARIS,$(OS)),--disable-64bit-libs) \ + $(if $(filter TRUE,$(DISABLE_DYNLOADING)),\ + --with-data-packaging=static --enable-static --disable-shared --disable-dyload,\ + --disable-static --enable-shared $(if $(filter ANDROID,$(OS)),--with-library-suffix=lo)) \ + $(gb_CONFIGURE_PLATFORMS) \ + $(if $(CROSS_COMPILING), \ + --with-cross-build=$(WORKDIR_FOR_BUILD)/UnpackedTarball/icu/source \ + --disable-tools --disable-extras) \ + AR="$(AR)" RANLIB="$(RANLIB)" \ + && $(MAKE) $(if $(CROSS_COMPILING),DATASUBDIR=data) $(if $(verbose),VERBOSE=1) \ + $(if $(filter MACOSX,$(OS)), \ + && $(PERL) $(SRCDIR)/solenv/bin/macosx-change-install-names.pl shl \ + URELIB \ + $(EXTERNAL_WORKDIR)/source/lib/libicuuc$(gb_Library_DLLEXT).$(icu_VERSION) \ + $(EXTERNAL_WORKDIR)/source/lib/libicui18n$(gb_Library_DLLEXT).$(icu_VERSION) \ + && $(PERL) $(SRCDIR)/solenv/bin/macosx-change-install-names.pl shl \ + OOO \ + $(EXTERNAL_WORKDIR)/source/lib/libicudata$(gb_Library_DLLEXT).$(icu_VERSION)) \ + ,source) + $(call gb_Trace_EndRange,icu,EXTERNAL) + +endif + +# vim: set noet sw=4 ts=4: diff --git a/external/icu/Makefile b/external/icu/Makefile new file mode 100644 index 0000000000..e4968cf85f --- /dev/null +++ b/external/icu/Makefile @@ -0,0 +1,7 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- + +module_directory:=$(dir $(realpath $(firstword $(MAKEFILE_LIST)))) + +include $(module_directory)/../../solenv/gbuild/partial_build.mk + +# vim: set noet sw=4 ts=4: diff --git a/external/icu/Module_icu.mk b/external/icu/Module_icu.mk new file mode 100644 index 0000000000..5c99b930fc --- /dev/null +++ b/external/icu/Module_icu.mk @@ -0,0 +1,19 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +$(eval $(call gb_Module_Module,icu)) + +$(eval $(call gb_Module_add_targets,icu,\ + UnpackedTarball_icu \ + ExternalPackage_icu \ + ExternalPackage_icu_ure \ + ExternalProject_icu \ +)) + +# vim: set noet sw=4 ts=4: diff --git a/external/icu/README b/external/icu/README new file mode 100644 index 0000000000..23cf5f0524 --- /dev/null +++ b/external/icu/README @@ -0,0 +1 @@ +Library providing Unicode support, from [https://icu.unicode.org/]. diff --git a/external/icu/UnpackedTarball_icu.mk b/external/icu/UnpackedTarball_icu.mk new file mode 100644 index 0000000000..655614447d --- /dev/null +++ b/external/icu/UnpackedTarball_icu.mk @@ -0,0 +1,47 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +$(eval $(call gb_UnpackedTarball_UnpackedTarball,icu)) + +$(eval $(call gb_UnpackedTarball_set_tarball,icu,$(ICU_TARBALL))) + +$(eval $(call gb_UnpackedTarball_update_autoconf_configs,icu,source)) + +# Data zip contains data/... and needs to end up in icu/source/data/... +# Only data/misc/icudata.rc is needed for a Cygwin/MSVC build. +$(eval $(call gb_UnpackedTarball_set_pre_action,icu,\ + unzip -q -d source -o $(gb_UnpackedTarget_TARFILE_LOCATION)/$(ICU_DATA_TARBALL) data/misc/icudata.rc \ +)) + +$(eval $(call gb_UnpackedTarball_set_patchlevel,icu,0)) + +$(eval $(call gb_UnpackedTarball_add_patches,icu,\ + external/icu/icu4c-build.patch.1 \ + external/icu/icu4c-warnings.patch.1 \ + external/icu/icu4c-macosx.patch.1 \ + external/icu/icu4c-solarisgcc.patch.1 \ + external/icu/icu4c-mkdir.patch.1 \ + external/icu/icu4c-ubsan.patch.1 \ + external/icu/icu4c-scriptrun.patch.1 \ + external/icu/icu4c-rtti.patch.1 \ + external/icu/icu4c-clang-cl.patch.1 \ + external/icu/gcc9.patch \ + external/icu/c++20-comparison.patch.1 \ + external/icu/Wdeprecated-copy-dtor.patch \ + external/icu/icu4c-windows-cygwin-cross.patch.1 \ + external/icu/icu4c-emscripten-cross.patch.1 \ + external/icu/icu4c-use-pkgdata-single-ccode-file-mode.patch.1 \ + external/icu/icu4c-khmerbreakengine.patch.1 \ + external/icu/icu4c-$(if $(filter ANDROID,$(OS)),android,rpath).patch.1 \ + $(if $(filter-out ANDROID,$(OS)),external/icu/icu4c-icudata-stdlibs.patch.1) \ +)) + +$(eval $(call gb_UnpackedTarball_add_file,icu,source/data/brkitr/khmerdict.dict,external/icu/khmerdict.dict)) + +# vim: set noet sw=4 ts=4: diff --git a/external/icu/Wdeprecated-copy-dtor.patch b/external/icu/Wdeprecated-copy-dtor.patch new file mode 100644 index 0000000000..67078ef1bb --- /dev/null +++ b/external/icu/Wdeprecated-copy-dtor.patch @@ -0,0 +1,25 @@ +--- source/common/unicode/uobject.h ++++ source/common/unicode/uobject.h +@@ -245,10 +245,10 @@ + // direct use of UObject itself + + // default constructor +- // inline UObject() {} ++ UObject() = default; + + // copy constructor +- // inline UObject(const UObject &other) {} ++ UObject(const UObject &other) = default; + + #if 0 + // TODO Sometime in the future. Implement operator==(). +@@ -280,8 +280,8 @@ + * Subclasses need this assignment operator if they use compiler-provided + * assignment operators of their own. An alternative to not declaring one + * here would be to declare and empty-implement a protected or public one. +- UObject &UObject::operator=(const UObject &); + */ ++ UObject &operator=(const UObject &) = default; + }; + + #ifndef U_HIDE_INTERNAL_API diff --git a/external/icu/c++20-comparison.patch.1 b/external/icu/c++20-comparison.patch.1 new file mode 100644 index 0000000000..fa10b048ce --- /dev/null +++ b/external/icu/c++20-comparison.patch.1 @@ -0,0 +1,82 @@ +diff -ur icu.org/source/i18n/unicode/rbtz.h icu/source/i18n/unicode/rbtz.h +--- icu.org/source/i18n/unicode/rbtz.h 2022-10-19 02:53:21.000000000 +0200 ++++ icu/source/i18n/unicode/rbtz.h 2022-10-24 22:20:10.889969185 +0200 +@@ -87,6 +87,7 @@ + * @stable ICU 3.8 + */ + virtual bool operator!=(const TimeZone& that) const; ++ bool operator!=(const RuleBasedTimeZone& that) const {return !operator==(that);} + + /** + * Adds the `TimeZoneRule` which represents time transitions. +diff -ur icu.org/source/i18n/unicode/simpletz.h icu/source/i18n/unicode/simpletz.h +--- icu.org/source/i18n/unicode/simpletz.h 2022-10-19 02:53:21.000000000 +0200 ++++ icu/source/i18n/unicode/simpletz.h 2022-10-24 22:20:10.890969183 +0200 +@@ -112,6 +112,7 @@ + * @stable ICU 2.0 + */ + virtual bool operator==(const TimeZone& that) const override; ++ bool operator!=(const SimpleTimeZone& that) const {return !operator==(that);} + + /** + * Constructs a SimpleTimeZone with the given raw GMT offset and time zone ID, +diff -ur icu.org/source/i18n/unicode/smpdtfmt.h icu/source/i18n/unicode/smpdtfmt.h +--- icu.org/source/i18n/unicode/smpdtfmt.h 2022-10-19 02:53:21.000000000 +0200 ++++ icu/source/i18n/unicode/smpdtfmt.h 2022-10-24 22:20:10.891969181 +0200 +@@ -877,6 +877,7 @@ + * @stable ICU 2.0 + */ + virtual bool operator==(const Format& other) const override; ++ bool operator!=(const SimpleDateFormat& that) const {return !operator==(that);} + + + using DateFormat::format; +diff -ur icu.org/source/i18n/unicode/stsearch.h icu/source/i18n/unicode/stsearch.h +--- icu.org/source/i18n/unicode/stsearch.h 2022-10-19 02:53:21.000000000 +0200 ++++ icu/source/i18n/unicode/stsearch.h 2022-10-24 22:20:10.892969178 +0200 +@@ -298,6 +298,7 @@ + * @stable ICU 2.0 + */ + virtual bool operator==(const SearchIterator &that) const override; ++ bool operator!=(const StringSearch &that) const {return !operator==(that);} + + // public get and set methods ---------------------------------------- + +diff -ur icu.org/source/i18n/unicode/tzrule.h icu/source/i18n/unicode/tzrule.h +--- icu.org/source/i18n/unicode/tzrule.h 2022-10-19 02:53:21.000000000 +0200 ++++ icu/source/i18n/unicode/tzrule.h 2022-10-24 22:30:23.298744116 +0200 +@@ -257,6 +257,7 @@ + * @stable ICU 3.8 + */ + virtual bool operator!=(const TimeZoneRule& that) const override; ++ bool operator!=(const InitialTimeZoneRule& that) const {return !operator==(that);} + + /** + * Returns if this rule represents the same rule and offsets as another. +@@ -454,6 +455,7 @@ + * @stable ICU 3.8 + */ + virtual bool operator!=(const TimeZoneRule& that) const override; ++ bool operator!=(const AnnualTimeZoneRule& that) const {return !operator==(that);} + + /** + * Gets the start date/time rule used by this rule. +@@ -670,6 +672,7 @@ + * @stable ICU 3.8 + */ + virtual bool operator!=(const TimeZoneRule& that) const override; ++ bool operator!=(const TimeArrayTimeZoneRule& that) const {return !operator==(that);} + + /** + * Gets the time type of the start times used by this rule. The return value +diff -ur icu.org/source/i18n/unicode/vtzone.h icu/source/i18n/unicode/vtzone.h +--- icu.org/source/i18n/unicode/vtzone.h 2022-10-19 02:53:21.000000000 +0200 ++++ icu/source/i18n/unicode/vtzone.h 2022-10-24 22:20:10.895969172 +0200 +@@ -83,6 +83,7 @@ + * @stable ICU 3.8 + */ + virtual bool operator!=(const TimeZone& that) const; ++ bool operator!=(const VTimeZone& that) const {return !operator==(that);} + + /** + * Create a <code>VTimeZone</code> instance by the time zone ID. diff --git a/external/icu/cross-bin/icu-config b/external/icu/cross-bin/icu-config new file mode 100755 index 0000000000..8ccf94f9bd --- /dev/null +++ b/external/icu/cross-bin/icu-config @@ -0,0 +1,12 @@ +#!/bin/sh + +case $1 in +--version) + echo whatever + ;; +--cppflags) + echo ${ICU_CFLAGS} + ;; +--ldflags-searchpath) + echo ${ICU_LIBS} +esac diff --git a/external/icu/gcc9.patch b/external/icu/gcc9.patch new file mode 100644 index 0000000000..5c9808f8c3 --- /dev/null +++ b/external/icu/gcc9.patch @@ -0,0 +1,26 @@ +--- source/i18n/unicode/format.h ++++ source/i18n/unicode/format.h +@@ -22,6 +22,13 @@ + + #ifndef FORMAT_H + #define FORMAT_H ++ ++#ifdef __GNUC__ ++#pragma GCC diagnostic push ++#pragma GCC diagnostic ignored "-Wpragmas" // for old GCC ++#pragma GCC diagnostic ignored "-Wunknown-warning-option" // for Clang ++#pragma GCC diagnostic ignored "-Wdeprecated-copy" ++#endif + + + #include "unicode/utypes.h" +@@ -314,5 +314,9 @@ + + #endif /* U_SHOW_CPLUSPLUS_API */ + ++#ifdef __GNUC__ ++#pragma GCC diagnostic pop ++#endif ++ + #endif // _FORMAT + //eof diff --git a/external/icu/icu4c-android.patch.1 b/external/icu/icu4c-android.patch.1 new file mode 100644 index 0000000000..9ba252b402 --- /dev/null +++ b/external/icu/icu4c-android.patch.1 @@ -0,0 +1,75 @@ +diff -ur icu.org/source/common/unicode/platform.h icu/source/common/unicode/platform.h +--- icu.org/source/common/unicode/platform.h 2021-10-28 18:04:57.000000000 +0200 ++++ icu/source/common/unicode/platform.h 2021-11-15 21:03:11.474638494 +0100 +@@ -818,7 +818,7 @@ + UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllimport__)) + # define U_EXPORT __declspec(dllexport) + #elif defined(__GNUC__) +-# define U_EXPORT __attribute__((visibility("default"))) ++# define U_EXPORT + #elif (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x550) \ + || (defined(__SUNPRO_C) && __SUNPRO_C >= 0x550) + # define U_EXPORT __global +diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux +--- icu.org/source/config/mh-linux 2021-11-15 20:56:39.460705065 +0100 ++++ icu/source/config/mh-linux 2021-11-15 21:03:11.474638494 +0100 +@@ -27,7 +27,7 @@ + + ## Compiler switch to embed a library name + # The initial tab in the next line is to prevent icu-config from reading it. +- LD_SONAME = -Wl,-soname -Wl,$(notdir $(MIDDLE_SO_TARGET)) ++ #LD_SONAME = -Wl,-soname -Wl,$(notdir $(MIDDLE_SO_TARGET)) + #SH# # We can't depend on MIDDLE_SO_TARGET being set. + #SH# LD_SONAME= + +diff -ur icu.org/source/configure icu/source/configure +--- icu.org/source/configure 2021-11-15 20:56:39.875703936 +0100 ++++ icu/source/configure 2021-11-15 21:03:11.475638491 +0100 +@@ -5272,7 +5273,7 @@ + else + icu_cv_host_frag=mh-linux-va + fi ;; +-*-*-linux*|*-*-gnu|*-*-k*bsd*-gnu|*-*-kopensolaris*-gnu) icu_cv_host_frag=mh-linux ;; ++*-*-linux*|*-*-gnu|*-*-k*bsd*-gnu|*-*-kopensolaris*-gnu|*-*-*-androideabi*) icu_cv_host_frag=mh-linux ;; + i[34567]86-*-cygwin) + if test "$GCC" = yes; then + icu_cv_host_frag=mh-cygwin +@@ -6472,6 +6466,10 @@ + # Check to see if genccode can generate simple assembly. + GENCCODE_ASSEMBLY= + case "${host}" in ++arm-*-linux-androideabi) ++ if test "$GCC" = yes; then ++ GENCCODE_ASSEMBLY="-a gcc-android-arm" ++ fi ;; + *-linux*|*-kfreebsd*-gnu*|i*86-*-*bsd*|i*86-pc-gnu) + if test "$GCC" = yes; then + # We're using gcc, and the simple -a gcc command line works for genccode +@@ -7594,6 +7592,10 @@ + # wchar_t can be used + CHECK_UTF16_STRING_RESULT="available" + ;; ++*-*-*-androideabi|mips-unknown-linux-android) ++ # no UTF-16 strings thanks, I think, this is to avoid the -std=c++0x which causes trouble with uint64_t ++ CHECK_UTF16_STRING_RESULT="nope" ++ ;; + *) + ;; + esac +diff -ur icu.org/source/i18n/decimfmt.cpp icu/source/i18n/decimfmt.cpp +--- icu.org/source/i18n/decimfmt.cpp 2021-10-28 18:04:57.000000000 +0200 ++++ icu/source/i18n/decimfmt.cpp 2021-11-15 21:03:11.476638489 +0100 +@@ -9,6 +9,13 @@ + // Helpful in toString methods and elsewhere. + #define UNISTR_FROM_STRING_EXPLICIT + ++#ifdef __ANDROID__ ++#ifndef ARM ++#define ARM ++#endif ++#include <android/compatibility.hxx> ++#endif ++ + #include <cmath> + #include <cstdlib> + #include <stdlib.h> diff --git a/external/icu/icu4c-build.patch.1 b/external/icu/icu4c-build.patch.1 new file mode 100644 index 0000000000..a878de7323 --- /dev/null +++ b/external/icu/icu4c-build.patch.1 @@ -0,0 +1,91 @@ +diff -ur icu.org/source/config/mh-darwin icu/source/config/mh-darwin +--- icu.org/source/config/mh-darwin 2016-06-15 20:58:17.000000000 +0200 ++++ icu/source/config/mh-darwin 2017-04-21 21:30:23.584568210 +0200 +@@ -30,11 +30,7 @@ + SHLIB.cc= $(CXX) -dynamiclib -dynamic $(CXXFLAGS) $(LDFLAGS) $(LD_SOOPTIONS) + + ## Compiler switches to embed a library name and version information +-ifeq ($(ENABLE_RPATH),YES) +-LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name $(libdir)/$(notdir $(MIDDLE_SO_TARGET)) +-else +-LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name $(notdir $(MIDDLE_SO_TARGET)) $(PKGDATA_TRAILING_SPACE) +-endif ++LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name @__________________________________________________URELIB/$(notdir $(MIDDLE_SO_TARGET)) + + ## Compiler switch to embed a runtime search path + LD_RPATH= +@@ -50,10 +46,6 @@ + ## Non-shared intermediate object suffix + STATIC_O = ao + +-## Override Versioned target for a shared library. +-FINAL_SO_TARGET= $(basename $(SO_TARGET)).$(SO_TARGET_VERSION).$(SO) +-MIDDLE_SO_TARGET= $(basename $(SO_TARGET)).$(SO_TARGET_VERSION_MAJOR).$(SO) +- + ## Compilation and dependency rules + %.$(STATIC_O): $(srcdir)/%.c + $(call SILENT_COMPILE,$(strip $(COMPILE.c) $(STATICCPPFLAGS) $(STATICCFLAGS)) -MMD -MT "$*.d $*.o $*.$(STATIC_O)" -o $@ $<) +@@ -67,16 +59,10 @@ + + ## Versioned libraries rules + +-%.$(SO_TARGET_VERSION_MAJOR).$(SO): %.$(SO_TARGET_VERSION).$(SO) ++%.$(SO).$(SO_TARGET_VERSION_MAJOR): %.$(SO).$(SO_TARGET_VERSION) + $(RM) $@ && ln -s ${<F} $@ +-%.$(SO): %.$(SO_TARGET_VERSION_MAJOR).$(SO) +- $(RM) $@ && ln -s ${*F}.$(SO_TARGET_VERSION).$(SO) $@ +- +-# tzcode option +-TZORIG_EXTRA_CFLAGS=-DSTD_INSPIRED +- +-# genren opts +-GENREN_PL_OPTS=-x Mach-O -n '-g' -p '| c++filt' ++%.$(SO): %.$(SO).$(SO_TARGET_VERSION_MAJOR) ++ $(RM) $@ && ln -s ${*F}.$(SO).$(SO_TARGET_VERSION) $@ + + ## Remove shared library 's' + STATIC_PREFIX_WHEN_USED = +diff -ur icu.org/source/tools/toolutil/pkg_genc.cpp icu/source/tools/toolutil/pkg_genc.cpp +--- icu.org/source/tools/toolutil/pkg_genc.cpp 2017-04-13 11:46:02.000000000 +0200 ++++ icu/source/tools/toolutil/pkg_genc.cpp 2017-04-21 21:30:23.583568212 +0200 +@@ -160,6 +160,28 @@ + + ".long ","",HEX_0X + }, ++ {"gcc-android-arm", ++ "\t.arch armv5te\n" ++ "\t.fpu softvfp\n" ++ "\t.eabi_attribute 20, 1\n" ++ "\t.eabi_attribute 21, 1\n" ++ "\t.eabi_attribute 23, 3\n" ++ "\t.eabi_attribute 24, 1\n" ++ "\t.eabi_attribute 25, 1\n" ++ "\t.eabi_attribute 26, 2\n" ++ "\t.eabi_attribute 30, 6\n" ++ "\t.eabi_attribute 18, 4\n" ++ "\t.file \"%s.s\"\n" ++ "\t.global %s\n" ++ "\t.section .rodata\n" ++ "\t.align 2\n" ++ "\t.type %s, %%object\n" ++ "%s:\n", ++ ++ "\t.word ", ++ "\t.section .note.GNU-stack,\"\",%%progbits\n", ++ HEX_0X ++ }, + /* 16 bytes alignment. */ + /* http://docs.oracle.com/cd/E19641-01/802-1947/802-1947.pdf */ + {"sun", +diff -ur icu.org/source/tools/toolutil/pkg_genc.h icu/source/tools/toolutil/pkg_genc.h +--- icu.org/source/tools/toolutil/pkg_genc.h 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/tools/toolutil/pkg_genc.h 2017-04-21 21:30:23.582568215 +0200 +@@ -60,7 +60,7 @@ + #endif + + #define LARGE_BUFFER_MAX_SIZE 2048 +-#define SMALL_BUFFER_MAX_SIZE 512 ++#define SMALL_BUFFER_MAX_SIZE 2048 + #define SMALL_BUFFER_FLAG_NAMES 32 + #define BUFFER_PADDING_SIZE 20 + diff --git a/external/icu/icu4c-clang-cl.patch.1 b/external/icu/icu4c-clang-cl.patch.1 new file mode 100644 index 0000000000..a111a0df99 --- /dev/null +++ b/external/icu/icu4c-clang-cl.patch.1 @@ -0,0 +1,28 @@ +diff -ur icu.org/source/config/mh-cygwin-msvc icu/source/config/mh-cygwin-msvc +--- icu.org/source/config/mh-cygwin-msvc 2017-01-23 01:38:28.000000000 +0100 ++++ icu/source/config/mh-cygwin-msvc 2017-04-21 23:07:28.482892025 +0200 +@@ -55,8 +55,8 @@ + LDFLAGS+=-nologo + + # Commands to compile +-COMPILE.c= $(CC) $(CPPFLAGS) $(DEFS) $(CFLAGS) -c +-COMPILE.cc= $(CXX) $(CPPFLAGS) $(DEFS) $(CXXFLAGS) -c ++COMPILE.c= true && $(CC) $(CPPFLAGS) $(DEFS) $(CFLAGS) -c ++COMPILE.cc= true && $(CXX) $(CPPFLAGS) $(DEFS) $(CXXFLAGS) -c + + # Commands to link + LINK.c= LINK.EXE -subsystem:console $(LDFLAGS) +diff -ur icu.org/source/runConfigureICU icu/source/runConfigureICU +--- icu.org/source/runConfigureICU 2017-01-23 01:38:28.000000000 +0100 ++++ icu/source/runConfigureICU 2017-04-21 23:07:28.482892025 +0200 +@@ -261,8 +261,8 @@ + Cygwin/MSVC) + THE_OS="Windows with Cygwin" + THE_COMP="Microsoft Visual C++" +- CC=cl; export CC +- CXX=cl; export CXX ++ CC=${CC-cl}; export CC ++ CXX=${CXX-cl}; export CXX + RELEASE_CFLAGS='-Gy -MD' + RELEASE_CXXFLAGS='-Gy -MD' + DEBUG_CFLAGS='-FS -Zi -MDd' diff --git a/external/icu/icu4c-emscripten-cross.patch.1 b/external/icu/icu4c-emscripten-cross.patch.1 new file mode 100644 index 0000000000..84c88a68a8 --- /dev/null +++ b/external/icu/icu4c-emscripten-cross.patch.1 @@ -0,0 +1,99 @@ +--- icu/source/acinclude.m4.orig 2020-04-22 22:04:20.000000000 +0200 ++++ icu/source/acinclude.m4 2020-11-04 06:10:29.993070072 +0100 +@@ -84,6 +84,7 @@ + *-dec-osf*) icu_cv_host_frag=mh-alpha-osf ;; + *-*-nto*) icu_cv_host_frag=mh-qnx ;; + *-ncr-*) icu_cv_host_frag=mh-mpras ;; ++wasm*-*-emscripten*) icu_cv_host_frag=mh-emscripten ;; + *) icu_cv_host_frag=mh-unknown ;; + esac + ] +--- /dev/null ++++ icu/source/config/mh-emscripten 2015-10-06 12:01:00.497972406 +0200 +@@ -0,0 +1,86 @@ ++## Emscripten-specific setup ++## Copyright (c) 1999-2013, International Business Machines Corporation and ++## others. All Rights Reserved. ++## Commands to generate dependency files ++GEN_DEPS.c= $(CC) -E -MM $(DEFS) $(CPPFLAGS) ++GEN_DEPS.cc= $(CXX) -E -MM $(DEFS) $(CPPFLAGS) $(CXXFLAGS) ++ ++## Flags for position independent code ++SHAREDLIBCFLAGS = -fPIC ++SHAREDLIBCXXFLAGS = -fPIC ++SHAREDLIBCPPFLAGS = -DPIC ++ ++## Additional flags when building libraries and with threads ++THREADSCPPFLAGS = -D_REENTRANT ++LIBCPPFLAGS = ++ ++## Compiler switch to embed a runtime search path ++LD_RPATH= -Wl,-zorigin,-rpath,'$$'ORIGIN ++LD_RPATH_PRE = -Wl,-rpath, ++ ++## Force RPATH=$ORIGIN to locate own dependencies w/o need for LD_LIBRARY_PATH: ++ENABLE_RPATH=YES ++RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN' ++ ++## These are the library specific LDFLAGS ++#LDFLAGSICUDT=-nodefaultlibs -nostdlib ++# Debian change: linking icudata as data only causes too many problems. ++LDFLAGSICUDT= ++ ++## Compiler switch to embed a library name ++# The initial tab in the next line is to prevent icu-config from reading it. ++ LD_SONAME = -Wl,-soname -Wl,$(notdir $(MIDDLE_SO_TARGET)) ++#SH# # We can't depend on MIDDLE_SO_TARGET being set. ++#SH# LD_SONAME= ++ ++## Shared library options ++LD_SOOPTIONS= -Wl,-Bsymbolic-functions ++ ++## Shared object suffix ++SO = so ++## Non-shared intermediate object suffix ++STATIC_O = o ++ ++## Compilation rules ++# WASM needs -pthread for atomics support ++%.$(STATIC_O): $(srcdir)/%.c ++ $(call SILENT_COMPILE,$(strip $(COMPILE.c) $(STATICCPPFLAGS) $(STATICCFLAGS)) -pthread -o $@ $<) ++ ++%.$(STATIC_O): $(srcdir)/%.cpp ++ $(call SILENT_COMPILE,$(strip $(COMPILE.cc) $(STATICCPPFLAGS) $(STATICCXXFLAGS)) -pthread -o $@ $<) ++ ++ ++## Dependency rules ++%.d: $(srcdir)/%.c ++ $(call ICU_MSG,(deps)) $< ++ @$(SHELL) -ec '$(GEN_DEPS.c) $< \ ++ | sed '\''s%\($*\)\.o[ :]*%\1.o $@ : %g'\'' > $@; \ ++ [ -s $@ ] || rm -f $@' ++ ++%.d: $(srcdir)/%.cpp ++ $(call ICU_MSG,(deps)) $< ++ @$(SHELL) -ec '$(GEN_DEPS.cc) $< \ ++ | sed '\''s%\($*\)\.o[ :]*%\1.o $@ : %g'\'' > $@; \ ++ [ -s $@ ] || rm -f $@' ++ ++## Versioned libraries rules ++ ++%.$(SO).$(SO_TARGET_VERSION_MAJOR): %.$(SO).$(SO_TARGET_VERSION) ++ $(RM) $@ && ln -s ${<F} $@ ++%.$(SO): %.$(SO).$(SO_TARGET_VERSION_MAJOR) ++ $(RM) $@ && ln -s ${*F}.$(SO).$(SO_TARGET_VERSION) $@ ++ ++## Bind internal references ++ ++# LDflags that pkgdata will use ++BIR_LDFLAGS= -Wl,-Bsymbolic ++ ++# Dependencies [i.e. map files] for the final library ++BIR_DEPS= ++ ++## Remove shared library 's' ++STATIC_PREFIX_WHEN_USED = ++STATIC_PREFIX = ++ ++## without assembly ++PKGDATA_OPTS = -O $(top_builddir)/data/icupkg.inc -w diff --git a/external/icu/icu4c-icudata-stdlibs.patch.1 b/external/icu/icu4c-icudata-stdlibs.patch.1 new file mode 100644 index 0000000000..c8d66c6ed0 --- /dev/null +++ b/external/icu/icu4c-icudata-stdlibs.patch.1 @@ -0,0 +1,14 @@ +diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux +--- icu.org/source/config/mh-linux 2017-04-21 23:09:57.588533707 +0200 ++++ icu/source/config/mh-linux 2017-04-21 23:11:38.075292226 +0200 +@@ -27,7 +27,9 @@ + RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN' + + ## These are the library specific LDFLAGS +-LDFLAGSICUDT=-nodefaultlibs -nostdlib ++#LDFLAGSICUDT=-nodefaultlibs -nostdlib ++# Debian change: linking icudata as data only causes too many problems. ++LDFLAGSICUDT= + + ## Compiler switch to embed a library name + # The initial tab in the next line is to prevent icu-config from reading it. diff --git a/external/icu/icu4c-khmerbreakengine.patch.1 b/external/icu/icu4c-khmerbreakengine.patch.1 new file mode 100644 index 0000000000..605914014e --- /dev/null +++ b/external/icu/icu4c-khmerbreakengine.patch.1 @@ -0,0 +1,837 @@ +diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp +--- icu.org/source/common/dictbe.cpp 2023-06-14 06:23:55.000000000 +0900 ++++ icu/source/common/dictbe.cpp 2023-06-26 17:43:53.034173100 +0900 +@@ -35,7 +35,19 @@ + ****************************************************************** + */ + +-DictionaryBreakEngine::DictionaryBreakEngine() { ++DictionaryBreakEngine::DictionaryBreakEngine() ++ : fTypes(0), clusterLimit(0) { ++} ++ ++DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) ++ : fTypes(breakTypes), clusterLimit(3) { ++ UErrorCode status = U_ZERO_ERROR; ++ fViramaSet.applyPattern(UnicodeString(u"[[:ccc=VR:]]"), status); ++ ++ // note Skip Sets contain fIgnoreSet characters too. ++ fSkipStartSet.applyPattern(UnicodeString(u"[[:lb=OP:][:lb=QU:]\\u200C\\u200D\\u2060]"), status); ++ fSkipEndSet.applyPattern(UnicodeString(u"[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]\\u200C\\u200D\\u2060]"), status); ++ fNBeforeSet.applyPattern(UnicodeString(u"[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status); + } + + DictionaryBreakEngine::~DictionaryBreakEngine() { +@@ -85,6 +97,169 @@ + fSet.compact(); + } + ++bool ++DictionaryBreakEngine::scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const { ++ UErrorCode status = U_ZERO_ERROR; ++ UText* ut = utext_clone(NULL, text, false, true, &status); ++ utext_setNativeIndex(ut, start); ++ UChar32 c = utext_current32(ut); ++ bool res = false; ++ doBreak = true; ++ while (start >= 0) { ++ if (!fSkipStartSet.contains(c)) { ++ res = (c == ZWSP); ++ break; ++ } ++ --start; ++ c = utext_previous32(ut); ++ doBreak = false; ++ } ++ utext_close(ut); ++ return res; ++} ++ ++bool ++DictionaryBreakEngine::scanAfterEnd(UText *text, int32_t textEnd, int32_t& end, bool &doBreak) const { ++ UErrorCode status = U_ZERO_ERROR; ++ UText* ut = utext_clone(NULL, text, false, true, &status); ++ utext_setNativeIndex(ut, end); ++ UChar32 c = utext_current32(ut); ++ bool res = false; ++ doBreak = !fNBeforeSet.contains(c); ++ while (end < textEnd) { ++ if (!fSkipEndSet.contains(c)) { ++ res = (c == ZWSP); ++ break; ++ } ++ ++end; ++ c = utext_next32(ut); ++ doBreak = false; ++ } ++ utext_close(ut); ++ return res; ++} ++ ++void ++DictionaryBreakEngine::scanBackClusters(UText *text, int32_t textStart, int32_t& start) const { ++ UChar32 c = 0; ++ start = utext_getNativeIndex(text); ++ while (start > textStart) { ++ c = utext_previous32(text); ++ --start; ++ if (!fSkipEndSet.contains(c)) ++ break; ++ } ++ for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters ++ while (start > textStart) { ++ while (fIgnoreSet.contains(c)) ++ c = utext_previous32(text); ++ if (!fMarkSet.contains(c)) { ++ if (fBaseSet.contains(c)) { ++ c = utext_previous32(text); ++ if (!fViramaSet.contains(c)) { // Virama (e.g. coeng) preceding base. Treat sequence as a mark ++ utext_next32(text); ++ c = utext_current32(text); ++ break; ++ } else { ++ --start; ++ } ++ } else { ++ break; ++ } ++ } ++ c = utext_previous32(text); ++ --start; ++ } ++ if (!fBaseSet.contains(c) || start < textStart) { // not a cluster start so finish ++ break; ++ } ++ c = utext_previous32(text); ++ --start; // go round again ++ } // ignore hitting previous inhibitor since scanning for it should have found us! ++ ++start; // counteract --before ++} ++ ++void ++DictionaryBreakEngine::scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const { ++ UChar32 c = utext_current32(text); ++ end = utext_getNativeIndex(text); ++ while (end < textEnd) { ++ if (!fSkipStartSet.contains(c)) ++ break; ++ utext_next32(text); ++ c = utext_current32(text); ++ ++end; ++ } ++ for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters ++ while (fIgnoreSet.contains(c)) { ++ utext_next32(text); ++ c = utext_current32(text); ++ } ++ if (fBaseSet.contains(c)) { ++ while (end < textEnd) { ++ utext_next32(text); ++ c = utext_current32(text); ++ ++end; ++ if (!fMarkSet.contains(c)) ++ break; ++ else if (fViramaSet.contains(c)) { // handle coeng + base as mark ++ utext_next32(text); ++ c = utext_current32(text); ++ ++end; ++ if (!fBaseSet.contains(c)) ++ break; ++ } ++ } ++ } else { ++ --end; // bad char so break after char before it ++ break; ++ } ++ } ++} ++ ++bool ++DictionaryBreakEngine::scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const { ++ UErrorCode status = U_ZERO_ERROR; ++ UText* ut = utext_clone(NULL, text, false, true, &status); ++ int32_t nat = start; ++ utext_setNativeIndex(ut, nat); ++ bool foundFirst = true; ++ int32_t curr = start; ++ while (nat < end) { ++ UChar32 c = utext_current32(ut); ++ if (c == ZWSP || c == WJ) { ++ curr = nat + 1; ++ if (foundFirst) // only scan backwards for first inhibitor ++ scanBackClusters(ut, start, before); ++ foundFirst = false; // don't scan backwards if we go around again. Also marks found something ++ ++ utext_next32(ut); ++ scanFwdClusters(ut, end, after); ++ nat = after + 1; ++ ++ if (c == ZWSP || c == WJ) { // did we hit another one? ++ continue; ++ } else { ++ break; ++ } ++ } ++ ++ ++nat; // keep hunting ++ utext_next32(ut); ++ } ++ ++ utext_close(ut); ++ ++ if (nat >= end && foundFirst) { ++ start = before = after = nat; ++ return false; // failed to find anything ++ } ++ else { ++ start = curr; ++ } ++ return true; // yup hit one ++} ++ + /* + ****************************************************************** + * PossibleWord +@@ -114,7 +289,7 @@ + ~PossibleWord() {} + + // Fill the list of candidates if needed, select the longest, and return the number found +- int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); ++ int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet = NULL, int32_t minLength = 0 ); + + // Select the currently marked candidate, point after it in the text, and invalidate self + int32_t acceptMarked( UText *text ); +@@ -135,12 +310,12 @@ + }; + + +-int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { ++int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet, int32_t minLength) { + // TODO: If getIndex is too slow, use offset < 0 and add discardAll() + int32_t start = (int32_t)utext_getNativeIndex(text); + if (start != offset) { + offset = start; +- count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix); ++ count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix, ignoreSet, minLength); + // Dictionary leaves text after longest prefix, not longest word. Back up. + if (count <= 0) { + utext_setNativeIndex(text, start); +@@ -814,53 +989,30 @@ + * KhmerBreakEngine + */ + +-// How many words in a row are "good enough"? +-static const int32_t KHMER_LOOKAHEAD = 3; +- +-// Will not combine a non-word with a preceding dictionary word longer than this +-static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3; +- +-// Will not combine a non-word that shares at least this much prefix with a +-// dictionary word, with a preceding word +-static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3; +- +-// Minimum word size +-static const int32_t KHMER_MIN_WORD = 2; +- +-// Minimum number of characters for two words +-static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2; +- + KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) +- : DictionaryBreakEngine(), ++ : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), + fDictionary(adoptDictionary) + { + UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); + UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr"); +- UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status); ++ ++ clusterLimit = 3; ++ ++ UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]\\u2060\\u200C\\u200D]"), status); + if (U_SUCCESS(status)) { + setCharacters(khmerWordSet); + } + fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); +- fMarkSet.add(0x0020); +- fEndWordSet = khmerWordSet; +- fBeginWordSet.add(0x1780, 0x17B3); +- //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels +- //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word +- //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word +- fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters +- //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels +-// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT +-// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI +-// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK +-// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI +-// fSuffixSet.add(THAI_PAIYANNOI); +-// fSuffixSet.add(THAI_MAIYAMOK); ++ fIgnoreSet.add(0x2060); // WJ ++ fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ ++ fBaseSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status); ++ fPuncSet.applyPattern(UnicodeString(u"[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status); + + // Compact for caching. + fMarkSet.compact(); +- fEndWordSet.compact(); +- fBeginWordSet.compact(); +-// fSuffixSet.compact(); ++ fIgnoreSet.compact(); ++ fBaseSet.compact(); ++ fPuncSet.compact(); + UTRACE_EXIT_STATUS(status); + } + +@@ -876,175 +1028,205 @@ + UBool /* isPhraseBreaking */, + UErrorCode& status ) const { + if (U_FAILURE(status)) return 0; +- if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { +- return 0; // Not enough characters for two words ++ uint32_t wordsFound = foundBreaks.size(); ++ int32_t before = 0; ++ int32_t after = 0; ++ int32_t finalBefore = 0; ++ int32_t initAfter = 0; ++ int32_t scanStart = rangeStart; ++ int32_t scanEnd = rangeEnd; ++ ++ bool startZwsp = false; ++ bool breakStart = false; ++ bool breakEnd = false; ++ ++ if (rangeStart > 0) { ++ --scanStart; ++ startZwsp = scanBeforeStart(text, scanStart, breakStart); + } + +- uint32_t wordsFound = 0; +- int32_t cpWordLength = 0; +- int32_t cuWordLength = 0; +- int32_t current; +- PossibleWord words[KHMER_LOOKAHEAD]; +- + utext_setNativeIndex(text, rangeStart); ++ scanFwdClusters(text, rangeEnd, initAfter); ++ bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd); ++ utext_setNativeIndex(text, rangeEnd - 1); ++ scanBackClusters(text, rangeStart, finalBefore); ++ if (finalBefore < initAfter) { // the whole run is tented so no breaks ++ if (breakStart || fTypes < UBRK_LINE) ++ foundBreaks.push(rangeStart, status); ++ if (breakEnd || fTypes < UBRK_LINE) ++ foundBreaks.push(rangeEnd, status); ++ return foundBreaks.size() - wordsFound; ++ } + +- while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { +- cuWordLength = 0; +- cpWordLength = 0; +- +- // Look for candidate words at the current position +- int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); +- +- // If we found exactly one, use that +- if (candidates == 1) { +- cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); +- cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); +- wordsFound += 1; +- } ++ scanStart = rangeStart; ++ scanWJ(text, scanStart, rangeEnd, before, after); ++ if (startZwsp || initAfter >= before) { ++ after = initAfter; ++ before = 0; ++ } ++ if (!endZwsp && after > finalBefore && after < rangeEnd) ++ endZwsp = true; ++ if (endZwsp && before > finalBefore) ++ before = finalBefore; + +- // If there was more than one, see which one can take us forward the most words +- else if (candidates > 1) { +- // If we're already at the end of the range, we're done +- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { +- goto foundBest; +- } +- do { +- if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { +- // Followed by another dictionary word; mark first word as a good candidate +- words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); ++ utext_setNativeIndex(text, rangeStart); ++ int32_t numCodePts = rangeEnd - rangeStart; ++ // bestSnlp[i] is the snlp of the best segmentation of the first i ++ // code points in the range to be matched. ++ UVector32 bestSnlp(numCodePts + 1, status); ++ bestSnlp.addElement(0, status); ++ for(int32_t i = 1; i <= numCodePts; i++) { ++ bestSnlp.addElement(kuint32max, status); ++ } + +- // If we're already at the end of the range, we're done +- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { +- goto foundBest; +- } ++ // prev[i] is the index of the last code point in the previous word in ++ // the best segmentation of the first i characters. Note negative implies ++ // that the code point is part of an unknown word. ++ UVector32 prev(numCodePts + 1, status); ++ for(int32_t i = 0; i <= numCodePts; i++) { ++ prev.addElement(kuint32max, status); ++ } + +- // See if any of the possible second words is followed by a third word +- do { +- // If we find a third word, stop right away +- if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { +- words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); +- goto foundBest; +- } +- } +- while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); +- } ++ const int32_t maxWordSize = 20; ++ UVector32 values(maxWordSize, status); ++ values.setSize(maxWordSize); ++ UVector32 lengths(maxWordSize, status); ++ lengths.setSize(maxWordSize); ++ ++ // Dynamic programming to find the best segmentation. ++ ++ // In outer loop, i is the code point index, ++ // ix is the corresponding string (code unit) index. ++ // They differ when the string contains supplementary characters. ++ int32_t ix = rangeStart; ++ for (int32_t i = 0; i < numCodePts; ++i, utext_setNativeIndex(text, ++ix)) { ++ if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) { ++ continue; ++ } ++ ++ int32_t count; ++ count = fDictionary->matches(text, numCodePts - i, maxWordSize, ++ NULL, lengths.getBuffer(), values.getBuffer(), NULL, &fIgnoreSet, 2); ++ // Note: lengths is filled with code point lengths ++ // The NULL parameter is the ignored code unit lengths. ++ ++ for (int32_t j = 0; j < count; j++) { ++ int32_t ln = lengths.elementAti(j); ++ if (ln + i >= numCodePts) ++ continue; ++ utext_setNativeIndex(text, ln+ix); ++ int32_t c = utext_current32(text); ++ if (fMarkSet.contains(c) || c == 0x17D2) { // Coeng ++ lengths.removeElementAt(j); ++ values.removeElementAt(j); ++ --j; ++ --count; + } +- while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); +-foundBest: +- cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); +- cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); +- wordsFound += 1; + } +- +- // We come here after having either found a word or not. We look ahead to the +- // next word. If it's not a dictionary word, we will combine it with the word we +- // just found (if there is one), but only if the preceding word does not exceed +- // the threshold. +- // The text iterator should now be positioned at the end of the word we found. +- if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) { +- // if it is a dictionary word, do nothing. If it isn't, then if there is +- // no preceding word, or the non-word shares less than the minimum threshold +- // of characters with a dictionary word, then scan to resynchronize +- if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 +- && (cuWordLength == 0 +- || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { +- // Look for a plausible word boundary +- int32_t remaining = rangeEnd - (current+cuWordLength); +- UChar32 pc; +- UChar32 uc; +- int32_t chars = 0; +- for (;;) { +- int32_t pcIndex = (int32_t)utext_getNativeIndex(text); +- pc = utext_next32(text); +- int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex; +- chars += pcSize; +- remaining -= pcSize; +- if (remaining <= 0) { ++ if (count == 0) { ++ utext_setNativeIndex(text, ix); ++ int32_t c = utext_current32(text); ++ if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) { ++ values.setElementAt(0, count); ++ lengths.setElementAt(1, count++); ++ } else if (fBaseSet.contains(c)) { ++ int32_t currix = utext_getNativeIndex(text); ++ do { ++ utext_next32(text); ++ c = utext_current32(text); ++ if (utext_getNativeIndex(text) >= rangeEnd) + break; +- } +- uc = utext_current32(text); +- if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { +- // Maybe. See if it's in the dictionary. +- int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); +- utext_setNativeIndex(text, current+cuWordLength+chars); +- if (num_candidates > 0) { ++ if (c == 0x17D2) { // Coeng ++ utext_next32(text); ++ c = utext_current32(text); ++ if (!fBaseSet.contains(c) || utext_getNativeIndex(text) >= rangeEnd) { + break; ++ } else { ++ utext_next32(text); ++ c = utext_current32(text); ++ if (utext_getNativeIndex(text) >= rangeEnd) ++ break; + } + } +- } +- +- // Bump the word count if there wasn't already one +- if (cuWordLength <= 0) { +- wordsFound += 1; +- } ++ } while (fMarkSet.contains(c) || fIgnoreSet.contains(c)); ++ values.setElementAt(BADSNLP, count); ++ lengths.setElementAt(utext_getNativeIndex(text) - currix, count++); ++ } else { ++ values.setElementAt(BADSNLP, count); ++ lengths.setElementAt(1, count++); ++ } ++ } + +- // Update the length with the passed-over characters +- cuWordLength += chars; ++ for (int32_t j = 0; j < count; j++) { ++ uint32_t v = values.elementAti(j); ++ int32_t newSnlp = bestSnlp.elementAti(i) + v; ++ int32_t ln = lengths.elementAti(j); ++ utext_setNativeIndex(text, ln+ix); ++ int32_t c = utext_current32(text); ++ while ((fPuncSet.contains(c) || fIgnoreSet.contains(c)) && ln + i < numCodePts) { ++ ++ln; ++ utext_next32(text); ++ c = utext_current32(text); + } +- else { +- // Back up to where we were for next iteration +- utext_setNativeIndex(text, current+cuWordLength); ++ int32_t ln_j_i = ln + i; // yes really i! ++ if (newSnlp < bestSnlp.elementAti(ln_j_i)) { ++ if (v == BADSNLP) { ++ int32_t p = prev.elementAti(i); ++ if (p < 0) ++ prev.setElementAt(p, ln_j_i); ++ else ++ prev.setElementAt(-i, ln_j_i); ++ } ++ else ++ prev.setElementAt(i, ln_j_i); ++ bestSnlp.setElementAt(newSnlp, ln_j_i); + } + } +- +- // Never stop before a combining mark. +- int32_t currPos; +- while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { +- utext_next32(text); +- cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; ++ } ++ // Start pushing the optimal offset index into t_boundary (t for tentative). ++ // prev[numCodePts] is guaranteed to be meaningful. ++ // We'll first push in the reverse order, i.e., ++ // t_boundary[0] = numCodePts, and afterwards do a swap. ++ UVector32 t_boundary(numCodePts+1, status); ++ ++ int32_t numBreaks = 0; ++ // No segmentation found, set boundary to end of range ++ while (numCodePts >= 0 && (uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) { ++ --numCodePts; ++ } ++ if (numCodePts < 0) { ++ t_boundary.addElement(numCodePts, status); ++ numBreaks++; ++ } else { ++ for (int32_t i = numCodePts; (uint32_t)i != kuint32max; i = prev.elementAti(i)) { ++ if (i < 0) i = -i; ++ t_boundary.addElement(i, status); ++ numBreaks++; + } ++ // U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0); ++ } + +- // Look ahead for possible suffixes if a dictionary word does not follow. +- // We do this in code rather than using a rule so that the heuristic +- // resynch continues to function. For example, one of the suffix characters +- // could be a typo in the middle of a word. +-// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { +-// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 +-// && fSuffixSet.contains(uc = utext_current32(text))) { +-// if (uc == KHMER_PAIYANNOI) { +-// if (!fSuffixSet.contains(utext_previous32(text))) { +-// // Skip over previous end and PAIYANNOI +-// utext_next32(text); +-// utext_next32(text); +-// wordLength += 1; // Add PAIYANNOI to word +-// uc = utext_current32(text); // Fetch next character +-// } +-// else { +-// // Restore prior position +-// utext_next32(text); +-// } +-// } +-// if (uc == KHMER_MAIYAMOK) { +-// if (utext_previous32(text) != KHMER_MAIYAMOK) { +-// // Skip over previous end and MAIYAMOK +-// utext_next32(text); +-// utext_next32(text); +-// wordLength += 1; // Add MAIYAMOK to word +-// } +-// else { +-// // Restore prior position +-// utext_next32(text); +-// } +-// } +-// } +-// else { +-// utext_setNativeIndex(text, current+wordLength); +-// } +-// } +- +- // Did we find a word on this iteration? If so, push it on the break stack +- if (cuWordLength > 0) { +- foundBreaks.push((current+cuWordLength), status); ++ // Now that we're done, convert positions in t_boundary[] (indices in ++ // the normalized input string) back to indices in the original input UText ++ // while reversing t_boundary and pushing values to foundBreaks. ++ for (int32_t i = numBreaks-1; i >= 0; i--) { ++ int32_t cpPos = t_boundary.elementAti(i); ++ if (cpPos == 0 && !breakStart && fTypes >= UBRK_LINE) continue; ++ int32_t utextPos = cpPos + rangeStart; ++ while (utextPos > after && scanWJ(text, utextPos, scanEnd, before, after)); ++ if (utextPos < before) { ++ // Boundaries are added to foundBreaks output in ascending order. ++ U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos); ++ foundBreaks.push(utextPos, status); + } + } +- ++ + // Don't return a break for the end of the dictionary range if there is one there. +- if (foundBreaks.peeki() >= rangeEnd) { ++ if (!breakEnd && fTypes >= UBRK_LINE && foundBreaks.peeki() >= rangeEnd) { + (void) foundBreaks.popi(); +- wordsFound -= 1; + } + +- return wordsFound; ++ return foundBreaks.size() - wordsFound; + } + + #if !UCONFIG_NO_NORMALIZATION +diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h +--- icu.org/source/common/dictbe.h 2022-04-08 00:41:55.000000000 +0200 ++++ icu/source/common/dictbe.h 2022-05-16 13:49:33.820459894 +0200 +@@ -35,7 +35,8 @@ + * threads without synchronization.</p> + */ + class DictionaryBreakEngine : public LanguageBreakEngine { +- private: ++ protected: ++ + /** + * The set of characters handled by this engine + * @internal +@@ -43,14 +44,84 @@ + + UnicodeSet fSet; + ++ const int32_t WJ = 0x2060; ++ const int32_t ZWSP = 0x200B; ++ ++ /** ++ * The break types it was constructed with ++ * @internal ++ */ ++ uint32_t fTypes; ++ ++ /** ++ * A Unicode set of all viramas ++ * @internal ++ */ ++ UnicodeSet fViramaSet; ++ ++ /** ++ * A Unicode set of all base characters ++ * @internal ++ */ ++ UnicodeSet fBaseSet; ++ ++ /** ++ * A Unicode set of all marks ++ * @internal ++ */ ++ UnicodeSet fMarkSet; ++ ++ /** ++ * A Unicode set of all characters ignored ignored in dictionary matching ++ * @internal ++ */ ++ UnicodeSet fIgnoreSet; ++ ++ /** ++ * A Unicode set of all characters ignored ignored in dictionary matching ++ * @internal ++ */ ++ UnicodeSet fSkipStartSet; ++ ++ /** ++ * A Unicode set of all characters ignored ignored in dictionary matching ++ * @internal ++ */ ++ UnicodeSet fSkipEndSet; ++ ++ /** ++ * A Unicode set of all characters that should not be broken before ++ * @internal ++ */ ++ UnicodeSet fNBeforeSet; ++ ++ /** ++ * The number of clusters within which breaks are inhibited ++ * @internal ++ */ ++ int32_t clusterLimit; ++ ++ bool scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const; ++ ++ bool scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const; ++ bool scanAfterEnd(UText *text, int32_t rangeEnd, int32_t& end, bool &doBreak) const; ++ void scanBackClusters(UText *text, int32_t textStart, int32_t& start) const; ++ void scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const; ++ + public: + + /** +- * <p>Constructor </p> ++ * <p>Default constructor.</p> ++ * + */ + DictionaryBreakEngine(); + + /** ++ * <p>Constructor with break types.</p> ++ */ ++ explicit DictionaryBreakEngine(uint32_t breakTypes); ++ ++ /** + * <p>Virtual destructor.</p> + */ + virtual ~DictionaryBreakEngine(); +@@ -305,10 +376,12 @@ + * @internal + */ + +- UnicodeSet fEndWordSet; + UnicodeSet fBeginWordSet; +- UnicodeSet fMarkSet; +- DictionaryMatcher *fDictionary; ++ UnicodeSet fPuncSet; ++ DictionaryMatcher *fDictionary; ++ ++ const uint32_t BADSNLP = 256 * 20; ++ const uint32_t kuint32max = 0x7FFFFFFF; + + public: + +diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionarydata.cpp +--- icu.org/source/common/dictionarydata.cpp 2023-06-14 06:23:55.000000000 +0900 ++++ icu/source/common/dictionarydata.cpp 2023-06-26 02:18:05.709454400 +0900 +@@ -44,7 +44,7 @@ + + int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const { ++ int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const { + + UCharsTrie uct(characters); + int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); +@@ -55,7 +55,13 @@ + UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c); + int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; + codePointsMatched += 1; ++ if (ignoreSet != NULL && ignoreSet->contains(c)) { ++ continue; ++ } + if (USTRINGTRIE_HAS_VALUE(result)) { ++ if (codePointsMatched < minLength) { ++ continue; ++ } + if (wordCount < limit) { + if (values != nullptr) { + values[wordCount] = uct.getValue(); +@@ -112,7 +118,7 @@ + + int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const { ++ int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const { + BytesTrie bt(characters); + int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); + int32_t wordCount = 0; +@@ -122,7 +128,13 @@ + UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c)); + int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; + codePointsMatched += 1; ++ if (ignoreSet != NULL && ignoreSet->contains(c)) { ++ continue; ++ } + if (USTRINGTRIE_HAS_VALUE(result)) { ++ if (codePointsMatched < minLength) { ++ continue; ++ } + if (wordCount < limit) { + if (values != nullptr) { + values[wordCount] = bt.getValue(); + +diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h +--- icu.org/source/common/dictionarydata.h 2023-06-14 06:23:55.000000000 +0900 ++++ icu/source/common/dictionarydata.h 2023-06-26 17:43:53.097724900 +0900 +@@ -21,6 +21,7 @@ + #include "unicode/utext.h" + #include "unicode/udata.h" + #include "udataswp.h" ++#include "unicode/uniset.h" + #include "unicode/uobject.h" + #include "unicode/ustringtrie.h" + +@@ -92,7 +93,7 @@ + */ + virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const = 0; ++ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const = 0; + + /** @return DictionaryData::TRIE_TYPE_XYZ */ + virtual int32_t getType() const = 0; +@@ -107,7 +108,7 @@ + virtual ~UCharsDictionaryMatcher(); + virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const override; ++ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const override; + virtual int32_t getType() const override; + private: + const char16_t *characters; +@@ -125,7 +126,7 @@ + virtual ~BytesDictionaryMatcher(); + virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const override; ++ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const override; + virtual int32_t getType() const override; + private: + UChar32 transform(UChar32 c) const; diff --git a/external/icu/icu4c-macosx.patch.1 b/external/icu/icu4c-macosx.patch.1 new file mode 100644 index 0000000000..fee08eb057 --- /dev/null +++ b/external/icu/icu4c-macosx.patch.1 @@ -0,0 +1,20 @@ +diff -ur icu.org/source/common/putil.cpp icu/source/common/putil.cpp +--- icu.org/source/common/putil.cpp 2017-04-10 16:22:16.000000000 +0200 ++++ icu/source/common/putil.cpp 2017-04-21 22:14:09.940217733 +0200 +@@ -1198,8 +1198,16 @@ + static const time_t decemberSolstice=1198332540; /*2007-12-22 06:09 UT*/ + + /* This probing will tell us when daylight savings occurs. */ ++#if U_PLATFORM_IS_DARWIN_BASED ++ struct tm *tmp; ++ tmp = localtime(&juneSolstice); ++ juneSol = *tmp; ++ tmp = localtime(&decemberSolstice); ++ decemberSol = *tmp; ++#else + localtime_r(&juneSolstice, &juneSol); + localtime_r(&decemberSolstice, &decemberSol); ++#endif + if(decemberSol.tm_isdst > 0) { + daylightType = U_DAYLIGHT_DECEMBER; + } else if(juneSol.tm_isdst > 0) { diff --git a/external/icu/icu4c-mkdir.patch.1 b/external/icu/icu4c-mkdir.patch.1 new file mode 100644 index 0000000000..0cdcf2b078 --- /dev/null +++ b/external/icu/icu4c-mkdir.patch.1 @@ -0,0 +1,17 @@ +diff -ur icu.org/source/data/Makefile.in icu/source/data/Makefile.in +--- icu.org/source/data/Makefile.in 2020-10-28 22:21:12.000000000 +0100 ++++ icu/source/data/Makefile.in 2020-11-17 10:18:37.960032668 +0100 +@@ -239,6 +239,13 @@ + + ifeq ($(ENABLE_SO_VERSION_DATA),1) + ifeq ($(PKGDATA_MODE),dll) ++ ++# This should be in the included rules.mk but that is generated empty by ++# configure because we have no data/locales/root.txt with prebuilt data/in/ ++$(TMP_DIR)/dirs.timestamp: ++ $(MKINSTALLDIRS) $(OUTTMPDIR) $(TMP_DIR) ++ echo timestamp > $@ ++ + SO_VERSION_DATA = $(OUTTMPDIR)/icudata.res + $(SO_VERSION_DATA) : $(MISCSRCDIR)/icudata.rc | $(TMP_DIR)/dirs.timestamp + ifeq ($(MSYS_RC_MODE),1) diff --git a/external/icu/icu4c-rpath.patch.1 b/external/icu/icu4c-rpath.patch.1 new file mode 100644 index 0000000000..35a5457780 --- /dev/null +++ b/external/icu/icu4c-rpath.patch.1 @@ -0,0 +1,36 @@ +diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux +--- icu.org/source/config/mh-linux 2016-06-15 20:58:17.000000000 +0200 ++++ icu/source/config/mh-linux 2017-04-21 22:38:18.893927819 +0200 +@@ -22,6 +22,10 @@ + LD_RPATH= -Wl,-zorigin,-rpath,'$$'ORIGIN + LD_RPATH_PRE = -Wl,-rpath, + ++## Force RPATH=$ORIGIN to locate own dependencies w/o need for LD_LIBRARY_PATH: ++ENABLE_RPATH=YES ++RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN' ++ + ## These are the library specific LDFLAGS + LDFLAGSICUDT=-nodefaultlibs -nostdlib + +diff -ur icu.org/source/data/pkgdataMakefile.in icu/source/data/pkgdataMakefile.in +--- icu.org/source/data/pkgdataMakefile.in 2016-06-15 20:58:17.000000000 +0200 ++++ icu/source/data/pkgdataMakefile.in 2017-04-21 22:38:18.892927822 +0200 +@@ -18,6 +18,9 @@ + MIDDLE_SO_TARGET= + PKGDATA_TRAILING_SPACE=" " + ++# escape $ with \ when passing to echo; needed to preserve $ORIGIN ++SHLIB.c.shell := $(subst $$,\$$,$(SHLIB.c)) ++ + all : clean + @echo GENCCODE_ASSEMBLY_TYPE=$(GENCCODE_ASSEMBLY) >> $(OUTPUTFILE) + @echo SO=$(SO) >> $(OUTPUTFILE) +@@ -26,7 +29,7 @@ + @echo LIB_EXT_ORDER=$(FINAL_SO_TARGET) >> $(OUTPUTFILE) + @echo COMPILE="$(COMPILE.c)" >> $(OUTPUTFILE) + @echo LIBFLAGS="-I$(top_srcdir)/common -I$(top_builddir)/common $(SHAREDLIBCPPFLAGS) $(SHAREDLIBCFLAGS)" >> $(OUTPUTFILE) +- @echo GENLIB="$(SHLIB.c)" >> $(OUTPUTFILE) ++ @echo GENLIB="$(SHLIB.c.shell)" >> $(OUTPUTFILE) + @echo LDICUDTFLAGS=$(LDFLAGSICUDT) >> $(OUTPUTFILE) + @echo LD_SONAME=$(LD_SONAME) >> $(OUTPUTFILE) + @echo RPATH_FLAGS=$(RPATH_FLAGS) >> $(OUTPUTFILE) diff --git a/external/icu/icu4c-rtti.patch.1 b/external/icu/icu4c-rtti.patch.1 new file mode 100644 index 0000000000..c058c7f3c8 --- /dev/null +++ b/external/icu/icu4c-rtti.patch.1 @@ -0,0 +1,12 @@ +diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux +--- icu.org/source/config/mh-linux 2017-04-21 23:01:23.257769703 +0200 ++++ icu/source/config/mh-linux 2017-04-21 23:03:23.166481552 +0200 +@@ -36,7 +36,7 @@ + #SH# LD_SONAME= + + ## Shared library options +-LD_SOOPTIONS= -Wl,-Bsymbolic ++LD_SOOPTIONS= -Wl,-Bsymbolic-functions + + ## Shared object suffix + SO = so diff --git a/external/icu/icu4c-scriptrun.patch.1 b/external/icu/icu4c-scriptrun.patch.1 new file mode 100644 index 0000000000..f2f2cf9f3b --- /dev/null +++ b/external/icu/icu4c-scriptrun.patch.1 @@ -0,0 +1,60 @@ +diff -ur icu.org/source/extra/scrptrun/scrptrun.cpp icu/source/extra/scrptrun/scrptrun.cpp +--- icu.org/source/extra/scrptrun/scrptrun.cpp 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/extra/scrptrun/scrptrun.cpp 2017-04-21 22:59:31.708037770 +0200 +@@ -151,7 +151,11 @@ + // characters above it on the stack will be poped. + if (pairIndex >= 0) { + if ((pairIndex & 1) == 0) { +- parenStack[++parenSP].pairIndex = pairIndex; ++ ++parenSP; ++ int32_t nVecSize = parenStack.size(); ++ if (parenSP == nVecSize) ++ parenStack.resize(nVecSize + 128); ++ parenStack[parenSP].pairIndex = pairIndex; + parenStack[parenSP].scriptCode = scriptCode; + } else if (parenSP >= 0) { + int32_t pi = pairIndex & ~1; +@@ -185,7 +189,14 @@ + // pop it from the stack + if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) { + parenSP -= 1; +- startSP -= 1; ++ /* decrement startSP only if it is >= 0, ++ decrementing it unnecessarily will lead to memory corruption ++ while processing the above while block. ++ e.g. startSP = -4 , parenSP = -1 ++ */ ++ if (startSP >= 0) { ++ startSP -= 1; ++ } + } + } else { + // if the run broke on a surrogate pair, +diff -ur icu.org/source/extra/scrptrun/scrptrun.h icu/source/extra/scrptrun/scrptrun.h +--- icu.org/source/extra/scrptrun/scrptrun.h 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/extra/scrptrun/scrptrun.h 2017-04-21 22:59:31.708037770 +0200 +@@ -19,6 +19,7 @@ + #include "unicode/utypes.h" + #include "unicode/uobject.h" + #include "unicode/uscript.h" ++#include <vector> + + U_NAMESPACE_BEGIN + +@@ -81,7 +82,7 @@ + int32_t scriptEnd; + UScriptCode scriptCode; + +- ParenStackEntry parenStack[128]; ++ std::vector<ParenStackEntry> parenStack; + int32_t parenSP; + + static int8_t highBit(int32_t value); +@@ -135,6 +136,7 @@ + scriptEnd = charStart; + scriptCode = USCRIPT_INVALID_CODE; + parenSP = -1; ++ parenStack.resize(128); + } + + inline void ScriptRun::reset(int32_t start, int32_t length) diff --git a/external/icu/icu4c-solarisgcc.patch.1 b/external/icu/icu4c-solarisgcc.patch.1 new file mode 100644 index 0000000000..6000ed0cb9 --- /dev/null +++ b/external/icu/icu4c-solarisgcc.patch.1 @@ -0,0 +1,12 @@ +diff -ur icu.org/source/common/uposixdefs.h icu/source/common/uposixdefs.h +--- icu.org/source/common/uposixdefs.h 2017-03-09 03:12:45.000000000 +0100 ++++ icu/source/common/uposixdefs.h 2017-04-21 22:23:11.857926971 +0200 +@@ -54,7 +54,7 @@ + * + * z/OS needs this definition for timeval and to get usleep. + */ +-#if !defined(_XOPEN_SOURCE_EXTENDED) && defined(__TOS_MVS__) ++#if !defined(_XOPEN_SOURCE_EXTENDED) && (defined(__TOS_MVS__) || defined(__IBMC__) || defined(__IBMCPP__)) + # define _XOPEN_SOURCE_EXTENDED 1 + #endif + diff --git a/external/icu/icu4c-ubsan.patch.1 b/external/icu/icu4c-ubsan.patch.1 new file mode 100644 index 0000000000..7b0c2efc92 --- /dev/null +++ b/external/icu/icu4c-ubsan.patch.1 @@ -0,0 +1,14 @@ +diff -ur icu.org/source/common/ubidiimp.h icu/source/common/ubidiimp.h +--- icu.org/source/common/ubidiimp.h 2019-10-03 13:16:41.000000000 +0200 ++++ icu/source/common/ubidiimp.h 2019-10-28 19:08:13.533284618 +0100 +@@ -198,8 +198,8 @@ + /* in a Run, logicalStart will get this bit set if the run level is odd */ + #define INDEX_ODD_BIT (1UL<<31) + +-#define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((int32_t)((level)&1)<<31)) +-#define ADD_ODD_BIT_FROM_LEVEL(x, level) ((x)|=((int32_t)((level)&1)<<31)) ++#define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((uint32_t)((level)&1)<<31)) ++#define ADD_ODD_BIT_FROM_LEVEL(x, level) ((x)|=((uint32_t)((level)&1)<<31)) + #define REMOVE_ODD_BIT(x) ((x)&=~INDEX_ODD_BIT) + + #define GET_INDEX(x) ((x)&~INDEX_ODD_BIT) diff --git a/external/icu/icu4c-use-pkgdata-single-ccode-file-mode.patch.1 b/external/icu/icu4c-use-pkgdata-single-ccode-file-mode.patch.1 new file mode 100644 index 0000000000..237e554b8a --- /dev/null +++ b/external/icu/icu4c-use-pkgdata-single-ccode-file-mode.patch.1 @@ -0,0 +1,12 @@ +--- icu/source/tools/toolutil/pkg_genc.h.orig 2022-01-11 06:02:29.694678787 +0100 ++++ icu/source/tools/toolutil/pkg_genc.h 2022-01-11 06:02:41.602640965 +0100 +@@ -48,9 +48,7 @@ + * the data to generate the final data library. This can + * increase the performance of the pkdata tool. + */ +-#if U_PLATFORM == U_PF_OS400 + #define USE_SINGLE_CCODE_FILE +-#endif + + /* Need to fix the file seperator character when using MinGW. */ + #if defined(WINDOWS_WITH_GNUC) || defined(USING_CYGWIN) diff --git a/external/icu/icu4c-warnings.patch.1 b/external/icu/icu4c-warnings.patch.1 new file mode 100644 index 0000000000..d8df0e14e9 --- /dev/null +++ b/external/icu/icu4c-warnings.patch.1 @@ -0,0 +1,11 @@ +diff -ur icu.org/source/common/unicode/utf16.h icu/source/common/unicode/utf16.h +--- icu.org/source/common/unicode/utf16.h 2020-10-28 22:21:12.000000000 +0100 ++++ icu/source/common/unicode/utf16.h 2020-11-16 19:31:03.356478154 +0100 +@@ -398,6 +398,7 @@ + (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ + } else /* c>0x10ffff or not enough space */ { \ + (isError)=true; \ ++ (void)(isError); \ + } \ + } UPRV_BLOCK_MACRO_END + diff --git a/external/icu/icu4c-windows-cygwin-cross.patch.1 b/external/icu/icu4c-windows-cygwin-cross.patch.1 new file mode 100644 index 0000000000..dd6b47c172 --- /dev/null +++ b/external/icu/icu4c-windows-cygwin-cross.patch.1 @@ -0,0 +1,131 @@ +diff -ur icu.org/source/acinclude.m4 icu/source/acinclude.m4 +--- icu.org/source/acinclude.m4 2020-04-10 16:22:16.000000000 +0200 ++++ icu/source/acinclude.m4 2020-04-21 22:14:09.940217733 +0200 +@@ -52,6 +52,12 @@ + else + icu_cv_host_frag=mh-cygwin-msvc + fi ;; ++aarch64-*-cygwin) ++ if test "$GCC" = yes; then ++ icu_cv_host_frag=mh-cygwin64 ++ else ++ icu_cv_host_frag=mh-cygwin-msvc ++ fi ;; + *-*-mingw*) + if test "$GCC" = yes; then + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ +--- icu/source/configure.ac.orig 2020-04-22 22:04:20.000000000 +0200 ++++ icu/source/configure.ac 2020-10-01 09:39:05.570900400 +0200 +@@ -213,23 +213,33 @@ + [cross_buildroot="${withval}"], + [cross_buildroot=""]) + ++cross_mixed_buildroot="$cross_buildroot" ++cross_unix_buildroot="$cross_buildroot" + if test "X$cross_buildroot" = "X"; then + if test "$cross_compiling" = "yes"; then + AC_MSG_ERROR([Error! Cross compiling but no --with-cross-build option specified - please supply the path to an executable ICU's build root]) + dnl ' + fi + else +- if test -f "${cross_buildroot}/config/icucross.mk"; then ++ case "${host}" in ++ *-*-cygwin*) ++ #M# -m isn't used because it doesn't work on Win98 ++ cross_mixed_buildroot=$(cygpath -ad "$cross_buildroot" | tr '\\' '/') ++ cross_unix_buildroot=$(cygpath -au "$cross_buildroot") ++ ;; ++ esac ++ if test -f "${cross_mixed_buildroot}/config/icucross.mk"; then + AC_MSG_RESULT([Using cross buildroot: $cross_buildroot]) + else +- if test -d "${cross_buildroot}"; then +- AC_MSG_ERROR([${cross_buildroot}/config/icucross.mk not found. Please build ICU in ${cross_buildroot} first.]) ++ if test -d "${cross_mixed_buildroot}"; then ++ AC_MSG_ERROR([${cross_mixed_buildroot}/config/icucross.mk not found. Please build ICU in ${cross_mixed_buildroot} first.]) + else +- AC_MSG_ERROR([No such directory ${cross_buildroot} supplied as the argument to --with-cross-build. Use an absolute path.]) ++ AC_MSG_ERROR([No such directory ${cross_mixed_buildroot} supplied as the argument to --with-cross-build. Use an absolute path.]) + fi + fi + fi +-AC_SUBST(cross_buildroot) ++AC_SUBST(cross_mixed_buildroot) ++AC_SUBST(cross_unix_buildroot) + + # Check for doxygen to generate documentation + AC_PATH_PROG(DOXYGEN,doxygen,,$PATH:/usr/local/bin:/usr/bin) +--- icu/source/test/testdata/Makefile.in.orig 2020-10-01 09:37:25.847888900 +0200 ++++ icu/source/test/testdata/Makefile.in 2020-10-01 09:36:41.859996500 +0200 +@@ -82,7 +82,7 @@ + # relative lib links from pkgdata are the same as for tmp + GENRBOPTS=-k + # use the cross root, in case we are cross compiling. Otherwise it is equal to top_builddir +-TOOLDIR=$(cross_buildroot)/tools ++TOOLDIR=$(cross_mixed_buildroot)/tools + SRCDATADIR=$(top_srcdir)/data + UNICODEDATADIR=$(SRCDATADIR)/unidata + OUTDIR=$(top_builddir)/data/out +--- icu/source/Makefile.in.orig 2020-04-22 22:04:20.000000000 +0200 ++++ icu/source/Makefile.in 2020-10-01 09:29:36.642364000 +0200 +@@ -255,16 +255,16 @@ + @(echo "CROSS_ICU_VERSION=$(VERSION)" ;\ + echo "TOOLEXEEXT=$(EXEEXT)" \ + ) > $@ +- @(echo 'TOOLBINDIR=$$(cross_buildroot)/bin' ;\ +- echo 'TOOLLIBDIR=$$(cross_buildroot)/lib' ;\ +- echo "INVOKE=$(LDLIBRARYPATH_ENVVAR)=$(LIBRARY_PATH_PREFIX)"'$$(TOOLLIBDIR):$$(cross_buildroot)/stubdata:$$(cross_buildroot)/tools/ctestfw:$$$$'"$(LDLIBRARYPATH_ENVVAR)" ;\ +- echo "PKGDATA_INVOKE=$(LDLIBRARYPATH_ENVVAR)=$(LIBRARY_PATH_PREFIX)"'$$(cross_buildroot)/stubdata:$$(cross_buildroot)/tools/ctestfw:$$(TOOLLIBDIR):$$$$'"$(LDLIBRARYPATH_ENVVAR) " ;\ ++ @(echo 'TOOLBINDIR=$$(cross_mixed_buildroot)/bin' ;\ ++ echo 'TOOLLIBDIR=$$(cross_mixed_buildroot)/lib' ;\ ++ echo "INVOKE=$(LDLIBRARYPATH_ENVVAR)=$(LIBRARY_PATH_PREFIX)"'$$(cross_unix_buildroot)/lib:$$(cross_unix_buildroot)/stubdata:$$(cross_unix_buildroot)/tools/ctestfw:$$$$'"$(LDLIBRARYPATH_ENVVAR)" ;\ ++ echo "PKGDATA_INVOKE=$(LDLIBRARYPATH_ENVVAR)=$(LIBRARY_PATH_PREFIX)"'$$(cross_unix_buildroot)/stubdata:$$(cross_unix_buildroot)/tools/ctestfw:$$(cross_unix_buildroot)/lib:$$$$'"$(LDLIBRARYPATH_ENVVAR) " ;\ + echo ) >> $@ + + config/icucross.inc: $(top_builddir)/icudefs.mk $(top_builddir)/Makefile @platform_make_fragment@ + @echo rebuilding $@ +- @(grep '^CURR_FULL_DIR' $(top_builddir)/icudefs.mk ; \ +- grep '^CURR_FULL_DIR' @platform_make_fragment@ || echo ""; \ ++ @(grep '^CURR_FULL_DIR' @platform_make_fragment@ || echo ""; \ ++ grep '^CURR_FULL_DIR' $(top_builddir)/icudefs.mk ; \ + ) > $@ + + config/icu.pc: $(srcdir)/config/icu.pc.in +--- icu/source/icudefs.mk.in.orig 2020-04-22 22:04:20.000000000 +0200 ++++ icu/source/icudefs.mk.in 2020-10-01 09:35:54.418128800 +0200 +@@ -35,7 +35,8 @@ + sysconfdir = @sysconfdir@ + # controls the include of $(top_builddir)/icucross.mk at bottom of file + cross_compiling = @cross_compiling@ +-cross_buildroot = @cross_buildroot@ ++cross_mixed_buildroot = @cross_mixed_buildroot@ ++cross_unix_buildroot = @cross_unix_buildroot@ + + # Package information + +@@ -303,8 +304,8 @@ + INSTALLED_INVOKE = $(LDLIBRARYPATH_ENVVAR)=$(libdir):$$$(LDLIBRARYPATH_ENVVAR) + + # Current full path directory for cross compilation +-ifneq ($(strip $(cross_buildroot)),) +-include $(cross_buildroot)/config/icucross.inc ++ifneq ($(strip $(cross_mixed_buildroot)),) ++include $(cross_mixed_buildroot)/config/icucross.inc + endif + + # Platform-specific setup +@@ -323,10 +324,11 @@ + + # some imported things from the cross env + TOOLEXEEXT = $(EXEEXT) +-ifneq ($(strip $(cross_buildroot)),) +-include $(cross_buildroot)/config/icucross.mk ++ifneq ($(strip $(cross_mixed_buildroot)),) ++include $(cross_mixed_buildroot)/config/icucross.mk + else +-cross_buildroot = $(top_builddir) ++cross_mixed_buildroot = $(top_builddir) ++cross_unix_buildroot = $(top_builddir) + endif + + # for tests diff --git a/external/icu/khmerdict.dict b/external/icu/khmerdict.dict Binary files differnew file mode 100644 index 0000000000..52605b6546 --- /dev/null +++ b/external/icu/khmerdict.dict |