diff options
Diffstat (limited to 'external/icu')
27 files changed, 1915 insertions, 0 deletions
diff --git a/external/icu/ExternalPackage_icu.mk b/external/icu/ExternalPackage_icu.mk new file mode 100644 index 000000000..dcd4da216 --- /dev/null +++ b/external/icu/ExternalPackage_icu.mk @@ -0,0 +1,42 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +icu_VERSION := $(ICU_MAJOR).$(ICU_MINOR)$(if $(ICU_MICRO),.$(ICU_MICRO)) + +$(eval $(call gb_ExternalPackage_ExternalPackage,icu,icu)) + +$(eval $(call gb_ExternalPackage_use_external_project,icu,icu)) + +ifneq ($(DISABLE_DYNLOADING),TRUE) +ifeq ($(OS),WNT) + +ifeq ($(COM),GCC) +$(eval $(call gb_ExternalPackage_add_files,icu,$(LIBO_LIB_FOLDER),\ + source/lib/icuin$(ICU_MAJOR).dll \ +)) +else +$(eval $(call gb_ExternalPackage_add_files,icu,$(LIBO_LIB_FOLDER),\ + source/lib/icuin$(if $(MSVC_USE_DEBUG_RUNTIME),d)$(ICU_MAJOR).dll \ +)) +endif # $(COM) + +else ifeq ($(OS),ANDROID) + +$(eval $(call gb_ExternalPackage_add_files,icu,$(LIBO_LIB_FOLDER),\ + source/lib/libicui18nlo.so \ +)) + +else # $(OS) != WNT/ANDROID + +$(eval $(call gb_ExternalPackage_add_file,icu,$(LIBO_LIB_FOLDER)/libicui18n$(gb_Library_DLLEXT).$(ICU_MAJOR),source/lib/libicui18n$(gb_Library_DLLEXT).$(icu_VERSION))) + +endif # $(OS) +endif # DISABLE_DYNLOADING + +# vim: set noet sw=4 ts=4: diff --git a/external/icu/ExternalPackage_icu_ure.mk b/external/icu/ExternalPackage_icu_ure.mk new file mode 100644 index 000000000..fefe71afd --- /dev/null +++ b/external/icu/ExternalPackage_icu_ure.mk @@ -0,0 +1,48 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +# libxml2 is in URE and depends on icuuc*.dll on Windows; the i18nlangtag lib is +# in URE and depends on the icuuc lib (which in turn depends on the icudata lib) +# on all platforms: + +$(eval $(call gb_ExternalPackage_ExternalPackage,icu_ure,icu)) + +$(eval $(call gb_ExternalPackage_use_external_project,icu_ure,icu)) + +ifneq ($(DISABLE_DYNLOADING),TRUE) +ifeq ($(OS),WNT) + +ifeq ($(COM),GCC) +$(eval $(call gb_ExternalPackage_add_files,icu_ure,$(LIBO_URE_LIB_FOLDER),\ + source/lib/icudt$(ICU_MAJOR).dll \ + source/lib/icuuc$(ICU_MAJOR).dll \ +)) +else +$(eval $(call gb_ExternalPackage_add_files,icu_ure,$(LIBO_URE_LIB_FOLDER),\ + source/lib/icudt$(if $(MSVC_USE_DEBUG_RUNTIME),d)$(ICU_MAJOR).dll \ + source/lib/icuuc$(if $(MSVC_USE_DEBUG_RUNTIME),d)$(ICU_MAJOR).dll \ +)) +endif # $(COM) + +else ifeq ($(OS),ANDROID) + +$(eval $(call gb_ExternalPackage_add_files,icu_ure,$(LIBO_URE_LIB_FOLDER),\ + source/lib/libicudatalo.so \ + source/lib/libicuuclo.so \ +)) + +else # $(OS) != WNT/ANDROID + +$(eval $(call gb_ExternalPackage_add_file,icu_ure,$(LIBO_URE_LIB_FOLDER)/libicudata$(gb_Library_DLLEXT).$(ICU_MAJOR),source/lib/libicudata$(gb_Library_DLLEXT).$(icu_VERSION))) +$(eval $(call gb_ExternalPackage_add_file,icu_ure,$(LIBO_URE_LIB_FOLDER)/libicuuc$(gb_Library_DLLEXT).$(ICU_MAJOR),source/lib/libicuuc$(gb_Library_DLLEXT).$(icu_VERSION))) + +endif # $(OS) +endif # DISABLE_DYNLOADING + +# vim: set noet sw=4 ts=4: diff --git a/external/icu/ExternalProject_icu.mk b/external/icu/ExternalProject_icu.mk new file mode 100644 index 000000000..f62d8528c --- /dev/null +++ b/external/icu/ExternalProject_icu.mk @@ -0,0 +1,94 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +$(eval $(call gb_ExternalProject_ExternalProject,icu)) + +$(eval $(call gb_ExternalProject_register_targets,icu,\ + build \ +)) + +icu_CPPFLAGS:="-DHAVE_GCC_ATOMICS=$(if $(filter TRUE,$(GCC_HAVE_BUILTIN_ATOMIC)),1,0)" + +ifeq ($(OS),WNT) + +# Note: runConfigureICU ignores everything following the platform name! +$(call gb_ExternalProject_get_state_target,icu,build) : + $(call gb_Trace_StartRange,icu,EXTERNAL) + $(call gb_ExternalProject_run,build,\ + export LIB="$(ILIB)" \ + && CFLAGS="-FS $(SOLARINC) $(gb_DEBUGINFO_FLAGS)" CPPFLAGS="$(SOLARINC)" CXXFLAGS="-FS $(SOLARINC) $(gb_DEBUGINFO_FLAGS)" \ + INSTALL=`cygpath -m /usr/bin/install` \ + ./runConfigureICU \ + $(if $(MSVC_USE_DEBUG_RUNTIME),--enable-debug --disable-release) \ + Cygwin/MSVC --disable-extras \ + && $(MAKE) \ + ,source) + $(call gb_Trace_EndRange,icu,EXTERNAL) + +else # $(OS) + +icu_CFLAGS:=" \ + $(if $(filter iOS,$(OS)),-DUCONFIG_NO_FILE_IO) \ + $(if $(SYSBASE),-I$(SYSBASE)/usr/include) \ + $(if $(ENABLE_OPTIMIZED),$(gb_COMPILEROPTFLAGS),$(gb_COMPILERNOOPTFLAGS)) \ + $(if $(ENABLE_LTO),$(gb_LTOFLAGS)) \ + $(if $(filter GCC,$(COM)),-fno-strict-aliasing) \ + $(if $(call gb_Module__symbols_enabled,icu),$(gb_DEBUGINFO_FLAGS)) \ + $(if $(filter FUZZERS,$(BUILD_TYPE)),-DU_USE_STRTOD_L=0) \ + $(if $(filter ANDROID,$(OS)),-fvisibility=hidden -fno-omit-frame-pointer)" +icu_CXXFLAGS:="$(CXXFLAGS) $(CXXFLAGS_CXX11) \ + $(if $(filter iOS,$(OS)),-DUCONFIG_NO_FILE_IO) \ + $(if $(ENABLE_OPTIMIZED),$(gb_COMPILEROPTFLAGS),$(gb_COMPILERNOOPTFLAGS)) \ + $(if $(ENABLE_LTO),$(gb_LTOFLAGS)) \ + $(if $(filter GCC,$(COM)),-fno-strict-aliasing) \ + $(if $(call gb_Module__symbols_enabled,icu),$(gb_DEBUGINFO_FLAGS)) \ + $(if $(filter FUZZERS,$(BUILD_TYPE)),-DU_USE_STRTOD_L=0) \ + $(if $(filter ANDROID,$(OS)),-fvisibility=hidden -fno-omit-frame-pointer $(SOLARINC))" +icu_LDFLAGS:=" \ + $(if $(ENABLE_LTO),$(gb_LTOFLAGS)) \ + $(if $(filter TRUE,$(HAVE_LD_HASH_STYLE)),-Wl$(COMMA)--hash-style=$(WITH_LINKER_HASH_STYLE)) \ + $(if $(SYSBASE),-L../lib -L../../lib -L../stubdata -L../../stubdata -L$(SYSBASE)/usr/lib) \ + $(if $(filter TRUE,$(HAVE_LD_BSYMBOLIC_FUNCTIONS)), -Wl$(COMMA)-Bsymbolic-functions) \ + $(if $(filter ANDROID,$(OS)),$(gb_STDLIBS))" + +# DATASUBDIR=data in cross-compiling case, because --disable-tools completely skips the +# data directory/doesn't build the requested library in that case (icu/source/Makefile.in) +# so we need to add it back to the list of subdirectories to build +$(call gb_ExternalProject_get_state_target,icu,build) : + $(call gb_Trace_StartRange,icu,EXTERNAL) + $(call gb_ExternalProject_run,build,\ + CPPFLAGS=$(icu_CPPFLAGS) CFLAGS=$(icu_CFLAGS) \ + CXXFLAGS=$(icu_CXXFLAGS) LDFLAGS=$(icu_LDFLAGS) \ + ./configure \ + --disable-layout --disable-samples \ + $(if $(filter FUZZERS,$(BUILD_TYPE)),--disable-release) \ + $(if $(CROSS_COMPILING),--disable-tools --disable-extras) \ + $(if $(filter iOS ANDROID,$(OS)),--disable-dyload) \ + $(if $(filter ANDROID,$(OS)),--disable-strict ac_cv_c_bigendian=no) \ + $(if $(filter SOLARIS AIX,$(OS)),--disable-64bit-libs) \ + $(if $(filter TRUE,$(DISABLE_DYNLOADING)),\ + --with-data-packaging=static --enable-static --disable-shared --disable-dyload,\ + --disable-static --enable-shared $(if $(filter ANDROID,$(OS)),--with-library-suffix=lo)) \ + $(if $(CROSS_COMPILING),--build=$(BUILD_PLATFORM) --host=$(HOST_PLATFORM)\ + --with-cross-build=$(WORKDIR_FOR_BUILD)/UnpackedTarball/icu/source) \ + && $(MAKE) $(if $(CROSS_COMPILING),DATASUBDIR=data) \ + $(if $(filter MACOSX,$(OS)), \ + && $(PERL) $(SRCDIR)/solenv/bin/macosx-change-install-names.pl shl \ + URELIB \ + $(EXTERNAL_WORKDIR)/source/lib/libicuuc$(gb_Library_DLLEXT).$(icu_VERSION) \ + $(EXTERNAL_WORKDIR)/source/lib/libicui18n$(gb_Library_DLLEXT).$(icu_VERSION) \ + && $(PERL) $(SRCDIR)/solenv/bin/macosx-change-install-names.pl shl \ + OOO \ + $(EXTERNAL_WORKDIR)/source/lib/libicudata$(gb_Library_DLLEXT).$(icu_VERSION)) \ + ,source) + $(call gb_Trace_EndRange,icu,EXTERNAL) + +endif + +# vim: set noet sw=4 ts=4: diff --git a/external/icu/Makefile b/external/icu/Makefile new file mode 100644 index 000000000..e4968cf85 --- /dev/null +++ b/external/icu/Makefile @@ -0,0 +1,7 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- + +module_directory:=$(dir $(realpath $(firstword $(MAKEFILE_LIST)))) + +include $(module_directory)/../../solenv/gbuild/partial_build.mk + +# vim: set noet sw=4 ts=4: diff --git a/external/icu/Module_icu.mk b/external/icu/Module_icu.mk new file mode 100644 index 000000000..5c99b930f --- /dev/null +++ b/external/icu/Module_icu.mk @@ -0,0 +1,19 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +$(eval $(call gb_Module_Module,icu)) + +$(eval $(call gb_Module_add_targets,icu,\ + UnpackedTarball_icu \ + ExternalPackage_icu \ + ExternalPackage_icu_ure \ + ExternalProject_icu \ +)) + +# vim: set noet sw=4 ts=4: diff --git a/external/icu/README b/external/icu/README new file mode 100644 index 000000000..484de1f41 --- /dev/null +++ b/external/icu/README @@ -0,0 +1 @@ +Library providing Unicode support, from [http://site.icu-project.org/]. diff --git a/external/icu/UnpackedTarball_icu.mk b/external/icu/UnpackedTarball_icu.mk new file mode 100644 index 000000000..72fae09b1 --- /dev/null +++ b/external/icu/UnpackedTarball_icu.mk @@ -0,0 +1,46 @@ +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +$(eval $(call gb_UnpackedTarball_UnpackedTarball,icu)) + +$(eval $(call gb_UnpackedTarball_set_tarball,icu,$(ICU_TARBALL))) + +$(eval $(call gb_UnpackedTarball_update_autoconf_configs,icu,source)) + +# Data zip contains data/... and needs to end up in icu/source/data/... +# Only data/misc/icudata.rc is needed for a Cygwin/MSVC build. +$(eval $(call gb_UnpackedTarball_set_pre_action,icu,\ + unzip -q -d source -o $(gb_UnpackedTarget_TARFILE_LOCATION)/$(ICU_DATA_TARBALL) data/misc/icudata.rc \ +)) + +$(eval $(call gb_UnpackedTarball_set_patchlevel,icu,0)) + +$(eval $(call gb_UnpackedTarball_add_patches,icu,\ + external/icu/icu4c-build.patch.1 \ + external/icu/icu4c-aix.patch.1 \ + external/icu/icu4c-warnings.patch.1 \ + external/icu/icu4c-macosx.patch.1 \ + external/icu/icu4c-solarisgcc.patch.1 \ + external/icu/icu4c-mkdir.patch.1 \ + external/icu/icu4c-$(if $(filter ANDROID,$(OS)),android,rpath).patch.1 \ + external/icu/icu4c-ubsan.patch.1 \ + external/icu/icu4c-scriptrun.patch.1 \ + external/icu/icu4c-rtti.patch.1 \ + external/icu/icu4c-clang-cl.patch.1 \ + $(if $(filter-out ANDROID,$(OS)),external/icu/icu4c-icudata-stdlibs.patch.1) \ + external/icu/gcc9.patch \ + external/icu/c++20-comparison.patch \ + external/icu/ubsan.patch \ + external/icu/Wdeprecated-copy-dtor.patch \ + external/icu/icu4c-khmerbreakengine.patch.1 \ +)) + +$(eval $(call gb_UnpackedTarball_add_file,icu,source/data/brkitr/khmerdict.dict,external/icu/khmerdict.dict)) + +# vim: set noet sw=4 ts=4: diff --git a/external/icu/Wdeprecated-copy-dtor.patch b/external/icu/Wdeprecated-copy-dtor.patch new file mode 100644 index 000000000..67078ef1b --- /dev/null +++ b/external/icu/Wdeprecated-copy-dtor.patch @@ -0,0 +1,25 @@ +--- source/common/unicode/uobject.h ++++ source/common/unicode/uobject.h +@@ -245,10 +245,10 @@ + // direct use of UObject itself + + // default constructor +- // inline UObject() {} ++ UObject() = default; + + // copy constructor +- // inline UObject(const UObject &other) {} ++ UObject(const UObject &other) = default; + + #if 0 + // TODO Sometime in the future. Implement operator==(). +@@ -280,8 +280,8 @@ + * Subclasses need this assignment operator if they use compiler-provided + * assignment operators of their own. An alternative to not declaring one + * here would be to declare and empty-implement a protected or public one. +- UObject &UObject::operator=(const UObject &); + */ ++ UObject &operator=(const UObject &) = default; + }; + + #ifndef U_HIDE_INTERNAL_API diff --git a/external/icu/c++20-comparison.patch b/external/icu/c++20-comparison.patch new file mode 100644 index 000000000..44053e671 --- /dev/null +++ b/external/icu/c++20-comparison.patch @@ -0,0 +1,171 @@ +--- source/common/uvector.cpp ++++ source/common/uvector.cpp +@@ -110,7 +110,7 @@ + } + + // This only does something sensible if this object has a non-null comparer +-UBool UVector::operator==(const UVector& other) { ++UBool UVector::operator==(const UVector& other) const { + int32_t i; + if (count != other.count) return FALSE; + if (comparer != NULL) { +--- source/common/uvector.h ++++ source/common/uvector.h +@@ -113,12 +113,12 @@ + * equal if they are of the same size and all elements are equal, + * as compared using this object's comparer. + */ +- UBool operator==(const UVector& other); ++ UBool operator==(const UVector& other) const; + + /** + * Equivalent to !operator==() + */ +- inline UBool operator!=(const UVector& other); ++ inline UBool operator!=(const UVector& other) const; + + //------------------------------------------------------------ + // java.util.Vector API +@@ -382,7 +382,7 @@ + return elementAt(index); + } + +-inline UBool UVector::operator!=(const UVector& other) { ++inline UBool UVector::operator!=(const UVector& other) const { + return !operator==(other); + } + +--- source/i18n/tzrule.cpp ++++ source/i18n/tzrule.cpp +@@ -53,7 +53,7 @@ + return *this; + } + +-UBool ++bool + TimeZoneRule::operator==(const TimeZoneRule& that) const { + return ((this == &that) || + (typeid(*this) == typeid(that) && +@@ -120,7 +120,7 @@ + return *this; + } + +-UBool ++bool + InitialTimeZoneRule::operator==(const TimeZoneRule& that) const { + return ((this == &that) || + (typeid(*this) == typeid(that) && +@@ -226,7 +226,7 @@ + return *this; + } + +-UBool ++bool + AnnualTimeZoneRule::operator==(const TimeZoneRule& that) const { + if (this == &that) { + return TRUE; +@@ -445,7 +445,7 @@ + return *this; + } + +-UBool ++bool + TimeArrayTimeZoneRule::operator==(const TimeZoneRule& that) const { + if (this == &that) { + return TRUE; +--- source/i18n/unicode/rbtz.h ++++ source/i18n/unicode/rbtz.h +@@ -85,6 +85,7 @@ + * @stable ICU 3.8 + */ + virtual UBool operator!=(const TimeZone& that) const; ++ UBool operator!=(const RuleBasedTimeZone& that) const {return !operator==(that);} + + /** + * Adds the <code>TimeZoneRule</code> which represents time transitions. +--- source/i18n/unicode/simpletz.h ++++ source/i18n/unicode/simpletz.h +@@ -110,6 +110,7 @@ + * @stable ICU 2.0 + */ + virtual UBool operator==(const TimeZone& that) const; ++ UBool operator!=(const SimpleTimeZone& that) const {return !operator==(that);} + + /** + * Constructs a SimpleTimeZone with the given raw GMT offset and time zone ID, +--- source/i18n/unicode/smpdtfmt.h ++++ source/i18n/unicode/smpdtfmt.h +@@ -874,6 +874,7 @@ + * @stable ICU 2.0 + */ + virtual UBool operator==(const Format& other) const; ++ UBool operator!=(const SimpleDateFormat& that) const {return !operator==(that);} + + + using DateFormat::format; +--- source/i18n/unicode/stsearch.h ++++ source/i18n/unicode/stsearch.h +@@ -297,6 +297,7 @@ + * @stable ICU 2.0 + */ + virtual UBool operator==(const SearchIterator &that) const; ++ UBool operator!=(const StringSearch &that) const {return !operator==(that);} + + // public get and set methods ---------------------------------------- + +--- source/i18n/unicode/tzrule.h ++++ source/i18n/unicode/tzrule.h +@@ -54,7 +54,7 @@ + * @return true if the given <code>TimeZoneRule</code> objects are semantically equal. + * @stable ICU 3.8 + */ +- virtual UBool operator==(const TimeZoneRule& that) const; ++ virtual bool operator==(const TimeZoneRule& that) const; + + /** + * Return true if the given <code>TimeZoneRule</code> objects are semantically unequal. Objects +@@ -245,7 +245,7 @@ + * @return true if the given <code>TimeZoneRule</code> objects are semantically equal. + * @stable ICU 3.8 + */ +- virtual UBool operator==(const TimeZoneRule& that) const; ++ virtual bool operator==(const TimeZoneRule& that) const; + + /** + * Return true if the given <code>TimeZoneRule</code> objects are semantically unequal. Objects +@@ -255,6 +255,7 @@ + * @stable ICU 3.8 + */ + virtual UBool operator!=(const TimeZoneRule& that) const; ++ UBool operator!=(const InitialTimeZoneRule& that) const {return !operator==(that);} + + /** + * Gets the time when this rule takes effect in the given year. +@@ -456,7 +457,7 @@ + * @return true if the given <code>TimeZoneRule</code> objects are semantically equal. + * @stable ICU 3.8 + */ +- virtual UBool operator==(const TimeZoneRule& that) const; ++ virtual bool operator==(const TimeZoneRule& that) const; + + /** + * Return true if the given <code>TimeZoneRule</code> objects are semantically unequal. Objects +@@ -672,7 +673,7 @@ + * @return true if the given <code>TimeZoneRule</code> objects are semantically equal. + * @stable ICU 3.8 + */ +- virtual UBool operator==(const TimeZoneRule& that) const; ++ virtual bool operator==(const TimeZoneRule& that) const; + + /** + * Return true if the given <code>TimeZoneRule</code> objects are semantically unequal. Objects +--- source/i18n/unicode/vtzone.h ++++ source/i18n/unicode/vtzone.h +@@ -81,6 +81,7 @@ + * @stable ICU 3.8 + */ + virtual UBool operator!=(const TimeZone& that) const; ++ UBool operator!=(const VTimeZone& that) const {return !operator==(that);} + + /** + * Create a <code>VTimeZone</code> instance by the time zone ID. diff --git a/external/icu/cross-bin/icu-config b/external/icu/cross-bin/icu-config new file mode 100755 index 000000000..8ccf94f9b --- /dev/null +++ b/external/icu/cross-bin/icu-config @@ -0,0 +1,12 @@ +#!/bin/sh + +case $1 in +--version) + echo whatever + ;; +--cppflags) + echo ${ICU_CFLAGS} + ;; +--ldflags-searchpath) + echo ${ICU_LIBS} +esac diff --git a/external/icu/gcc9.patch b/external/icu/gcc9.patch new file mode 100644 index 000000000..5c9808f8c --- /dev/null +++ b/external/icu/gcc9.patch @@ -0,0 +1,26 @@ +--- source/i18n/unicode/format.h ++++ source/i18n/unicode/format.h +@@ -22,6 +22,13 @@ + + #ifndef FORMAT_H + #define FORMAT_H ++ ++#ifdef __GNUC__ ++#pragma GCC diagnostic push ++#pragma GCC diagnostic ignored "-Wpragmas" // for old GCC ++#pragma GCC diagnostic ignored "-Wunknown-warning-option" // for Clang ++#pragma GCC diagnostic ignored "-Wdeprecated-copy" ++#endif + + + #include "unicode/utypes.h" +@@ -314,5 +314,9 @@ + + #endif /* U_SHOW_CPLUSPLUS_API */ + ++#ifdef __GNUC__ ++#pragma GCC diagnostic pop ++#endif ++ + #endif // _FORMAT + //eof diff --git a/external/icu/icu4c-aix.patch.1 b/external/icu/icu4c-aix.patch.1 new file mode 100644 index 000000000..77982163b --- /dev/null +++ b/external/icu/icu4c-aix.patch.1 @@ -0,0 +1,143 @@ +diff -ur icu.org/source/config/mh-aix-gcc icu/source/config/mh-aix-gcc +--- icu.org/source/config/mh-aix-gcc 2016-06-15 20:58:17.000000000 +0200 ++++ icu/source/config/mh-aix-gcc 2017-04-21 21:58:49.731432198 +0200 +@@ -18,84 +18,29 @@ + GEN_DEPS.c= $(CC) -E -MM $(DEFS) $(CPPFLAGS) + GEN_DEPS.cc= $(CXX) -E -MM $(DEFS) $(CPPFLAGS) + +-## Commands to link +-## We need to use the C++ linker, even when linking C programs, since +-## our libraries contain C++ code (C++ static init not called) +-LINK.c= $(AIX_PREDELETE) $(CXX) $(CXXFLAGS) $(LDFLAGS) +-LINK.cc= $(AIX_PREDELETE) $(CXX) $(CXXFLAGS) $(LDFLAGS) +- +-## Shared library options +-LD_SOOPTIONS= -Wl,-bsymbolic +- +-## Commands to make a shared library +-SHLIB.c= $(AIX_PREDELETE) $(CC) $(CFLAGS) $(LDFLAGS) -shared -Wl,-bexpall $(LD_SOOPTIONS) +-SHLIB.cc= $(AIX_PREDELETE) $(CXX) $(CXXFLAGS) $(LDFLAGS) -shared -Wl,-bexpall $(LD_SOOPTIONS) +- +-## Compiler switch to embed a runtime search path +-LD_RPATH= -I +-LD_RPATH_PRE= ++## Flags for position independent code ++SHAREDLIBCFLAGS = -fPIC ++SHAREDLIBCXXFLAGS = -fPIC ++SHAREDLIBCPPFLAGS = -DPIC ++ ++## Additional flags when building libraries and with threads ++THREADSCPPFLAGS = -D_REENTRANT -D_THREAD_SAFE ++LIBCPPFLAGS = + +-## enable the shared lib loader +-LDFLAGS += -Wl,-bbigtoc ++LD_RPATH= ++LD_RPATH_PRE= + + ## These are the library specific LDFLAGS + LDFLAGSICUDT=-nodefaultlibs -nostdlib + +-## We need to delete things prior to linking, or else we'll get +-## SEVERE ERROR: output file in use .. on AIX. +-## But, shell script version should NOT delete target as we don't +-## have $@ in that context. (SH = only shell script, icu-config) +-AIX_PREDELETE=rm -f $@ ; +-#SH# AIX_PREDELETE= +- + ## Environment variable to set a runtime search path + LDLIBRARYPATH_ENVVAR = LIBPATH + +-## Override Versioned target for a shared library. +-FINAL_SO_TARGET= $(basename $(SO_TARGET))$(SO_TARGET_VERSION).$(SO) +-MIDDLE_SO_TARGET= $(basename $(SO_TARGET))$(SO_TARGET_VERSION_MAJOR).$(SO) +-SHARED_OBJECT = $(notdir $(FINAL_SO_TARGET:.$(SO)=.$(SOBJ))) +-SHARED_OBJECT_NO_VERSION = $(basename $(SO_TARGET)).$(SOBJ) +- +-# The following is for Makefile.inc's use. +-ICULIBSUFFIX_VERSION = $(LIB_VERSION_MAJOR) +- +-# this one is for icudefs.mk's use +-ifeq ($(ENABLE_SHARED),YES) +-SO_TARGET_VERSION_SUFFIX = $(SO_TARGET_VERSION_MAJOR) +-endif +- +-## Compiler switch to embed a library name. Not present on AIX. +-LD_SONAME = +- +-## The type of assembly needed when pkgdata is used for generating shared libraries. +-GENCCODE_ASSEMBLY=-a xlc +- + ## Shared object suffix +-SOBJ= so +-# without the -brtl option, the library names use .a. AIX is funny that way. +-SO= a +-A= a ++SO= so + + ## Non-shared intermediate object suffix +-STATIC_O = o +- +-## Special AIX rules +- +-## Build archive from shared object +-%.a : %.so +- ln -f $< $(SHARED_OBJECT_NO_VERSION) +- $(AR) $(ARFLAGS) $@ $(SHARED_OBJECT_NO_VERSION) +- rm -f $(SHARED_OBJECT_NO_VERSION) +-$(LIBDIR)/%.a : %.so +- ln -f $< $(SHARED_OBJECT_NO_VERSION) +- $(AR) $(ARFLAGS) $@ $(SHARED_OBJECT_NO_VERSION) +- rm -f $(SHARED_OBJECT_NO_VERSION) +- +-## Build import list from export list +-%.e : %.exp +- @echo "Building an import list for $<" +- @$(SHELL) -ec "echo '#! $*.a($*.so)' | cat - $< > $@" ++STATIC_O = ao + + ## Compilation rules + %.$(STATIC_O): $(srcdir)/%.c +@@ -123,10 +68,10 @@ + [ -s $@ ] || rm -f $@' + + ## Versioned libraries rules +-%$(SO_TARGET_VERSION_MAJOR).$(SO): %$(SO_TARGET_VERSION).$(SO) +- $(RM) $@ && ln -s ${*F}$(SO_TARGET_VERSION).$(SO) $@ +-%.$(SO): %$(SO_TARGET_VERSION).$(SO) +- $(RM) $@ && ln -s ${*F}$(SO_TARGET_VERSION).$(SO) $@ ++%.$(SO).$(SO_TARGET_VERSION_MAJOR): %.$(SO).$(SO_TARGET_VERSION) ++ $(RM) $@ && ln -s ${<F} $@ ++%.$(SO): %.$(SO).$(SO_TARGET_VERSION_MAJOR) ++ $(RM) $@ && ln -s ${*F}.$(SO).$(SO_TARGET_VERSION) $@ + + + ## BIR - bind with internal references [so app data and icu data doesn't collide] +diff -ur icu.org/source/tools/pkgdata/pkgdata.cpp icu/source/tools/pkgdata/pkgdata.cpp +--- icu.org/source/tools/pkgdata/pkgdata.cpp 2017-03-21 02:03:49.000000000 +0100 ++++ icu/source/tools/pkgdata/pkgdata.cpp 2017-04-21 21:58:49.732432195 +0200 +@@ -934,7 +934,7 @@ + + uprv_strcat(pkgDataFlags[SO_EXT], "."); + uprv_strcat(pkgDataFlags[SO_EXT], pkgDataFlags[A_EXT]); +-#elif U_PLATFORM == U_PF_OS400 || defined(_AIX) ++#elif U_PLATFORM == U_PF_OS400 + sprintf(libFileNames[LIB_FILE_VERSION_TMP], "%s%s%s", + libFileNames[LIB_FILE], + FILE_EXTENSION_SEP, +@@ -1407,15 +1407,6 @@ + pkgDataFlags[LDICUDTFLAGS], + targetDir, + libFileNames[LIB_FILE_CYGWIN_VERSION], +-#elif U_PLATFORM == U_PF_AIX +- sprintf(cmd, "%s %s%s;%s %s -o %s%s %s %s%s %s %s", +- RM_CMD, +- targetDir, +- libFileNames[LIB_FILE_VERSION_TMP], +- pkgDataFlags[GENLIB], +- pkgDataFlags[LDICUDTFLAGS], +- targetDir, +- libFileNames[LIB_FILE_VERSION_TMP], + #else + sprintf(cmd, "%s %s -o %s%s %s %s%s %s %s", + pkgDataFlags[GENLIB], diff --git a/external/icu/icu4c-android.patch.1 b/external/icu/icu4c-android.patch.1 new file mode 100644 index 000000000..602d225d7 --- /dev/null +++ b/external/icu/icu4c-android.patch.1 @@ -0,0 +1,75 @@ +diff -ur icu.org/source/common/unicode/platform.h icu/source/common/unicode/platform.h +--- icu.org/source/common/unicode/platform.h 2019-10-03 13:16:41.000000000 +0200 ++++ icu/source/common/unicode/platform.h 2019-10-29 22:58:26.881221287 +0100 +@@ -818,7 +818,7 @@ + UPRV_HAS_DECLSPEC_ATTRIBUTE(dllimport)) + # define U_EXPORT __declspec(dllexport) + #elif defined(__GNUC__) +-# define U_EXPORT __attribute__((visibility("default"))) ++# define U_EXPORT + #elif (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x550) \ + || (defined(__SUNPRO_C) && __SUNPRO_C >= 0x550) + # define U_EXPORT __global +diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux +--- icu.org/source/config/mh-linux 2018-09-29 02:34:41.000000000 +0200 ++++ icu/source/config/mh-linux 2018-10-20 00:33:36.558130876 +0200 +@@ -27,7 +27,7 @@ + + ## Compiler switch to embed a library name + # The initial tab in the next line is to prevent icu-config from reading it. +- LD_SONAME = -Wl,-soname -Wl,$(notdir $(MIDDLE_SO_TARGET)) ++ #LD_SONAME = -Wl,-soname -Wl,$(notdir $(MIDDLE_SO_TARGET)) + #SH# # We can't depend on MIDDLE_SO_TARGET being set. + #SH# LD_SONAME= + +diff -ur icu.org/source/configure icu/source/configure +--- icu.org/source/configure 2018-10-02 00:39:56.000000000 +0200 ++++ icu/source/configure 2018-10-20 00:33:36.559130874 +0200 +@@ -5207,7 +5207,7 @@ + else + icu_cv_host_frag=mh-linux-va + fi ;; +-*-*-linux*|*-*-gnu|*-*-k*bsd*-gnu|*-*-kopensolaris*-gnu) icu_cv_host_frag=mh-linux ;; ++*-*-linux*|*-*-gnu|*-*-k*bsd*-gnu|*-*-kopensolaris*-gnu|*-*-*-androideabi*) icu_cv_host_frag=mh-linux ;; + i[34567]86-*-cygwin) + if test "$GCC" = yes; then + icu_cv_host_frag=mh-cygwin +@@ -6400,6 +6400,10 @@ + # Check to see if genccode can generate simple assembly. + GENCCODE_ASSEMBLY= + case "${host}" in ++arm-*-linux-androideabi) ++ if test "$GCC" = yes; then ++ GENCCODE_ASSEMBLY="-a gcc-android-arm" ++ fi ;; + *-linux*|*-kfreebsd*-gnu*|i*86-*-*bsd*|i*86-pc-gnu) + if test "$GCC" = yes; then + # We're using gcc, and the simple -a gcc command line works for genccode +@@ -7499,6 +7503,10 @@ + # wchar_t can be used + CHECK_UTF16_STRING_RESULT="available" + ;; ++*-*-*-androideabi|mips-unknown-linux-android) ++ # no UTF-16 strings thanks, I think, this is to avoid the -std=c++0x which causes trouble with uint64_t ++ CHECK_UTF16_STRING_RESULT="nope" ++ ;; + *) + ;; + esac +diff -ur icu.org/source/i18n/decimfmt.cpp icu/source/i18n/decimfmt.cpp +--- icu.org/source/i18n/decimfmt.cpp 2018-10-02 00:39:56.000000000 +0200 ++++ icu/source/i18n/decimfmt.cpp 2018-10-20 00:33:36.560130873 +0200 +@@ -9,6 +9,13 @@ + // Helpful in toString methods and elsewhere. + #define UNISTR_FROM_STRING_EXPLICIT + ++#ifdef __ANDROID__ ++#ifndef ARM ++#define ARM ++#endif ++#include <android/compatibility.hxx> ++#endif ++ + #include <cmath> + #include <cstdlib> + #include <stdlib.h> diff --git a/external/icu/icu4c-build.patch.1 b/external/icu/icu4c-build.patch.1 new file mode 100644 index 000000000..a878de732 --- /dev/null +++ b/external/icu/icu4c-build.patch.1 @@ -0,0 +1,91 @@ +diff -ur icu.org/source/config/mh-darwin icu/source/config/mh-darwin +--- icu.org/source/config/mh-darwin 2016-06-15 20:58:17.000000000 +0200 ++++ icu/source/config/mh-darwin 2017-04-21 21:30:23.584568210 +0200 +@@ -30,11 +30,7 @@ + SHLIB.cc= $(CXX) -dynamiclib -dynamic $(CXXFLAGS) $(LDFLAGS) $(LD_SOOPTIONS) + + ## Compiler switches to embed a library name and version information +-ifeq ($(ENABLE_RPATH),YES) +-LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name $(libdir)/$(notdir $(MIDDLE_SO_TARGET)) +-else +-LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name $(notdir $(MIDDLE_SO_TARGET)) $(PKGDATA_TRAILING_SPACE) +-endif ++LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name @__________________________________________________URELIB/$(notdir $(MIDDLE_SO_TARGET)) + + ## Compiler switch to embed a runtime search path + LD_RPATH= +@@ -50,10 +46,6 @@ + ## Non-shared intermediate object suffix + STATIC_O = ao + +-## Override Versioned target for a shared library. +-FINAL_SO_TARGET= $(basename $(SO_TARGET)).$(SO_TARGET_VERSION).$(SO) +-MIDDLE_SO_TARGET= $(basename $(SO_TARGET)).$(SO_TARGET_VERSION_MAJOR).$(SO) +- + ## Compilation and dependency rules + %.$(STATIC_O): $(srcdir)/%.c + $(call SILENT_COMPILE,$(strip $(COMPILE.c) $(STATICCPPFLAGS) $(STATICCFLAGS)) -MMD -MT "$*.d $*.o $*.$(STATIC_O)" -o $@ $<) +@@ -67,16 +59,10 @@ + + ## Versioned libraries rules + +-%.$(SO_TARGET_VERSION_MAJOR).$(SO): %.$(SO_TARGET_VERSION).$(SO) ++%.$(SO).$(SO_TARGET_VERSION_MAJOR): %.$(SO).$(SO_TARGET_VERSION) + $(RM) $@ && ln -s ${<F} $@ +-%.$(SO): %.$(SO_TARGET_VERSION_MAJOR).$(SO) +- $(RM) $@ && ln -s ${*F}.$(SO_TARGET_VERSION).$(SO) $@ +- +-# tzcode option +-TZORIG_EXTRA_CFLAGS=-DSTD_INSPIRED +- +-# genren opts +-GENREN_PL_OPTS=-x Mach-O -n '-g' -p '| c++filt' ++%.$(SO): %.$(SO).$(SO_TARGET_VERSION_MAJOR) ++ $(RM) $@ && ln -s ${*F}.$(SO).$(SO_TARGET_VERSION) $@ + + ## Remove shared library 's' + STATIC_PREFIX_WHEN_USED = +diff -ur icu.org/source/tools/toolutil/pkg_genc.cpp icu/source/tools/toolutil/pkg_genc.cpp +--- icu.org/source/tools/toolutil/pkg_genc.cpp 2017-04-13 11:46:02.000000000 +0200 ++++ icu/source/tools/toolutil/pkg_genc.cpp 2017-04-21 21:30:23.583568212 +0200 +@@ -160,6 +160,28 @@ + + ".long ","",HEX_0X + }, ++ {"gcc-android-arm", ++ "\t.arch armv5te\n" ++ "\t.fpu softvfp\n" ++ "\t.eabi_attribute 20, 1\n" ++ "\t.eabi_attribute 21, 1\n" ++ "\t.eabi_attribute 23, 3\n" ++ "\t.eabi_attribute 24, 1\n" ++ "\t.eabi_attribute 25, 1\n" ++ "\t.eabi_attribute 26, 2\n" ++ "\t.eabi_attribute 30, 6\n" ++ "\t.eabi_attribute 18, 4\n" ++ "\t.file \"%s.s\"\n" ++ "\t.global %s\n" ++ "\t.section .rodata\n" ++ "\t.align 2\n" ++ "\t.type %s, %%object\n" ++ "%s:\n", ++ ++ "\t.word ", ++ "\t.section .note.GNU-stack,\"\",%%progbits\n", ++ HEX_0X ++ }, + /* 16 bytes alignment. */ + /* http://docs.oracle.com/cd/E19641-01/802-1947/802-1947.pdf */ + {"sun", +diff -ur icu.org/source/tools/toolutil/pkg_genc.h icu/source/tools/toolutil/pkg_genc.h +--- icu.org/source/tools/toolutil/pkg_genc.h 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/tools/toolutil/pkg_genc.h 2017-04-21 21:30:23.582568215 +0200 +@@ -60,7 +60,7 @@ + #endif + + #define LARGE_BUFFER_MAX_SIZE 2048 +-#define SMALL_BUFFER_MAX_SIZE 512 ++#define SMALL_BUFFER_MAX_SIZE 2048 + #define SMALL_BUFFER_FLAG_NAMES 32 + #define BUFFER_PADDING_SIZE 20 + diff --git a/external/icu/icu4c-clang-cl.patch.1 b/external/icu/icu4c-clang-cl.patch.1 new file mode 100644 index 000000000..a111a0df9 --- /dev/null +++ b/external/icu/icu4c-clang-cl.patch.1 @@ -0,0 +1,28 @@ +diff -ur icu.org/source/config/mh-cygwin-msvc icu/source/config/mh-cygwin-msvc +--- icu.org/source/config/mh-cygwin-msvc 2017-01-23 01:38:28.000000000 +0100 ++++ icu/source/config/mh-cygwin-msvc 2017-04-21 23:07:28.482892025 +0200 +@@ -55,8 +55,8 @@ + LDFLAGS+=-nologo + + # Commands to compile +-COMPILE.c= $(CC) $(CPPFLAGS) $(DEFS) $(CFLAGS) -c +-COMPILE.cc= $(CXX) $(CPPFLAGS) $(DEFS) $(CXXFLAGS) -c ++COMPILE.c= true && $(CC) $(CPPFLAGS) $(DEFS) $(CFLAGS) -c ++COMPILE.cc= true && $(CXX) $(CPPFLAGS) $(DEFS) $(CXXFLAGS) -c + + # Commands to link + LINK.c= LINK.EXE -subsystem:console $(LDFLAGS) +diff -ur icu.org/source/runConfigureICU icu/source/runConfigureICU +--- icu.org/source/runConfigureICU 2017-01-23 01:38:28.000000000 +0100 ++++ icu/source/runConfigureICU 2017-04-21 23:07:28.482892025 +0200 +@@ -261,8 +261,8 @@ + Cygwin/MSVC) + THE_OS="Windows with Cygwin" + THE_COMP="Microsoft Visual C++" +- CC=cl; export CC +- CXX=cl; export CXX ++ CC=${CC-cl}; export CC ++ CXX=${CXX-cl}; export CXX + RELEASE_CFLAGS='-Gy -MD' + RELEASE_CXXFLAGS='-Gy -MD' + DEBUG_CFLAGS='-FS -Zi -MDd' diff --git a/external/icu/icu4c-icudata-stdlibs.patch.1 b/external/icu/icu4c-icudata-stdlibs.patch.1 new file mode 100644 index 000000000..c8d66c6ed --- /dev/null +++ b/external/icu/icu4c-icudata-stdlibs.patch.1 @@ -0,0 +1,14 @@ +diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux +--- icu.org/source/config/mh-linux 2017-04-21 23:09:57.588533707 +0200 ++++ icu/source/config/mh-linux 2017-04-21 23:11:38.075292226 +0200 +@@ -27,7 +27,9 @@ + RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN' + + ## These are the library specific LDFLAGS +-LDFLAGSICUDT=-nodefaultlibs -nostdlib ++#LDFLAGSICUDT=-nodefaultlibs -nostdlib ++# Debian change: linking icudata as data only causes too many problems. ++LDFLAGSICUDT= + + ## Compiler switch to embed a library name + # The initial tab in the next line is to prevent icu-config from reading it. diff --git a/external/icu/icu4c-khmerbreakengine.patch.1 b/external/icu/icu4c-khmerbreakengine.patch.1 new file mode 100644 index 000000000..272d0b8ab --- /dev/null +++ b/external/icu/icu4c-khmerbreakengine.patch.1 @@ -0,0 +1,845 @@ +diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp +--- icu.org/source/common/dictbe.cpp 2020-04-22 22:04:20.000000000 +0200 ++++ icu/source/common/dictbe.cpp 2020-05-11 18:55:07.702282061 +0200 +@@ -32,7 +32,19 @@ + ****************************************************************** + */ + +-DictionaryBreakEngine::DictionaryBreakEngine() { ++DictionaryBreakEngine::DictionaryBreakEngine() ++ : fTypes(0), clusterLimit(0) { ++} ++ ++DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) ++ : fTypes(breakTypes), clusterLimit(3) { ++ UErrorCode status = U_ZERO_ERROR; ++ fViramaSet.applyPattern(UNICODE_STRING_SIMPLE("[[:ccc=VR:]]"), status); ++ ++ // note Skip Sets contain fIgnoreSet characters too. ++ fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]\\u200C\\u200D\\u2060]"), status); ++ fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]\\u200C\\u200D\\u2060]"), status); ++ fNBeforeSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status); + } + + DictionaryBreakEngine::~DictionaryBreakEngine() { +@@ -79,6 +91,169 @@ + fSet.compact(); + } + ++bool ++DictionaryBreakEngine::scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const { ++ UErrorCode status = U_ZERO_ERROR; ++ UText* ut = utext_clone(NULL, text, false, true, &status); ++ utext_setNativeIndex(ut, start); ++ UChar32 c = utext_current32(ut); ++ bool res = false; ++ doBreak = true; ++ while (start >= 0) { ++ if (!fSkipStartSet.contains(c)) { ++ res = (c == ZWSP); ++ break; ++ } ++ --start; ++ c = utext_previous32(ut); ++ doBreak = false; ++ } ++ utext_close(ut); ++ return res; ++} ++ ++bool ++DictionaryBreakEngine::scanAfterEnd(UText *text, int32_t textEnd, int32_t& end, bool &doBreak) const { ++ UErrorCode status = U_ZERO_ERROR; ++ UText* ut = utext_clone(NULL, text, false, true, &status); ++ utext_setNativeIndex(ut, end); ++ UChar32 c = utext_current32(ut); ++ bool res = false; ++ doBreak = !fNBeforeSet.contains(c); ++ while (end < textEnd) { ++ if (!fSkipEndSet.contains(c)) { ++ res = (c == ZWSP); ++ break; ++ } ++ ++end; ++ c = utext_next32(ut); ++ doBreak = false; ++ } ++ utext_close(ut); ++ return res; ++} ++ ++void ++DictionaryBreakEngine::scanBackClusters(UText *text, int32_t textStart, int32_t& start) const { ++ UChar32 c = 0; ++ start = utext_getNativeIndex(text); ++ while (start > textStart) { ++ c = utext_previous32(text); ++ --start; ++ if (!fSkipEndSet.contains(c)) ++ break; ++ } ++ for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters ++ while (start > textStart) { ++ while (fIgnoreSet.contains(c)) ++ c = utext_previous32(text); ++ if (!fMarkSet.contains(c)) { ++ if (fBaseSet.contains(c)) { ++ c = utext_previous32(text); ++ if (!fViramaSet.contains(c)) { // Virama (e.g. coeng) preceding base. Treat sequence as a mark ++ utext_next32(text); ++ c = utext_current32(text); ++ break; ++ } else { ++ --start; ++ } ++ } else { ++ break; ++ } ++ } ++ c = utext_previous32(text); ++ --start; ++ } ++ if (!fBaseSet.contains(c) || start < textStart) { // not a cluster start so finish ++ break; ++ } ++ c = utext_previous32(text); ++ --start; // go round again ++ } // ignore hitting previous inhibitor since scanning for it should have found us! ++ ++start; // counteract --before ++} ++ ++void ++DictionaryBreakEngine::scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const { ++ UChar32 c = utext_current32(text); ++ end = utext_getNativeIndex(text); ++ while (end < textEnd) { ++ if (!fSkipStartSet.contains(c)) ++ break; ++ utext_next32(text); ++ c = utext_current32(text); ++ ++end; ++ } ++ for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters ++ while (fIgnoreSet.contains(c)) { ++ utext_next32(text); ++ c = utext_current32(text); ++ } ++ if (fBaseSet.contains(c)) { ++ while (end < textEnd) { ++ utext_next32(text); ++ c = utext_current32(text); ++ ++end; ++ if (!fMarkSet.contains(c)) ++ break; ++ else if (fViramaSet.contains(c)) { // handle coeng + base as mark ++ utext_next32(text); ++ c = utext_current32(text); ++ ++end; ++ if (!fBaseSet.contains(c)) ++ break; ++ } ++ } ++ } else { ++ --end; // bad char so break after char before it ++ break; ++ } ++ } ++} ++ ++bool ++DictionaryBreakEngine::scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const { ++ UErrorCode status = U_ZERO_ERROR; ++ UText* ut = utext_clone(NULL, text, false, true, &status); ++ int32_t nat = start; ++ utext_setNativeIndex(ut, nat); ++ bool foundFirst = true; ++ int32_t curr = start; ++ while (nat < end) { ++ UChar32 c = utext_current32(ut); ++ if (c == ZWSP || c == WJ) { ++ curr = nat + 1; ++ if (foundFirst) // only scan backwards for first inhibitor ++ scanBackClusters(ut, start, before); ++ foundFirst = false; // don't scan backwards if we go around again. Also marks found something ++ ++ utext_next32(ut); ++ scanFwdClusters(ut, end, after); ++ nat = after + 1; ++ ++ if (c == ZWSP || c == WJ) { // did we hit another one? ++ continue; ++ } else { ++ break; ++ } ++ } ++ ++ ++nat; // keep hunting ++ utext_next32(ut); ++ } ++ ++ utext_close(ut); ++ ++ if (nat >= end && foundFirst) { ++ start = before = after = nat; ++ return false; // failed to find anything ++ } ++ else { ++ start = curr; ++ } ++ return true; // yup hit one ++} ++ + /* + ****************************************************************** + * PossibleWord +@@ -108,7 +283,7 @@ + ~PossibleWord() {} + + // Fill the list of candidates if needed, select the longest, and return the number found +- int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); ++ int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet = NULL, int32_t minLength = 0 ); + + // Select the currently marked candidate, point after it in the text, and invalidate self + int32_t acceptMarked( UText *text ); +@@ -129,12 +304,12 @@ + }; + + +-int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { ++int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet, int32_t minLength) { + // TODO: If getIndex is too slow, use offset < 0 and add discardAll() + int32_t start = (int32_t)utext_getNativeIndex(text); + if (start != offset) { + offset = start; +- count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix); ++ count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix, ignoreSet, minLength); + // Dictionary leaves text after longest prefix, not longest word. Back up. + if (count <= 0) { + utext_setNativeIndex(text, start); +@@ -815,53 +990,30 @@ + * KhmerBreakEngine + */ + +-// How many words in a row are "good enough"? +-static const int32_t KHMER_LOOKAHEAD = 3; +- +-// Will not combine a non-word with a preceding dictionary word longer than this +-static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3; +- +-// Will not combine a non-word that shares at least this much prefix with a +-// dictionary word, with a preceding word +-static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3; +- +-// Minimum word size +-static const int32_t KHMER_MIN_WORD = 2; +- +-// Minimum number of characters for two words +-static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2; +- + KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) +- : DictionaryBreakEngine(), ++ : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), + fDictionary(adoptDictionary) + { + UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); + UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr"); +- fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); ++ ++ clusterLimit = 3; ++ ++ fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]\\u2060\\u200C\\u200D]"), status); + if (U_SUCCESS(status)) { + setCharacters(fKhmerWordSet); + } + fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); +- fMarkSet.add(0x0020); +- fEndWordSet = fKhmerWordSet; +- fBeginWordSet.add(0x1780, 0x17B3); +- //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels +- //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word +- //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word +- fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters +- //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels +-// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT +-// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI +-// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK +-// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI +-// fSuffixSet.add(THAI_PAIYANNOI); +-// fSuffixSet.add(THAI_MAIYAMOK); ++ fIgnoreSet.add(0x2060); // WJ ++ fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ ++ fBaseSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status); ++ fPuncSet.applyPattern(UNICODE_STRING_SIMPLE("[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status); + + // Compact for caching. + fMarkSet.compact(); +- fEndWordSet.compact(); +- fBeginWordSet.compact(); +-// fSuffixSet.compact(); ++ fIgnoreSet.compact(); ++ fBaseSet.compact(); ++ fPuncSet.compact(); + UTRACE_EXIT_STATUS(status); + } + +@@ -874,180 +1026,204 @@ + int32_t rangeStart, + int32_t rangeEnd, + UVector32 &foundBreaks ) const { +- if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { +- return 0; // Not enough characters for two words +- } +- +- uint32_t wordsFound = 0; +- int32_t cpWordLength = 0; +- int32_t cuWordLength = 0; +- int32_t current; ++ uint32_t wordsFound = foundBreaks.size(); + UErrorCode status = U_ZERO_ERROR; +- PossibleWord words[KHMER_LOOKAHEAD]; +- ++ int32_t before = 0; ++ int32_t after = 0; ++ int32_t finalBefore = 0; ++ int32_t initAfter = 0; ++ int32_t scanStart = rangeStart; ++ int32_t scanEnd = rangeEnd; ++ ++ bool startZwsp = false; ++ bool breakStart = false; ++ bool breakEnd = false; ++ ++ if (rangeStart > 0) { ++ --scanStart; ++ startZwsp = scanBeforeStart(text, scanStart, breakStart); ++ } + utext_setNativeIndex(text, rangeStart); ++ scanFwdClusters(text, rangeEnd, initAfter); ++ bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd); ++ utext_setNativeIndex(text, rangeEnd - 1); ++ scanBackClusters(text, rangeStart, finalBefore); ++ if (finalBefore < initAfter) { // the whole run is tented so no breaks ++ if (breakStart || fTypes < UBRK_LINE) ++ foundBreaks.push(rangeStart, status); ++ if (breakEnd || fTypes < UBRK_LINE) ++ foundBreaks.push(rangeEnd, status); ++ return foundBreaks.size() - wordsFound; ++ } + +- while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { +- cuWordLength = 0; +- cpWordLength = 0; +- +- // Look for candidate words at the current position +- int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); +- +- // If we found exactly one, use that +- if (candidates == 1) { +- cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); +- cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); +- wordsFound += 1; +- } ++ scanStart = rangeStart; ++ scanWJ(text, scanStart, rangeEnd, before, after); ++ if (startZwsp || initAfter >= before) { ++ after = initAfter; ++ before = 0; ++ } ++ if (!endZwsp && after > finalBefore && after < rangeEnd) ++ endZwsp = true; ++ if (endZwsp && before > finalBefore) ++ before = finalBefore; + +- // If there was more than one, see which one can take us forward the most words +- else if (candidates > 1) { +- // If we're already at the end of the range, we're done +- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { +- goto foundBest; +- } +- do { +- int32_t wordsMatched = 1; +- if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { +- if (wordsMatched < 2) { +- // Followed by another dictionary word; mark first word as a good candidate +- words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); +- wordsMatched = 2; +- } ++ utext_setNativeIndex(text, rangeStart); ++ int32_t numCodePts = rangeEnd - rangeStart; ++ // bestSnlp[i] is the snlp of the best segmentation of the first i ++ // code points in the range to be matched. ++ UVector32 bestSnlp(numCodePts + 1, status); ++ bestSnlp.addElement(0, status); ++ for(int32_t i = 1; i <= numCodePts; i++) { ++ bestSnlp.addElement(kuint32max, status); ++ } + +- // If we're already at the end of the range, we're done +- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { +- goto foundBest; +- } ++ // prev[i] is the index of the last code point in the previous word in ++ // the best segmentation of the first i characters. Note negative implies ++ // that the code point is part of an unknown word. ++ UVector32 prev(numCodePts + 1, status); ++ for(int32_t i = 0; i <= numCodePts; i++) { ++ prev.addElement(kuint32max, status); ++ } + +- // See if any of the possible second words is followed by a third word +- do { +- // If we find a third word, stop right away +- if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { +- words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); +- goto foundBest; +- } +- } +- while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); +- } ++ const int32_t maxWordSize = 20; ++ UVector32 values(maxWordSize, status); ++ values.setSize(maxWordSize); ++ UVector32 lengths(maxWordSize, status); ++ lengths.setSize(maxWordSize); ++ ++ // Dynamic programming to find the best segmentation. ++ ++ // In outer loop, i is the code point index, ++ // ix is the corresponding string (code unit) index. ++ // They differ when the string contains supplementary characters. ++ int32_t ix = rangeStart; ++ for (int32_t i = 0; i < numCodePts; ++i, utext_setNativeIndex(text, ++ix)) { ++ if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) { ++ continue; ++ } ++ ++ int32_t count; ++ count = fDictionary->matches(text, numCodePts - i, maxWordSize, ++ NULL, lengths.getBuffer(), values.getBuffer(), NULL, &fIgnoreSet, 2); ++ // Note: lengths is filled with code point lengths ++ // The NULL parameter is the ignored code unit lengths. ++ ++ for (int32_t j = 0; j < count; j++) { ++ int32_t ln = lengths.elementAti(j); ++ if (ln + i >= numCodePts) ++ continue; ++ utext_setNativeIndex(text, ln+ix); ++ int32_t c = utext_current32(text); ++ if (fMarkSet.contains(c) || c == 0x17D2) { // Coeng ++ lengths.removeElementAt(j); ++ values.removeElementAt(j); ++ --j; ++ --count; + } +- while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); +-foundBest: +- cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); +- cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); +- wordsFound += 1; + } +- +- // We come here after having either found a word or not. We look ahead to the +- // next word. If it's not a dictionary word, we will combine it with the word we +- // just found (if there is one), but only if the preceding word does not exceed +- // the threshold. +- // The text iterator should now be positioned at the end of the word we found. +- if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) { +- // if it is a dictionary word, do nothing. If it isn't, then if there is +- // no preceding word, or the non-word shares less than the minimum threshold +- // of characters with a dictionary word, then scan to resynchronize +- if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 +- && (cuWordLength == 0 +- || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { +- // Look for a plausible word boundary +- int32_t remaining = rangeEnd - (current+cuWordLength); +- UChar32 pc; +- UChar32 uc; +- int32_t chars = 0; +- for (;;) { +- int32_t pcIndex = (int32_t)utext_getNativeIndex(text); +- pc = utext_next32(text); +- int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex; +- chars += pcSize; +- remaining -= pcSize; +- if (remaining <= 0) { ++ if (count == 0) { ++ utext_setNativeIndex(text, ix); ++ int32_t c = utext_current32(text); ++ if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) { ++ values.setElementAt(0, count); ++ lengths.setElementAt(1, count++); ++ } else if (fBaseSet.contains(c)) { ++ int32_t currix = utext_getNativeIndex(text); ++ do { ++ utext_next32(text); ++ c = utext_current32(text); ++ if (utext_getNativeIndex(text) >= rangeEnd) + break; +- } +- uc = utext_current32(text); +- if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { +- // Maybe. See if it's in the dictionary. +- int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); +- utext_setNativeIndex(text, current+cuWordLength+chars); +- if (num_candidates > 0) { ++ if (c == 0x17D2) { // Coeng ++ utext_next32(text); ++ c = utext_current32(text); ++ if (!fBaseSet.contains(c) || utext_getNativeIndex(text) >= rangeEnd) { + break; ++ } else { ++ utext_next32(text); ++ c = utext_current32(text); ++ if (utext_getNativeIndex(text) >= rangeEnd) ++ break; + } + } +- } +- +- // Bump the word count if there wasn't already one +- if (cuWordLength <= 0) { +- wordsFound += 1; +- } ++ } while (fMarkSet.contains(c) || fIgnoreSet.contains(c)); ++ values.setElementAt(BADSNLP, count); ++ lengths.setElementAt(utext_getNativeIndex(text) - currix, count++); ++ } else { ++ values.setElementAt(BADSNLP, count); ++ lengths.setElementAt(1, count++); ++ } ++ } + +- // Update the length with the passed-over characters +- cuWordLength += chars; ++ for (int32_t j = 0; j < count; j++) { ++ uint32_t v = values.elementAti(j); ++ int32_t newSnlp = bestSnlp.elementAti(i) + v; ++ int32_t ln = lengths.elementAti(j); ++ utext_setNativeIndex(text, ln+ix); ++ int32_t c = utext_current32(text); ++ while ((fPuncSet.contains(c) || fIgnoreSet.contains(c)) && ln + i < numCodePts) { ++ ++ln; ++ utext_next32(text); ++ c = utext_current32(text); + } +- else { +- // Back up to where we were for next iteration +- utext_setNativeIndex(text, current+cuWordLength); ++ int32_t ln_j_i = ln + i; // yes really i! ++ if (newSnlp < bestSnlp.elementAti(ln_j_i)) { ++ if (v == BADSNLP) { ++ int32_t p = prev.elementAti(i); ++ if (p < 0) ++ prev.setElementAt(p, ln_j_i); ++ else ++ prev.setElementAt(-i, ln_j_i); ++ } ++ else ++ prev.setElementAt(i, ln_j_i); ++ bestSnlp.setElementAt(newSnlp, ln_j_i); + } + } +- +- // Never stop before a combining mark. +- int32_t currPos; +- while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { +- utext_next32(text); +- cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; ++ } ++ // Start pushing the optimal offset index into t_boundary (t for tentative). ++ // prev[numCodePts] is guaranteed to be meaningful. ++ // We'll first push in the reverse order, i.e., ++ // t_boundary[0] = numCodePts, and afterwards do a swap. ++ UVector32 t_boundary(numCodePts+1, status); ++ ++ int32_t numBreaks = 0; ++ // No segmentation found, set boundary to end of range ++ while (numCodePts >= 0 && (uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) { ++ --numCodePts; ++ } ++ if (numCodePts < 0) { ++ t_boundary.addElement(numCodePts, status); ++ numBreaks++; ++ } else { ++ for (int32_t i = numCodePts; (uint32_t)i != kuint32max; i = prev.elementAti(i)) { ++ if (i < 0) i = -i; ++ t_boundary.addElement(i, status); ++ numBreaks++; + } ++ U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0); ++ } + +- // Look ahead for possible suffixes if a dictionary word does not follow. +- // We do this in code rather than using a rule so that the heuristic +- // resynch continues to function. For example, one of the suffix characters +- // could be a typo in the middle of a word. +-// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { +-// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 +-// && fSuffixSet.contains(uc = utext_current32(text))) { +-// if (uc == KHMER_PAIYANNOI) { +-// if (!fSuffixSet.contains(utext_previous32(text))) { +-// // Skip over previous end and PAIYANNOI +-// utext_next32(text); +-// utext_next32(text); +-// wordLength += 1; // Add PAIYANNOI to word +-// uc = utext_current32(text); // Fetch next character +-// } +-// else { +-// // Restore prior position +-// utext_next32(text); +-// } +-// } +-// if (uc == KHMER_MAIYAMOK) { +-// if (utext_previous32(text) != KHMER_MAIYAMOK) { +-// // Skip over previous end and MAIYAMOK +-// utext_next32(text); +-// utext_next32(text); +-// wordLength += 1; // Add MAIYAMOK to word +-// } +-// else { +-// // Restore prior position +-// utext_next32(text); +-// } +-// } +-// } +-// else { +-// utext_setNativeIndex(text, current+wordLength); +-// } +-// } +- +- // Did we find a word on this iteration? If so, push it on the break stack +- if (cuWordLength > 0) { +- foundBreaks.push((current+cuWordLength), status); ++ // Now that we're done, convert positions in t_boundary[] (indices in ++ // the normalized input string) back to indices in the original input UText ++ // while reversing t_boundary and pushing values to foundBreaks. ++ for (int32_t i = numBreaks-1; i >= 0; i--) { ++ int32_t cpPos = t_boundary.elementAti(i); ++ if (cpPos == 0 && !breakStart && fTypes >= UBRK_LINE) continue; ++ int32_t utextPos = cpPos + rangeStart; ++ while (utextPos > after && scanWJ(text, utextPos, scanEnd, before, after)); ++ if (utextPos < before) { ++ // Boundaries are added to foundBreaks output in ascending order. ++ U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos); ++ foundBreaks.push(utextPos, status); + } + } +- ++ + // Don't return a break for the end of the dictionary range if there is one there. +- if (foundBreaks.peeki() >= rangeEnd) { ++ if (!breakEnd && fTypes >= UBRK_LINE && foundBreaks.peeki() >= rangeEnd) { + (void) foundBreaks.popi(); +- wordsFound -= 1; + } +- +- return wordsFound; ++ return foundBreaks.size() - wordsFound; + } + + #if !UCONFIG_NO_NORMALIZATION +diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h +--- icu.org/source/common/dictbe.h 2020-04-22 22:04:20.000000000 +0200 ++++ icu/source/common/dictbe.h 2020-05-11 19:08:24.754634732 +0200 +@@ -34,7 +34,8 @@ + * threads without synchronization.</p> + */ + class DictionaryBreakEngine : public LanguageBreakEngine { +- private: ++ protected: ++ + /** + * The set of characters handled by this engine + * @internal +@@ -42,14 +43,84 @@ + + UnicodeSet fSet; + ++ const int32_t WJ = 0x2060; ++ const int32_t ZWSP = 0x200B; ++ ++ /** ++ * The break types it was constructed with ++ * @internal ++ */ ++ uint32_t fTypes; ++ ++ /** ++ * A Unicode set of all viramas ++ * @internal ++ */ ++ UnicodeSet fViramaSet; ++ ++ /** ++ * A Unicode set of all base characters ++ * @internal ++ */ ++ UnicodeSet fBaseSet; ++ ++ /** ++ * A Unicode set of all marks ++ * @internal ++ */ ++ UnicodeSet fMarkSet; ++ ++ /** ++ * A Unicode set of all characters ignored ignored in dictionary matching ++ * @internal ++ */ ++ UnicodeSet fIgnoreSet; ++ ++ /** ++ * A Unicode set of all characters ignored ignored in dictionary matching ++ * @internal ++ */ ++ UnicodeSet fSkipStartSet; ++ ++ /** ++ * A Unicode set of all characters ignored ignored in dictionary matching ++ * @internal ++ */ ++ UnicodeSet fSkipEndSet; ++ ++ /** ++ * A Unicode set of all characters that should not be broken before ++ * @internal ++ */ ++ UnicodeSet fNBeforeSet; ++ ++ /** ++ * The number of clusters within which breaks are inhibited ++ * @internal ++ */ ++ int32_t clusterLimit; ++ ++ bool scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const; ++ ++ bool scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const; ++ bool scanAfterEnd(UText *text, int32_t rangeEnd, int32_t& end, bool &doBreak) const; ++ void scanBackClusters(UText *text, int32_t textStart, int32_t& start) const; ++ void scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const; ++ + public: + + /** +- * <p>Constructor </p> ++ * <p>Default constructor.</p> ++ * + */ + DictionaryBreakEngine(); + + /** ++ * <p>Constructor with break types.</p> ++ */ ++ explicit DictionaryBreakEngine(uint32_t breakTypes); ++ ++ /** + * <p>Virtual destructor.</p> + */ + virtual ~DictionaryBreakEngine(); +@@ -293,11 +364,13 @@ + */ + + UnicodeSet fKhmerWordSet; +- UnicodeSet fEndWordSet; +- UnicodeSet fBeginWordSet; +- UnicodeSet fMarkSet; +- DictionaryMatcher *fDictionary; +- ++ UnicodeSet fBeginWordSet; ++ UnicodeSet fPuncSet; ++ DictionaryMatcher *fDictionary; ++ ++ const uint32_t BADSNLP = 256 * 20; ++ const uint32_t kuint32max = 0x7FFFFFFF; ++ + public: + + /** +diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionarydata.cpp +--- icu.org/source/common/dictionarydata.cpp 2020-04-22 22:04:20.000000000 +0200 ++++ icu/source/common/dictionarydata.cpp 2020-05-11 18:50:43.703113749 +0200 +@@ -44,7 +44,7 @@ + + int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const { ++ int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const { + + UCharsTrie uct(characters); + int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); +@@ -55,7 +55,13 @@ + UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c); + int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; + codePointsMatched += 1; ++ if (ignoreSet != NULL && ignoreSet->contains(c)) { ++ continue; ++ } + if (USTRINGTRIE_HAS_VALUE(result)) { ++ if (codePointsMatched < minLength) { ++ continue; ++ } + if (wordCount < limit) { + if (values != NULL) { + values[wordCount] = uct.getValue(); +@@ -112,7 +118,7 @@ + + int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const { ++ int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const { + BytesTrie bt(characters); + int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); + int32_t wordCount = 0; +@@ -122,7 +128,13 @@ + UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c)); + int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; + codePointsMatched += 1; ++ if (ignoreSet != NULL && ignoreSet->contains(c)) { ++ continue; ++ } + if (USTRINGTRIE_HAS_VALUE(result)) { ++ if (codePointsMatched < minLength) { ++ continue; ++ } + if (wordCount < limit) { + if (values != NULL) { + values[wordCount] = bt.getValue(); +diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h +--- icu.org/source/common/dictionarydata.h 2020-04-22 22:04:20.000000000 +0200 ++++ icu/source/common/dictionarydata.h 2020-05-11 18:50:43.704113746 +0200 +@@ -21,6 +21,7 @@ + #include "unicode/utext.h" + #include "unicode/udata.h" + #include "udataswp.h" ++#include "unicode/uniset.h" + #include "unicode/uobject.h" + #include "unicode/ustringtrie.h" + +@@ -92,7 +93,7 @@ + */ + virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const = 0; ++ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const = 0; + + /** @return DictionaryData::TRIE_TYPE_XYZ */ + virtual int32_t getType() const = 0; +@@ -107,7 +108,7 @@ + virtual ~UCharsDictionaryMatcher(); + virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const; ++ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const; + virtual int32_t getType() const; + private: + const UChar *characters; +@@ -125,7 +126,7 @@ + virtual ~BytesDictionaryMatcher(); + virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const; ++ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const; + virtual int32_t getType() const; + private: + UChar32 transform(UChar32 c) const; diff --git a/external/icu/icu4c-macosx.patch.1 b/external/icu/icu4c-macosx.patch.1 new file mode 100644 index 000000000..fee08eb05 --- /dev/null +++ b/external/icu/icu4c-macosx.patch.1 @@ -0,0 +1,20 @@ +diff -ur icu.org/source/common/putil.cpp icu/source/common/putil.cpp +--- icu.org/source/common/putil.cpp 2017-04-10 16:22:16.000000000 +0200 ++++ icu/source/common/putil.cpp 2017-04-21 22:14:09.940217733 +0200 +@@ -1198,8 +1198,16 @@ + static const time_t decemberSolstice=1198332540; /*2007-12-22 06:09 UT*/ + + /* This probing will tell us when daylight savings occurs. */ ++#if U_PLATFORM_IS_DARWIN_BASED ++ struct tm *tmp; ++ tmp = localtime(&juneSolstice); ++ juneSol = *tmp; ++ tmp = localtime(&decemberSolstice); ++ decemberSol = *tmp; ++#else + localtime_r(&juneSolstice, &juneSol); + localtime_r(&decemberSolstice, &decemberSol); ++#endif + if(decemberSol.tm_isdst > 0) { + daylightType = U_DAYLIGHT_DECEMBER; + } else if(juneSol.tm_isdst > 0) { diff --git a/external/icu/icu4c-mkdir.patch.1 b/external/icu/icu4c-mkdir.patch.1 new file mode 100644 index 000000000..112e57cc2 --- /dev/null +++ b/external/icu/icu4c-mkdir.patch.1 @@ -0,0 +1,11 @@ +diff -ur icu.org/source/data/Makefile.in icu/source/data/Makefile.in +--- icu.org/source/data/Makefile.in 2019-04-17 21:42:15.000000000 +0200 ++++ icu/source/data/Makefile.in 2019-10-28 12:57:15.033649494 +0100 +@@ -226,6 +226,7 @@ + ifeq ($(PKGDATA_MODE),dll) + SO_VERSION_DATA = $(OUTTMPDIR)/icudata.res + $(SO_VERSION_DATA) : $(MISCSRCDIR)/icudata.rc ++ mkdir -p $(OUTTMPDIR) + ifeq ($(MSYS_RC_MODE),1) + rc.exe -i$(srcdir)/../common -i$(top_builddir)/common -fo$@ $(CPPFLAGS) $< + else diff --git a/external/icu/icu4c-rpath.patch.1 b/external/icu/icu4c-rpath.patch.1 new file mode 100644 index 000000000..35a545778 --- /dev/null +++ b/external/icu/icu4c-rpath.patch.1 @@ -0,0 +1,36 @@ +diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux +--- icu.org/source/config/mh-linux 2016-06-15 20:58:17.000000000 +0200 ++++ icu/source/config/mh-linux 2017-04-21 22:38:18.893927819 +0200 +@@ -22,6 +22,10 @@ + LD_RPATH= -Wl,-zorigin,-rpath,'$$'ORIGIN + LD_RPATH_PRE = -Wl,-rpath, + ++## Force RPATH=$ORIGIN to locate own dependencies w/o need for LD_LIBRARY_PATH: ++ENABLE_RPATH=YES ++RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN' ++ + ## These are the library specific LDFLAGS + LDFLAGSICUDT=-nodefaultlibs -nostdlib + +diff -ur icu.org/source/data/pkgdataMakefile.in icu/source/data/pkgdataMakefile.in +--- icu.org/source/data/pkgdataMakefile.in 2016-06-15 20:58:17.000000000 +0200 ++++ icu/source/data/pkgdataMakefile.in 2017-04-21 22:38:18.892927822 +0200 +@@ -18,6 +18,9 @@ + MIDDLE_SO_TARGET= + PKGDATA_TRAILING_SPACE=" " + ++# escape $ with \ when passing to echo; needed to preserve $ORIGIN ++SHLIB.c.shell := $(subst $$,\$$,$(SHLIB.c)) ++ + all : clean + @echo GENCCODE_ASSEMBLY_TYPE=$(GENCCODE_ASSEMBLY) >> $(OUTPUTFILE) + @echo SO=$(SO) >> $(OUTPUTFILE) +@@ -26,7 +29,7 @@ + @echo LIB_EXT_ORDER=$(FINAL_SO_TARGET) >> $(OUTPUTFILE) + @echo COMPILE="$(COMPILE.c)" >> $(OUTPUTFILE) + @echo LIBFLAGS="-I$(top_srcdir)/common -I$(top_builddir)/common $(SHAREDLIBCPPFLAGS) $(SHAREDLIBCFLAGS)" >> $(OUTPUTFILE) +- @echo GENLIB="$(SHLIB.c)" >> $(OUTPUTFILE) ++ @echo GENLIB="$(SHLIB.c.shell)" >> $(OUTPUTFILE) + @echo LDICUDTFLAGS=$(LDFLAGSICUDT) >> $(OUTPUTFILE) + @echo LD_SONAME=$(LD_SONAME) >> $(OUTPUTFILE) + @echo RPATH_FLAGS=$(RPATH_FLAGS) >> $(OUTPUTFILE) diff --git a/external/icu/icu4c-rtti.patch.1 b/external/icu/icu4c-rtti.patch.1 new file mode 100644 index 000000000..c058c7f3c --- /dev/null +++ b/external/icu/icu4c-rtti.patch.1 @@ -0,0 +1,12 @@ +diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux +--- icu.org/source/config/mh-linux 2017-04-21 23:01:23.257769703 +0200 ++++ icu/source/config/mh-linux 2017-04-21 23:03:23.166481552 +0200 +@@ -36,7 +36,7 @@ + #SH# LD_SONAME= + + ## Shared library options +-LD_SOOPTIONS= -Wl,-Bsymbolic ++LD_SOOPTIONS= -Wl,-Bsymbolic-functions + + ## Shared object suffix + SO = so diff --git a/external/icu/icu4c-scriptrun.patch.1 b/external/icu/icu4c-scriptrun.patch.1 new file mode 100644 index 000000000..f2f2cf9f3 --- /dev/null +++ b/external/icu/icu4c-scriptrun.patch.1 @@ -0,0 +1,60 @@ +diff -ur icu.org/source/extra/scrptrun/scrptrun.cpp icu/source/extra/scrptrun/scrptrun.cpp +--- icu.org/source/extra/scrptrun/scrptrun.cpp 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/extra/scrptrun/scrptrun.cpp 2017-04-21 22:59:31.708037770 +0200 +@@ -151,7 +151,11 @@ + // characters above it on the stack will be poped. + if (pairIndex >= 0) { + if ((pairIndex & 1) == 0) { +- parenStack[++parenSP].pairIndex = pairIndex; ++ ++parenSP; ++ int32_t nVecSize = parenStack.size(); ++ if (parenSP == nVecSize) ++ parenStack.resize(nVecSize + 128); ++ parenStack[parenSP].pairIndex = pairIndex; + parenStack[parenSP].scriptCode = scriptCode; + } else if (parenSP >= 0) { + int32_t pi = pairIndex & ~1; +@@ -185,7 +189,14 @@ + // pop it from the stack + if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) { + parenSP -= 1; +- startSP -= 1; ++ /* decrement startSP only if it is >= 0, ++ decrementing it unnecessarily will lead to memory corruption ++ while processing the above while block. ++ e.g. startSP = -4 , parenSP = -1 ++ */ ++ if (startSP >= 0) { ++ startSP -= 1; ++ } + } + } else { + // if the run broke on a surrogate pair, +diff -ur icu.org/source/extra/scrptrun/scrptrun.h icu/source/extra/scrptrun/scrptrun.h +--- icu.org/source/extra/scrptrun/scrptrun.h 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/extra/scrptrun/scrptrun.h 2017-04-21 22:59:31.708037770 +0200 +@@ -19,6 +19,7 @@ + #include "unicode/utypes.h" + #include "unicode/uobject.h" + #include "unicode/uscript.h" ++#include <vector> + + U_NAMESPACE_BEGIN + +@@ -81,7 +82,7 @@ + int32_t scriptEnd; + UScriptCode scriptCode; + +- ParenStackEntry parenStack[128]; ++ std::vector<ParenStackEntry> parenStack; + int32_t parenSP; + + static int8_t highBit(int32_t value); +@@ -135,6 +136,7 @@ + scriptEnd = charStart; + scriptCode = USCRIPT_INVALID_CODE; + parenSP = -1; ++ parenStack.resize(128); + } + + inline void ScriptRun::reset(int32_t start, int32_t length) diff --git a/external/icu/icu4c-solarisgcc.patch.1 b/external/icu/icu4c-solarisgcc.patch.1 new file mode 100644 index 000000000..6000ed0cb --- /dev/null +++ b/external/icu/icu4c-solarisgcc.patch.1 @@ -0,0 +1,12 @@ +diff -ur icu.org/source/common/uposixdefs.h icu/source/common/uposixdefs.h +--- icu.org/source/common/uposixdefs.h 2017-03-09 03:12:45.000000000 +0100 ++++ icu/source/common/uposixdefs.h 2017-04-21 22:23:11.857926971 +0200 +@@ -54,7 +54,7 @@ + * + * z/OS needs this definition for timeval and to get usleep. + */ +-#if !defined(_XOPEN_SOURCE_EXTENDED) && defined(__TOS_MVS__) ++#if !defined(_XOPEN_SOURCE_EXTENDED) && (defined(__TOS_MVS__) || defined(__IBMC__) || defined(__IBMCPP__)) + # define _XOPEN_SOURCE_EXTENDED 1 + #endif + diff --git a/external/icu/icu4c-ubsan.patch.1 b/external/icu/icu4c-ubsan.patch.1 new file mode 100644 index 000000000..7b0c2efc9 --- /dev/null +++ b/external/icu/icu4c-ubsan.patch.1 @@ -0,0 +1,14 @@ +diff -ur icu.org/source/common/ubidiimp.h icu/source/common/ubidiimp.h +--- icu.org/source/common/ubidiimp.h 2019-10-03 13:16:41.000000000 +0200 ++++ icu/source/common/ubidiimp.h 2019-10-28 19:08:13.533284618 +0100 +@@ -198,8 +198,8 @@ + /* in a Run, logicalStart will get this bit set if the run level is odd */ + #define INDEX_ODD_BIT (1UL<<31) + +-#define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((int32_t)((level)&1)<<31)) +-#define ADD_ODD_BIT_FROM_LEVEL(x, level) ((x)|=((int32_t)((level)&1)<<31)) ++#define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((uint32_t)((level)&1)<<31)) ++#define ADD_ODD_BIT_FROM_LEVEL(x, level) ((x)|=((uint32_t)((level)&1)<<31)) + #define REMOVE_ODD_BIT(x) ((x)&=~INDEX_ODD_BIT) + + #define GET_INDEX(x) ((x)&~INDEX_ODD_BIT) diff --git a/external/icu/icu4c-warnings.patch.1 b/external/icu/icu4c-warnings.patch.1 new file mode 100644 index 000000000..76f8b7298 --- /dev/null +++ b/external/icu/icu4c-warnings.patch.1 @@ -0,0 +1,11 @@ +diff -ur icu.org/source/common/unicode/utf16.h icu/source/common/unicode/utf16.h +--- icu.org/source/common/unicode/utf16.h 2019-10-03 13:16:41.000000000 +0200 ++++ icu/source/common/unicode/utf16.h 2019-10-28 18:03:07.967208272 +0100 +@@ -397,6 +397,7 @@ + (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ + } else /* c>0x10ffff or not enough space */ { \ + (isError)=TRUE; \ ++ (void)(isError); \ + } \ + } UPRV_BLOCK_MACRO_END + diff --git a/external/icu/khmerdict.dict b/external/icu/khmerdict.dict Binary files differnew file mode 100644 index 000000000..52605b654 --- /dev/null +++ b/external/icu/khmerdict.dict diff --git a/external/icu/ubsan.patch b/external/icu/ubsan.patch new file mode 100644 index 000000000..762bd6e5a --- /dev/null +++ b/external/icu/ubsan.patch @@ -0,0 +1,52 @@ +--- source/common/uloc.cpp ++++ source/common/uloc.cpp +@@ -1203,7 +1203,8 @@ + return 0; + } + int32_t reslen = result.length(); +- uprv_memcpy(language, result.data(), std::min(reslen, languageCapacity)); ++ auto const n = std::min(reslen, languageCapacity); ++ if (n != 0) uprv_memcpy(language, result.data(), n); + return reslen; + } + +@@ -1251,7 +1252,8 @@ + return 0; + } + int32_t reslen = result.length(); +- uprv_memcpy(script, result.data(), std::min(reslen, scriptCapacity)); ++ auto const n = std::min(reslen, scriptCapacity); ++ if (n != 0) uprv_memcpy(script, result.data(), n); + return reslen; + } + +--- source/tools/genrb/rbutil.c ++++ source/tools/genrb/rbutil.c +@@ -30,7 +30,12 @@ + get_dirname(char *dirname, + const char *filename) + { +- const char *lastSlash = uprv_strrchr(filename, U_FILE_SEP_CHAR) + 1; ++ const char *lastSlash = uprv_strrchr(filename, U_FILE_SEP_CHAR); ++ if(lastSlash == NULL) { ++ lastSlash = filename; ++ } else { ++ ++lastSlash; ++ } + + if(lastSlash>filename) { + uprv_strncpy(dirname, filename, (lastSlash - filename)); +@@ -46,7 +51,12 @@ + const char *filename) + { + /* strip off any leading directory portions */ +- const char *lastSlash = uprv_strrchr(filename, U_FILE_SEP_CHAR) + 1; ++ const char *lastSlash = uprv_strrchr(filename, U_FILE_SEP_CHAR); ++ if(lastSlash == NULL) { ++ lastSlash = filename; ++ } else { ++ ++lastSlash; ++ } + char *lastDot; + + if(lastSlash>filename) { |