From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- intl/icu/source/common/normalizer2impl.h | 987 +++++++++++++++++++++++++++++++ 1 file changed, 987 insertions(+) create mode 100644 intl/icu/source/common/normalizer2impl.h (limited to 'intl/icu/source/common/normalizer2impl.h') diff --git a/intl/icu/source/common/normalizer2impl.h b/intl/icu/source/common/normalizer2impl.h new file mode 100644 index 0000000000..2cca33d349 --- /dev/null +++ b/intl/icu/source/common/normalizer2impl.h @@ -0,0 +1,987 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2009-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: normalizer2impl.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009nov22 +* created by: Markus W. Scherer +*/ + +#ifndef __NORMALIZER2IMPL_H__ +#define __NORMALIZER2IMPL_H__ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_NORMALIZATION + +#include "unicode/normalizer2.h" +#include "unicode/ucptrie.h" +#include "unicode/unistr.h" +#include "unicode/unorm.h" +#include "unicode/utf.h" +#include "unicode/utf16.h" +#include "mutex.h" +#include "udataswp.h" +#include "uset_imp.h" + +// When the nfc.nrm data is *not* hardcoded into the common library +// (with this constant set to 0), +// then it needs to be built into the data package: +// Add nfc.nrm to icu4c/source/data/Makefile.in DAT_FILES_SHORT +#define NORM2_HARDCODE_NFC_DATA 1 + +U_NAMESPACE_BEGIN + +struct CanonIterData; + +class ByteSink; +class Edits; +class InitCanonIterData; +class LcccContext; + +class U_COMMON_API Hangul { +public: + /* Korean Hangul and Jamo constants */ + enum { + JAMO_L_BASE=0x1100, /* "lead" jamo */ + JAMO_L_END=0x1112, + JAMO_V_BASE=0x1161, /* "vowel" jamo */ + JAMO_V_END=0x1175, + JAMO_T_BASE=0x11a7, /* "trail" jamo */ + JAMO_T_END=0x11c2, + + HANGUL_BASE=0xac00, + HANGUL_END=0xd7a3, + + JAMO_L_COUNT=19, + JAMO_V_COUNT=21, + JAMO_T_COUNT=28, + + JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT, + + HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT, + HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT + }; + + static inline UBool isHangul(UChar32 c) { + return HANGUL_BASE<=c && c(INERT) : + UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c); + } + uint16_t getRawNorm16(UChar32 c) const { return UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c); } + + UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const { + if(norm16=MIN_NORMAL_MAYBE_YES) { + return getCCFromNormalYesOrMaybe(norm16); + } + if(norm16> OFFSET_SHIFT); + } + static uint8_t getCCFromYesOrMaybe(uint16_t norm16) { + return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; + } + uint8_t getCCFromYesOrMaybeCP(UChar32 c) const { + if (c < minCompNoMaybeCP) { return 0; } + return getCCFromYesOrMaybe(getNorm16(c)); + } + + /** + * Returns the FCD data for code point c. + * @param c A Unicode code point. + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. + */ + uint16_t getFCD16(UChar32 c) const { + if(c>8]; + if(bits==0) { return false; } + return (UBool)((bits>>((lead>>5)&7))&1); + } + /** Returns the FCD value from the regular normalization data. */ + uint16_t getFCD16FromNormData(UChar32 c) const; + + /** + * Gets the decomposition for one code point. + * @param c code point + * @param buffer out-only buffer for algorithmic decompositions + * @param length out-only, takes the length of the decomposition, if any + * @return pointer to the decomposition, or NULL if none + */ + const char16_t *getDecomposition(UChar32 c, char16_t buffer[4], int32_t &length) const; + + /** + * Gets the raw decomposition for one code point. + * @param c code point + * @param buffer out-only buffer for algorithmic decompositions + * @param length out-only, takes the length of the decomposition, if any + * @return pointer to the decomposition, or NULL if none + */ + const char16_t *getRawDecomposition(UChar32 c, char16_t buffer[30], int32_t &length) const; + + UChar32 composePair(UChar32 a, UChar32 b) const; + + UBool isCanonSegmentStarter(UChar32 c) const; + UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const; + + enum { + // Fixed norm16 values. + MIN_YES_YES_WITH_CC=0xfe02, + JAMO_VT=0xfe00, + MIN_NORMAL_MAYBE_YES=0xfc00, + JAMO_L=2, // offset=1 hasCompBoundaryAfter=false + INERT=1, // offset=0 hasCompBoundaryAfter=true + + // norm16 bit 0 is comp-boundary-after. + HAS_COMP_BOUNDARY_AFTER=1, + OFFSET_SHIFT=1, + + // For algorithmic one-way mappings, norm16 bits 2..1 indicate the + // tccc (0, 1, >1) for quick FCC boundary-after tests. + DELTA_TCCC_0=0, + DELTA_TCCC_1=2, + DELTA_TCCC_GT_1=4, + DELTA_TCCC_MASK=6, + DELTA_SHIFT=3, + + MAX_DELTA=0x40 + }; + + enum { + // Byte offsets from the start of the data, after the generic header. + IX_NORM_TRIE_OFFSET, + IX_EXTRA_DATA_OFFSET, + IX_SMALL_FCD_OFFSET, + IX_RESERVED3_OFFSET, + IX_RESERVED4_OFFSET, + IX_RESERVED5_OFFSET, + IX_RESERVED6_OFFSET, + IX_TOTAL_SIZE, + + // Code point thresholds for quick check codes. + IX_MIN_DECOMP_NO_CP, + IX_MIN_COMP_NO_MAYBE_CP, + + // Norm16 value thresholds for quick check combinations and types of extra data. + + /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ + IX_MIN_YES_NO, + /** Mappings are comp-normalized. */ + IX_MIN_NO_NO, + IX_LIMIT_NO_NO, + IX_MIN_MAYBE_YES, + + /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ + IX_MIN_YES_NO_MAPPINGS_ONLY, + /** Mappings are not comp-normalized but have a comp boundary before. */ + IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE, + /** Mappings do not have a comp boundary before. */ + IX_MIN_NO_NO_COMP_NO_MAYBE_CC, + /** Mappings to the empty string. */ + IX_MIN_NO_NO_EMPTY, + + IX_MIN_LCCC_CP, + IX_RESERVED19, + IX_COUNT + }; + + enum { + MAPPING_HAS_CCC_LCCC_WORD=0x80, + MAPPING_HAS_RAW_MAPPING=0x40, + // unused bit 0x20, + MAPPING_LENGTH_MASK=0x1f + }; + + enum { + COMP_1_LAST_TUPLE=0x8000, + COMP_1_TRIPLE=1, + COMP_1_TRAIL_LIMIT=0x3400, + COMP_1_TRAIL_MASK=0x7ffe, + COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit + COMP_2_TRAIL_SHIFT=6, + COMP_2_TRAIL_MASK=0xffc0 + }; + + // higher-level functionality ------------------------------------------ *** + + // NFD without an NFD Normalizer2 instance. + UnicodeString &decompose(const UnicodeString &src, UnicodeString &dest, + UErrorCode &errorCode) const; + /** + * Decomposes [src, limit[ and writes the result to dest. + * limit can be NULL if src is NUL-terminated. + * destLengthEstimate is the initial dest buffer capacity and can be -1. + */ + void decompose(const char16_t *src, const char16_t *limit, + UnicodeString &dest, int32_t destLengthEstimate, + UErrorCode &errorCode) const; + + const char16_t *decompose(const char16_t *src, const char16_t *limit, + ReorderingBuffer *buffer, UErrorCode &errorCode) const; + void decomposeAndAppend(const char16_t *src, const char16_t *limit, + UBool doDecompose, + UnicodeString &safeMiddle, + ReorderingBuffer &buffer, + UErrorCode &errorCode) const; + + /** sink==nullptr: isNormalized()/spanQuickCheckYes() */ + const uint8_t *decomposeUTF8(uint32_t options, + const uint8_t *src, const uint8_t *limit, + ByteSink *sink, Edits *edits, UErrorCode &errorCode) const; + + UBool compose(const char16_t *src, const char16_t *limit, + UBool onlyContiguous, + UBool doCompose, + ReorderingBuffer &buffer, + UErrorCode &errorCode) const; + const char16_t *composeQuickCheck(const char16_t *src, const char16_t *limit, + UBool onlyContiguous, + UNormalizationCheckResult *pQCResult) const; + void composeAndAppend(const char16_t *src, const char16_t *limit, + UBool doCompose, + UBool onlyContiguous, + UnicodeString &safeMiddle, + ReorderingBuffer &buffer, + UErrorCode &errorCode) const; + + /** sink==nullptr: isNormalized() */ + UBool composeUTF8(uint32_t options, UBool onlyContiguous, + const uint8_t *src, const uint8_t *limit, + ByteSink *sink, icu::Edits *edits, UErrorCode &errorCode) const; + + const char16_t *makeFCD(const char16_t *src, const char16_t *limit, + ReorderingBuffer *buffer, UErrorCode &errorCode) const; + void makeFCDAndAppend(const char16_t *src, const char16_t *limit, + UBool doMakeFCD, + UnicodeString &safeMiddle, + ReorderingBuffer &buffer, + UErrorCode &errorCode) const; + + UBool hasDecompBoundaryBefore(UChar32 c) const; + UBool norm16HasDecompBoundaryBefore(uint16_t norm16) const; + UBool hasDecompBoundaryAfter(UChar32 c) const; + UBool norm16HasDecompBoundaryAfter(uint16_t norm16) const; + UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); } + + UBool hasCompBoundaryBefore(UChar32 c) const { + return c=minMaybeYes; } + static UBool isInert(uint16_t norm16) { return norm16==INERT; } + static UBool isJamoL(uint16_t norm16) { return norm16==JAMO_L; } + static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } + uint16_t hangulLVT() const { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } + UBool isHangulLV(uint16_t norm16) const { return norm16==minYesNo; } + UBool isHangulLVT(uint16_t norm16) const { + return norm16==hangulLVT(); + } + UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16=MIN_YES_YES_WITH_CC || norm16=limitNoNo; } + + // For use with isCompYes(). + // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. + // static uint8_t getCCFromYes(uint16_t norm16) { + // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; + // } + uint8_t getCCFromNoNo(uint16_t norm16) const { + const uint16_t *mapping=getMapping(norm16); + if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) { + return (uint8_t)*(mapping-1); + } else { + return 0; + } + } + // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() + uint8_t getTrailCCFromCompYesAndZeroCC(uint16_t norm16) const { + if(norm16<=minYesNo) { + return 0; // yesYes and Hangul LV have ccc=tccc=0 + } else { + // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. + return (uint8_t)(*getMapping(norm16)>>8); // tccc from yesNo + } + } + uint8_t getPreviousTrailCC(const char16_t *start, const char16_t *p) const; + uint8_t getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const; + + // Requires algorithmic-NoNo. + UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const { + return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; + } + UChar32 getAlgorithmicDelta(uint16_t norm16) const { + return (norm16>>DELTA_SHIFT)-centerNoNoDelta; + } + + // Requires minYesNo>OFFSET_SHIFT); } + const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const { + if(norm16>OFFSET_SHIFT); + } + /** + * @param c code point must have compositions + * @return compositions list pointer + */ + const uint16_t *getCompositionsList(uint16_t norm16) const { + return isDecompYes(norm16) ? + getCompositionsListForDecompYes(norm16) : + getCompositionsListForComposite(norm16); + } + + const char16_t *copyLowPrefixFromNulTerminated(const char16_t *src, + UChar32 minNeedDataCP, + ReorderingBuffer *buffer, + UErrorCode &errorCode) const; + + enum StopAt { STOP_AT_LIMIT, STOP_AT_DECOMP_BOUNDARY, STOP_AT_COMP_BOUNDARY }; + + const char16_t *decomposeShort(const char16_t *src, const char16_t *limit, + UBool stopAtCompBoundary, UBool onlyContiguous, + ReorderingBuffer &buffer, UErrorCode &errorCode) const; + UBool decompose(UChar32 c, uint16_t norm16, + ReorderingBuffer &buffer, UErrorCode &errorCode) const; + + const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit, + StopAt stopAt, UBool onlyContiguous, + ReorderingBuffer &buffer, UErrorCode &errorCode) const; + + static int32_t combine(const uint16_t *list, UChar32 trail); + void addComposites(const uint16_t *list, UnicodeSet &set) const; + void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, + UBool onlyContiguous) const; + + UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const { + return c