diff options
Diffstat (limited to 'intl/icu/source/i18n/usrchimp.h')
-rw-r--r-- | intl/icu/source/i18n/usrchimp.h | 243 |
1 files changed, 243 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/usrchimp.h b/intl/icu/source/i18n/usrchimp.h new file mode 100644 index 0000000000..13d825f73b --- /dev/null +++ b/intl/icu/source/i18n/usrchimp.h @@ -0,0 +1,243 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 2001-2015 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 08/13/2001 synwee Creation. +********************************************************************** +*/ +#ifndef USRCHIMP_H +#define USRCHIMP_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +#include "unicode/normalizer2.h" +#include "unicode/ucol.h" +#include "unicode/ucoleitr.h" +#include "unicode/ubrk.h" + +/* mask off anything but primary order */ +#define UCOL_PRIMARYORDERMASK 0xffff0000 +/* mask off anything but secondary order */ +#define UCOL_SECONDARYORDERMASK 0x0000ff00 +/* mask off anything but tertiary order */ +#define UCOL_TERTIARYORDERMASK 0x000000ff +/* primary order shift */ +#define UCOL_PRIMARYORDERSHIFT 16 +/* secondary order shift */ +#define UCOL_SECONDARYORDERSHIFT 8 + +#define UCOL_IGNORABLE 0 + +/* get weights from a CE */ +#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff) +#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT) +#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK) + +#define UCOL_CONTINUATION_MARKER 0xC0 + +#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER) + +/** + * This indicates an error has occurred during processing or there are no more CEs + * to be returned. + */ +#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX) + +U_NAMESPACE_BEGIN + +class CollationElementIterator; +class Collator; + +struct PCEI +{ + uint64_t ce; + int32_t low; + int32_t high; +}; + +struct PCEBuffer +{ + PCEI defaultBuffer[16]; + PCEI *buffer; + int32_t bufferIndex; + int32_t bufferSize; + + PCEBuffer(); + ~PCEBuffer(); + + void reset(); + UBool isEmpty() const; + void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode); + const PCEI *get(); +}; + +class UCollationPCE : public UMemory { +private: + PCEBuffer pceBuffer; + CollationElementIterator *cei; + UCollationStrength strength; + UBool toShift; + UBool isShifted; + uint32_t variableTop; + +public: + UCollationPCE(UCollationElements *elems); + UCollationPCE(CollationElementIterator *iter); + ~UCollationPCE(); + + void init(UCollationElements *elems); + void init(CollationElementIterator *iter); + + /** + * Get the processed ordering priority of the next collation element in the text. + * A single character may contain more than one collation element. + * + * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE. + * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE. + * @param status A pointer to an UErrorCode to receive any errors. + * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER + * if an error has occurred or if the end of string has been reached + */ + int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); + /** + * Get the processed ordering priority of the previous collation element in the text. + * A single character may contain more than one collation element. + * + * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE + * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE + * @param status A pointer to an UErrorCode to receive any errors. Notably + * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack + * buffer has been exhausted. + * @return The previous collation elements ordering, otherwise returns + * UCOL_PROCESSED_NULLORDER if an error has occurred or if the start of + * string has been reached. + */ + int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); + +private: + void init(const Collator &coll); + uint64_t processCE(uint32_t ce); +}; + +U_NAMESPACE_END + +#define INITIAL_ARRAY_SIZE_ 256 + +struct USearch { + // required since collation element iterator does not have a getText API + const UChar *text; + int32_t textLength; // exact length + UBool isOverlap; + UBool isCanonicalMatch; + int16_t elementComparisonType; + UBreakIterator *internalBreakIter; // internal character breakiterator, lazily created. + UBreakIterator *breakIter; // caller provided character breakiterator + // value USEARCH_DONE is the default value + // if we are not at the start of the text or the end of the text, + // depending on the iteration direction and matchedIndex is USEARCH_DONE + // it means that we can't find any more matches in that particular direction + int32_t matchedIndex; + int32_t matchedLength; + UBool isForwardSearching; + UBool reset; +}; + +struct UPattern { + const UChar *text; + int32_t textLength; // exact length + // length required for backwards ce comparison + int32_t cesLength; + int32_t *ces; + int32_t cesBuffer[INITIAL_ARRAY_SIZE_]; + int32_t pcesLength; + int64_t *pces; + int64_t pcesBuffer[INITIAL_ARRAY_SIZE_]; + UBool hasPrefixAccents; + UBool hasSuffixAccents; +}; + +struct UStringSearch { + struct USearch *search; + struct UPattern pattern; + const UCollator *collator; + const icu::Normalizer2 *nfd; + // positions within the collation element iterator is used to determine + // if we are at the start of the text. + UCollationElements *textIter; + icu::UCollationPCE *textProcessedIter; + // utility collation element, used throughout program for temporary + // iteration. + UCollationElements *utilIter; + UBool ownCollator; + UCollationStrength strength; + uint32_t ceMask; + uint32_t variableTop; + UBool toShift; +}; + +/** +* Exact matches without checking for the ends for extra accents. +* The match after the position within the collation element iterator is to be +* found. +* After a match is found the offset in the collation element iterator will be +* shifted to the start of the match. +* Implementation note: +* For tertiary we can't use the collator->tertiaryMask, that is a +* preprocessed mask that takes into account case options. since we are only +* concerned with exact matches, we don't need that. +* Alternate handling - since only the 16 most significant digits is only used, +* we can safely do a compare without masking if the ce is a variable, we mask +* and get only the primary values no shifting to quartenary is required since +* all primary values less than variabletop will need to be masked off anyway. +* If the end character is composite and the pattern ce does not match the text +* ce, we skip it until we find a match in the end composite character or when +* it has passed the character. This is so that we can match pattern "a" with +* the text "\u00e6" +* @param strsrch string search data +* @param status error status if any +* @return true if an exact match is found, false otherwise +*/ +U_CFUNC +UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status); + +/** +* Canonical matches. +* According to the definition, matches found here will include the whole span +* of beginning and ending accents if it overlaps that region. +* @param strsrch string search data +* @param status error status if any +* @return true if a canonical match is found, false otherwise +*/ +U_CFUNC +UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status); + +/** +* Gets the previous match. +* Comments follows from handleNextExact +* @param strsrch string search data +* @param status error status if any +* @return True if a exact math is found, false otherwise. +*/ +U_CFUNC +UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status); + +/** +* Canonical matches. +* According to the definition, matches found here will include the whole span +* of beginning and ending accents if it overlaps that region. +* @param strsrch string search data +* @param status error status if any +* @return true if a canonical match is found, false otherwise +*/ +U_CFUNC +UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, + UErrorCode *status); + +#endif /* #if !UCONFIG_NO_COLLATION */ + +#endif |