summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/usrchimp.h
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/i18n/usrchimp.h')
-rw-r--r--intl/icu/source/i18n/usrchimp.h243
1 files changed, 243 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/usrchimp.h b/intl/icu/source/i18n/usrchimp.h
new file mode 100644
index 0000000000..13d825f73b
--- /dev/null
+++ b/intl/icu/source/i18n/usrchimp.h
@@ -0,0 +1,243 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+**********************************************************************
+* Copyright (C) 2001-2015 IBM and others. All rights reserved.
+**********************************************************************
+* Date Name Description
+* 08/13/2001 synwee Creation.
+**********************************************************************
+*/
+#ifndef USRCHIMP_H
+#define USRCHIMP_H
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_COLLATION
+
+#include "unicode/normalizer2.h"
+#include "unicode/ucol.h"
+#include "unicode/ucoleitr.h"
+#include "unicode/ubrk.h"
+
+/* mask off anything but primary order */
+#define UCOL_PRIMARYORDERMASK 0xffff0000
+/* mask off anything but secondary order */
+#define UCOL_SECONDARYORDERMASK 0x0000ff00
+/* mask off anything but tertiary order */
+#define UCOL_TERTIARYORDERMASK 0x000000ff
+/* primary order shift */
+#define UCOL_PRIMARYORDERSHIFT 16
+/* secondary order shift */
+#define UCOL_SECONDARYORDERSHIFT 8
+
+#define UCOL_IGNORABLE 0
+
+/* get weights from a CE */
+#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
+#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
+#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
+
+#define UCOL_CONTINUATION_MARKER 0xC0
+
+#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
+
+/**
+ * This indicates an error has occurred during processing or there are no more CEs
+ * to be returned.
+ */
+#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
+
+U_NAMESPACE_BEGIN
+
+class CollationElementIterator;
+class Collator;
+
+struct PCEI
+{
+ uint64_t ce;
+ int32_t low;
+ int32_t high;
+};
+
+struct PCEBuffer
+{
+ PCEI defaultBuffer[16];
+ PCEI *buffer;
+ int32_t bufferIndex;
+ int32_t bufferSize;
+
+ PCEBuffer();
+ ~PCEBuffer();
+
+ void reset();
+ UBool isEmpty() const;
+ void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
+ const PCEI *get();
+};
+
+class UCollationPCE : public UMemory {
+private:
+ PCEBuffer pceBuffer;
+ CollationElementIterator *cei;
+ UCollationStrength strength;
+ UBool toShift;
+ UBool isShifted;
+ uint32_t variableTop;
+
+public:
+ UCollationPCE(UCollationElements *elems);
+ UCollationPCE(CollationElementIterator *iter);
+ ~UCollationPCE();
+
+ void init(UCollationElements *elems);
+ void init(CollationElementIterator *iter);
+
+ /**
+ * Get the processed ordering priority of the next collation element in the text.
+ * A single character may contain more than one collation element.
+ *
+ * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
+ * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
+ * @param status A pointer to an UErrorCode to receive any errors.
+ * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
+ * if an error has occurred or if the end of string has been reached
+ */
+ int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
+ /**
+ * Get the processed ordering priority of the previous collation element in the text.
+ * A single character may contain more than one collation element.
+ *
+ * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
+ * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
+ * @param status A pointer to an UErrorCode to receive any errors. Notably
+ * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
+ * buffer has been exhausted.
+ * @return The previous collation elements ordering, otherwise returns
+ * UCOL_PROCESSED_NULLORDER if an error has occurred or if the start of
+ * string has been reached.
+ */
+ int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
+
+private:
+ void init(const Collator &coll);
+ uint64_t processCE(uint32_t ce);
+};
+
+U_NAMESPACE_END
+
+#define INITIAL_ARRAY_SIZE_ 256
+
+struct USearch {
+ // required since collation element iterator does not have a getText API
+ const UChar *text;
+ int32_t textLength; // exact length
+ UBool isOverlap;
+ UBool isCanonicalMatch;
+ int16_t elementComparisonType;
+ UBreakIterator *internalBreakIter; // internal character breakiterator, lazily created.
+ UBreakIterator *breakIter; // caller provided character breakiterator
+ // value USEARCH_DONE is the default value
+ // if we are not at the start of the text or the end of the text,
+ // depending on the iteration direction and matchedIndex is USEARCH_DONE
+ // it means that we can't find any more matches in that particular direction
+ int32_t matchedIndex;
+ int32_t matchedLength;
+ UBool isForwardSearching;
+ UBool reset;
+};
+
+struct UPattern {
+ const UChar *text;
+ int32_t textLength; // exact length
+ // length required for backwards ce comparison
+ int32_t cesLength;
+ int32_t *ces;
+ int32_t cesBuffer[INITIAL_ARRAY_SIZE_];
+ int32_t pcesLength;
+ int64_t *pces;
+ int64_t pcesBuffer[INITIAL_ARRAY_SIZE_];
+ UBool hasPrefixAccents;
+ UBool hasSuffixAccents;
+};
+
+struct UStringSearch {
+ struct USearch *search;
+ struct UPattern pattern;
+ const UCollator *collator;
+ const icu::Normalizer2 *nfd;
+ // positions within the collation element iterator is used to determine
+ // if we are at the start of the text.
+ UCollationElements *textIter;
+ icu::UCollationPCE *textProcessedIter;
+ // utility collation element, used throughout program for temporary
+ // iteration.
+ UCollationElements *utilIter;
+ UBool ownCollator;
+ UCollationStrength strength;
+ uint32_t ceMask;
+ uint32_t variableTop;
+ UBool toShift;
+};
+
+/**
+* Exact matches without checking for the ends for extra accents.
+* The match after the position within the collation element iterator is to be
+* found.
+* After a match is found the offset in the collation element iterator will be
+* shifted to the start of the match.
+* Implementation note:
+* For tertiary we can't use the collator->tertiaryMask, that is a
+* preprocessed mask that takes into account case options. since we are only
+* concerned with exact matches, we don't need that.
+* Alternate handling - since only the 16 most significant digits is only used,
+* we can safely do a compare without masking if the ce is a variable, we mask
+* and get only the primary values no shifting to quartenary is required since
+* all primary values less than variabletop will need to be masked off anyway.
+* If the end character is composite and the pattern ce does not match the text
+* ce, we skip it until we find a match in the end composite character or when
+* it has passed the character. This is so that we can match pattern "a" with
+* the text "\u00e6"
+* @param strsrch string search data
+* @param status error status if any
+* @return true if an exact match is found, false otherwise
+*/
+U_CFUNC
+UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
+
+/**
+* Canonical matches.
+* According to the definition, matches found here will include the whole span
+* of beginning and ending accents if it overlaps that region.
+* @param strsrch string search data
+* @param status error status if any
+* @return true if a canonical match is found, false otherwise
+*/
+U_CFUNC
+UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
+
+/**
+* Gets the previous match.
+* Comments follows from handleNextExact
+* @param strsrch string search data
+* @param status error status if any
+* @return True if a exact math is found, false otherwise.
+*/
+U_CFUNC
+UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
+
+/**
+* Canonical matches.
+* According to the definition, matches found here will include the whole span
+* of beginning and ending accents if it overlaps that region.
+* @param strsrch string search data
+* @param status error status if any
+* @return true if a canonical match is found, false otherwise
+*/
+U_CFUNC
+UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
+ UErrorCode *status);
+
+#endif /* #if !UCONFIG_NO_COLLATION */
+
+#endif