1 files changed, 174 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/utf8collationiterator.h b/intl/icu/source/i18n/utf8collationiterator.h
new file mode 100644
index 0000000000..13ca87846b
--- /dev/null
+++ b/intl/icu/source/i18n/utf8collationiterator.h
@@ -0,0 +1,174 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+* Copyright (C) 2012-2016, International Business Machines
+* Corporation and others.  All Rights Reserved.
+*******************************************************************************
+* utf8collationiterator.h
+*
+* created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
+* created by: Markus W. Scherer
+*/
+
+#ifndef __UTF8COLLATIONITERATOR_H__
+#define __UTF8COLLATIONITERATOR_H__
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_COLLATION
+
+#include "cmemory.h"
+#include "collation.h"
+#include "collationdata.h"
+#include "collationiterator.h"
+#include "normalizer2impl.h"
+
+U_NAMESPACE_BEGIN
+
+/**
+ * UTF-8 collation element and character iterator.
+ * Handles normalized UTF-8 text inline, with length or NUL-terminated.
+ * Unnormalized text is handled by a subclass.
+ */
+class U_I18N_API UTF8CollationIterator : public CollationIterator {
+public:
+    UTF8CollationIterator(const CollationData *d, UBool numeric,
+                          const uint8_t *s, int32_t p, int32_t len)
+            : CollationIterator(d, numeric),
+              u8(s), pos(p), length(len) {}
+
+    virtual ~UTF8CollationIterator();
+
+    virtual void resetToOffset(int32_t newOffset) override;
+
+    virtual int32_t getOffset() const override;
+
+    virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
+
+    virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
+
+protected:
+    /**
+     * For byte sequences that are illegal in UTF-8, an error value may be returned
+     * together with a bogus code point. The caller will ignore that code point.
+     *
+     * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
+     * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns true.
+     *
+     * Valid lead surrogates are returned from inside a normalized text segment,
+     * where handleGetTrailSurrogate() will return the matching trail surrogate.
+     */
+    virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
+
+    virtual UBool foundNULTerminator() override;
+
+    virtual UBool forbidSurrogateCodePoints() const override;
+
+    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
+
+    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
+
+    const uint8_t *u8;
+    int32_t pos;
+    int32_t length;  // <0 for NUL-terminated strings
+};
+
+/**
+ * Incrementally checks the input text for FCD and normalizes where necessary.
+ */
+class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
+public:
+    FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
+                             const uint8_t *s, int32_t p, int32_t len)
+            : UTF8CollationIterator(data, numeric, s, p, len),
+              state(CHECK_FWD), start(p),
+              nfcImpl(data->nfcImpl) {}
+
+    virtual ~FCDUTF8CollationIterator();
+
+    virtual void resetToOffset(int32_t newOffset) override;
+
+    virtual int32_t getOffset() const override;
+
+    virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
+
+    virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
+
+protected:
+    virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
+
+    virtual char16_t handleGetTrailSurrogate() override;
+
+    virtual UBool foundNULTerminator() override;
+
+    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
+
+    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
+
+private:
+    UBool nextHasLccc() const;
+    UBool previousHasTccc() const;
+
+    /**
+     * Switches to forward checking if possible.
+     */
+    void switchToForward();
+
+    /**
+     * Extends the FCD text segment forward or normalizes around pos.
+     * @return true if success
+     */
+    UBool nextSegment(UErrorCode &errorCode);
+
+    /**
+     * Switches to backward checking.
+     */
+    void switchToBackward();
+
+    /**
+     * Extends the FCD text segment backward or normalizes around pos.
+     * @return true if success
+     */
+    UBool previousSegment(UErrorCode &errorCode);
+
+    UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
+
+    enum State {
+        /**
+         * The input text [start..pos[ passes the FCD check.
+         * Moving forward checks incrementally.
+         * limit is undefined.
+         */
+        CHECK_FWD,
+        /**
+         * The input text [pos..limit[ passes the FCD check.
+         * Moving backward checks incrementally.
+         * start is undefined.
+         */
+        CHECK_BWD,
+        /**
+         * The input text [start..limit[ passes the FCD check.
+         * pos tracks the current text index.
+         */
+        IN_FCD_SEGMENT,
+        /**
+         * The input text [start..limit[ failed the FCD check and was normalized.
+         * pos tracks the current index in the normalized string.
+         */
+        IN_NORMALIZED
+    };
+
+    State state;
+
+    int32_t start;
+    int32_t limit;
+
+    const Normalizer2Impl &nfcImpl;
+    UnicodeString normalized;
+};
+
+U_NAMESPACE_END
+
+#endif  // !UCONFIG_NO_COLLATION
+#endif  // __UTF8COLLATIONITERATOR_H__