summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/collationruleparser.h
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/i18n/collationruleparser.h')
-rw-r--r--intl/icu/source/i18n/collationruleparser.h197
1 files changed, 197 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/collationruleparser.h b/intl/icu/source/i18n/collationruleparser.h
new file mode 100644
index 0000000000..e124881fcf
--- /dev/null
+++ b/intl/icu/source/i18n/collationruleparser.h
@@ -0,0 +1,197 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+* Copyright (C) 2013-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* collationruleparser.h
+*
+* created on: 2013apr10
+* created by: Markus W. Scherer
+*/
+
+#ifndef __COLLATIONRULEPARSER_H__
+#define __COLLATIONRULEPARSER_H__
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_COLLATION
+
+#include "unicode/ucol.h"
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+
+struct UParseError;
+
+U_NAMESPACE_BEGIN
+
+struct CollationData;
+struct CollationTailoring;
+
+class Locale;
+class Normalizer2;
+
+struct CollationSettings;
+
+class U_I18N_API CollationRuleParser : public UMemory {
+public:
+ /** Special reset positions. */
+ enum Position {
+ FIRST_TERTIARY_IGNORABLE,
+ LAST_TERTIARY_IGNORABLE,
+ FIRST_SECONDARY_IGNORABLE,
+ LAST_SECONDARY_IGNORABLE,
+ FIRST_PRIMARY_IGNORABLE,
+ LAST_PRIMARY_IGNORABLE,
+ FIRST_VARIABLE,
+ LAST_VARIABLE,
+ FIRST_REGULAR,
+ LAST_REGULAR,
+ FIRST_IMPLICIT,
+ LAST_IMPLICIT,
+ FIRST_TRAILING,
+ LAST_TRAILING
+ };
+
+ /**
+ * First character of contractions that encode special reset positions.
+ * U+FFFE cannot be tailored via rule syntax.
+ *
+ * The second contraction character is POS_BASE + Position.
+ */
+ static const UChar POS_LEAD = 0xfffe;
+ /**
+ * Base for the second character of contractions that encode special reset positions.
+ * Braille characters U+28xx are printable and normalization-inert.
+ * @see POS_LEAD
+ */
+ static const UChar POS_BASE = 0x2800;
+
+ class U_I18N_API Sink : public UObject {
+ public:
+ virtual ~Sink();
+ /**
+ * Adds a reset.
+ * strength=UCOL_IDENTICAL for &str.
+ * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
+ */
+ virtual void addReset(int32_t strength, const UnicodeString &str,
+ const char *&errorReason, UErrorCode &errorCode) = 0;
+ /**
+ * Adds a relation with strength and prefix | str / extension.
+ */
+ virtual void addRelation(int32_t strength, const UnicodeString &prefix,
+ const UnicodeString &str, const UnicodeString &extension,
+ const char *&errorReason, UErrorCode &errorCode) = 0;
+
+ virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason,
+ UErrorCode &errorCode);
+
+ virtual void optimize(const UnicodeSet &set, const char *&errorReason,
+ UErrorCode &errorCode);
+ };
+
+ class U_I18N_API Importer : public UObject {
+ public:
+ virtual ~Importer();
+ virtual void getRules(
+ const char *localeID, const char *collationType,
+ UnicodeString &rules,
+ const char *&errorReason, UErrorCode &errorCode) = 0;
+ };
+
+ /**
+ * Constructor.
+ * The Sink must be set before parsing.
+ * The Importer can be set, otherwise [import locale] syntax is not supported.
+ */
+ CollationRuleParser(const CollationData *base, UErrorCode &errorCode);
+ ~CollationRuleParser();
+
+ /**
+ * Sets the pointer to a Sink object.
+ * The pointer is aliased: Pointer copy without cloning or taking ownership.
+ */
+ void setSink(Sink *sinkAlias) {
+ sink = sinkAlias;
+ }
+
+ /**
+ * Sets the pointer to an Importer object.
+ * The pointer is aliased: Pointer copy without cloning or taking ownership.
+ */
+ void setImporter(Importer *importerAlias) {
+ importer = importerAlias;
+ }
+
+ void parse(const UnicodeString &ruleString,
+ CollationSettings &outSettings,
+ UParseError *outParseError,
+ UErrorCode &errorCode);
+
+ const char *getErrorReason() const { return errorReason; }
+
+ /**
+ * Gets a script or reorder code from its string representation.
+ * @return the script/reorder code, or
+ * -1 if not recognized
+ */
+ static int32_t getReorderCode(const char *word);
+
+private:
+ /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
+ static const int32_t STRENGTH_MASK = 0xf;
+ static const int32_t STARRED_FLAG = 0x10;
+ static const int32_t OFFSET_SHIFT = 8;
+
+ void parse(const UnicodeString &ruleString, UErrorCode &errorCode);
+ void parseRuleChain(UErrorCode &errorCode);
+ int32_t parseResetAndPosition(UErrorCode &errorCode);
+ int32_t parseRelationOperator(UErrorCode &errorCode);
+ void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode);
+ void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode);
+ int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
+ int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
+
+ /**
+ * Sets str to a contraction of U+FFFE and (U+2800 + Position).
+ * @return rule index after the special reset position
+ */
+ int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode);
+ void parseSetting(UErrorCode &errorCode);
+ void parseReordering(const UnicodeString &raw, UErrorCode &errorCode);
+ static UColAttributeValue getOnOffValue(const UnicodeString &s);
+
+ int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode);
+ int32_t readWords(int32_t i, UnicodeString &raw) const;
+ int32_t skipComment(int32_t i) const;
+
+ void setParseError(const char *reason, UErrorCode &errorCode);
+ void setErrorContext();
+
+ /**
+ * ASCII [:P:] and [:S:]:
+ * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
+ */
+ static UBool isSyntaxChar(UChar32 c);
+ int32_t skipWhiteSpace(int32_t i) const;
+
+ const Normalizer2 &nfd, &nfc;
+
+ const UnicodeString *rules;
+ const CollationData *const baseData;
+ CollationSettings *settings;
+ UParseError *parseError;
+ const char *errorReason;
+
+ Sink *sink;
+ Importer *importer;
+
+ int32_t ruleIndex;
+};
+
+U_NAMESPACE_END
+
+#endif // !UCONFIG_NO_COLLATION
+#endif // __COLLATIONRULEPARSER_H__