From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- intl/icu/source/i18n/collationruleparser.cpp | 881 +++++++++++++++++++++++++++ 1 file changed, 881 insertions(+) create mode 100644 intl/icu/source/i18n/collationruleparser.cpp (limited to 'intl/icu/source/i18n/collationruleparser.cpp') diff --git a/intl/icu/source/i18n/collationruleparser.cpp b/intl/icu/source/i18n/collationruleparser.cpp new file mode 100644 index 0000000000..4cc25a1f5c --- /dev/null +++ b/intl/icu/source/i18n/collationruleparser.cpp @@ -0,0 +1,881 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2013-2015, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* collationruleparser.cpp +* +* (replaced the former ucol_tok.cpp) +* +* created on: 2013apr10 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +#include "unicode/normalizer2.h" +#include "unicode/parseerr.h" +#include "unicode/uchar.h" +#include "unicode/ucol.h" +#include "unicode/uloc.h" +#include "unicode/unistr.h" +#include "unicode/utf16.h" +#include "charstr.h" +#include "cmemory.h" +#include "collation.h" +#include "collationdata.h" +#include "collationruleparser.h" +#include "collationsettings.h" +#include "collationtailoring.h" +#include "cstring.h" +#include "patternprops.h" +#include "uassert.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +namespace { + +static const char16_t BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before" +const int32_t BEFORE_LENGTH = 7; + +} // namespace + +CollationRuleParser::Sink::~Sink() {} + +void +CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {} + +void +CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {} + +CollationRuleParser::Importer::~Importer() {} + +CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode) + : nfd(*Normalizer2::getNFDInstance(errorCode)), + nfc(*Normalizer2::getNFCInstance(errorCode)), + rules(nullptr), baseData(base), settings(nullptr), + parseError(nullptr), errorReason(nullptr), + sink(nullptr), importer(nullptr), + ruleIndex(0) { +} + +CollationRuleParser::~CollationRuleParser() { +} + +void +CollationRuleParser::parse(const UnicodeString &ruleString, + CollationSettings &outSettings, + UParseError *outParseError, + UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + settings = &outSettings; + parseError = outParseError; + if(parseError != nullptr) { + parseError->line = 0; + parseError->offset = -1; + parseError->preContext[0] = 0; + parseError->postContext[0] = 0; + } + errorReason = nullptr; + parse(ruleString, errorCode); +} + +void +CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + rules = &ruleString; + ruleIndex = 0; + + while(ruleIndex < rules->length()) { + char16_t c = rules->charAt(ruleIndex); + if(PatternProps::isWhiteSpace(c)) { + ++ruleIndex; + continue; + } + switch(c) { + case 0x26: // '&' + parseRuleChain(errorCode); + break; + case 0x5b: // '[' + parseSetting(errorCode); + break; + case 0x23: // '#' starts a comment, until the end of the line + ruleIndex = skipComment(ruleIndex + 1); + break; + case 0x40: // '@' is equivalent to [backwards 2] + settings->setFlag(CollationSettings::BACKWARD_SECONDARY, + UCOL_ON, 0, errorCode); + ++ruleIndex; + break; + case 0x21: // '!' used to turn on Thai/Lao character reversal + // Accept but ignore. The root collator has contractions + // that are equivalent to the character reversal, where appropriate. + ++ruleIndex; + break; + default: + setParseError("expected a reset or setting or comment", errorCode); + break; + } + if(U_FAILURE(errorCode)) { return; } + } +} + +void +CollationRuleParser::parseRuleChain(UErrorCode &errorCode) { + int32_t resetStrength = parseResetAndPosition(errorCode); + UBool isFirstRelation = true; + for(;;) { + int32_t result = parseRelationOperator(errorCode); + if(U_FAILURE(errorCode)) { return; } + if(result < 0) { + if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) { + // '#' starts a comment, until the end of the line + ruleIndex = skipComment(ruleIndex + 1); + continue; + } + if(isFirstRelation) { + setParseError("reset not followed by a relation", errorCode); + } + return; + } + int32_t strength = result & STRENGTH_MASK; + if(resetStrength < UCOL_IDENTICAL) { + // reset-before rule chain + if(isFirstRelation) { + if(strength != resetStrength) { + setParseError("reset-before strength differs from its first relation", errorCode); + return; + } + } else { + if(strength < resetStrength) { + setParseError("reset-before strength followed by a stronger relation", errorCode); + return; + } + } + } + int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator + if((result & STARRED_FLAG) == 0) { + parseRelationStrings(strength, i, errorCode); + } else { + parseStarredCharacters(strength, i, errorCode); + } + if(U_FAILURE(errorCode)) { return; } + isFirstRelation = false; + } +} + +int32_t +CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; } + int32_t i = skipWhiteSpace(ruleIndex + 1); + int32_t j; + char16_t c; + int32_t resetStrength; + if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 && + (j = i + BEFORE_LENGTH) < rules->length() && + PatternProps::isWhiteSpace(rules->charAt(j)) && + ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() && + 0x31 <= (c = rules->charAt(j)) && c <= 0x33 && + rules->charAt(j + 1) == 0x5d) { + // &[before n] with n=1 or 2 or 3 + resetStrength = UCOL_PRIMARY + (c - 0x31); + i = skipWhiteSpace(j + 2); + } else { + resetStrength = UCOL_IDENTICAL; + } + if(i >= rules->length()) { + setParseError("reset without position", errorCode); + return UCOL_DEFAULT; + } + UnicodeString str; + if(rules->charAt(i) == 0x5b) { // '[' + i = parseSpecialPosition(i, str, errorCode); + } else { + i = parseTailoringString(i, str, errorCode); + } + sink->addReset(resetStrength, str, errorReason, errorCode); + if(U_FAILURE(errorCode)) { setErrorContext(); } + ruleIndex = i; + return resetStrength; +} + +int32_t +CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; } + ruleIndex = skipWhiteSpace(ruleIndex); + if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; } + int32_t strength; + int32_t i = ruleIndex; + char16_t c = rules->charAt(i++); + switch(c) { + case 0x3c: // '<' + if(i < rules->length() && rules->charAt(i) == 0x3c) { // << + ++i; + if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<< + ++i; + if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<< + ++i; + strength = UCOL_QUATERNARY; + } else { + strength = UCOL_TERTIARY; + } + } else { + strength = UCOL_SECONDARY; + } + } else { + strength = UCOL_PRIMARY; + } + if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*' + ++i; + strength |= STARRED_FLAG; + } + break; + case 0x3b: // ';' same as << + strength = UCOL_SECONDARY; + break; + case 0x2c: // ',' same as <<< + strength = UCOL_TERTIARY; + break; + case 0x3d: // '=' + strength = UCOL_IDENTICAL; + if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*' + ++i; + strength |= STARRED_FLAG; + } + break; + default: + return UCOL_DEFAULT; + } + return ((i - ruleIndex) << OFFSET_SHIFT) | strength; +} + +void +CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) { + // Parse + // prefix | str / extension + // where prefix and extension are optional. + UnicodeString prefix, str, extension; + i = parseTailoringString(i, str, errorCode); + if(U_FAILURE(errorCode)) { return; } + char16_t next = (i < rules->length()) ? rules->charAt(i) : 0; + if(next == 0x7c) { // '|' separates the context prefix from the string. + prefix = str; + i = parseTailoringString(i + 1, str, errorCode); + if(U_FAILURE(errorCode)) { return; } + next = (i < rules->length()) ? rules->charAt(i) : 0; + } + if(next == 0x2f) { // '/' separates the string from the extension. + i = parseTailoringString(i + 1, extension, errorCode); + } + if(!prefix.isEmpty()) { + UChar32 prefix0 = prefix.char32At(0); + UChar32 c = str.char32At(0); + if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) { + setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary", + errorCode); + return; + } + } + sink->addRelation(strength, prefix, str, extension, errorReason, errorCode); + if(U_FAILURE(errorCode)) { setErrorContext(); } + ruleIndex = i; +} + +void +CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) { + UnicodeString empty, raw; + i = parseString(skipWhiteSpace(i), raw, errorCode); + if(U_FAILURE(errorCode)) { return; } + if(raw.isEmpty()) { + setParseError("missing starred-relation string", errorCode); + return; + } + UChar32 prev = -1; + int32_t j = 0; + for(;;) { + while(j < raw.length()) { + UChar32 c = raw.char32At(j); + if(!nfd.isInert(c)) { + setParseError("starred-relation string is not all NFD-inert", errorCode); + return; + } + sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode); + if(U_FAILURE(errorCode)) { + setErrorContext(); + return; + } + j += U16_LENGTH(c); + prev = c; + } + if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-' + break; + } + if(prev < 0) { + setParseError("range without start in starred-relation string", errorCode); + return; + } + i = parseString(i + 1, raw, errorCode); + if(U_FAILURE(errorCode)) { return; } + if(raw.isEmpty()) { + setParseError("range without end in starred-relation string", errorCode); + return; + } + UChar32 c = raw.char32At(0); + if(c < prev) { + setParseError("range start greater than end in starred-relation string", errorCode); + return; + } + // range prev-c + UnicodeString s; + while(++prev <= c) { + if(!nfd.isInert(prev)) { + setParseError("starred-relation string range is not all NFD-inert", errorCode); + return; + } + if(U_IS_SURROGATE(prev)) { + setParseError("starred-relation string range contains a surrogate", errorCode); + return; + } + if(0xfffd <= prev && prev <= 0xffff) { + setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode); + return; + } + s.setTo(prev); + sink->addRelation(strength, empty, s, empty, errorReason, errorCode); + if(U_FAILURE(errorCode)) { + setErrorContext(); + return; + } + } + prev = -1; + j = U16_LENGTH(c); + } + ruleIndex = skipWhiteSpace(i); +} + +int32_t +CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) { + i = parseString(skipWhiteSpace(i), raw, errorCode); + if(U_SUCCESS(errorCode) && raw.isEmpty()) { + setParseError("missing relation string", errorCode); + } + return skipWhiteSpace(i); +} + +int32_t +CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return i; } + raw.remove(); + while(i < rules->length()) { + UChar32 c = rules->charAt(i++); + if(isSyntaxChar(c)) { + if(c == 0x27) { // apostrophe + if(i < rules->length() && rules->charAt(i) == 0x27) { + // Double apostrophe, encodes a single one. + raw.append((char16_t)0x27); + ++i; + continue; + } + // Quote literal text until the next single apostrophe. + for(;;) { + if(i == rules->length()) { + setParseError("quoted literal text missing terminating apostrophe", errorCode); + return i; + } + c = rules->charAt(i++); + if(c == 0x27) { + if(i < rules->length() && rules->charAt(i) == 0x27) { + // Double apostrophe inside quoted literal text, + // still encodes a single apostrophe. + ++i; + } else { + break; + } + } + raw.append((char16_t)c); + } + } else if(c == 0x5c) { // backslash + if(i == rules->length()) { + setParseError("backslash escape at the end of the rule string", errorCode); + return i; + } + c = rules->char32At(i); + raw.append(c); + i += U16_LENGTH(c); + } else { + // Any other syntax character terminates a string. + --i; + break; + } + } else if(PatternProps::isWhiteSpace(c)) { + // Unquoted white space terminates a string. + --i; + break; + } else { + raw.append((char16_t)c); + } + } + for(int32_t j = 0; j < raw.length();) { + UChar32 c = raw.char32At(j); + if(U_IS_SURROGATE(c)) { + setParseError("string contains an unpaired surrogate", errorCode); + return i; + } + if(0xfffd <= c && c <= 0xffff) { + setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode); + return i; + } + j += U16_LENGTH(c); + } + return i; +} + +namespace { + +static const char *const positions[] = { + "first tertiary ignorable", + "last tertiary ignorable", + "first secondary ignorable", + "last secondary ignorable", + "first primary ignorable", + "last primary ignorable", + "first variable", + "last variable", + "first regular", + "last regular", + "first implicit", + "last implicit", + "first trailing", + "last trailing" +}; + +} // namespace + +int32_t +CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return 0; } + UnicodeString raw; + int32_t j = readWords(i + 1, raw); + if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ] + ++j; + for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) { + if(raw == UnicodeString(positions[pos], -1, US_INV)) { + str.setTo((char16_t)POS_LEAD).append((char16_t)(POS_BASE + pos)); + return j; + } + } + if(raw == UNICODE_STRING_SIMPLE("top")) { + str.setTo((char16_t)POS_LEAD).append((char16_t)(POS_BASE + LAST_REGULAR)); + return j; + } + if(raw == UNICODE_STRING_SIMPLE("variable top")) { + str.setTo((char16_t)POS_LEAD).append((char16_t)(POS_BASE + LAST_VARIABLE)); + return j; + } + } + setParseError("not a valid special reset position", errorCode); + return i; +} + +void +CollationRuleParser::parseSetting(UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + UnicodeString raw; + int32_t i = ruleIndex + 1; + int32_t j = readWords(i, raw); + if(j <= i || raw.isEmpty()) { + setParseError("expected a setting/option at '['", errorCode); + } + if(rules->charAt(j) == 0x5d) { // words end with ] + ++j; + if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) && + (raw.length() == 7 || raw.charAt(7) == 0x20)) { + parseReordering(raw, errorCode); + ruleIndex = j; + return; + } + if(raw == UNICODE_STRING_SIMPLE("backwards 2")) { + settings->setFlag(CollationSettings::BACKWARD_SECONDARY, + UCOL_ON, 0, errorCode); + ruleIndex = j; + return; + } + UnicodeString v; + int32_t valueIndex = raw.lastIndexOf((char16_t)0x20); + if(valueIndex >= 0) { + v.setTo(raw, valueIndex + 1); + raw.truncate(valueIndex); + } + if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) { + int32_t value = UCOL_DEFAULT; + char16_t c = v.charAt(0); + if(0x31 <= c && c <= 0x34) { // 1..4 + value = UCOL_PRIMARY + (c - 0x31); + } else if(c == 0x49) { // 'I' + value = UCOL_IDENTICAL; + } + if(value != UCOL_DEFAULT) { + settings->setStrength(value, 0, errorCode); + ruleIndex = j; + return; + } + } else if(raw == UNICODE_STRING_SIMPLE("alternate")) { + UColAttributeValue value = UCOL_DEFAULT; + if(v == UNICODE_STRING_SIMPLE("non-ignorable")) { + value = UCOL_NON_IGNORABLE; + } else if(v == UNICODE_STRING_SIMPLE("shifted")) { + value = UCOL_SHIFTED; + } + if(value != UCOL_DEFAULT) { + settings->setAlternateHandling(value, 0, errorCode); + ruleIndex = j; + return; + } + } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) { + int32_t value = UCOL_DEFAULT; + if(v == UNICODE_STRING_SIMPLE("space")) { + value = CollationSettings::MAX_VAR_SPACE; + } else if(v == UNICODE_STRING_SIMPLE("punct")) { + value = CollationSettings::MAX_VAR_PUNCT; + } else if(v == UNICODE_STRING_SIMPLE("symbol")) { + value = CollationSettings::MAX_VAR_SYMBOL; + } else if(v == UNICODE_STRING_SIMPLE("currency")) { + value = CollationSettings::MAX_VAR_CURRENCY; + } + if(value != UCOL_DEFAULT) { + settings->setMaxVariable(value, 0, errorCode); + settings->variableTop = baseData->getLastPrimaryForGroup( + UCOL_REORDER_CODE_FIRST + value); + U_ASSERT(settings->variableTop != 0); + ruleIndex = j; + return; + } + } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) { + UColAttributeValue value = UCOL_DEFAULT; + if(v == UNICODE_STRING_SIMPLE("off")) { + value = UCOL_OFF; + } else if(v == UNICODE_STRING_SIMPLE("lower")) { + value = UCOL_LOWER_FIRST; + } else if(v == UNICODE_STRING_SIMPLE("upper")) { + value = UCOL_UPPER_FIRST; + } + if(value != UCOL_DEFAULT) { + settings->setCaseFirst(value, 0, errorCode); + ruleIndex = j; + return; + } + } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) { + UColAttributeValue value = getOnOffValue(v); + if(value != UCOL_DEFAULT) { + settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode); + ruleIndex = j; + return; + } + } else if(raw == UNICODE_STRING_SIMPLE("normalization")) { + UColAttributeValue value = getOnOffValue(v); + if(value != UCOL_DEFAULT) { + settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode); + ruleIndex = j; + return; + } + } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) { + UColAttributeValue value = getOnOffValue(v); + if(value != UCOL_DEFAULT) { + settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode); + ruleIndex = j; + return; + } + } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) { + UColAttributeValue value = getOnOffValue(v); + if(value != UCOL_DEFAULT) { + if(value == UCOL_ON) { + setParseError("[hiraganaQ on] is not supported", errorCode); + } + ruleIndex = j; + return; + } + } else if(raw == UNICODE_STRING_SIMPLE("import")) { + CharString lang; + lang.appendInvariantChars(v, errorCode); + if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; } + // BCP 47 language tag -> ICU locale ID + char localeID[ULOC_FULLNAME_CAPACITY]; + int32_t parsedLength; + int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY, + &parsedLength, &errorCode); + if(U_FAILURE(errorCode) || + parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) { + errorCode = U_ZERO_ERROR; + setParseError("expected language tag in [import langTag]", errorCode); + return; + } + // localeID minus all keywords + char baseID[ULOC_FULLNAME_CAPACITY]; + length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode); + if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) { + errorCode = U_ZERO_ERROR; + setParseError("expected language tag in [import langTag]", errorCode); + return; + } + if(length == 0) { + uprv_strcpy(baseID, "root"); + } else if(*baseID == '_') { + uprv_memmove(baseID + 3, baseID, length + 1); + uprv_memcpy(baseID, "und", 3); + } + // @collation=type, or length=0 if not specified + char collationType[ULOC_KEYWORDS_CAPACITY]; + length = uloc_getKeywordValue(localeID, "collation", + collationType, ULOC_KEYWORDS_CAPACITY, + &errorCode); + if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) { + errorCode = U_ZERO_ERROR; + setParseError("expected language tag in [import langTag]", errorCode); + return; + } + if(importer == nullptr) { + setParseError("[import langTag] is not supported", errorCode); + } else { + UnicodeString importedRules; + importer->getRules(baseID, length > 0 ? collationType : "standard", + importedRules, errorReason, errorCode); + if(U_FAILURE(errorCode)) { + if(errorReason == nullptr) { + errorReason = "[import langTag] failed"; + } + setErrorContext(); + return; + } + const UnicodeString *outerRules = rules; + int32_t outerRuleIndex = ruleIndex; + parse(importedRules, errorCode); + if(U_FAILURE(errorCode)) { + if(parseError != nullptr) { + parseError->offset = outerRuleIndex; + } + } + rules = outerRules; + ruleIndex = j; + } + return; + } + } else if(rules->charAt(j) == 0x5b) { // words end with [ + UnicodeSet set; + j = parseUnicodeSet(j, set, errorCode); + if(U_FAILURE(errorCode)) { return; } + if(raw == UNICODE_STRING_SIMPLE("optimize")) { + sink->optimize(set, errorReason, errorCode); + if(U_FAILURE(errorCode)) { setErrorContext(); } + ruleIndex = j; + return; + } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) { + sink->suppressContractions(set, errorReason, errorCode); + if(U_FAILURE(errorCode)) { setErrorContext(); } + ruleIndex = j; + return; + } + } + setParseError("not a valid setting/option", errorCode); +} + +void +CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + int32_t i = 7; // after "reorder" + if(i == raw.length()) { + // empty [reorder] with no codes + settings->resetReordering(); + return; + } + // Parse the codes in [reorder aa bb cc]. + UVector32 reorderCodes(errorCode); + if(U_FAILURE(errorCode)) { return; } + CharString word; + while(i < raw.length()) { + ++i; // skip the word-separating space + int32_t limit = raw.indexOf((char16_t)0x20, i); + if(limit < 0) { limit = raw.length(); } + word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode); + if(U_FAILURE(errorCode)) { return; } + int32_t code = getReorderCode(word.data()); + if(code < 0) { + setParseError("unknown script or reorder code", errorCode); + return; + } + reorderCodes.addElement(code, errorCode); + if(U_FAILURE(errorCode)) { return; } + i = limit; + } + settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode); +} + +static const char *const gSpecialReorderCodes[] = { + "space", "punct", "symbol", "currency", "digit" +}; + +int32_t +CollationRuleParser::getReorderCode(const char *word) { + for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) { + if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) { + return UCOL_REORDER_CODE_FIRST + i; + } + } + int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word); + if(script >= 0) { + return script; + } + if(uprv_stricmp(word, "others") == 0) { + return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN + } + return -1; +} + +UColAttributeValue +CollationRuleParser::getOnOffValue(const UnicodeString &s) { + if(s == UNICODE_STRING_SIMPLE("on")) { + return UCOL_ON; + } else if(s == UNICODE_STRING_SIMPLE("off")) { + return UCOL_OFF; + } else { + return UCOL_DEFAULT; + } +} + +int32_t +CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) { + // Collect a UnicodeSet pattern between a balanced pair of [brackets]. + int32_t level = 0; + int32_t j = i; + for(;;) { + if(j == rules->length()) { + setParseError("unbalanced UnicodeSet pattern brackets", errorCode); + return j; + } + char16_t c = rules->charAt(j++); + if(c == 0x5b) { // '[' + ++level; + } else if(c == 0x5d) { // ']' + if(--level == 0) { break; } + } + } + set.applyPattern(rules->tempSubStringBetween(i, j), errorCode); + if(U_FAILURE(errorCode)) { + errorCode = U_ZERO_ERROR; + setParseError("not a valid UnicodeSet pattern", errorCode); + return j; + } + j = skipWhiteSpace(j); + if(j == rules->length() || rules->charAt(j) != 0x5d) { + setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode); + return j; + } + return ++j; +} + +int32_t +CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const { + static const char16_t sp = 0x20; + raw.remove(); + i = skipWhiteSpace(i); + for(;;) { + if(i >= rules->length()) { return 0; } + char16_t c = rules->charAt(i); + if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_ + if(raw.isEmpty()) { return i; } + if(raw.endsWith(&sp, 1)) { // remove trailing space + raw.truncate(raw.length() - 1); + } + return i; + } + if(PatternProps::isWhiteSpace(c)) { + raw.append(sp); + i = skipWhiteSpace(i + 1); + } else { + raw.append(c); + ++i; + } + } +} + +int32_t +CollationRuleParser::skipComment(int32_t i) const { + // skip to past the newline + while(i < rules->length()) { + char16_t c = rules->charAt(i++); + // LF or FF or CR or NEL or LS or PS + if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) { + // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS." + // NLF (new line function) = CR or LF or CR+LF or NEL. + // No need to collect all of CR+LF because a following LF will be ignored anyway. + break; + } + } + return i; +} + +void +CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + // Error code consistent with the old parser (from ca. 2001), + // rather than U_PARSE_ERROR; + errorCode = U_INVALID_FORMAT_ERROR; + errorReason = reason; + if(parseError != nullptr) { setErrorContext(); } +} + +void +CollationRuleParser::setErrorContext() { + if(parseError == nullptr) { return; } + + // Note: This relies on the calling code maintaining the ruleIndex + // at a position that is useful for debugging. + // For example, at the beginning of a reset or relation etc. + parseError->offset = ruleIndex; + parseError->line = 0; // We are not counting line numbers. + + // before ruleIndex + int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1); + if(start < 0) { + start = 0; + } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) { + ++start; + } + int32_t length = ruleIndex - start; + rules->extract(start, length, parseError->preContext); + parseError->preContext[length] = 0; + + // starting from ruleIndex + length = rules->length() - ruleIndex; + if(length >= U_PARSE_CONTEXT_LEN) { + length = U_PARSE_CONTEXT_LEN - 1; + if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) { + --length; + } + } + rules->extract(ruleIndex, length, parseError->postContext); + parseError->postContext[length] = 0; +} + +UBool +CollationRuleParser::isSyntaxChar(UChar32 c) { + return 0x21 <= c && c <= 0x7e && + (c <= 0x2f || (0x3a <= c && c <= 0x40) || + (0x5b <= c && c <= 0x60) || (0x7b <= c)); +} + +int32_t +CollationRuleParser::skipWhiteSpace(int32_t i) const { + while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) { + ++i; + } + return i; +} + +U_NAMESPACE_END + +#endif // !UCONFIG_NO_COLLATION -- cgit v1.2.3