diff options
Diffstat (limited to 'intl/icu/source/i18n/rbt.cpp')
-rw-r--r-- | intl/icu/source/i18n/rbt.cpp | 307 |
1 files changed, 307 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/rbt.cpp b/intl/icu/source/i18n/rbt.cpp new file mode 100644 index 0000000000..2176e89fdd --- /dev/null +++ b/intl/icu/source/i18n/rbt.cpp @@ -0,0 +1,307 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 1999-2015, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* Date Name Description +* 11/17/99 aliu Creation. +********************************************************************** +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_TRANSLITERATION + +#include "unicode/rep.h" +#include "unicode/uniset.h" +#include "rbt_pars.h" +#include "rbt_data.h" +#include "rbt_rule.h" +#include "rbt.h" +#include "mutex.h" +#include "umutex.h" + +U_NAMESPACE_BEGIN + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator) + +static Replaceable *gLockedText = nullptr; + +void RuleBasedTransliterator::_construct(const UnicodeString& rules, + UTransDirection direction, + UParseError& parseError, + UErrorCode& status) { + fData = 0; + isDataOwned = true; + if (U_FAILURE(status)) { + return; + } + + TransliteratorParser parser(status); + parser.parse(rules, direction, parseError, status); + if (U_FAILURE(status)) { + return; + } + + if (parser.idBlockVector.size() != 0 || + parser.compoundFilter != nullptr || + parser.dataVector.size() == 0) { + status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT + return; + } + + fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0); + setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); +} + +/** + * Constructs a new transliterator from the given rules. + * @param id the id for the transliterator. + * @param rules rules, separated by ';' + * @param direction either FORWARD or REVERSE. + * @param adoptedFilter the filter for this transliterator. + * @param parseError Struct to receive information on position + * of error if an error is encountered + * @param status Output param set to success/failure code. + * @exception IllegalArgumentException if rules are malformed + * or direction is invalid. + */ +RuleBasedTransliterator::RuleBasedTransliterator( + const UnicodeString& id, + const UnicodeString& rules, + UTransDirection direction, + UnicodeFilter* adoptedFilter, + UParseError& parseError, + UErrorCode& status) : + Transliterator(id, adoptedFilter) { + _construct(rules, direction,parseError,status); +} + +/** + * Constructs a new transliterator from the given rules. + * @param id the id for the transliterator. + * @param rules rules, separated by ';' + * @param direction either FORWARD or REVERSE. + * @param adoptedFilter the filter for this transliterator. + * @param status Output param set to success/failure code. + * @exception IllegalArgumentException if rules are malformed + * or direction is invalid. + */ +/*RuleBasedTransliterator::RuleBasedTransliterator( + const UnicodeString& id, + const UnicodeString& rules, + UTransDirection direction, + UnicodeFilter* adoptedFilter, + UErrorCode& status) : + Transliterator(id, adoptedFilter) { + UParseError parseError; + _construct(rules, direction,parseError, status); +}*/ + +/** + * Convenience constructor with no filter. + */ +/*RuleBasedTransliterator::RuleBasedTransliterator( + const UnicodeString& id, + const UnicodeString& rules, + UTransDirection direction, + UErrorCode& status) : + Transliterator(id, 0) { + UParseError parseError; + _construct(rules, direction,parseError, status); +}*/ + +/** + * Convenience constructor with no filter and FORWARD direction. + */ +/*RuleBasedTransliterator::RuleBasedTransliterator( + const UnicodeString& id, + const UnicodeString& rules, + UErrorCode& status) : + Transliterator(id, 0) { + UParseError parseError; + _construct(rules, UTRANS_FORWARD, parseError, status); +}*/ + +/** + * Convenience constructor with FORWARD direction. + */ +/*RuleBasedTransliterator::RuleBasedTransliterator( + const UnicodeString& id, + const UnicodeString& rules, + UnicodeFilter* adoptedFilter, + UErrorCode& status) : + Transliterator(id, adoptedFilter) { + UParseError parseError; + _construct(rules, UTRANS_FORWARD,parseError, status); +}*/ + +RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, + const TransliterationRuleData* theData, + UnicodeFilter* adoptedFilter) : + Transliterator(id, adoptedFilter), + fData((TransliterationRuleData*)theData), // cast away const + isDataOwned(false) { + setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); +} + +/** + * Internal constructor. + */ +RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, + TransliterationRuleData* theData, + UBool isDataAdopted) : + Transliterator(id, 0), + fData(theData), + isDataOwned(isDataAdopted) { + setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); +} + +/** + * Copy constructor. + */ +RuleBasedTransliterator::RuleBasedTransliterator( + const RuleBasedTransliterator& other) : + Transliterator(other), fData(other.fData), + isDataOwned(other.isDataOwned) { + + // The data object may or may not be owned. If it is not owned we + // share it; it is invariant. If it is owned, it's still + // invariant, but we need to copy it to prevent double-deletion. + // If this becomes a performance issue (if people do a lot of RBT + // copying -- unlikely) we can reference count the data object. + + // Only do a deep copy if this is owned data, that is, data that + // will be later deleted. System transliterators contain + // non-owned data. + if (isDataOwned) { + fData = new TransliterationRuleData(*other.fData); + } +} + +/** + * Destructor. + */ +RuleBasedTransliterator::~RuleBasedTransliterator() { + // Delete the data object only if we own it. + if (isDataOwned) { + delete fData; + } +} + +RuleBasedTransliterator* +RuleBasedTransliterator::clone() const { + return new RuleBasedTransliterator(*this); +} + +/** + * Implements {@link Transliterator#handleTransliterate}. + */ +void +RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, + UBool isIncremental) const { + /* We keep contextStart and contextLimit fixed the entire time, + * relative to the text -- contextLimit may move numerically if + * text is inserted or removed. The start offset moves toward + * limit, with replacements happening under it. + * + * Example: rules 1. ab>x|y + * 2. yc>z + * + * |eabcd begin - no match, advance start + * e|abcd match rule 1 - change text & adjust start + * ex|ycd match rule 2 - change text & adjust start + * exz|d no match, advance start + * exzd| done + */ + + /* A rule like + * a>b|a + * creates an infinite loop. To prevent that, we put an arbitrary + * limit on the number of iterations that we take, one that is + * high enough that any reasonable rules are ok, but low enough to + * prevent a server from hanging. The limit is 16 times the + * number of characters n, unless n is so large that 16n exceeds a + * uint32_t. + */ + uint32_t loopCount = 0; + uint32_t loopLimit = index.limit - index.start; + if (loopLimit >= 0x10000000) { + loopLimit = 0xFFFFFFFF; + } else { + loopLimit <<= 4; + } + + // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent + // operations must be prevented. + // A Complication: compound transliterators can result in recursive entries to this + // function, sometimes with different "This" objects, always with the same text. + // Double-locking must be prevented in these cases. + // + + UBool lockedMutexAtThisLevel = false; + + // Test whether this request is operating on the same text string as + // some other transliteration that is still in progress and holding the + // transliteration mutex. If so, do not lock the transliteration + // mutex again. + // + // gLockedText variable is protected by the global ICU mutex. + // Shared RBT data protected by transliteratorDataMutex. + // + // TODO(andy): Need a better scheme for handling this. + + static UMutex transliteratorDataMutex; + UBool needToLock; + { + Mutex m; + needToLock = (&text != gLockedText); + } + if (needToLock) { + umtx_lock(&transliteratorDataMutex); // Contention, longish waits possible here. + Mutex m; + gLockedText = &text; + lockedMutexAtThisLevel = true; + } + + // Check to make sure we don't dereference a null pointer. + if (fData != nullptr) { + while (index.start < index.limit && + loopCount <= loopLimit && + fData->ruleSet.transliterate(text, index, isIncremental)) { + ++loopCount; + } + } + if (lockedMutexAtThisLevel) { + { + Mutex m; + gLockedText = nullptr; + } + umtx_unlock(&transliteratorDataMutex); + } +} + +UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource, + UBool escapeUnprintable) const { + return fData->ruleSet.toRules(rulesSource, escapeUnprintable); +} + +/** + * Implement Transliterator framework + */ +void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const { + fData->ruleSet.getSourceTargetSet(result, false); +} + +/** + * Override Transliterator framework + */ +UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const { + return fData->ruleSet.getSourceTargetSet(result, true); +} + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ |