diff options
Diffstat (limited to 'intl/icu/source/i18n/strrepl.cpp')
-rw-r--r-- | intl/icu/source/i18n/strrepl.cpp | 329 |
1 files changed, 329 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/strrepl.cpp b/intl/icu/source/i18n/strrepl.cpp new file mode 100644 index 0000000000..9fafeb2659 --- /dev/null +++ b/intl/icu/source/i18n/strrepl.cpp @@ -0,0 +1,329 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (c) 2002-2012, International Business Machines Corporation +* and others. All Rights Reserved. +********************************************************************** +* Date Name Description +* 01/21/2002 aliu Creation. +********************************************************************** +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_TRANSLITERATION + +#include "unicode/uniset.h" +#include "unicode/utf16.h" +#include "strrepl.h" +#include "rbt_data.h" +#include "util.h" + +U_NAMESPACE_BEGIN + +UnicodeReplacer::~UnicodeReplacer() {} +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) + +/** + * Construct a StringReplacer that sets the emits the given output + * text and sets the cursor to the given position. + * @param theOutput text that will replace input text when the + * replace() method is called. May contain stand-in characters + * that represent nested replacers. + * @param theCursorPos cursor position that will be returned by + * the replace() method + * @param theData transliterator context object that translates + * stand-in characters to UnicodeReplacer objects + */ +StringReplacer::StringReplacer(const UnicodeString& theOutput, + int32_t theCursorPos, + const TransliterationRuleData* theData) { + output = theOutput; + cursorPos = theCursorPos; + hasCursor = TRUE; + data = theData; + isComplex = TRUE; +} + +/** + * Construct a StringReplacer that sets the emits the given output + * text and does not modify the cursor. + * @param theOutput text that will replace input text when the + * replace() method is called. May contain stand-in characters + * that represent nested replacers. + * @param theData transliterator context object that translates + * stand-in characters to UnicodeReplacer objects + */ +StringReplacer::StringReplacer(const UnicodeString& theOutput, + const TransliterationRuleData* theData) { + output = theOutput; + cursorPos = 0; + hasCursor = FALSE; + data = theData; + isComplex = TRUE; +} + +/** + * Copy constructor. + */ +StringReplacer::StringReplacer(const StringReplacer& other) : + UnicodeFunctor(other), + UnicodeReplacer(other) +{ + output = other.output; + cursorPos = other.cursorPos; + hasCursor = other.hasCursor; + data = other.data; + isComplex = other.isComplex; +} + +/** + * Destructor + */ +StringReplacer::~StringReplacer() { +} + +/** + * Implement UnicodeFunctor + */ +StringReplacer* StringReplacer::clone() const { + return new StringReplacer(*this); +} + +/** + * Implement UnicodeFunctor + */ +UnicodeReplacer* StringReplacer::toReplacer() const { + return const_cast<StringReplacer *>(this); +} + +/** + * UnicodeReplacer API + */ +int32_t StringReplacer::replace(Replaceable& text, + int32_t start, + int32_t limit, + int32_t& cursor) { + int32_t outLen; + int32_t newStart = 0; + + // NOTE: It should be possible to _always_ run the complex + // processing code; just slower. If not, then there is a bug + // in the complex processing code. + + // Simple (no nested replacers) Processing Code : + if (!isComplex) { + text.handleReplaceBetween(start, limit, output); + outLen = output.length(); + + // Setup default cursor position (for cursorPos within output) + newStart = cursorPos; + } + + // Complex (nested replacers) Processing Code : + else { + /* When there are segments to be copied, use the Replaceable.copy() + * API in order to retain out-of-band data. Copy everything to the + * end of the string, then copy them back over the key. This preserves + * the integrity of indices into the key and surrounding context while + * generating the output text. + */ + UnicodeString buf; + int32_t oOutput; // offset into 'output' + isComplex = FALSE; + + // The temporary buffer starts at tempStart, and extends + // to destLimit. The start of the buffer has a single + // character from before the key. This provides style + // data when addition characters are filled into the + // temporary buffer. If there is nothing to the left, use + // the non-character U+FFFF, which Replaceable subclasses + // should treat specially as a "no-style character." + // destStart points to the point after the style context + // character, so it is tempStart+1 or tempStart+2. + int32_t tempStart = text.length(); // start of temp buffer + int32_t destStart = tempStart; // copy new text to here + if (start > 0) { + int32_t len = U16_LENGTH(text.char32At(start-1)); + text.copy(start-len, start, tempStart); + destStart += len; + } else { + UnicodeString str((UChar) 0xFFFF); + text.handleReplaceBetween(tempStart, tempStart, str); + destStart++; + } + int32_t destLimit = destStart; + + for (oOutput=0; oOutput<output.length(); ) { + if (oOutput == cursorPos) { + // Record the position of the cursor + newStart = destLimit - destStart; // relative to start + } + UChar32 c = output.char32At(oOutput); + UnicodeReplacer* r = data->lookupReplacer(c); + if (r == NULL) { + // Accumulate straight (non-segment) text. + buf.append(c); + } else { + isComplex = TRUE; + + // Insert any accumulated straight text. + if (buf.length() > 0) { + text.handleReplaceBetween(destLimit, destLimit, buf); + destLimit += buf.length(); + buf.truncate(0); + } + + // Delegate output generation to replacer object + int32_t len = r->replace(text, destLimit, destLimit, cursor); + destLimit += len; + } + oOutput += U16_LENGTH(c); + } + // Insert any accumulated straight text. + if (buf.length() > 0) { + text.handleReplaceBetween(destLimit, destLimit, buf); + destLimit += buf.length(); + } + if (oOutput == cursorPos) { + // Record the position of the cursor + newStart = destLimit - destStart; // relative to start + } + + outLen = destLimit - destStart; + + // Copy new text to start, and delete it + text.copy(destStart, destLimit, start); + text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString()); + + // Delete the old text (the key) + text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString()); + } + + if (hasCursor) { + // Adjust the cursor for positions outside the key. These + // refer to code points rather than code units. If cursorPos + // is within the output string, then use newStart, which has + // already been set above. + if (cursorPos < 0) { + newStart = start; + int32_t n = cursorPos; + // Outside the output string, cursorPos counts code points + while (n < 0 && newStart > 0) { + newStart -= U16_LENGTH(text.char32At(newStart-1)); + ++n; + } + newStart += n; + } else if (cursorPos > output.length()) { + newStart = start + outLen; + int32_t n = cursorPos - output.length(); + // Outside the output string, cursorPos counts code points + while (n > 0 && newStart < text.length()) { + newStart += U16_LENGTH(text.char32At(newStart)); + --n; + } + newStart += n; + } else { + // Cursor is within output string. It has been set up above + // to be relative to start. + newStart += start; + } + + cursor = newStart; + } + + return outLen; +} + +/** + * UnicodeReplacer API + */ +UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, + UBool escapeUnprintable) const { + rule.truncate(0); + UnicodeString quoteBuf; + + int32_t cursor = cursorPos; + + // Handle a cursor preceding the output + if (hasCursor && cursor < 0) { + while (cursor++ < 0) { + ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); + } + // Fall through and append '|' below + } + + for (int32_t i=0; i<output.length(); ++i) { + if (hasCursor && i == cursor) { + ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); + } + UChar c = output.charAt(i); // Ok to use 16-bits here + + UnicodeReplacer* r = data->lookupReplacer(c); + if (r == NULL) { + ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf); + } else { + UnicodeString buf; + r->toReplacerPattern(buf, escapeUnprintable); + buf.insert(0, (UChar)0x20); + buf.append((UChar)0x20); + ICU_Utility::appendToRule(rule, buf, + TRUE, escapeUnprintable, quoteBuf); + } + } + + // Handle a cursor after the output. Use > rather than >= because + // if cursor == output.length() it is at the end of the output, + // which is the default position, so we need not emit it. + if (hasCursor && cursor > output.length()) { + cursor -= output.length(); + while (cursor-- > 0) { + ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); + } + ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); + } + // Flush quoteBuf out to result + ICU_Utility::appendToRule(rule, -1, + TRUE, escapeUnprintable, quoteBuf); + + return rule; +} + +/** + * Implement UnicodeReplacer + */ +void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { + UChar32 ch; + for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) { + ch = output.char32At(i); + UnicodeReplacer* r = data->lookupReplacer(ch); + if (r == NULL) { + toUnionTo.add(ch); + } else { + r->addReplacementSetTo(toUnionTo); + } + } +} + +/** + * UnicodeFunctor API + */ +void StringReplacer::setData(const TransliterationRuleData* d) { + data = d; + int32_t i = 0; + while (i<output.length()) { + UChar32 c = output.char32At(i); + UnicodeFunctor* f = data->lookup(c); + if (f != NULL) { + f->setData(data); + } + i += U16_LENGTH(c); + } +} + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ + +//eof |