summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/strrepl.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/i18n/strrepl.cpp')
-rw-r--r--intl/icu/source/i18n/strrepl.cpp329
1 files changed, 329 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/strrepl.cpp b/intl/icu/source/i18n/strrepl.cpp
new file mode 100644
index 0000000000..2981553869
--- /dev/null
+++ b/intl/icu/source/i18n/strrepl.cpp
@@ -0,0 +1,329 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+**********************************************************************
+* Copyright (c) 2002-2012, International Business Machines Corporation
+* and others. All Rights Reserved.
+**********************************************************************
+* Date Name Description
+* 01/21/2002 aliu Creation.
+**********************************************************************
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_TRANSLITERATION
+
+#include "unicode/uniset.h"
+#include "unicode/utf16.h"
+#include "strrepl.h"
+#include "rbt_data.h"
+#include "util.h"
+
+U_NAMESPACE_BEGIN
+
+UnicodeReplacer::~UnicodeReplacer() {}
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
+
+/**
+ * Construct a StringReplacer that sets the emits the given output
+ * text and sets the cursor to the given position.
+ * @param theOutput text that will replace input text when the
+ * replace() method is called. May contain stand-in characters
+ * that represent nested replacers.
+ * @param theCursorPos cursor position that will be returned by
+ * the replace() method
+ * @param theData transliterator context object that translates
+ * stand-in characters to UnicodeReplacer objects
+ */
+StringReplacer::StringReplacer(const UnicodeString& theOutput,
+ int32_t theCursorPos,
+ const TransliterationRuleData* theData) {
+ output = theOutput;
+ cursorPos = theCursorPos;
+ hasCursor = true;
+ data = theData;
+ isComplex = true;
+}
+
+/**
+ * Construct a StringReplacer that sets the emits the given output
+ * text and does not modify the cursor.
+ * @param theOutput text that will replace input text when the
+ * replace() method is called. May contain stand-in characters
+ * that represent nested replacers.
+ * @param theData transliterator context object that translates
+ * stand-in characters to UnicodeReplacer objects
+ */
+StringReplacer::StringReplacer(const UnicodeString& theOutput,
+ const TransliterationRuleData* theData) {
+ output = theOutput;
+ cursorPos = 0;
+ hasCursor = false;
+ data = theData;
+ isComplex = true;
+}
+
+/**
+ * Copy constructor.
+ */
+StringReplacer::StringReplacer(const StringReplacer& other) :
+ UnicodeFunctor(other),
+ UnicodeReplacer(other)
+{
+ output = other.output;
+ cursorPos = other.cursorPos;
+ hasCursor = other.hasCursor;
+ data = other.data;
+ isComplex = other.isComplex;
+}
+
+/**
+ * Destructor
+ */
+StringReplacer::~StringReplacer() {
+}
+
+/**
+ * Implement UnicodeFunctor
+ */
+StringReplacer* StringReplacer::clone() const {
+ return new StringReplacer(*this);
+}
+
+/**
+ * Implement UnicodeFunctor
+ */
+UnicodeReplacer* StringReplacer::toReplacer() const {
+ return const_cast<StringReplacer *>(this);
+}
+
+/**
+ * UnicodeReplacer API
+ */
+int32_t StringReplacer::replace(Replaceable& text,
+ int32_t start,
+ int32_t limit,
+ int32_t& cursor) {
+ int32_t outLen;
+ int32_t newStart = 0;
+
+ // NOTE: It should be possible to _always_ run the complex
+ // processing code; just slower. If not, then there is a bug
+ // in the complex processing code.
+
+ // Simple (no nested replacers) Processing Code :
+ if (!isComplex) {
+ text.handleReplaceBetween(start, limit, output);
+ outLen = output.length();
+
+ // Setup default cursor position (for cursorPos within output)
+ newStart = cursorPos;
+ }
+
+ // Complex (nested replacers) Processing Code :
+ else {
+ /* When there are segments to be copied, use the Replaceable.copy()
+ * API in order to retain out-of-band data. Copy everything to the
+ * end of the string, then copy them back over the key. This preserves
+ * the integrity of indices into the key and surrounding context while
+ * generating the output text.
+ */
+ UnicodeString buf;
+ int32_t oOutput; // offset into 'output'
+ isComplex = false;
+
+ // The temporary buffer starts at tempStart, and extends
+ // to destLimit. The start of the buffer has a single
+ // character from before the key. This provides style
+ // data when addition characters are filled into the
+ // temporary buffer. If there is nothing to the left, use
+ // the non-character U+FFFF, which Replaceable subclasses
+ // should treat specially as a "no-style character."
+ // destStart points to the point after the style context
+ // character, so it is tempStart+1 or tempStart+2.
+ int32_t tempStart = text.length(); // start of temp buffer
+ int32_t destStart = tempStart; // copy new text to here
+ if (start > 0) {
+ int32_t len = U16_LENGTH(text.char32At(start-1));
+ text.copy(start-len, start, tempStart);
+ destStart += len;
+ } else {
+ UnicodeString str((char16_t) 0xFFFF);
+ text.handleReplaceBetween(tempStart, tempStart, str);
+ destStart++;
+ }
+ int32_t destLimit = destStart;
+
+ for (oOutput=0; oOutput<output.length(); ) {
+ if (oOutput == cursorPos) {
+ // Record the position of the cursor
+ newStart = destLimit - destStart; // relative to start
+ }
+ UChar32 c = output.char32At(oOutput);
+ UnicodeReplacer* r = data->lookupReplacer(c);
+ if (r == nullptr) {
+ // Accumulate straight (non-segment) text.
+ buf.append(c);
+ } else {
+ isComplex = true;
+
+ // Insert any accumulated straight text.
+ if (buf.length() > 0) {
+ text.handleReplaceBetween(destLimit, destLimit, buf);
+ destLimit += buf.length();
+ buf.truncate(0);
+ }
+
+ // Delegate output generation to replacer object
+ int32_t len = r->replace(text, destLimit, destLimit, cursor);
+ destLimit += len;
+ }
+ oOutput += U16_LENGTH(c);
+ }
+ // Insert any accumulated straight text.
+ if (buf.length() > 0) {
+ text.handleReplaceBetween(destLimit, destLimit, buf);
+ destLimit += buf.length();
+ }
+ if (oOutput == cursorPos) {
+ // Record the position of the cursor
+ newStart = destLimit - destStart; // relative to start
+ }
+
+ outLen = destLimit - destStart;
+
+ // Copy new text to start, and delete it
+ text.copy(destStart, destLimit, start);
+ text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
+
+ // Delete the old text (the key)
+ text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
+ }
+
+ if (hasCursor) {
+ // Adjust the cursor for positions outside the key. These
+ // refer to code points rather than code units. If cursorPos
+ // is within the output string, then use newStart, which has
+ // already been set above.
+ if (cursorPos < 0) {
+ newStart = start;
+ int32_t n = cursorPos;
+ // Outside the output string, cursorPos counts code points
+ while (n < 0 && newStart > 0) {
+ newStart -= U16_LENGTH(text.char32At(newStart-1));
+ ++n;
+ }
+ newStart += n;
+ } else if (cursorPos > output.length()) {
+ newStart = start + outLen;
+ int32_t n = cursorPos - output.length();
+ // Outside the output string, cursorPos counts code points
+ while (n > 0 && newStart < text.length()) {
+ newStart += U16_LENGTH(text.char32At(newStart));
+ --n;
+ }
+ newStart += n;
+ } else {
+ // Cursor is within output string. It has been set up above
+ // to be relative to start.
+ newStart += start;
+ }
+
+ cursor = newStart;
+ }
+
+ return outLen;
+}
+
+/**
+ * UnicodeReplacer API
+ */
+UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
+ UBool escapeUnprintable) const {
+ rule.truncate(0);
+ UnicodeString quoteBuf;
+
+ int32_t cursor = cursorPos;
+
+ // Handle a cursor preceding the output
+ if (hasCursor && cursor < 0) {
+ while (cursor++ < 0) {
+ ICU_Utility::appendToRule(rule, (char16_t)0x0040 /*@*/, true, escapeUnprintable, quoteBuf);
+ }
+ // Fall through and append '|' below
+ }
+
+ for (int32_t i=0; i<output.length(); ++i) {
+ if (hasCursor && i == cursor) {
+ ICU_Utility::appendToRule(rule, (char16_t)0x007C /*|*/, true, escapeUnprintable, quoteBuf);
+ }
+ char16_t c = output.charAt(i); // Ok to use 16-bits here
+
+ UnicodeReplacer* r = data->lookupReplacer(c);
+ if (r == nullptr) {
+ ICU_Utility::appendToRule(rule, c, false, escapeUnprintable, quoteBuf);
+ } else {
+ UnicodeString buf;
+ r->toReplacerPattern(buf, escapeUnprintable);
+ buf.insert(0, (char16_t)0x20);
+ buf.append((char16_t)0x20);
+ ICU_Utility::appendToRule(rule, buf,
+ true, escapeUnprintable, quoteBuf);
+ }
+ }
+
+ // Handle a cursor after the output. Use > rather than >= because
+ // if cursor == output.length() it is at the end of the output,
+ // which is the default position, so we need not emit it.
+ if (hasCursor && cursor > output.length()) {
+ cursor -= output.length();
+ while (cursor-- > 0) {
+ ICU_Utility::appendToRule(rule, (char16_t)0x0040 /*@*/, true, escapeUnprintable, quoteBuf);
+ }
+ ICU_Utility::appendToRule(rule, (char16_t)0x007C /*|*/, true, escapeUnprintable, quoteBuf);
+ }
+ // Flush quoteBuf out to result
+ ICU_Utility::appendToRule(rule, -1,
+ true, escapeUnprintable, quoteBuf);
+
+ return rule;
+}
+
+/**
+ * Implement UnicodeReplacer
+ */
+void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
+ UChar32 ch;
+ for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
+ ch = output.char32At(i);
+ UnicodeReplacer* r = data->lookupReplacer(ch);
+ if (r == nullptr) {
+ toUnionTo.add(ch);
+ } else {
+ r->addReplacementSetTo(toUnionTo);
+ }
+ }
+}
+
+/**
+ * UnicodeFunctor API
+ */
+void StringReplacer::setData(const TransliterationRuleData* d) {
+ data = d;
+ int32_t i = 0;
+ while (i<output.length()) {
+ UChar32 c = output.char32At(i);
+ UnicodeFunctor* f = data->lookup(c);
+ if (f != nullptr) {
+ f->setData(data);
+ }
+ i += U16_LENGTH(c);
+ }
+}
+
+U_NAMESPACE_END
+
+#endif /* #if !UCONFIG_NO_TRANSLITERATION */
+
+//eof