summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/i18n/strmatch.cpp
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--intl/icu/source/i18n/strmatch.cpp296
1 files changed, 296 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/strmatch.cpp b/intl/icu/source/i18n/strmatch.cpp
new file mode 100644
index 0000000000..ff52eeacdc
--- /dev/null
+++ b/intl/icu/source/i18n/strmatch.cpp
@@ -0,0 +1,296 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+**********************************************************************
+* Copyright (c) 2001-2012, International Business Machines Corporation
+* and others. All Rights Reserved.
+**********************************************************************
+* Date Name Description
+* 07/23/01 aliu Creation.
+**********************************************************************
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_TRANSLITERATION
+
+#include "strmatch.h"
+#include "rbt_data.h"
+#include "util.h"
+#include "unicode/uniset.h"
+#include "unicode/utf16.h"
+
+U_NAMESPACE_BEGIN
+
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
+
+StringMatcher::StringMatcher(const UnicodeString& theString,
+ int32_t start,
+ int32_t limit,
+ int32_t segmentNum,
+ const TransliterationRuleData& theData) :
+ data(&theData),
+ segmentNumber(segmentNum),
+ matchStart(-1),
+ matchLimit(-1)
+{
+ theString.extractBetween(start, limit, pattern);
+}
+
+StringMatcher::StringMatcher(const StringMatcher& o) :
+ UnicodeFunctor(o),
+ UnicodeMatcher(o),
+ UnicodeReplacer(o),
+ pattern(o.pattern),
+ data(o.data),
+ segmentNumber(o.segmentNumber),
+ matchStart(o.matchStart),
+ matchLimit(o.matchLimit)
+{
+}
+
+/**
+ * Destructor
+ */
+StringMatcher::~StringMatcher() {
+}
+
+/**
+ * Implement UnicodeFunctor
+ */
+StringMatcher* StringMatcher::clone() const {
+ return new StringMatcher(*this);
+}
+
+/**
+ * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
+ * and return the pointer.
+ */
+UnicodeMatcher* StringMatcher::toMatcher() const {
+ StringMatcher *nonconst_this = const_cast<StringMatcher *>(this);
+ UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
+
+ return nonconst_base;
+}
+
+/**
+ * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
+ * and return the pointer.
+ */
+UnicodeReplacer* StringMatcher::toReplacer() const {
+ StringMatcher *nonconst_this = const_cast<StringMatcher *>(this);
+ UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
+
+ return nonconst_base;
+}
+
+/**
+ * Implement UnicodeMatcher
+ */
+UMatchDegree StringMatcher::matches(const Replaceable& text,
+ int32_t& offset,
+ int32_t limit,
+ UBool incremental) {
+ int32_t i;
+ int32_t cursor = offset;
+ if (limit < cursor) {
+ // Match in the reverse direction
+ for (i=pattern.length()-1; i>=0; --i) {
+ char16_t keyChar = pattern.charAt(i);
+ UnicodeMatcher* subm = data->lookupMatcher(keyChar);
+ if (subm == 0) {
+ if (cursor > limit &&
+ keyChar == text.charAt(cursor)) {
+ --cursor;
+ } else {
+ return U_MISMATCH;
+ }
+ } else {
+ UMatchDegree m =
+ subm->matches(text, cursor, limit, incremental);
+ if (m != U_MATCH) {
+ return m;
+ }
+ }
+ }
+ // Record the match position, but adjust for a normal
+ // forward start, limit, and only if a prior match does not
+ // exist -- we want the rightmost match.
+ if (matchStart < 0) {
+ matchStart = cursor+1;
+ matchLimit = offset+1;
+ }
+ } else {
+ for (i=0; i<pattern.length(); ++i) {
+ if (incremental && cursor == limit) {
+ // We've reached the context limit without a mismatch and
+ // without completing our match.
+ return U_PARTIAL_MATCH;
+ }
+ char16_t keyChar = pattern.charAt(i);
+ UnicodeMatcher* subm = data->lookupMatcher(keyChar);
+ if (subm == 0) {
+ // Don't need the cursor < limit check if
+ // incremental is true (because it's done above); do need
+ // it otherwise.
+ if (cursor < limit &&
+ keyChar == text.charAt(cursor)) {
+ ++cursor;
+ } else {
+ return U_MISMATCH;
+ }
+ } else {
+ UMatchDegree m =
+ subm->matches(text, cursor, limit, incremental);
+ if (m != U_MATCH) {
+ return m;
+ }
+ }
+ }
+ // Record the match position
+ matchStart = offset;
+ matchLimit = cursor;
+ }
+
+ offset = cursor;
+ return U_MATCH;
+}
+
+/**
+ * Implement UnicodeMatcher
+ */
+UnicodeString& StringMatcher::toPattern(UnicodeString& result,
+ UBool escapeUnprintable) const
+{
+ result.truncate(0);
+ UnicodeString str, quoteBuf;
+ if (segmentNumber > 0) {
+ result.append((char16_t)40); /*(*/
+ }
+ for (int32_t i=0; i<pattern.length(); ++i) {
+ char16_t keyChar = pattern.charAt(i);
+ const UnicodeMatcher* m = data->lookupMatcher(keyChar);
+ if (m == 0) {
+ ICU_Utility::appendToRule(result, keyChar, false, escapeUnprintable, quoteBuf);
+ } else {
+ ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
+ true, escapeUnprintable, quoteBuf);
+ }
+ }
+ if (segmentNumber > 0) {
+ result.append((char16_t)41); /*)*/
+ }
+ // Flush quoteBuf out to result
+ ICU_Utility::appendToRule(result, -1,
+ true, escapeUnprintable, quoteBuf);
+ return result;
+}
+
+/**
+ * Implement UnicodeMatcher
+ */
+UBool StringMatcher::matchesIndexValue(uint8_t v) const {
+ if (pattern.length() == 0) {
+ return true;
+ }
+ UChar32 c = pattern.char32At(0);
+ const UnicodeMatcher *m = data->lookupMatcher(c);
+ return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
+}
+
+/**
+ * Implement UnicodeMatcher
+ */
+void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
+ UChar32 ch;
+ for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
+ ch = pattern.char32At(i);
+ const UnicodeMatcher* matcher = data->lookupMatcher(ch);
+ if (matcher == nullptr) {
+ toUnionTo.add(ch);
+ } else {
+ matcher->addMatchSetTo(toUnionTo);
+ }
+ }
+}
+
+/**
+ * UnicodeReplacer API
+ */
+int32_t StringMatcher::replace(Replaceable& text,
+ int32_t start,
+ int32_t limit,
+ int32_t& /*cursor*/) {
+
+ int32_t outLen = 0;
+
+ // Copy segment with out-of-band data
+ int32_t dest = limit;
+ // If there was no match, that means that a quantifier
+ // matched zero-length. E.g., x (a)* y matched "xy".
+ if (matchStart >= 0) {
+ if (matchStart != matchLimit) {
+ text.copy(matchStart, matchLimit, dest);
+ outLen = matchLimit - matchStart;
+ }
+ }
+
+ text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
+
+ return outLen;
+}
+
+/**
+ * UnicodeReplacer API
+ */
+UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
+ UBool /*escapeUnprintable*/) const {
+ // assert(segmentNumber > 0);
+ rule.truncate(0);
+ rule.append((char16_t)0x0024 /*$*/);
+ ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
+ return rule;
+}
+
+/**
+ * Remove any match info. This must be called before performing a
+ * set of matches with this segment.
+ */
+ void StringMatcher::resetMatch() {
+ matchStart = matchLimit = -1;
+}
+
+/**
+ * Union the set of all characters that may output by this object
+ * into the given set.
+ * @param toUnionTo the set into which to union the output characters
+ */
+void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
+ // The output of this replacer varies; it is the source text between
+ // matchStart and matchLimit. Since this varies depending on the
+ // input text, we can't compute it here. We can either do nothing
+ // or we can add ALL characters to the set. It's probably more useful
+ // to do nothing.
+}
+
+/**
+ * Implement UnicodeFunctor
+ */
+void StringMatcher::setData(const TransliterationRuleData* d) {
+ data = d;
+ int32_t i = 0;
+ while (i<pattern.length()) {
+ UChar32 c = pattern.char32At(i);
+ UnicodeFunctor* f = data->lookup(c);
+ if (f != nullptr) {
+ f->setData(data);
+ }
+ i += U16_LENGTH(c);
+ }
+}
+
+U_NAMESPACE_END
+
+#endif /* #if !UCONFIG_NO_TRANSLITERATION */
+
+//eof