// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 1999-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: unistr_case.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:2 * * created on: 2004aug19 * created by: Markus W. Scherer * * Case-mapping functions moved here from unistr.cpp */ #include "unicode/utypes.h" #include "unicode/brkiter.h" #include "unicode/casemap.h" #include "unicode/edits.h" #include "unicode/putil.h" #include "cstring.h" #include "cmemory.h" #include "unicode/ustring.h" #include "unicode/unistr.h" #include "unicode/uchar.h" #include "uassert.h" #include "ucasemap_imp.h" #include "uelement.h" U_NAMESPACE_BEGIN //======================================== // Read-only implementation //======================================== int8_t UnicodeString::doCaseCompare(int32_t start, int32_t length, const char16_t *srcChars, int32_t srcStart, int32_t srcLength, uint32_t options) const { // compare illegal string values // treat const char16_t *srcChars==nullptr as an empty string if(isBogus()) { return -1; } // pin indices to legal values pinIndices(start, length); if(srcChars == nullptr) { srcStart = srcLength = 0; } // get the correct pointer const char16_t *chars = getArrayStart(); chars += start; if(srcStart!=0) { srcChars += srcStart; } if(chars != srcChars) { UErrorCode errorCode=U_ZERO_ERROR; int32_t result=u_strcmpFold(chars, length, srcChars, srcLength, options|U_COMPARE_IGNORE_CASE, &errorCode); if(result!=0) { return (int8_t)(result >> 24 | 1); } } else { // get the srcLength if necessary if(srcLength < 0) { srcLength = u_strlen(srcChars + srcStart); } if(length != srcLength) { return (int8_t)((length - srcLength) >> 24 | 1); } } return 0; } //======================================== // Write implementation //======================================== UnicodeString & UnicodeString::caseMap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM UStringCaseMapper *stringCaseMapper) { if(isEmpty() || !isWritable()) { // nothing to do return *this; } char16_t oldBuffer[2 * US_STACKBUF_SIZE]; char16_t *oldArray; int32_t oldLength = length(); int32_t newLength; UBool writable = isBufferWritable(); UErrorCode errorCode = U_ZERO_ERROR; #if !UCONFIG_NO_BREAK_ITERATION // Read-only alias to the original string contents for the titlecasing BreakIterator. // We cannot set the iterator simply to *this because *this is being modified. UnicodeString oldString; #endif // Try to avoid heap-allocating a new character array for this string. if (writable ? oldLength <= UPRV_LENGTHOF(oldBuffer) : oldLength < US_STACKBUF_SIZE) { // Short string: Copy the contents into a temporary buffer and // case-map back into the current array, or into the stack buffer. char16_t *buffer = getArrayStart(); int32_t capacity; oldArray = oldBuffer; u_memcpy(oldBuffer, buffer, oldLength); if (writable) { capacity = getCapacity(); } else { // Switch from the read-only alias or shared heap buffer to the stack buffer. if (!cloneArrayIfNeeded(US_STACKBUF_SIZE, US_STACKBUF_SIZE, /* doCopyArray= */ false)) { return *this; } U_ASSERT(fUnion.fFields.fLengthAndFlags & kUsingStackBuffer); buffer = fUnion.fStackFields.fBuffer; capacity = US_STACKBUF_SIZE; } #if !UCONFIG_NO_BREAK_ITERATION if (iter != nullptr) { oldString.setTo(false, oldArray, oldLength); iter->setText(oldString); } #endif newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR buffer, capacity, oldArray, oldLength, nullptr, errorCode); if (U_SUCCESS(errorCode)) { setLength(newLength); return *this; } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) { // common overflow handling below } else { setToBogus(); return *this; } } else { // Longer string or read-only buffer: // Collect only changes and then apply them to this string. // Case mapping often changes only small parts of a string, // and often does not change its length. oldArray = getArrayStart(); Edits edits; char16_t replacementChars[200]; #if !UCONFIG_NO_BREAK_ITERATION if (iter != nullptr) { oldString.setTo(false, oldArray, oldLength); iter->setText(oldString); } #endif stringCaseMapper(caseLocale, options | U_OMIT_UNCHANGED_TEXT, UCASEMAP_BREAK_ITERATOR replacementChars, UPRV_LENGTHOF(replacementChars), oldArray, oldLength, &edits, errorCode); if (U_SUCCESS(errorCode)) { // Grow the buffer at most once, not for multiple doReplace() calls. newLength = oldLength + edits.lengthDelta(); if (newLength > oldLength && !cloneArrayIfNeeded(newLength, newLength)) { return *this; } for (Edits::Iterator ei = edits.getCoarseChangesIterator(); ei.next(errorCode);) { doReplace(ei.destinationIndex(), ei.oldLength(), replacementChars, ei.replacementIndex(), ei.newLength()); } if (U_FAILURE(errorCode)) { setToBogus(); } return *this; } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) { // common overflow handling below newLength = oldLength + edits.lengthDelta(); } else { setToBogus(); return *this; } } // Handle buffer overflow, newLength is known. // We need to allocate a new buffer for the internal string case mapping function. // This is very similar to how doReplace() keeps the old array pointer // and deletes the old array itself after it is done. // In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array. int32_t *bufferToDelete = 0; if (!cloneArrayIfNeeded(newLength, newLength, false, &bufferToDelete, true)) { return *this; } errorCode = U_ZERO_ERROR; // No need to iter->setText() again: The case mapper restarts via iter->first(). newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR getArrayStart(), getCapacity(), oldArray, oldLength, nullptr, errorCode); if (bufferToDelete) { uprv_free(bufferToDelete); } if (U_SUCCESS(errorCode)) { setLength(newLength); } else { setToBogus(); } return *this; } UnicodeString & UnicodeString::foldCase(uint32_t options) { return caseMap(UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL ustrcase_internalFold); } U_NAMESPACE_END // Defined here to reduce dependencies on break iterator U_CAPI int32_t U_EXPORT2 uhash_hashCaselessUnicodeString(const UElement key) { U_NAMESPACE_USE const UnicodeString *str = (const UnicodeString*) key.pointer; if (str == nullptr) { return 0; } // Inefficient; a better way would be to have a hash function in // UnicodeString that does case folding on the fly. UnicodeString copy(*str); return copy.foldCase().hashCode(); } // Defined here to reduce dependencies on break iterator U_CAPI UBool U_EXPORT2 uhash_compareCaselessUnicodeString(const UElement key1, const UElement key2) { U_NAMESPACE_USE const UnicodeString *str1 = (const UnicodeString*) key1.pointer; const UnicodeString *str2 = (const UnicodeString*) key2.pointer; if (str1 == str2) { return true; } if (str1 == nullptr || str2 == nullptr) { return false; } return str1->caseCompare(*str2, U_FOLD_CASE_DEFAULT) == 0; }