1 files changed, 417 insertions, 0 deletions
diff --git a/intl/icu/source/common/unistr_cnv.cpp b/intl/icu/source/common/unistr_cnv.cpp
new file mode 100644
index 0000000000..2d649b2d51
--- /dev/null
+++ b/intl/icu/source/common/unistr_cnv.cpp
@@ -0,0 +1,417 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+*   Copyright (C) 1999-2014, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
+*******************************************************************************
+*   file name:  unistr_cnv.cpp
+*   encoding:   UTF-8
+*   tab size:   8 (not used)
+*   indentation:2
+*
+*   created on: 2004aug19
+*   created by: Markus W. Scherer
+*
+*   Character conversion functions moved here from unistr.cpp
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
+#include "unicode/putil.h"
+#include "cstring.h"
+#include "cmemory.h"
+#include "unicode/ustring.h"
+#include "unicode/unistr.h"
+#include "unicode/ucnv.h"
+#include "ucnv_imp.h"
+#include "putilimp.h"
+#include "ustr_cnv.h"
+#include "ustr_imp.h"
+
+U_NAMESPACE_BEGIN
+
+//========================================
+// Constructors
+//========================================
+
+#if !U_CHARSET_IS_UTF8
+
+UnicodeString::UnicodeString(const char *codepageData) {
+    fUnion.fFields.fLengthAndFlags = kShortString;
+    if(codepageData != 0) {
+        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
+    }
+}
+
+UnicodeString::UnicodeString(const char *codepageData,
+                             int32_t dataLength) {
+    fUnion.fFields.fLengthAndFlags = kShortString;
+    if(codepageData != 0) {
+        doCodepageCreate(codepageData, dataLength, 0);
+    }
+}
+
+// else see unistr.cpp
+#endif
+
+UnicodeString::UnicodeString(const char *codepageData,
+                             const char *codepage) {
+    fUnion.fFields.fLengthAndFlags = kShortString;
+    if(codepageData != 0) {
+        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
+    }
+}
+
+UnicodeString::UnicodeString(const char *codepageData,
+                             int32_t dataLength,
+                             const char *codepage) {
+    fUnion.fFields.fLengthAndFlags = kShortString;
+    if(codepageData != 0) {
+        doCodepageCreate(codepageData, dataLength, codepage);
+    }
+}
+
+UnicodeString::UnicodeString(const char *src, int32_t srcLength,
+                             UConverter *cnv,
+                             UErrorCode &errorCode) {
+    fUnion.fFields.fLengthAndFlags = kShortString;
+    if(U_SUCCESS(errorCode)) {
+        // check arguments
+        if(src==nullptr) {
+            // treat as an empty string, do nothing more
+        } else if(srcLength<-1) {
+            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        } else {
+            // get input length
+            if(srcLength==-1) {
+                srcLength=(int32_t)uprv_strlen(src);
+            }
+            if(srcLength>0) {
+                if(cnv!=0) {
+                    // use the provided converter
+                    ucnv_resetToUnicode(cnv);
+                    doCodepageCreate(src, srcLength, cnv, errorCode);
+                } else {
+                    // use the default converter
+                    cnv=u_getDefaultConverter(&errorCode);
+                    doCodepageCreate(src, srcLength, cnv, errorCode);
+                    u_releaseDefaultConverter(cnv);
+                }
+            }
+        }
+
+        if(U_FAILURE(errorCode)) {
+            setToBogus();
+        }
+    }
+}
+
+//========================================
+// Codeset conversion
+//========================================
+
+#if !U_CHARSET_IS_UTF8
+
+int32_t
+UnicodeString::extract(int32_t start,
+                       int32_t length,
+                       char *target,
+                       uint32_t dstSize) const {
+    return extract(start, length, target, dstSize, 0);
+}
+
+// else see unistr.cpp
+#endif
+
+int32_t
+UnicodeString::extract(int32_t start,
+                       int32_t length,
+                       char *target,
+                       uint32_t dstSize,
+                       const char *codepage) const
+{
+    // if the arguments are illegal, then do nothing
+    if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
+        return 0;
+    }
+
+    // pin the indices to legal values
+    pinIndices(start, length);
+
+    // We need to cast dstSize to int32_t for all subsequent code.
+    // I don't know why the API was defined with uint32_t but we are stuck with it.
+    // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
+    // as a limit in some functions, it may wrap around and yield a pointer
+    // that compares less-than target.
+    int32_t capacity;
+    if(dstSize < 0x7fffffff) {
+        // Assume that the capacity is real and a limit pointer won't wrap around.
+        capacity = (int32_t)dstSize;
+    } else {
+        // Pin the capacity so that a limit pointer does not wrap around.
+        char *targetLimit = (char *)U_MAX_PTR(target);
+        // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
+        // greater than target and does not wrap around the top of the address space.
+        capacity = (int32_t)(targetLimit - target);
+    }
+
+    // create the converter
+    UConverter *converter;
+    UErrorCode status = U_ZERO_ERROR;
+
+    // just write the NUL if the string length is 0
+    if(length == 0) {
+        return u_terminateChars(target, capacity, 0, &status);
+    }
+
+    // if the codepage is the default, use our cache
+    // if it is an empty string, then use the "invariant character" conversion
+    if (codepage == 0) {
+        const char *defaultName = ucnv_getDefaultName();
+        if(UCNV_FAST_IS_UTF8(defaultName)) {
+            return toUTF8(start, length, target, capacity);
+        }
+        converter = u_getDefaultConverter(&status);
+    } else if (*codepage == 0) {
+        // use the "invariant characters" conversion
+        int32_t destLength;
+        if(length <= capacity) {
+            destLength = length;
+        } else {
+            destLength = capacity;
+        }
+        u_UCharsToChars(getArrayStart() + start, target, destLength);
+        return u_terminateChars(target, capacity, length, &status);
+    } else {
+        converter = ucnv_open(codepage, &status);
+    }
+
+    length = doExtract(start, length, target, capacity, converter, status);
+
+    // close the converter
+    if (codepage == 0) {
+        u_releaseDefaultConverter(converter);
+    } else {
+        ucnv_close(converter);
+    }
+
+    return length;
+}
+
+int32_t
+UnicodeString::extract(char *dest, int32_t destCapacity,
+                       UConverter *cnv,
+                       UErrorCode &errorCode) const
+{
+    if(U_FAILURE(errorCode)) {
+        return 0;
+    }
+
+    if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
+        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+    // nothing to do?
+    if(isEmpty()) {
+        return u_terminateChars(dest, destCapacity, 0, &errorCode);
+    }
+
+    // get the converter
+    UBool isDefaultConverter;
+    if(cnv==0) {
+        isDefaultConverter=true;
+        cnv=u_getDefaultConverter(&errorCode);
+        if(U_FAILURE(errorCode)) {
+            return 0;
+        }
+    } else {
+        isDefaultConverter=false;
+        ucnv_resetFromUnicode(cnv);
+    }
+
+    // convert
+    int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
+
+    // release the converter
+    if(isDefaultConverter) {
+        u_releaseDefaultConverter(cnv);
+    }
+
+    return len;
+}
+
+int32_t
+UnicodeString::doExtract(int32_t start, int32_t length,
+                         char *dest, int32_t destCapacity,
+                         UConverter *cnv,
+                         UErrorCode &errorCode) const
+{
+    if(U_FAILURE(errorCode)) {
+        if(destCapacity!=0) {
+            *dest=0;
+        }
+        return 0;
+    }
+
+    const char16_t *src=getArrayStart()+start, *srcLimit=src+length;
+    char *originalDest=dest;
+    const char *destLimit;
+
+    if(destCapacity==0) {
+        destLimit=dest=0;
+    } else if(destCapacity==-1) {
+        // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
+        destLimit=(char*)U_MAX_PTR(dest);
+        // for NUL-termination, translate into highest int32_t
+        destCapacity=0x7fffffff;
+    } else {
+        destLimit=dest+destCapacity;
+    }
+
+    // perform the conversion
+    ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, true, &errorCode);
+    length=(int32_t)(dest-originalDest);
+
+    // if an overflow occurs, then get the preflighting length
+    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
+        char buffer[1024];
+
+        destLimit=buffer+sizeof(buffer);
+        do {
+            dest=buffer;
+            errorCode=U_ZERO_ERROR;
+            ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, true, &errorCode);
+            length+=(int32_t)(dest-buffer);
+        } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
+    }
+
+    return u_terminateChars(originalDest, destCapacity, length, &errorCode);
+}
+
+void
+UnicodeString::doCodepageCreate(const char *codepageData,
+                                int32_t dataLength,
+                                const char *codepage)
+{
+    // if there's nothing to convert, do nothing
+    if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
+        return;
+    }
+    if(dataLength == -1) {
+        dataLength = (int32_t)uprv_strlen(codepageData);
+    }
+
+    UErrorCode status = U_ZERO_ERROR;
+
+    // create the converter
+    // if the codepage is the default, use our cache
+    // if it is an empty string, then use the "invariant character" conversion
+    UConverter *converter;
+    if (codepage == 0) {
+        const char *defaultName = ucnv_getDefaultName();
+        if(UCNV_FAST_IS_UTF8(defaultName)) {
+            setToUTF8(StringPiece(codepageData, dataLength));
+            return;
+        }
+        converter = u_getDefaultConverter(&status);
+    } else if(*codepage == 0) {
+        // use the "invariant characters" conversion
+        if(cloneArrayIfNeeded(dataLength, dataLength, false)) {
+            u_charsToUChars(codepageData, getArrayStart(), dataLength);
+            setLength(dataLength);
+        } else {
+            setToBogus();
+        }
+        return;
+    } else {
+        converter = ucnv_open(codepage, &status);
+    }
+
+    // if we failed, set the appropriate flags and return
+    if(U_FAILURE(status)) {
+        setToBogus();
+        return;
+    }
+
+    // perform the conversion
+    doCodepageCreate(codepageData, dataLength, converter, status);
+    if(U_FAILURE(status)) {
+        setToBogus();
+    }
+
+    // close the converter
+    if(codepage == 0) {
+        u_releaseDefaultConverter(converter);
+    } else {
+        ucnv_close(converter);
+    }
+}
+
+void
+UnicodeString::doCodepageCreate(const char *codepageData,
+                                int32_t dataLength,
+                                UConverter *converter,
+                                UErrorCode &status)
+{
+    if(U_FAILURE(status)) {
+        return;
+    }
+
+    // set up the conversion parameters
+    const char *mySource     = codepageData;
+    const char *mySourceEnd  = mySource + dataLength;
+    char16_t *array, *myTarget;
+
+    // estimate the size needed:
+    int32_t arraySize;
+    if(dataLength <= US_STACKBUF_SIZE) {
+        // try to use the stack buffer
+        arraySize = US_STACKBUF_SIZE;
+    } else {
+        // 1.25 char16_t's per source byte should cover most cases
+        arraySize = dataLength + (dataLength >> 2);
+    }
+
+    // we do not care about the current contents
+    UBool doCopyArray = false;
+    for(;;) {
+        if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
+            setToBogus();
+            break;
+        }
+
+        // perform the conversion
+        array = getArrayStart();
+        myTarget = array + length();
+        ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
+            &mySource, mySourceEnd, 0, true, &status);
+
+        // update the conversion parameters
+        setLength((int32_t)(myTarget - array));
+
+        // allocate more space and copy data, if needed
+        if(status == U_BUFFER_OVERFLOW_ERROR) {
+            // reset the error code
+            status = U_ZERO_ERROR;
+
+            // keep the previous conversion results
+            doCopyArray = true;
+
+            // estimate the new size needed, larger than before
+            // try 2 char16_t's per remaining source byte
+            arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
+        } else {
+            break;
+        }
+    }
+}
+
+U_NAMESPACE_END
+
+#endif