summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/common/ucnvbocu.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/common/ucnvbocu.cpp')
-rw-r--r--intl/icu/source/common/ucnvbocu.cpp1413
1 files changed, 1413 insertions, 0 deletions
diff --git a/intl/icu/source/common/ucnvbocu.cpp b/intl/icu/source/common/ucnvbocu.cpp
new file mode 100644
index 0000000000..007722e474
--- /dev/null
+++ b/intl/icu/source/common/ucnvbocu.cpp
@@ -0,0 +1,1413 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+******************************************************************************
+*
+* Copyright (C) 2002-2016, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+******************************************************************************
+* file name: ucnvbocu.cpp
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2002mar27
+* created by: Markus W. Scherer
+*
+* This is an implementation of the Binary Ordered Compression for Unicode,
+* in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
+
+#include "unicode/ucnv.h"
+#include "unicode/ucnv_cb.h"
+#include "unicode/utf16.h"
+#include "putilimp.h"
+#include "ucnv_bld.h"
+#include "ucnv_cnv.h"
+#include "uassert.h"
+
+/* BOCU-1 constants and macros ---------------------------------------------- */
+
+/*
+ * BOCU-1 encodes the code points of a Unicode string as
+ * a sequence of byte-encoded differences (slope detection),
+ * preserving lexical order.
+ *
+ * Optimize the difference-taking for runs of Unicode text within
+ * small scripts:
+ *
+ * Most small scripts are allocated within aligned 128-blocks of Unicode
+ * code points. Lexical order is preserved if the "previous code point" state
+ * is always moved into the middle of such a block.
+ *
+ * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
+ * areas into the middle of those areas.
+ *
+ * C0 control codes and space are encoded with their US-ASCII bytes.
+ * "prev" is reset for C0 controls but not for space.
+ */
+
+/* initial value for "prev": middle of the ASCII range */
+#define BOCU1_ASCII_PREV 0x40
+
+/* bounding byte values for differences */
+#define BOCU1_MIN 0x21
+#define BOCU1_MIDDLE 0x90
+#define BOCU1_MAX_LEAD 0xfe
+#define BOCU1_MAX_TRAIL 0xff
+#define BOCU1_RESET 0xff
+
+/* number of lead bytes */
+#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
+
+/* adjust trail byte counts for the use of some C0 control byte values */
+#define BOCU1_TRAIL_CONTROLS_COUNT 20
+#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
+
+/* number of trail bytes */
+#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
+
+/*
+ * number of positive and negative single-byte codes
+ * (counting 0==BOCU1_MIDDLE among the positive ones)
+ */
+#define BOCU1_SINGLE 64
+
+/* number of lead bytes for positive and negative 2/3/4-byte sequences */
+#define BOCU1_LEAD_2 43
+#define BOCU1_LEAD_3 3
+#define BOCU1_LEAD_4 1
+
+/* The difference value range for single-byters. */
+#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
+#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
+
+/* The difference value range for double-byters. */
+#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
+#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
+
+/* The difference value range for 3-byters. */
+#define BOCU1_REACH_POS_3 \
+ (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
+
+#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
+
+/* The lead byte start values. */
+#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
+#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
+#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
+ /* ==BOCU1_MAX_LEAD */
+
+#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
+#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
+#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
+ /* ==BOCU1_MIN+1 */
+
+/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
+#define BOCU1_LENGTH_FROM_LEAD(lead) \
+ ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
+ (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
+ (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
+
+/* The length of a byte sequence, according to its packed form. */
+#define BOCU1_LENGTH_FROM_PACKED(packed) \
+ ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
+
+/*
+ * 12 commonly used C0 control codes (and space) are only used to encode
+ * themselves directly,
+ * which makes BOCU-1 MIME-usable and reasonably safe for
+ * ASCII-oriented software.
+ *
+ * These controls are
+ * 0 NUL
+ *
+ * 7 BEL
+ * 8 BS
+ *
+ * 9 TAB
+ * a LF
+ * b VT
+ * c FF
+ * d CR
+ *
+ * e SO
+ * f SI
+ *
+ * 1a SUB
+ * 1b ESC
+ *
+ * The other 20 C0 controls are also encoded directly (to preserve order)
+ * but are also used as trail bytes in difference encoding
+ * (for better compression).
+ */
+#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
+
+/*
+ * Byte value map for control codes,
+ * from external byte values 0x00..0x20
+ * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
+ * External byte values that are illegal as trail bytes are mapped to -1.
+ */
+static const int8_t
+bocu1ByteToTrail[BOCU1_MIN]={
+/* 0 1 2 3 4 5 6 7 */
+ -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
+
+/* 8 9 a b c d e f */
+ -1, -1, -1, -1, -1, -1, -1, -1,
+
+/* 10 11 12 13 14 15 16 17 */
+ 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
+
+/* 18 19 1a 1b 1c 1d 1e 1f */
+ 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
+
+/* 20 */
+ -1
+};
+
+/*
+ * Byte value map for control codes,
+ * from trail byte values 0..19 (0..0x13) as used in the difference calculation
+ * to external byte values 0x00..0x20.
+ */
+static const int8_t
+bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
+/* 0 1 2 3 4 5 6 7 */
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
+
+/* 8 9 a b c d e f */
+ 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+
+/* 10 11 12 13 */
+ 0x1c, 0x1d, 0x1e, 0x1f
+};
+
+/**
+ * Integer division and modulo with negative numerators
+ * yields negative modulo results and quotients that are one more than
+ * what we need here.
+ * This macro adjust the results so that the modulo-value m is always >=0.
+ *
+ * For positive n, the if() condition is always false.
+ *
+ * @param n Number to be split into quotient and rest.
+ * Will be modified to contain the quotient.
+ * @param d Divisor.
+ * @param m Output variable for the rest (modulo result).
+ */
+#define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \
+ (m)=(n)%(d); \
+ (n)/=(d); \
+ if((m)<0) { \
+ --(n); \
+ (m)+=(d); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/* Faster versions of packDiff() for single-byte-encoded diff values. */
+
+/** Is a diff value encodable in a single byte? */
+#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
+
+/** Encode a diff value in a single byte. */
+#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
+
+/** Is a diff value encodable in two bytes? */
+#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
+
+/* BOCU-1 implementation functions ------------------------------------------ */
+
+#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
+
+/**
+ * Compute the next "previous" value for differencing
+ * from the current code point.
+ *
+ * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
+ * @return "previous code point" state value
+ */
+static inline int32_t
+bocu1Prev(int32_t c) {
+ /* compute new prev */
+ if(/* 0x3040<=c && */ c<=0x309f) {
+ /* Hiragana is not 128-aligned */
+ return 0x3070;
+ } else if(0x4e00<=c && c<=0x9fa5) {
+ /* CJK Unihan */
+ return 0x4e00-BOCU1_REACH_NEG_2;
+ } else if(0xac00<=c /* && c<=0xd7a3 */) {
+ /* Korean Hangul */
+ return (0xd7a3+0xac00)/2;
+ } else {
+ /* mostly small scripts */
+ return BOCU1_SIMPLE_PREV(c);
+ }
+}
+
+/** Fast version of bocu1Prev() for most scripts. */
+#define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
+
+/*
+ * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
+ * The UConverter fields are used as follows:
+ *
+ * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
+ *
+ * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
+ * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
+ */
+
+/* BOCU-1-from-Unicode conversion functions --------------------------------- */
+
+/**
+ * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
+ * and return a packed integer with them.
+ *
+ * The encoding favors small absolute differences with short encodings
+ * to compress runs of same-script characters.
+ *
+ * Optimized version with unrolled loops and fewer floating-point operations
+ * than the standard packDiff().
+ *
+ * @param diff difference value -0x10ffff..0x10ffff
+ * @return
+ * 0x010000zz for 1-byte sequence zz
+ * 0x0200yyzz for 2-byte sequence yy zz
+ * 0x03xxyyzz for 3-byte sequence xx yy zz
+ * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
+ */
+static int32_t
+packDiff(int32_t diff) {
+ int32_t result, m;
+
+ U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
+ if(diff>=BOCU1_REACH_NEG_1) {
+ /* mostly positive differences, and single-byte negative ones */
+#if 0 /* single-byte case handled in macros, see below */
+ if(diff<=BOCU1_REACH_POS_1) {
+ /* single byte */
+ return 0x01000000|(BOCU1_MIDDLE+diff);
+ } else
+#endif
+ if(diff<=BOCU1_REACH_POS_2) {
+ /* two bytes */
+ diff-=BOCU1_REACH_POS_1+1;
+ result=0x02000000;
+
+ m=diff%BOCU1_TRAIL_COUNT;
+ diff/=BOCU1_TRAIL_COUNT;
+ result|=BOCU1_TRAIL_TO_BYTE(m);
+
+ result|=(BOCU1_START_POS_2+diff)<<8;
+ } else if(diff<=BOCU1_REACH_POS_3) {
+ /* three bytes */
+ diff-=BOCU1_REACH_POS_2+1;
+ result=0x03000000;
+
+ m=diff%BOCU1_TRAIL_COUNT;
+ diff/=BOCU1_TRAIL_COUNT;
+ result|=BOCU1_TRAIL_TO_BYTE(m);
+
+ m=diff%BOCU1_TRAIL_COUNT;
+ diff/=BOCU1_TRAIL_COUNT;
+ result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
+
+ result|=(BOCU1_START_POS_3+diff)<<16;
+ } else {
+ /* four bytes */
+ diff-=BOCU1_REACH_POS_3+1;
+
+ m=diff%BOCU1_TRAIL_COUNT;
+ diff/=BOCU1_TRAIL_COUNT;
+ result=BOCU1_TRAIL_TO_BYTE(m);
+
+ m=diff%BOCU1_TRAIL_COUNT;
+ diff/=BOCU1_TRAIL_COUNT;
+ result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
+
+ /*
+ * We know that / and % would deliver quotient 0 and rest=diff.
+ * Avoid division and modulo for performance.
+ */
+ result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
+
+ result|=((uint32_t)BOCU1_START_POS_4)<<24;
+ }
+ } else {
+ /* two- to four-byte negative differences */
+ if(diff>=BOCU1_REACH_NEG_2) {
+ /* two bytes */
+ diff-=BOCU1_REACH_NEG_1;
+ result=0x02000000;
+
+ NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
+ result|=BOCU1_TRAIL_TO_BYTE(m);
+
+ result|=(BOCU1_START_NEG_2+diff)<<8;
+ } else if(diff>=BOCU1_REACH_NEG_3) {
+ /* three bytes */
+ diff-=BOCU1_REACH_NEG_2;
+ result=0x03000000;
+
+ NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
+ result|=BOCU1_TRAIL_TO_BYTE(m);
+
+ NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
+ result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
+
+ result|=(BOCU1_START_NEG_3+diff)<<16;
+ } else {
+ /* four bytes */
+ diff-=BOCU1_REACH_NEG_3;
+
+ NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
+ result=BOCU1_TRAIL_TO_BYTE(m);
+
+ NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
+ result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
+
+ /*
+ * We know that NEGDIVMOD would deliver
+ * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
+ * Avoid division and modulo for performance.
+ */
+ m=diff+BOCU1_TRAIL_COUNT;
+ result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
+
+ result|=BOCU1_MIN<<24;
+ }
+ }
+ return result;
+}
+
+
+static void U_CALLCONV
+_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
+ UErrorCode *pErrorCode) {
+ UConverter *cnv;
+ const char16_t *source, *sourceLimit;
+ uint8_t *target;
+ int32_t targetCapacity;
+ int32_t *offsets;
+
+ int32_t prev, c, diff;
+
+ int32_t sourceIndex, nextSourceIndex;
+
+ /* set up the local pointers */
+ cnv=pArgs->converter;
+ source=pArgs->source;
+ sourceLimit=pArgs->sourceLimit;
+ target=(uint8_t *)pArgs->target;
+ targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
+ offsets=pArgs->offsets;
+
+ /* get the converter state from UConverter */
+ c=cnv->fromUChar32;
+ prev=(int32_t)cnv->fromUnicodeStatus;
+ if(prev==0) {
+ prev=BOCU1_ASCII_PREV;
+ }
+
+ /* sourceIndex=-1 if the current character began in the previous buffer */
+ sourceIndex= c==0 ? 0 : -1;
+ nextSourceIndex=0;
+
+ /* conversion loop */
+ if(c!=0 && targetCapacity>0) {
+ goto getTrail;
+ }
+
+fastSingle:
+ /* fast loop for single-byte differences */
+ /* use only one loop counter variable, targetCapacity, not also source */
+ diff=(int32_t)(sourceLimit-source);
+ if(targetCapacity>diff) {
+ targetCapacity=diff;
+ }
+ while(targetCapacity>0 && (c=*source)<0x3000) {
+ if(c<=0x20) {
+ if(c!=0x20) {
+ prev=BOCU1_ASCII_PREV;
+ }
+ *target++=(uint8_t)c;
+ *offsets++=nextSourceIndex++;
+ ++source;
+ --targetCapacity;
+ } else {
+ diff=c-prev;
+ if(DIFF_IS_SINGLE(diff)) {
+ prev=BOCU1_SIMPLE_PREV(c);
+ *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
+ *offsets++=nextSourceIndex++;
+ ++source;
+ --targetCapacity;
+ } else {
+ break;
+ }
+ }
+ }
+ /* restore real values */
+ targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
+ sourceIndex=nextSourceIndex; /* wrong if offsets==nullptr but does not matter */
+
+ /* regular loop for all cases */
+ while(source<sourceLimit) {
+ if(targetCapacity>0) {
+ c=*source++;
+ ++nextSourceIndex;
+
+ if(c<=0x20) {
+ /*
+ * ISO C0 control & space:
+ * Encode directly for MIME compatibility,
+ * and reset state except for space, to not disrupt compression.
+ */
+ if(c!=0x20) {
+ prev=BOCU1_ASCII_PREV;
+ }
+ *target++=(uint8_t)c;
+ *offsets++=sourceIndex;
+ --targetCapacity;
+
+ sourceIndex=nextSourceIndex;
+ continue;
+ }
+
+ if(U16_IS_LEAD(c)) {
+getTrail:
+ if(source<sourceLimit) {
+ /* test the following code unit */
+ char16_t trail=*source;
+ if(U16_IS_TRAIL(trail)) {
+ ++source;
+ ++nextSourceIndex;
+ c=U16_GET_SUPPLEMENTARY(c, trail);
+ }
+ } else {
+ /* no more input */
+ c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
+ break;
+ }
+ }
+
+ /*
+ * all other Unicode code points c==U+0021..U+10ffff
+ * are encoded with the difference c-prev
+ *
+ * a new prev is computed from c,
+ * placed in the middle of a 0x80-block (for most small scripts) or
+ * in the middle of the Unihan and Hangul blocks
+ * to statistically minimize the following difference
+ */
+ diff=c-prev;
+ prev=BOCU1_PREV(c);
+ if(DIFF_IS_SINGLE(diff)) {
+ *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
+ *offsets++=sourceIndex;
+ --targetCapacity;
+ sourceIndex=nextSourceIndex;
+ if(c<0x3000) {
+ goto fastSingle;
+ }
+ } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
+ /* optimize 2-byte case */
+ int32_t m;
+
+ if(diff>=0) {
+ diff-=BOCU1_REACH_POS_1+1;
+ m=diff%BOCU1_TRAIL_COUNT;
+ diff/=BOCU1_TRAIL_COUNT;
+ diff+=BOCU1_START_POS_2;
+ } else {
+ diff-=BOCU1_REACH_NEG_1;
+ NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
+ diff+=BOCU1_START_NEG_2;
+ }
+ *target++=(uint8_t)diff;
+ *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
+ *offsets++=sourceIndex;
+ *offsets++=sourceIndex;
+ targetCapacity-=2;
+ sourceIndex=nextSourceIndex;
+ } else {
+ int32_t length; /* will be 2..4 */
+
+ diff=packDiff(diff);
+ length=BOCU1_LENGTH_FROM_PACKED(diff);
+
+ /* write the output character bytes from diff and length */
+ /* from the first if in the loop we know that targetCapacity>0 */
+ if(length<=targetCapacity) {
+ switch(length) {
+ /* each branch falls through to the next one */
+ case 4:
+ *target++=(uint8_t)(diff>>24);
+ *offsets++=sourceIndex;
+ U_FALLTHROUGH;
+ case 3:
+ *target++=(uint8_t)(diff>>16);
+ *offsets++=sourceIndex;
+ U_FALLTHROUGH;
+ case 2:
+ *target++=(uint8_t)(diff>>8);
+ *offsets++=sourceIndex;
+ /* case 1: handled above */
+ *target++=(uint8_t)diff;
+ *offsets++=sourceIndex;
+ U_FALLTHROUGH;
+ default:
+ /* will never occur */
+ break;
+ }
+ targetCapacity-=length;
+ sourceIndex=nextSourceIndex;
+ } else {
+ uint8_t *charErrorBuffer;
+
+ /*
+ * We actually do this backwards here:
+ * In order to save an intermediate variable, we output
+ * first to the overflow buffer what does not fit into the
+ * regular target.
+ */
+ /* we know that 1<=targetCapacity<length<=4 */
+ length-=targetCapacity;
+ charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
+ switch(length) {
+ /* each branch falls through to the next one */
+ case 3:
+ *charErrorBuffer++=(uint8_t)(diff>>16);
+ U_FALLTHROUGH;
+ case 2:
+ *charErrorBuffer++=(uint8_t)(diff>>8);
+ U_FALLTHROUGH;
+ case 1:
+ *charErrorBuffer=(uint8_t)diff;
+ U_FALLTHROUGH;
+ default:
+ /* will never occur */
+ break;
+ }
+ cnv->charErrorBufferLength=(int8_t)length;
+
+ /* now output what fits into the regular target */
+ diff>>=8*length; /* length was reduced by targetCapacity */
+ switch(targetCapacity) {
+ /* each branch falls through to the next one */
+ case 3:
+ *target++=(uint8_t)(diff>>16);
+ *offsets++=sourceIndex;
+ U_FALLTHROUGH;
+ case 2:
+ *target++=(uint8_t)(diff>>8);
+ *offsets++=sourceIndex;
+ U_FALLTHROUGH;
+ case 1:
+ *target++=(uint8_t)diff;
+ *offsets++=sourceIndex;
+ U_FALLTHROUGH;
+ default:
+ /* will never occur */
+ break;
+ }
+
+ /* target overflow */
+ targetCapacity=0;
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ break;
+ }
+ }
+ } else {
+ /* target is full */
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ break;
+ }
+ }
+
+ /* set the converter state back into UConverter */
+ cnv->fromUChar32= c<0 ? -c : 0;
+ cnv->fromUnicodeStatus=(uint32_t)prev;
+
+ /* write back the updated pointers */
+ pArgs->source=source;
+ pArgs->target=(char *)target;
+ pArgs->offsets=offsets;
+}
+
+/*
+ * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
+ * If a change is made in the original function, then either
+ * change this function the same way or
+ * re-copy the original function and remove the variables
+ * offsets, sourceIndex, and nextSourceIndex.
+ */
+static void U_CALLCONV
+_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
+ UErrorCode *pErrorCode) {
+ UConverter *cnv;
+ const char16_t *source, *sourceLimit;
+ uint8_t *target;
+ int32_t targetCapacity;
+
+ int32_t prev, c, diff;
+
+ /* set up the local pointers */
+ cnv=pArgs->converter;
+ source=pArgs->source;
+ sourceLimit=pArgs->sourceLimit;
+ target=(uint8_t *)pArgs->target;
+ targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
+
+ /* get the converter state from UConverter */
+ c=cnv->fromUChar32;
+ prev=(int32_t)cnv->fromUnicodeStatus;
+ if(prev==0) {
+ prev=BOCU1_ASCII_PREV;
+ }
+
+ /* conversion loop */
+ if(c!=0 && targetCapacity>0) {
+ goto getTrail;
+ }
+
+fastSingle:
+ /* fast loop for single-byte differences */
+ /* use only one loop counter variable, targetCapacity, not also source */
+ diff=(int32_t)(sourceLimit-source);
+ if(targetCapacity>diff) {
+ targetCapacity=diff;
+ }
+ while(targetCapacity>0 && (c=*source)<0x3000) {
+ if(c<=0x20) {
+ if(c!=0x20) {
+ prev=BOCU1_ASCII_PREV;
+ }
+ *target++=(uint8_t)c;
+ } else {
+ diff=c-prev;
+ if(DIFF_IS_SINGLE(diff)) {
+ prev=BOCU1_SIMPLE_PREV(c);
+ *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
+ } else {
+ break;
+ }
+ }
+ ++source;
+ --targetCapacity;
+ }
+ /* restore real values */
+ targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
+
+ /* regular loop for all cases */
+ while(source<sourceLimit) {
+ if(targetCapacity>0) {
+ c=*source++;
+
+ if(c<=0x20) {
+ /*
+ * ISO C0 control & space:
+ * Encode directly for MIME compatibility,
+ * and reset state except for space, to not disrupt compression.
+ */
+ if(c!=0x20) {
+ prev=BOCU1_ASCII_PREV;
+ }
+ *target++=(uint8_t)c;
+ --targetCapacity;
+ continue;
+ }
+
+ if(U16_IS_LEAD(c)) {
+getTrail:
+ if(source<sourceLimit) {
+ /* test the following code unit */
+ char16_t trail=*source;
+ if(U16_IS_TRAIL(trail)) {
+ ++source;
+ c=U16_GET_SUPPLEMENTARY(c, trail);
+ }
+ } else {
+ /* no more input */
+ c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
+ break;
+ }
+ }
+
+ /*
+ * all other Unicode code points c==U+0021..U+10ffff
+ * are encoded with the difference c-prev
+ *
+ * a new prev is computed from c,
+ * placed in the middle of a 0x80-block (for most small scripts) or
+ * in the middle of the Unihan and Hangul blocks
+ * to statistically minimize the following difference
+ */
+ diff=c-prev;
+ prev=BOCU1_PREV(c);
+ if(DIFF_IS_SINGLE(diff)) {
+ *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
+ --targetCapacity;
+ if(c<0x3000) {
+ goto fastSingle;
+ }
+ } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
+ /* optimize 2-byte case */
+ int32_t m;
+
+ if(diff>=0) {
+ diff-=BOCU1_REACH_POS_1+1;
+ m=diff%BOCU1_TRAIL_COUNT;
+ diff/=BOCU1_TRAIL_COUNT;
+ diff+=BOCU1_START_POS_2;
+ } else {
+ diff-=BOCU1_REACH_NEG_1;
+ NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
+ diff+=BOCU1_START_NEG_2;
+ }
+ *target++=(uint8_t)diff;
+ *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
+ targetCapacity-=2;
+ } else {
+ int32_t length; /* will be 2..4 */
+
+ diff=packDiff(diff);
+ length=BOCU1_LENGTH_FROM_PACKED(diff);
+
+ /* write the output character bytes from diff and length */
+ /* from the first if in the loop we know that targetCapacity>0 */
+ if(length<=targetCapacity) {
+ switch(length) {
+ /* each branch falls through to the next one */
+ case 4:
+ *target++=(uint8_t)(diff>>24);
+ U_FALLTHROUGH;
+ case 3:
+ *target++=(uint8_t)(diff>>16);
+ /* case 2: handled above */
+ *target++=(uint8_t)(diff>>8);
+ /* case 1: handled above */
+ *target++=(uint8_t)diff;
+ U_FALLTHROUGH;
+ default:
+ /* will never occur */
+ break;
+ }
+ targetCapacity-=length;
+ } else {
+ uint8_t *charErrorBuffer;
+
+ /*
+ * We actually do this backwards here:
+ * In order to save an intermediate variable, we output
+ * first to the overflow buffer what does not fit into the
+ * regular target.
+ */
+ /* we know that 1<=targetCapacity<length<=4 */
+ length-=targetCapacity;
+ charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
+ switch(length) {
+ /* each branch falls through to the next one */
+ case 3:
+ *charErrorBuffer++=(uint8_t)(diff>>16);
+ U_FALLTHROUGH;
+ case 2:
+ *charErrorBuffer++=(uint8_t)(diff>>8);
+ U_FALLTHROUGH;
+ case 1:
+ *charErrorBuffer=(uint8_t)diff;
+ U_FALLTHROUGH;
+ default:
+ /* will never occur */
+ break;
+ }
+ cnv->charErrorBufferLength=(int8_t)length;
+
+ /* now output what fits into the regular target */
+ diff>>=8*length; /* length was reduced by targetCapacity */
+ switch(targetCapacity) {
+ /* each branch falls through to the next one */
+ case 3:
+ *target++=(uint8_t)(diff>>16);
+ U_FALLTHROUGH;
+ case 2:
+ *target++=(uint8_t)(diff>>8);
+ U_FALLTHROUGH;
+ case 1:
+ *target++=(uint8_t)diff;
+ U_FALLTHROUGH;
+ default:
+ /* will never occur */
+ break;
+ }
+
+ /* target overflow */
+ targetCapacity=0;
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ break;
+ }
+ }
+ } else {
+ /* target is full */
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ break;
+ }
+ }
+
+ /* set the converter state back into UConverter */
+ cnv->fromUChar32= c<0 ? -c : 0;
+ cnv->fromUnicodeStatus=(uint32_t)prev;
+
+ /* write back the updated pointers */
+ pArgs->source=source;
+ pArgs->target=(char *)target;
+}
+
+/* BOCU-1-to-Unicode conversion functions ----------------------------------- */
+
+/**
+ * Function for BOCU-1 decoder; handles multi-byte lead bytes.
+ *
+ * @param b lead byte;
+ * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
+ * @return (diff<<2)|count
+ */
+static inline int32_t
+decodeBocu1LeadByte(int32_t b) {
+ int32_t diff, count;
+
+ if(b>=BOCU1_START_NEG_2) {
+ /* positive difference */
+ if(b<BOCU1_START_POS_3) {
+ /* two bytes */
+ diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
+ count=1;
+ } else if(b<BOCU1_START_POS_4) {
+ /* three bytes */
+ diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
+ count=2;
+ } else {
+ /* four bytes */
+ diff=BOCU1_REACH_POS_3+1;
+ count=3;
+ }
+ } else {
+ /* negative difference */
+ if(b>=BOCU1_START_NEG_3) {
+ /* two bytes */
+ diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
+ count=1;
+ } else if(b>BOCU1_MIN) {
+ /* three bytes */
+ diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
+ count=2;
+ } else {
+ /* four bytes */
+ diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
+ count=3;
+ }
+ }
+
+ /* return the state for decoding the trail byte(s) */
+ return ((uint32_t)diff<<2)|count;
+}
+
+/**
+ * Function for BOCU-1 decoder; handles multi-byte trail bytes.
+ *
+ * @param count number of remaining trail bytes including this one
+ * @param b trail byte
+ * @return new delta for diff including b - <0 indicates an error
+ *
+ * @see decodeBocu1
+ */
+static inline int32_t
+decodeBocu1TrailByte(int32_t count, int32_t b) {
+ if(b<=0x20) {
+ /* skip some C0 controls and make the trail byte range contiguous */
+ b=bocu1ByteToTrail[b];
+ /* b<0 for an illegal trail byte value will result in return<0 below */
+#if BOCU1_MAX_TRAIL<0xff
+ } else if(b>BOCU1_MAX_TRAIL) {
+ return -99;
+#endif
+ } else {
+ b-=BOCU1_TRAIL_BYTE_OFFSET;
+ }
+
+ /* add trail byte into difference and decrement count */
+ if(count==1) {
+ return b;
+ } else if(count==2) {
+ return b*BOCU1_TRAIL_COUNT;
+ } else /* count==3 */ {
+ return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
+ }
+}
+
+static void U_CALLCONV
+_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+ UErrorCode *pErrorCode) {
+ UConverter *cnv;
+ const uint8_t *source, *sourceLimit;
+ char16_t *target;
+ const char16_t *targetLimit;
+ int32_t *offsets;
+
+ int32_t prev, count, diff, c;
+
+ int8_t byteIndex;
+ uint8_t *bytes;
+
+ int32_t sourceIndex, nextSourceIndex;
+
+ /* set up the local pointers */
+ cnv=pArgs->converter;
+ source=(const uint8_t *)pArgs->source;
+ sourceLimit=(const uint8_t *)pArgs->sourceLimit;
+ target=pArgs->target;
+ targetLimit=pArgs->targetLimit;
+ offsets=pArgs->offsets;
+
+ /* get the converter state from UConverter */
+ prev=(int32_t)cnv->toUnicodeStatus;
+ if(prev==0) {
+ prev=BOCU1_ASCII_PREV;
+ }
+ diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
+ count=diff&3;
+ diff>>=2;
+
+ byteIndex=cnv->toULength;
+ bytes=cnv->toUBytes;
+
+ /* sourceIndex=-1 if the current character began in the previous buffer */
+ sourceIndex=byteIndex==0 ? 0 : -1;
+ nextSourceIndex=0;
+
+ /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
+ if(count>0 && byteIndex>0 && target<targetLimit) {
+ goto getTrail;
+ }
+
+fastSingle:
+ /* fast loop for single-byte differences */
+ /* use count as the only loop counter variable */
+ diff=(int32_t)(sourceLimit-source);
+ count=(int32_t)(pArgs->targetLimit-target);
+ if(count>diff) {
+ count=diff;
+ }
+ while(count>0) {
+ if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
+ c=prev+(c-BOCU1_MIDDLE);
+ if(c<0x3000) {
+ *target++=(char16_t)c;
+ *offsets++=nextSourceIndex++;
+ prev=BOCU1_SIMPLE_PREV(c);
+ } else {
+ break;
+ }
+ } else if(c<=0x20) {
+ if(c!=0x20) {
+ prev=BOCU1_ASCII_PREV;
+ }
+ *target++=(char16_t)c;
+ *offsets++=nextSourceIndex++;
+ } else {
+ break;
+ }
+ ++source;
+ --count;
+ }
+ sourceIndex=nextSourceIndex; /* wrong if offsets==nullptr but does not matter */
+
+ /* decode a sequence of single and lead bytes */
+ while(source<sourceLimit) {
+ if(target>=targetLimit) {
+ /* target is full */
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ break;
+ }
+
+ ++nextSourceIndex;
+ c=*source++;
+ if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
+ /* Write a code point directly from a single-byte difference. */
+ c=prev+(c-BOCU1_MIDDLE);
+ if(c<0x3000) {
+ *target++=(char16_t)c;
+ *offsets++=sourceIndex;
+ prev=BOCU1_SIMPLE_PREV(c);
+ sourceIndex=nextSourceIndex;
+ goto fastSingle;
+ }
+ } else if(c<=0x20) {
+ /*
+ * Direct-encoded C0 control code or space.
+ * Reset prev for C0 control codes but not for space.
+ */
+ if(c!=0x20) {
+ prev=BOCU1_ASCII_PREV;
+ }
+ *target++=(char16_t)c;
+ *offsets++=sourceIndex;
+ sourceIndex=nextSourceIndex;
+ continue;
+ } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
+ /* Optimize two-byte case. */
+ if(c>=BOCU1_MIDDLE) {
+ diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
+ } else {
+ diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
+ }
+
+ /* trail byte */
+ ++nextSourceIndex;
+ c=decodeBocu1TrailByte(1, *source++);
+ if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
+ bytes[0]=source[-2];
+ bytes[1]=source[-1];
+ byteIndex=2;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
+ }
+ } else if(c==BOCU1_RESET) {
+ /* only reset the state, no code point */
+ prev=BOCU1_ASCII_PREV;
+ sourceIndex=nextSourceIndex;
+ continue;
+ } else {
+ /*
+ * For multi-byte difference lead bytes, set the decoder state
+ * with the partial difference value from the lead byte and
+ * with the number of trail bytes.
+ */
+ bytes[0]=(uint8_t)c;
+ byteIndex=1;
+
+ diff=decodeBocu1LeadByte(c);
+ count=diff&3;
+ diff>>=2;
+getTrail:
+ for(;;) {
+ if(source>=sourceLimit) {
+ goto endloop;
+ }
+ ++nextSourceIndex;
+ c=bytes[byteIndex++]=*source++;
+
+ /* trail byte in any position */
+ c=decodeBocu1TrailByte(count, c);
+ if(c<0) {
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ goto endloop;
+ }
+
+ diff+=c;
+ if(--count==0) {
+ /* final trail byte, deliver a code point */
+ byteIndex=0;
+ c=prev+diff;
+ if((uint32_t)c>0x10ffff) {
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ goto endloop;
+ }
+ break;
+ }
+ }
+ }
+
+ /* calculate the next prev and output c */
+ prev=BOCU1_PREV(c);
+ if(c<=0xffff) {
+ *target++=(char16_t)c;
+ *offsets++=sourceIndex;
+ } else {
+ /* output surrogate pair */
+ *target++=U16_LEAD(c);
+ if(target<targetLimit) {
+ *target++=U16_TRAIL(c);
+ *offsets++=sourceIndex;
+ *offsets++=sourceIndex;
+ } else {
+ /* target overflow */
+ *offsets++=sourceIndex;
+ cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
+ cnv->UCharErrorBufferLength=1;
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ break;
+ }
+ }
+ sourceIndex=nextSourceIndex;
+ }
+endloop:
+
+ if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
+ /* set the converter state in UConverter to deal with the next character */
+ cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
+ cnv->mode=0;
+ } else {
+ /* set the converter state back into UConverter */
+ cnv->toUnicodeStatus=(uint32_t)prev;
+ cnv->mode=(int32_t)((uint32_t)diff<<2)|count;
+ }
+ cnv->toULength=byteIndex;
+
+ /* write back the updated pointers */
+ pArgs->source=(const char *)source;
+ pArgs->target=target;
+ pArgs->offsets=offsets;
+ return;
+}
+
+/*
+ * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
+ * If a change is made in the original function, then either
+ * change this function the same way or
+ * re-copy the original function and remove the variables
+ * offsets, sourceIndex, and nextSourceIndex.
+ */
+static void U_CALLCONV
+_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
+ UErrorCode *pErrorCode) {
+ UConverter *cnv;
+ const uint8_t *source, *sourceLimit;
+ char16_t *target;
+ const char16_t *targetLimit;
+
+ int32_t prev, count, diff, c;
+
+ int8_t byteIndex;
+ uint8_t *bytes;
+
+ /* set up the local pointers */
+ cnv=pArgs->converter;
+ source=(const uint8_t *)pArgs->source;
+ sourceLimit=(const uint8_t *)pArgs->sourceLimit;
+ target=pArgs->target;
+ targetLimit=pArgs->targetLimit;
+
+ /* get the converter state from UConverter */
+ prev=(int32_t)cnv->toUnicodeStatus;
+ if(prev==0) {
+ prev=BOCU1_ASCII_PREV;
+ }
+ diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
+ count=diff&3;
+ diff>>=2;
+
+ byteIndex=cnv->toULength;
+ bytes=cnv->toUBytes;
+
+ /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
+ if(count>0 && byteIndex>0 && target<targetLimit) {
+ goto getTrail;
+ }
+
+fastSingle:
+ /* fast loop for single-byte differences */
+ /* use count as the only loop counter variable */
+ diff=(int32_t)(sourceLimit-source);
+ count=(int32_t)(pArgs->targetLimit-target);
+ if(count>diff) {
+ count=diff;
+ }
+ while(count>0) {
+ if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
+ c=prev+(c-BOCU1_MIDDLE);
+ if(c<0x3000) {
+ *target++=(char16_t)c;
+ prev=BOCU1_SIMPLE_PREV(c);
+ } else {
+ break;
+ }
+ } else if(c<=0x20) {
+ if(c!=0x20) {
+ prev=BOCU1_ASCII_PREV;
+ }
+ *target++=(char16_t)c;
+ } else {
+ break;
+ }
+ ++source;
+ --count;
+ }
+
+ /* decode a sequence of single and lead bytes */
+ while(source<sourceLimit) {
+ if(target>=targetLimit) {
+ /* target is full */
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ break;
+ }
+
+ c=*source++;
+ if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
+ /* Write a code point directly from a single-byte difference. */
+ c=prev+(c-BOCU1_MIDDLE);
+ if(c<0x3000) {
+ *target++=(char16_t)c;
+ prev=BOCU1_SIMPLE_PREV(c);
+ goto fastSingle;
+ }
+ } else if(c<=0x20) {
+ /*
+ * Direct-encoded C0 control code or space.
+ * Reset prev for C0 control codes but not for space.
+ */
+ if(c!=0x20) {
+ prev=BOCU1_ASCII_PREV;
+ }
+ *target++=(char16_t)c;
+ continue;
+ } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
+ /* Optimize two-byte case. */
+ if(c>=BOCU1_MIDDLE) {
+ diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
+ } else {
+ diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
+ }
+
+ /* trail byte */
+ c=decodeBocu1TrailByte(1, *source++);
+ if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
+ bytes[0]=source[-2];
+ bytes[1]=source[-1];
+ byteIndex=2;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
+ }
+ } else if(c==BOCU1_RESET) {
+ /* only reset the state, no code point */
+ prev=BOCU1_ASCII_PREV;
+ continue;
+ } else {
+ /*
+ * For multi-byte difference lead bytes, set the decoder state
+ * with the partial difference value from the lead byte and
+ * with the number of trail bytes.
+ */
+ bytes[0]=(uint8_t)c;
+ byteIndex=1;
+
+ diff=decodeBocu1LeadByte(c);
+ count=diff&3;
+ diff>>=2;
+getTrail:
+ for(;;) {
+ if(source>=sourceLimit) {
+ goto endloop;
+ }
+ c=bytes[byteIndex++]=*source++;
+
+ /* trail byte in any position */
+ c=decodeBocu1TrailByte(count, c);
+ if(c<0) {
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ goto endloop;
+ }
+
+ diff+=c;
+ if(--count==0) {
+ /* final trail byte, deliver a code point */
+ byteIndex=0;
+ c=prev+diff;
+ if((uint32_t)c>0x10ffff) {
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ goto endloop;
+ }
+ break;
+ }
+ }
+ }
+
+ /* calculate the next prev and output c */
+ prev=BOCU1_PREV(c);
+ if(c<=0xffff) {
+ *target++=(char16_t)c;
+ } else {
+ /* output surrogate pair */
+ *target++=U16_LEAD(c);
+ if(target<targetLimit) {
+ *target++=U16_TRAIL(c);
+ } else {
+ /* target overflow */
+ cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
+ cnv->UCharErrorBufferLength=1;
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ break;
+ }
+ }
+ }
+endloop:
+
+ if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
+ /* set the converter state in UConverter to deal with the next character */
+ cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
+ cnv->mode=0;
+ } else {
+ /* set the converter state back into UConverter */
+ cnv->toUnicodeStatus=(uint32_t)prev;
+ cnv->mode=((uint32_t)diff<<2)|count;
+ }
+ cnv->toULength=byteIndex;
+
+ /* write back the updated pointers */
+ pArgs->source=(const char *)source;
+ pArgs->target=target;
+ return;
+}
+
+/* miscellaneous ------------------------------------------------------------ */
+
+static const UConverterImpl _Bocu1Impl={
+ UCNV_BOCU1,
+
+ nullptr,
+ nullptr,
+
+ nullptr,
+ nullptr,
+ nullptr,
+
+ _Bocu1ToUnicode,
+ _Bocu1ToUnicodeWithOffsets,
+ _Bocu1FromUnicode,
+ _Bocu1FromUnicodeWithOffsets,
+ nullptr,
+
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ ucnv_getCompleteUnicodeSet,
+
+ nullptr,
+ nullptr
+};
+
+static const UConverterStaticData _Bocu1StaticData={
+ sizeof(UConverterStaticData),
+ "BOCU-1",
+ 1214, /* CCSID for BOCU-1 */
+ UCNV_IBM, UCNV_BOCU1,
+ 1, 4, /* one char16_t generates at least 1 byte and at most 4 bytes */
+ { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
+ false, false,
+ 0,
+ 0,
+ { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+
+const UConverterSharedData _Bocu1Data=
+ UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
+
+#endif