diff options
Diffstat (limited to 'intl/icu/source/common/ucnvhz.cpp')
-rw-r--r-- | intl/icu/source/common/ucnvhz.cpp | 625 |
1 files changed, 625 insertions, 0 deletions
diff --git a/intl/icu/source/common/ucnvhz.cpp b/intl/icu/source/common/ucnvhz.cpp new file mode 100644 index 0000000000..fa0f2b40ea --- /dev/null +++ b/intl/icu/source/common/ucnvhz.cpp @@ -0,0 +1,625 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 2000-2015, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* file name: ucnvhz.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000oct16 +* created by: Ram Viswanadha +* 10/31/2000 Ram Implemented offsets logic function +* +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION + +#include "cmemory.h" +#include "unicode/ucnv.h" +#include "unicode/ucnv_cb.h" +#include "unicode/uset.h" +#include "unicode/utf16.h" +#include "ucnv_bld.h" +#include "ucnv_cnv.h" +#include "ucnv_imp.h" + +#define UCNV_TILDE 0x7E /* ~ */ +#define UCNV_OPEN_BRACE 0x7B /* { */ +#define UCNV_CLOSE_BRACE 0x7D /* } */ +#define SB_ESCAPE "\x7E\x7D" +#define DB_ESCAPE "\x7E\x7B" +#define TILDE_ESCAPE "\x7E\x7E" +#define ESC_LEN 2 + + +#define CONCAT_ESCAPE_MACRO(args, targetIndex,targetLength,strToAppend, err, len,sourceIndex) UPRV_BLOCK_MACRO_BEGIN { \ + while(len-->0){ \ + if(targetIndex < targetLength){ \ + args->target[targetIndex] = (unsigned char) *strToAppend; \ + if(args->offsets!=nullptr){ \ + *(offsets++) = sourceIndex-1; \ + } \ + targetIndex++; \ + } \ + else{ \ + args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \ + *err =U_BUFFER_OVERFLOW_ERROR; \ + } \ + strToAppend++; \ + } \ +} UPRV_BLOCK_MACRO_END + + +typedef struct{ + UConverter* gbConverter; + int32_t targetIndex; + int32_t sourceIndex; + UBool isEscapeAppended; + UBool isStateDBCS; + UBool isTargetUCharDBCS; + UBool isEmptySegment; +}UConverterDataHZ; + + +U_CDECL_BEGIN +static void U_CALLCONV +_HZOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ + UConverter *gbConverter; + if(pArgs->onlyTestIsLoadable) { + ucnv_canCreateConverter("GBK", errorCode); /* errorCode carries result */ + return; + } + gbConverter = ucnv_open("GBK", errorCode); + if(U_FAILURE(*errorCode)) { + return; + } + cnv->toUnicodeStatus = 0; + cnv->fromUnicodeStatus= 0; + cnv->mode=0; + cnv->fromUChar32=0x0000; + cnv->extraInfo = uprv_calloc(1, sizeof(UConverterDataHZ)); + if(cnv->extraInfo != nullptr){ + ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = gbConverter; + } + else { + ucnv_close(gbConverter); + *errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } +} + +static void U_CALLCONV +_HZClose(UConverter *cnv){ + if(cnv->extraInfo != nullptr) { + ucnv_close (((UConverterDataHZ *) (cnv->extraInfo))->gbConverter); + if(!cnv->isExtraLocal) { + uprv_free(cnv->extraInfo); + } + cnv->extraInfo = nullptr; + } +} + +static void U_CALLCONV +_HZReset(UConverter *cnv, UConverterResetChoice choice){ + if(choice<=UCNV_RESET_TO_UNICODE) { + cnv->toUnicodeStatus = 0; + cnv->mode=0; + if(cnv->extraInfo != nullptr){ + ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = false; + ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = false; + } + } + if(choice!=UCNV_RESET_TO_UNICODE) { + cnv->fromUnicodeStatus= 0; + cnv->fromUChar32=0x0000; + if(cnv->extraInfo != nullptr){ + ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = false; + ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0; + ((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0; + ((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = false; + } + } +} + +/**************************************HZ Encoding************************************************* +* Rules for HZ encoding +* +* In ASCII mode, a byte is interpreted as an ASCII character, unless a +* '~' is encountered. The character '~' is an escape character. By +* convention, it must be immediately followed ONLY by '~', '{' or '\n' +* (<LF>), with the following special meaning. + +* 1. The escape sequence '~~' is interpreted as a '~'. +* 2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB. +* 3. The escape sequence '~\n' is a line-continuation marker to be +* consumed with no output produced. +* In GB mode, characters are interpreted two bytes at a time as (pure) +* GB codes until the escape-from-GB code '~}' is read. This code +* switches the mode from GB back to ASCII. (Note that the escape- +* from-GB code '~}' ($7E7D) is outside the defined GB range.) +* +* Source: RFC 1842 +* +* Note that the formal syntax in RFC 1842 is invalid. I assume that the +* intended definition of single-byte-segment is as follows (pedberg): +* single-byte-segment = single-byte-seq 1*single-byte-char +*/ + + +static void U_CALLCONV +UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, + UErrorCode* err){ + char tempBuf[2]; + const char *mySource = ( char *) args->source; + char16_t *myTarget = args->target; + const char *mySourceLimit = args->sourceLimit; + UChar32 targetUniChar = 0x0000; + int32_t mySourceChar = 0x0000; + UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo); + tempBuf[0]=0; + tempBuf[1]=0; + + /* Calling code already handles this situation. */ + /*if ((args->converter == nullptr) || (args->targetLimit < args->target) || (mySourceLimit < args->source)){ + *err = U_ILLEGAL_ARGUMENT_ERROR; + return; + }*/ + + while(mySource< mySourceLimit){ + + if(myTarget < args->targetLimit){ + + mySourceChar= (unsigned char) *mySource++; + + if(args->converter->mode == UCNV_TILDE) { + /* second byte after ~ */ + args->converter->mode=0; + switch(mySourceChar) { + case 0x0A: + /* no output for ~\n (line-continuation marker) */ + continue; + case UCNV_TILDE: + if(args->offsets) { + args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2); + } + *(myTarget++)=(char16_t)mySourceChar; + myData->isEmptySegment = false; + continue; + case UCNV_OPEN_BRACE: + case UCNV_CLOSE_BRACE: + myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE); + if (myData->isEmptySegment) { + myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */ + *err = U_ILLEGAL_ESCAPE_SEQUENCE; + args->converter->toUCallbackReason = UCNV_IRREGULAR; + args->converter->toUBytes[0] = UCNV_TILDE; + args->converter->toUBytes[1] = static_cast<uint8_t>(mySourceChar); + args->converter->toULength = 2; + args->target = myTarget; + args->source = mySource; + return; + } + myData->isEmptySegment = true; + continue; + default: + /* if the first byte is equal to TILDE and the trail byte + * is not a valid byte then it is an error condition + */ + /* + * Ticket 5691: consistent illegal sequences: + * - We include at least the first byte in the illegal sequence. + * - If any of the non-initial bytes could be the start of a character, + * we stop the illegal sequence before the first one of those. + */ + myData->isEmptySegment = false; /* different error here, reset this to avoid spurious future error */ + *err = U_ILLEGAL_ESCAPE_SEQUENCE; + args->converter->toUBytes[0] = UCNV_TILDE; + if( myData->isStateDBCS ? + (0x21 <= mySourceChar && mySourceChar <= 0x7e) : + mySourceChar <= 0x7f + ) { + /* The current byte could be the start of a character: Back it out. */ + args->converter->toULength = 1; + --mySource; + } else { + /* Include the current byte in the illegal sequence. */ + args->converter->toUBytes[1] = static_cast<uint8_t>(mySourceChar); + args->converter->toULength = 2; + } + args->target = myTarget; + args->source = mySource; + return; + } + } else if(myData->isStateDBCS) { + if(args->converter->toUnicodeStatus == 0x00){ + /* lead byte */ + if(mySourceChar == UCNV_TILDE) { + args->converter->mode = UCNV_TILDE; + } else { + /* add another bit to distinguish a 0 byte from not having seen a lead byte */ + args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100); + myData->isEmptySegment = false; /* the segment has something, either valid or will produce a different error, so reset this */ + } + continue; + } + else{ + /* trail byte */ + int leadIsOk, trailIsOk; + uint32_t leadByte = args->converter->toUnicodeStatus & 0xff; + targetUniChar = 0xffff; + /* + * Ticket 5691: consistent illegal sequences: + * - We include at least the first byte in the illegal sequence. + * - If any of the non-initial bytes could be the start of a character, + * we stop the illegal sequence before the first one of those. + * + * In HZ DBCS, if the second byte is in the 21..7e range, + * we report only the first byte as the illegal sequence. + * Otherwise we convert or report the pair of bytes. + */ + leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21); + trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); + if (leadIsOk && trailIsOk) { + tempBuf[0] = (char) (leadByte+0x80) ; + tempBuf[1] = (char) (mySourceChar+0x80); + targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, + tempBuf, 2, args->converter->useFallback); + mySourceChar= (leadByte << 8) | mySourceChar; + } else if (trailIsOk) { + /* report a single illegal byte and continue with the following DBCS starter byte */ + --mySource; + mySourceChar = (int32_t)leadByte; + } else { + /* report a pair of illegal bytes if the second byte is not a DBCS starter */ + /* add another bit so that the code below writes 2 bytes in case of error */ + mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; + } + args->converter->toUnicodeStatus =0x00; + } + } + else{ + if(mySourceChar == UCNV_TILDE) { + args->converter->mode = UCNV_TILDE; + continue; + } else if(mySourceChar <= 0x7f) { + targetUniChar = (char16_t)mySourceChar; /* ASCII */ + myData->isEmptySegment = false; /* the segment has something valid */ + } else { + targetUniChar = 0xffff; + myData->isEmptySegment = false; /* different error here, reset this to avoid spurious future error */ + } + } + if(targetUniChar < 0xfffe){ + if(args->offsets) { + args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 1-(myData->isStateDBCS)); + } + + *(myTarget++)=(char16_t)targetUniChar; + } + else /* targetUniChar>=0xfffe */ { + if(targetUniChar == 0xfffe){ + *err = U_INVALID_CHAR_FOUND; + } + else{ + *err = U_ILLEGAL_CHAR_FOUND; + } + if(mySourceChar > 0xff){ + args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8); + args->converter->toUBytes[1] = (uint8_t)mySourceChar; + args->converter->toULength=2; + } + else{ + args->converter->toUBytes[0] = (uint8_t)mySourceChar; + args->converter->toULength=1; + } + break; + } + } + else{ + *err =U_BUFFER_OVERFLOW_ERROR; + break; + } + } + + args->target = myTarget; + args->source = mySource; +} + + +static void U_CALLCONV +UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, + UErrorCode * err){ + const char16_t *mySource = args->source; + char *myTarget = args->target; + int32_t* offsets = args->offsets; + int32_t mySourceIndex = 0; + int32_t myTargetIndex = 0; + int32_t targetLength = (int32_t)(args->targetLimit - myTarget); + int32_t mySourceLength = (int32_t)(args->sourceLimit - args->source); + uint32_t targetUniChar = 0x0000; + UChar32 mySourceChar = 0x0000; + UConverterDataHZ *myConverterData=(UConverterDataHZ*)args->converter->extraInfo; + UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS; + UBool oldIsTargetUCharDBCS; + int len =0; + const char* escSeq=nullptr; + + /* Calling code already handles this situation. */ + /*if ((args->converter == nullptr) || (args->targetLimit < myTarget) || (args->sourceLimit < args->source)){ + *err = U_ILLEGAL_ARGUMENT_ERROR; + return; + }*/ + if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) { + goto getTrail; + } + /*writing the char to the output stream */ + while (mySourceIndex < mySourceLength){ + targetUniChar = missingCharMarker; + if (myTargetIndex < targetLength){ + + mySourceChar = (char16_t) mySource[mySourceIndex++]; + + + oldIsTargetUCharDBCS = isTargetUCharDBCS; + if(mySourceChar ==UCNV_TILDE){ + /*concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);*/ + len = ESC_LEN; + escSeq = TILDE_ESCAPE; + CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex); + continue; + } else if(mySourceChar <= 0x7f) { + targetUniChar = mySourceChar; + } else { + int32_t length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData, + mySourceChar,&targetUniChar,args->converter->useFallback); + /* we can only use lead bytes 21..7D and trail bytes 21..7E */ + if( length == 2 && + (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) && + (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1) + ) { + targetUniChar -= 0x8080; + } else { + targetUniChar = missingCharMarker; + } + } + if (targetUniChar != missingCharMarker){ + myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF); + if(oldIsTargetUCharDBCS != isTargetUCharDBCS || !myConverterData->isEscapeAppended ){ + /*Shifting from a double byte to single byte mode*/ + if(!isTargetUCharDBCS){ + len =ESC_LEN; + escSeq = SB_ESCAPE; + CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex); + myConverterData->isEscapeAppended = true; + } + else{ /* Shifting from a single byte to double byte mode*/ + len =ESC_LEN; + escSeq = DB_ESCAPE; + CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex); + myConverterData->isEscapeAppended = true; + + } + } + + if(isTargetUCharDBCS){ + if( myTargetIndex <targetLength){ + myTarget[myTargetIndex++] =(char) (targetUniChar >> 8); + if(offsets){ + *(offsets++) = mySourceIndex-1; + } + if(myTargetIndex < targetLength){ + myTarget[myTargetIndex++] =(char) targetUniChar; + if(offsets){ + *(offsets++) = mySourceIndex-1; + } + }else{ + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; + *err = U_BUFFER_OVERFLOW_ERROR; + } + }else{ + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8); + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; + *err = U_BUFFER_OVERFLOW_ERROR; + } + + }else{ + if( myTargetIndex <targetLength){ + myTarget[myTargetIndex++] = (char) (targetUniChar ); + if(offsets){ + *(offsets++) = mySourceIndex-1; + } + + }else{ + args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; + *err = U_BUFFER_OVERFLOW_ERROR; + } + } + + } + else{ + /* oops.. the code point is unassigned */ + /*Handle surrogates */ + /*check if the char is a First surrogate*/ + if(U16_IS_SURROGATE(mySourceChar)) { + if(U16_IS_SURROGATE_LEAD(mySourceChar)) { + args->converter->fromUChar32=mySourceChar; +getTrail: + /*look ahead to find the trail surrogate*/ + if(mySourceIndex < mySourceLength) { + /* test the following code unit */ + char16_t trail=(char16_t) args->source[mySourceIndex]; + if(U16_IS_TRAIL(trail)) { + ++mySourceIndex; + mySourceChar=U16_GET_SUPPLEMENTARY(args->converter->fromUChar32, trail); + args->converter->fromUChar32=0x00; + /* there are no surrogates in GB2312*/ + *err = U_INVALID_CHAR_FOUND; + /* exit this condition tree */ + } else { + /* this is an unmatched lead code unit (1st surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; + } + } else { + /* no more input */ + *err = U_ZERO_ERROR; + } + } else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + *err=U_ILLEGAL_CHAR_FOUND; + } + } else { + /* callback(unassigned) for a BMP code point */ + *err = U_INVALID_CHAR_FOUND; + } + + args->converter->fromUChar32=mySourceChar; + break; + } + } + else{ + *err = U_BUFFER_OVERFLOW_ERROR; + break; + } + targetUniChar=missingCharMarker; + } + + args->target += myTargetIndex; + args->source += mySourceIndex; + myConverterData->isTargetUCharDBCS = isTargetUCharDBCS; +} + +static void U_CALLCONV +_HZ_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { + UConverter *cnv = args->converter; + UConverterDataHZ *convData=(UConverterDataHZ *) cnv->extraInfo; + char *p; + char buffer[4]; + p = buffer; + + if( convData->isTargetUCharDBCS){ + *p++= UCNV_TILDE; + *p++= UCNV_CLOSE_BRACE; + convData->isTargetUCharDBCS=false; + } + *p++= (char)cnv->subChars[0]; + + ucnv_cbFromUWriteBytes(args, + buffer, (int32_t)(p - buffer), + offsetIndex, err); +} + +/* + * Structure for cloning an HZ converter into a single memory block. + */ +struct cloneHZStruct +{ + UConverter cnv; + UConverter subCnv; + UConverterDataHZ mydata; +}; + + +static UConverter * U_CALLCONV +_HZ_SafeClone(const UConverter *cnv, + void *stackBuffer, + int32_t *pBufferSize, + UErrorCode *status) +{ + struct cloneHZStruct * localClone; + int32_t size, bufferSizeNeeded = sizeof(struct cloneHZStruct); + + if (U_FAILURE(*status)){ + return nullptr; + } + + if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ + *pBufferSize = bufferSizeNeeded; + return nullptr; + } + + localClone = (struct cloneHZStruct *)stackBuffer; + /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ + + uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataHZ)); + localClone->cnv.extraInfo = &localClone->mydata; + localClone->cnv.isExtraLocal = true; + + /* deep-clone the sub-converter */ + size = (int32_t)sizeof(UConverter); + ((UConverterDataHZ*)localClone->cnv.extraInfo)->gbConverter = + ucnv_safeClone(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, &localClone->subCnv, &size, status); + + return &localClone->cnv; +} + +static void U_CALLCONV +_HZ_GetUnicodeSet(const UConverter *cnv, + const USetAdder *sa, + UConverterUnicodeSet which, + UErrorCode *pErrorCode) { + /* HZ converts all of ASCII */ + sa->addRange(sa->set, 0, 0x7f); + + /* add all of the code points that the sub-converter handles */ + ucnv_MBCSGetFilteredUnicodeSetForUnicode( + ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, + sa, which, UCNV_SET_FILTER_HZ, + pErrorCode); +} +U_CDECL_END +static const UConverterImpl _HZImpl={ + + UCNV_HZ, + + nullptr, + nullptr, + + _HZOpen, + _HZClose, + _HZReset, + + UConverter_toUnicode_HZ_OFFSETS_LOGIC, + UConverter_toUnicode_HZ_OFFSETS_LOGIC, + UConverter_fromUnicode_HZ_OFFSETS_LOGIC, + UConverter_fromUnicode_HZ_OFFSETS_LOGIC, + nullptr, + + nullptr, + nullptr, + _HZ_WriteSub, + _HZ_SafeClone, + _HZ_GetUnicodeSet, + nullptr, + nullptr +}; + +static const UConverterStaticData _HZStaticData={ + sizeof(UConverterStaticData), + "HZ", + 0, + UCNV_IBM, + UCNV_HZ, + 1, + 4, + { 0x1a, 0, 0, 0 }, + 1, + false, + false, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */ + +}; + +const UConverterSharedData _HZData= + UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_HZStaticData, &_HZImpl); + +#endif /* #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION */ |