diff options
Diffstat (limited to 'sal/textenc/tcvtutf7.cxx')
-rw-r--r-- | sal/textenc/tcvtutf7.cxx | 602 |
1 files changed, 602 insertions, 0 deletions
diff --git a/sal/textenc/tcvtutf7.cxx b/sal/textenc/tcvtutf7.cxx new file mode 100644 index 000000000..0f3ea4e7c --- /dev/null +++ b/sal/textenc/tcvtutf7.cxx @@ -0,0 +1,602 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <sal/config.h> + +#include <rtl/textcvt.h> + +#include "tenchelp.hxx" +#include "unichars.hxx" + +/* ======================================================================= */ + +unsigned char const aImplBase64Tab[64] = +{ + /* A-Z */ + 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, + /* a-z */ + 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, + /* 0-9,+,/ */ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x2B, 0x2F +}; + +/* Index in Base64Tab or 0xFF, when is an invalid character */ +unsigned char const aImplBase64IndexTab[128] = +{ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x00-0x07 */ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x08-0x0F */ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x10-0x17 */ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x18-0x1F */ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x20-0x27 !"#$%&' */ + 0xFF, 0xFF, 0xFF, 62, 0xFF, 0xFF, 0xFF, 63, /* 0x28-0x2F ()*+,-./ */ + 52, 53, 54, 55, 56, 57, 58, 59, /* 0x30-0x37 01234567 */ + 60, 61, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x38-0x3F 89:;<=>? */ + 0xFF, 0, 1, 2, 3, 4, 5, 6, /* 0x40-0x47 @ABCDEFG */ + 7, 8, 9, 10, 11, 12, 13, 14, /* 0x48-0x4F HIJKLMNO */ + 15, 16, 17, 18, 19, 20, 21, 22, /* 0x50-0x57 PQRSTUVW */ + 23, 24, 25, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x58-0x5F XYZ[\]^_ */ + 0xFF, 26, 27, 28, 29, 30, 31, 32, /* 0x60-0x67 `abcdefg */ + 33, 34, 35, 36, 37, 38, 39, 40, /* 0x68-0x6F hijklmno */ + 41, 42, 43, 44, 45, 46, 47, 48, /* 0x70-0x77 pqrstuvw */ + 49, 50, 51, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF /* 0x78-0x7F xyz{|}~ */ +}; + +unsigned char const aImplMustShiftTab[128] = +{ + 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00-0x07 */ + 1, 0, 0, 1, 0, 1, 1, 1, /* 0x08-0x0F 0x09 == HTAB, 0x0A == LF 0x0C == CR */ + 1, 1, 1, 1, 1, 1, 1, 1, /* 0x10-0x17 */ + 1, 1, 1, 1, 1, 1, 1, 1, /* 0x18-0x1F */ + 0, 1, 1, 1, 1, 1, 1, 0, /* 0x20-0x27 !"#$%&' */ + 0, 0, 1, 1, 0, 1, 0, 0, /* 0x28-0x2F ()*+,-./ */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30-0x37 01234567 */ + 0, 0, 0, 1, 1, 1, 1, 0, /* 0x38-0x3F 89:;<=>? */ + 1, 0, 0, 0, 0, 0, 0, 0, /* 0x40-0x47 @ABCDEFG */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x48-0x4F HIJKLMNO */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x50-0x57 PQRSTUVW */ + 0, 0, 0, 1, 1, 1, 1, 1, /* 0x58-0x5F XYZ[\]^_ */ + 1, 0, 0, 0, 0, 0, 0, 0, /* 0x60-0x67 `abcdefg */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x68-0x6F hijklmno */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x70-0x77 pqrstuvw */ + 0, 0, 0, 1, 1, 1, 1, 1 /* 0x78-0x7F xyz{|}~ */ +}; + +/* + */ +#define IMPL_SHIFT_IN_CHAR 0x2B +/* - */ +#define IMPL_SHIFT_OUT_CHAR 0x2D + +/* ----------------------------------------------------------------------- */ + +namespace { + +struct ImplUTF7ToUCContextData +{ + bool mbShifted; + bool mbFirst; + bool mbWroteOne; + sal_uInt32 mnBitBuffer; + sal_uInt32 mnBufferBits; +}; + +} + +/* ----------------------------------------------------------------------- */ + +void* ImplUTF7CreateUTF7TextToUnicodeContext() +{ + ImplUTF7ToUCContextData* pContextData = new ImplUTF7ToUCContextData; + pContextData->mbShifted = false; + pContextData->mbFirst = false; + pContextData->mbWroteOne = false; + pContextData->mnBitBuffer = 0; + pContextData->mnBufferBits = 0; + return pContextData; +} + +/* ----------------------------------------------------------------------- */ + +void ImplUTF7DestroyTextToUnicodeContext( void* pContext ) +{ + delete static_cast< ImplUTF7ToUCContextData * >(pContext); +} + +/* ----------------------------------------------------------------------- */ + +void ImplUTF7ResetTextToUnicodeContext( void* pContext ) +{ + ImplUTF7ToUCContextData* pContextData = static_cast<ImplUTF7ToUCContextData*>(pContext); + pContextData->mbShifted = false; + pContextData->mbFirst = false; + pContextData->mbWroteOne = false; + pContextData->mnBitBuffer = 0; + pContextData->mnBufferBits = 0; +} + +/* ----------------------------------------------------------------------- */ + +sal_Size ImplUTF7ToUnicode( SAL_UNUSED_PARAMETER const void*, void* pContext, + const char* pSrcBuf, sal_Size nSrcBytes, + sal_Unicode* pDestBuf, sal_Size nDestChars, + sal_uInt32 nFlags, sal_uInt32* pInfo, + sal_Size* pSrcCvtBytes ) +{ + ImplUTF7ToUCContextData* pContextData = static_cast<ImplUTF7ToUCContextData*>(pContext); + unsigned char c ='\0'; + unsigned char nBase64Value = 0; + bool bEnd = false; + bool bShifted; + bool bFirst; + bool bWroteOne; + bool bBase64End; + sal_uInt32 nBitBuffer; + sal_uInt32 nBitBufferTemp; + sal_uInt32 nBufferBits; + sal_Unicode* pEndDestBuf; + const char* pEndSrcBuf; + +/* !!! Implementation not finished !!! + if ( pContextData ) + { + bShifted = pContextData->mbShifted; + bFirst = pContextData->mbFirst; + bWroteOne = pContextData->mbWroteOne; + nBitBuffer = pContextData->mnBitBuffer; + nBufferBits = pContextData->mnBufferBits; + } + else +*/ + { + bShifted = false; + bFirst = false; + bWroteOne = false; + nBitBuffer = 0; + nBufferBits = 0; + } + + *pInfo = 0; + pEndDestBuf = pDestBuf+nDestChars; + pEndSrcBuf = pSrcBuf+nSrcBytes; + do + { + if ( pSrcBuf < pEndSrcBuf ) + { + c = static_cast<unsigned char>(*pSrcBuf); + + /* End, when not a base64 character */ + bBase64End = false; + if ( c <= 0x7F ) + { + nBase64Value = aImplBase64IndexTab[c]; + if ( nBase64Value == 0xFF ) + bBase64End = true; + } + } + else + { + bEnd = true; + bBase64End = true; + } + + if ( bShifted ) + { + if ( bBase64End ) + { + bShifted = false; + + /* If the character causing us to drop out was SHIFT_IN */ + /* or SHIFT_OUT, it may be a special escape for SHIFT_IN. */ + /* The test for SHIFT_IN is not necessary, but allows */ + /* an alternate form of UTF-7 where SHIFT_IN is escaped */ + /* by SHIFT_IN. This only works for some values of */ + /* SHIFT_IN. It is so implemented, because this comes */ + /* from the official unicode book (The Unicode Standard, */ + /* Version 2.0) and so I think, that someone of the */ + /* world has used this feature. */ + if ( !bEnd ) + { + if ( (c == IMPL_SHIFT_IN_CHAR) || (c == IMPL_SHIFT_OUT_CHAR) ) + { + /* If no base64 character, and the terminating */ + /* character of the shift sequence was the */ + /* SHIFT_OUT_CHAR, then it't a special escape */ + /* for SHIFT_IN_CHAR. */ + if ( bFirst && (c == IMPL_SHIFT_OUT_CHAR) ) + { + if ( pDestBuf >= pEndDestBuf ) + { + *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL; + break; + } + *pDestBuf = IMPL_SHIFT_IN_CHAR; + pDestBuf++; + bWroteOne = true; + } + + /* Skip character */ + pSrcBuf++; + if ( pSrcBuf < pEndSrcBuf ) + c = static_cast<unsigned char>(*pSrcBuf); + else + bEnd = true; + } + } + + /* Empty sequence not allowed, so when we don't write one */ + /* valid char, then the sequence is corrupt */ + if ( !bWroteOne ) + { + /* When no more bytes in the source buffer, then */ + /* this buffer may be too small */ + if ( bEnd ) + *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL; + else + { + *pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID; + if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ) + { + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + if (!bEnd) { + ++pSrcBuf; + } + } else { + //TODO: move pSrcBuf back to a reasonable starting place + } + *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; + break; + } + /* We insert here no default char, because I think */ + /* this is better to ignore this */ + } + } + } + else + { + /* Add 6 Bits from character to the bit buffer */ + nBufferBits += 6; + nBitBuffer |= static_cast<sal_uInt32>(nBase64Value & 0x3F) << (32-nBufferBits); + bFirst = false; + } + + /* Extract as many full 16 bit characters as possible from the */ + /* bit buffer. */ + while ( (pDestBuf < pEndDestBuf) && (nBufferBits >= 16) ) + { + nBitBufferTemp = nBitBuffer >> (32-16); + *pDestBuf = static_cast<sal_Unicode>(nBitBufferTemp & 0xFFFF); + pDestBuf++; + nBitBuffer <<= 16; + nBufferBits -= 16; + bWroteOne = true; + } + + if ( nBufferBits >= 16 ) + { + *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL; + break; + } + + if ( bBase64End ) + { + /* Sequence ended and we have some bits, then the */ + /* sequence is corrupted */ + if ( nBufferBits && nBitBuffer ) + { + /* When no more bytes in the source buffer, then */ + /* this buffer may be too small */ + if ( bEnd ) + *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL; + else + { + *pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID; + if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ) + { + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + if (!bEnd) { + ++pSrcBuf; + } + } else { + //TODO: move pSrcBuf back to a reasonable starting place + } + *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; + break; + } + if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) != RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE ) + { + if ( pDestBuf >= pEndDestBuf ) + { + *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL; + break; + } + *pDestBuf++ + = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER; + } + } + + } + + nBitBuffer = 0; + nBufferBits = 0; + } + } + + if ( !bEnd ) + { + if ( !bShifted ) + { + if ( c == IMPL_SHIFT_IN_CHAR ) + { + bShifted = true; + bFirst = true; + bWroteOne = false; + } + else + { + /* No direct encoded character, then the buffer is */ + /* corrupt */ + if ( c > 0x7F ) + { + *pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID; + if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ) + { + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) { + ++pSrcBuf; + } + *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; + break; + } + if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) != RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE ) + { + if ( pDestBuf >= pEndDestBuf ) + { + *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL; + break; + } + *pDestBuf++ + = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER; + } + } + else + { + /* Write char to unicode buffer */ + if ( pDestBuf >= pEndDestBuf ) + { + *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL; + break; + } + *pDestBuf = c; + pDestBuf++; + + } + } + } + + pSrcBuf++; + } + } + while ( !bEnd ); + + if ( pContextData ) + { + pContextData->mbShifted = bShifted; + pContextData->mbFirst = bFirst; + pContextData->mbWroteOne = bWroteOne; + pContextData->mnBitBuffer = nBitBuffer; + pContextData->mnBufferBits = nBufferBits; + } + + *pSrcCvtBytes = nSrcBytes - (pEndSrcBuf-pSrcBuf); + return (nDestChars - (pEndDestBuf-pDestBuf)); +} + +/* ======================================================================= */ + +namespace { + +struct ImplUTF7FromUCContextData +{ + bool mbShifted; + sal_uInt32 mnBitBuffer; + sal_uInt32 mnBufferBits; +}; + +} + +/* ----------------------------------------------------------------------- */ + +void* ImplUTF7CreateUnicodeToTextContext() +{ + ImplUTF7FromUCContextData* pContextData = new ImplUTF7FromUCContextData; + pContextData->mbShifted = false; + pContextData->mnBitBuffer = 0; + pContextData->mnBufferBits = 0; + return pContextData; +} + +/* ----------------------------------------------------------------------- */ + +void ImplUTF7DestroyUnicodeToTextContext( void* pContext ) +{ + delete static_cast< ImplUTF7FromUCContextData * >(pContext); +} + +/* ----------------------------------------------------------------------- */ + +void ImplUTF7ResetUnicodeToTextContext( void* pContext ) +{ + ImplUTF7FromUCContextData* pContextData = static_cast<ImplUTF7FromUCContextData*>(pContext); + pContextData->mbShifted = false; + pContextData->mnBitBuffer = 0; + pContextData->mnBufferBits = 0; +} + +/* ----------------------------------------------------------------------- */ + +sal_Size ImplUnicodeToUTF7( SAL_UNUSED_PARAMETER const void*, void* pContext, + const sal_Unicode* pSrcBuf, sal_Size nSrcChars, + char* pDestBuf, sal_Size nDestBytes, + SAL_UNUSED_PARAMETER sal_uInt32, sal_uInt32* pInfo, + sal_Size* pSrcCvtChars ) +{ + ImplUTF7FromUCContextData* pContextData = static_cast<ImplUTF7FromUCContextData*>(pContext); + sal_Unicode c = '\0'; + bool bEnd = false; + bool bShifted; + bool bNeedShift; + sal_uInt32 nBitBuffer; + sal_uInt32 nBitBufferTemp; + sal_uInt32 nBufferBits; + char* pEndDestBuf; + const sal_Unicode* pEndSrcBuf; + +/* !!! Implementation not finished !!! + if ( pContextData ) + { + bShifted = pContextData->mbShifted; + nBitBuffer = pContextData->mnBitBuffer; + nBufferBits = pContextData->mnBufferBits; + } + else +*/ + { + bShifted = false; + nBitBuffer = 0; + nBufferBits = 0; + } + + *pInfo = 0; + pEndDestBuf = pDestBuf+nDestBytes; + pEndSrcBuf = pSrcBuf+nSrcChars; + do + { + if ( pSrcBuf < pEndSrcBuf ) + { + c = *pSrcBuf; + + bNeedShift = (c > 0x7F) || aImplMustShiftTab[c]; + if ( bNeedShift && !bShifted ) + { + if ( pDestBuf >= pEndDestBuf ) + { + *pInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; + break; + } + *pDestBuf = IMPL_SHIFT_IN_CHAR; + pDestBuf++; + /* Special case handling for SHIFT_IN_CHAR */ + if ( c == IMPL_SHIFT_IN_CHAR ) + { + if ( pDestBuf >= pEndDestBuf ) + { + *pInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; + break; + } + *pDestBuf = IMPL_SHIFT_OUT_CHAR; + pDestBuf++; + } + else + bShifted = true; + } + } + else + { + bEnd = true; + bNeedShift = false; + } + + if ( bShifted ) + { + /* Write the character to the bit buffer, or pad the bit */ + /* buffer out to a full base64 character */ + if ( bNeedShift ) + { + nBufferBits += 16; + nBitBuffer |= static_cast<sal_uInt32>(c) << (32-nBufferBits); + } + else + nBufferBits += (6-(nBufferBits%6))%6; + + /* Flush out as many full base64 characters as possible */ + while ( (pDestBuf < pEndDestBuf) && (nBufferBits >= 6) ) + { + nBitBufferTemp = nBitBuffer >> (32-6); + *pDestBuf = aImplBase64Tab[nBitBufferTemp]; + pDestBuf++; + nBitBuffer <<= 6; + nBufferBits -= 6; + } + + if ( nBufferBits >= 6 ) + { + *pInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; + break; + } + + /* Write SHIFT_OUT_CHAR, when needed */ + if ( !bNeedShift ) + { + if ( pDestBuf >= pEndDestBuf ) + { + *pInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; + break; + } + *pDestBuf = IMPL_SHIFT_OUT_CHAR; + pDestBuf++; + bShifted = false; + } + } + + if ( !bEnd ) + { + /* Character can be directly encoded */ + if ( !bNeedShift ) + { + if ( pDestBuf >= pEndDestBuf ) + { + *pInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; + break; + } + *pDestBuf = static_cast< char >(static_cast< unsigned char >(c)); + pDestBuf++; + } + + pSrcBuf++; + } + } + while ( !bEnd ); + + if ( pContextData ) + { + pContextData->mbShifted = bShifted; + pContextData->mnBitBuffer = nBitBuffer; + pContextData->mnBufferBits = nBufferBits; + } + + *pSrcCvtChars = nSrcChars - (pEndSrcBuf-pSrcBuf); + return (nDestBytes - (pEndDestBuf-pDestBuf)); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |