From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- .../7zstub/src/CPP/Common/UTFConvert.cpp | 288 +++++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100644 other-licenses/7zstub/src/CPP/Common/UTFConvert.cpp (limited to 'other-licenses/7zstub/src/CPP/Common/UTFConvert.cpp') diff --git a/other-licenses/7zstub/src/CPP/Common/UTFConvert.cpp b/other-licenses/7zstub/src/CPP/Common/UTFConvert.cpp new file mode 100644 index 0000000000..b09bbcdb37 --- /dev/null +++ b/other-licenses/7zstub/src/CPP/Common/UTFConvert.cpp @@ -0,0 +1,288 @@ +// UTFConvert.cpp + +#include "StdAfx.h" + +#include "MyTypes.h" +#include "UTFConvert.h" + +#ifdef _WIN32 +#define _WCHART_IS_16BIT 1 +#endif + +/* + _UTF8_START(n) - is a base value for start byte (head), if there are (n) additional bytes after start byte + + n : _UTF8_START(n) : Bits of code point + + 0 : 0x80 : : unused + 1 : 0xC0 : 11 : + 2 : 0xE0 : 16 : Basic Multilingual Plane + 3 : 0xF0 : 21 : Unicode space + 3 : 0xF8 : 26 : + 5 : 0xFC : 31 : UCS-4 + 6 : 0xFE : 36 : We can use it, if we want to encode any 32-bit value + 7 : 0xFF : +*/ + +#define _UTF8_START(n) (0x100 - (1 << (7 - (n)))) + +#define _UTF8_HEAD_PARSE2(n) if (c < _UTF8_START((n) + 1)) { numBytes = (n); c -= _UTF8_START(n); } + +#define _UTF8_HEAD_PARSE \ + _UTF8_HEAD_PARSE2(1) \ + else _UTF8_HEAD_PARSE2(2) \ + else _UTF8_HEAD_PARSE2(3) \ + else _UTF8_HEAD_PARSE2(4) \ + else _UTF8_HEAD_PARSE2(5) \ + + // else _UTF8_HEAD_PARSE2(6) + +bool CheckUTF8(const char *src, bool allowReduced) throw() +{ + for (;;) + { + Byte c = *src++; + if (c == 0) + return true; + + if (c < 0x80) + continue; + if (c < 0xC0) // (c < 0xC0 + 2) // if we support only optimal encoding chars + return false; + + unsigned numBytes; + _UTF8_HEAD_PARSE + else + return false; + + UInt32 val = c; + + do + { + Byte c2 = *src++; + if (c2 < 0x80 || c2 >= 0xC0) + return allowReduced && c2 == 0; + val <<= 6; + val |= (c2 - 0x80); + } + while (--numBytes); + + if (val >= 0x110000) + return false; + } +} + + +#define _ERROR_UTF8 \ + { if (dest) dest[destPos] = (wchar_t)0xFFFD; destPos++; ok = false; continue; } + +static bool Utf8_To_Utf16(wchar_t *dest, size_t *destLen, const char *src, const char *srcLim) throw() +{ + size_t destPos = 0; + bool ok = true; + + for (;;) + { + Byte c; + if (src == srcLim) + { + *destLen = destPos; + return ok; + } + c = *src++; + + if (c < 0x80) + { + if (dest) + dest[destPos] = (wchar_t)c; + destPos++; + continue; + } + if (c < 0xC0) + _ERROR_UTF8 + + unsigned numBytes; + _UTF8_HEAD_PARSE + else + _ERROR_UTF8 + + UInt32 val = c; + + do + { + Byte c2; + if (src == srcLim) + break; + c2 = *src; + if (c2 < 0x80 || c2 >= 0xC0) + break; + src++; + val <<= 6; + val |= (c2 - 0x80); + } + while (--numBytes); + + if (numBytes != 0) + _ERROR_UTF8 + + if (val < 0x10000) + { + if (dest) + dest[destPos] = (wchar_t)val; + destPos++; + } + else + { + val -= 0x10000; + if (val >= 0x100000) + _ERROR_UTF8 + if (dest) + { + dest[destPos + 0] = (wchar_t)(0xD800 + (val >> 10)); + dest[destPos + 1] = (wchar_t)(0xDC00 + (val & 0x3FF)); + } + destPos += 2; + } + } +} + +#define _UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6)) + +#define _UTF8_HEAD(n, val) ((char)(_UTF8_START(n) + (val >> (6 * (n))))) +#define _UTF8_CHAR(n, val) ((char)(0x80 + (((val) >> (6 * (n))) & 0x3F))) + +static size_t Utf16_To_Utf8_Calc(const wchar_t *src, const wchar_t *srcLim) +{ + size_t size = srcLim - src; + for (;;) + { + if (src == srcLim) + return size; + + UInt32 val = *src++; + + if (val < 0x80) + continue; + + if (val < _UTF8_RANGE(1)) + { + size++; + continue; + } + + if (val >= 0xD800 && val < 0xDC00 && src != srcLim) + { + UInt32 c2 = *src; + if (c2 >= 0xDC00 && c2 < 0xE000) + { + src++; + size += 2; + continue; + } + } + + #ifdef _WCHART_IS_16BIT + + size += 2; + + #else + + if (val < _UTF8_RANGE(2)) size += 2; + else if (val < _UTF8_RANGE(3)) size += 3; + else if (val < _UTF8_RANGE(4)) size += 4; + else if (val < _UTF8_RANGE(5)) size += 5; + else size += 6; + + #endif + } +} + +static char *Utf16_To_Utf8(char *dest, const wchar_t *src, const wchar_t *srcLim) +{ + for (;;) + { + if (src == srcLim) + return dest; + + UInt32 val = *src++; + + if (val < 0x80) + { + *dest++ = (char)val; + continue; + } + + if (val < _UTF8_RANGE(1)) + { + dest[0] = _UTF8_HEAD(1, val); + dest[1] = _UTF8_CHAR(0, val); + dest += 2; + continue; + } + + if (val >= 0xD800 && val < 0xDC00 && src != srcLim) + { + UInt32 c2 = *src; + if (c2 >= 0xDC00 && c2 < 0xE000) + { + src++; + val = (((val - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000; + dest[0] = _UTF8_HEAD(3, val); + dest[1] = _UTF8_CHAR(2, val); + dest[2] = _UTF8_CHAR(1, val); + dest[3] = _UTF8_CHAR(0, val); + dest += 4; + continue; + } + } + + #ifndef _WCHART_IS_16BIT + if (val < _UTF8_RANGE(2)) + #endif + { + dest[0] = _UTF8_HEAD(2, val); + dest[1] = _UTF8_CHAR(1, val); + dest[2] = _UTF8_CHAR(0, val); + dest += 3; + continue; + } + + #ifndef _WCHART_IS_16BIT + + UInt32 b; + unsigned numBits; + if (val < _UTF8_RANGE(3)) { numBits = 6 * 3; b = _UTF8_HEAD(3, val); } + else if (val < _UTF8_RANGE(4)) { numBits = 6 * 4; b = _UTF8_HEAD(4, val); } + else if (val < _UTF8_RANGE(5)) { numBits = 6 * 5; b = _UTF8_HEAD(5, val); } + else { numBits = 6 * 6; b = _UTF8_START(6); } + + *dest++ = (Byte)b; + + do + { + numBits -= 6; + *dest++ = (char)(0x80 + ((val >> numBits) & 0x3F)); + } + while (numBits != 0); + + #endif + } +} + +bool ConvertUTF8ToUnicode(const AString &src, UString &dest) +{ + dest.Empty(); + size_t destLen = 0; + Utf8_To_Utf16(NULL, &destLen, src, src.Ptr(src.Len())); + bool res = Utf8_To_Utf16(dest.GetBuf((unsigned)destLen), &destLen, src, src.Ptr(src.Len())); + dest.ReleaseBuf_SetEnd((unsigned)destLen); + return res; +} + +void ConvertUnicodeToUTF8(const UString &src, AString &dest) +{ + dest.Empty(); + size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len())); + Utf16_To_Utf8(dest.GetBuf((unsigned)destLen), src, src.Ptr(src.Len())); + dest.ReleaseBuf_SetEnd((unsigned)destLen); +} -- cgit v1.2.3