diff options
Diffstat (limited to 'comm/third_party/botan/src/lib/utils/charset.cpp')
-rw-r--r-- | comm/third_party/botan/src/lib/utils/charset.cpp | 283 |
1 files changed, 283 insertions, 0 deletions
diff --git a/comm/third_party/botan/src/lib/utils/charset.cpp b/comm/third_party/botan/src/lib/utils/charset.cpp new file mode 100644 index 0000000000..ca32c652db --- /dev/null +++ b/comm/third_party/botan/src/lib/utils/charset.cpp @@ -0,0 +1,283 @@ +/* +* Character Set Handling +* (C) 1999-2007 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/charset.h> +#include <botan/exceptn.h> +#include <botan/loadstor.h> +#include <cctype> + +namespace Botan { + +namespace { + +void append_utf8_for(std::string& s, uint32_t c) + { + if(c >= 0xD800 && c < 0xE000) + throw Decoding_Error("Invalid Unicode character"); + + if(c <= 0x7F) + { + const uint8_t b0 = static_cast<uint8_t>(c); + s.push_back(static_cast<char>(b0)); + } + else if(c <= 0x7FF) + { + const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6); + const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F); + s.push_back(static_cast<char>(b0)); + s.push_back(static_cast<char>(b1)); + } + else if(c <= 0xFFFF) + { + const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12); + const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F); + const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F); + s.push_back(static_cast<char>(b0)); + s.push_back(static_cast<char>(b1)); + s.push_back(static_cast<char>(b2)); + } + else if(c <= 0x10FFFF) + { + const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18); + const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F); + const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F); + const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F); + s.push_back(static_cast<char>(b0)); + s.push_back(static_cast<char>(b1)); + s.push_back(static_cast<char>(b2)); + s.push_back(static_cast<char>(b3)); + } + else + throw Decoding_Error("Invalid Unicode character"); + + } + +} + +std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) + { + if(len % 2 != 0) + throw Decoding_Error("Invalid length for UCS-2 string"); + + const size_t chars = len / 2; + + std::string s; + for(size_t i = 0; i != chars; ++i) + { + const uint16_t c = load_be<uint16_t>(ucs2, i); + append_utf8_for(s, c); + } + + return s; + } + +std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) + { + if(len % 4 != 0) + throw Decoding_Error("Invalid length for UCS-4 string"); + + const size_t chars = len / 4; + + std::string s; + for(size_t i = 0; i != chars; ++i) + { + const uint32_t c = load_be<uint32_t>(ucs4, i); + append_utf8_for(s, c); + } + + return s; + } + +/* +* Convert from UTF-8 to ISO 8859-1 +*/ +std::string utf8_to_latin1(const std::string& utf8) + { + std::string iso8859; + + size_t position = 0; + while(position != utf8.size()) + { + const uint8_t c1 = static_cast<uint8_t>(utf8[position++]); + + if(c1 <= 0x7F) + { + iso8859 += static_cast<char>(c1); + } + else if(c1 >= 0xC0 && c1 <= 0xC7) + { + if(position == utf8.size()) + throw Decoding_Error("UTF-8: sequence truncated"); + + const uint8_t c2 = static_cast<uint8_t>(utf8[position++]); + const uint8_t iso_char = ((c1 & 0x07) << 6) | (c2 & 0x3F); + + if(iso_char <= 0x7F) + throw Decoding_Error("UTF-8: sequence longer than needed"); + + iso8859 += static_cast<char>(iso_char); + } + else + throw Decoding_Error("UTF-8: Unicode chars not in Latin1 used"); + } + + return iso8859; + } + +namespace Charset { + +namespace { + +/* +* Convert from UCS-2 to ISO 8859-1 +*/ +std::string ucs2_to_latin1(const std::string& ucs2) + { + if(ucs2.size() % 2 == 1) + throw Decoding_Error("UCS-2 string has an odd number of bytes"); + + std::string latin1; + + for(size_t i = 0; i != ucs2.size(); i += 2) + { + const uint8_t c1 = ucs2[i]; + const uint8_t c2 = ucs2[i+1]; + + if(c1 != 0) + throw Decoding_Error("UCS-2 has non-Latin1 characters"); + + latin1 += static_cast<char>(c2); + } + + return latin1; + } + +/* +* Convert from ISO 8859-1 to UTF-8 +*/ +std::string latin1_to_utf8(const std::string& iso8859) + { + std::string utf8; + for(size_t i = 0; i != iso8859.size(); ++i) + { + const uint8_t c = static_cast<uint8_t>(iso8859[i]); + + if(c <= 0x7F) + utf8 += static_cast<char>(c); + else + { + utf8 += static_cast<char>((0xC0 | (c >> 6))); + utf8 += static_cast<char>((0x80 | (c & 0x3F))); + } + } + return utf8; + } + +} + +/* +* Perform character set transcoding +*/ +std::string transcode(const std::string& str, + Character_Set to, Character_Set from) + { + if(to == LOCAL_CHARSET) + to = LATIN1_CHARSET; + if(from == LOCAL_CHARSET) + from = LATIN1_CHARSET; + + if(to == from) + return str; + + if(from == LATIN1_CHARSET && to == UTF8_CHARSET) + return latin1_to_utf8(str); + if(from == UTF8_CHARSET && to == LATIN1_CHARSET) + return utf8_to_latin1(str); + if(from == UCS2_CHARSET && to == LATIN1_CHARSET) + return ucs2_to_latin1(str); + + throw Invalid_Argument("Unknown transcoding operation from " + + std::to_string(from) + " to " + std::to_string(to)); + } + +/* +* Check if a character represents a digit +*/ +bool is_digit(char c) + { + if(c == '0' || c == '1' || c == '2' || c == '3' || c == '4' || + c == '5' || c == '6' || c == '7' || c == '8' || c == '9') + return true; + return false; + } + +/* +* Check if a character represents whitespace +*/ +bool is_space(char c) + { + if(c == ' ' || c == '\t' || c == '\n' || c == '\r') + return true; + return false; + } + +/* +* Convert a character to a digit +*/ +uint8_t char2digit(char c) + { + switch(c) + { + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + } + + throw Invalid_Argument("char2digit: Input is not a digit character"); + } + +/* +* Convert a digit to a character +*/ +char digit2char(uint8_t b) + { + switch(b) + { + case 0: return '0'; + case 1: return '1'; + case 2: return '2'; + case 3: return '3'; + case 4: return '4'; + case 5: return '5'; + case 6: return '6'; + case 7: return '7'; + case 8: return '8'; + case 9: return '9'; + } + + throw Invalid_Argument("digit2char: Input is not a digit"); + } + +/* +* Case-insensitive character comparison +*/ +bool caseless_cmp(char a, char b) + { + return (std::tolower(static_cast<unsigned char>(a)) == + std::tolower(static_cast<unsigned char>(b))); + } + +} + +} |