diff options
Diffstat (limited to 'src/libs/xpcom18a4/xpcom/string/public/nsUTF8Utils.h')
-rw-r--r-- | src/libs/xpcom18a4/xpcom/string/public/nsUTF8Utils.h | 462 |
1 files changed, 462 insertions, 0 deletions
diff --git a/src/libs/xpcom18a4/xpcom/string/public/nsUTF8Utils.h b/src/libs/xpcom18a4/xpcom/string/public/nsUTF8Utils.h new file mode 100644 index 00000000..c91079c2 --- /dev/null +++ b/src/libs/xpcom18a4/xpcom/string/public/nsUTF8Utils.h @@ -0,0 +1,462 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 2001 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Peter Annema <jaggernaut@netscape.com> (original author) + * + * Alternatively, the contents of this file may be used under the terms of + * either of the GNU General Public License Version 2 or later (the "GPL"), + * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#ifndef nsUTF8Utils_h_ +#define nsUTF8Utils_h_ + +class UTF8traits + { + public: + static PRBool isASCII(char c) { return (c & 0x80) == 0x00; } + static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; } + static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; } + static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; } + static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; } + static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; } + static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; } + }; + +#define PLANE1_BASE 0x00010000 +#define UCS2_REPLACEMENT_CHAR 0xfffd + +#ifdef __GNUC__ +#define NS_ALWAYS_INLINE __attribute__((always_inline)) +#else +#define NS_ALWAYS_INLINE +#endif + +/** + * A character sink (see |copy_string| in nsAlgorithm.h) for converting + * UTF-8 to UTF-16 + */ +class ConvertUTF8toUTF16 + { + public: + typedef nsACString::char_type value_type; + typedef nsAString::char_type buffer_type; + + ConvertUTF8toUTF16( buffer_type* aBuffer ) + : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {} + + size_t Length() const { return mBuffer - mStart; } + + PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N ) + { + if ( mErrorEncountered ) + return N; + + // algorithm assumes utf8 units won't + // be spread across fragments + const value_type* p = start; + const value_type* end = start + N; + buffer_type* out = mBuffer; + for ( ; p != end /* && *p */; ) + { + char c = *p++; + + if ( UTF8traits::isASCII(c) ) + { + *out++ = buffer_type(c); + continue; + } + + PRUint32 ucs4; + PRUint32 minUcs4; + PRInt32 state = 0; + + if ( UTF8traits::is2byte(c) ) + { + ucs4 = (PRUint32(c) << 6) & 0x000007C0L; + state = 1; + minUcs4 = 0x00000080; + } + else if ( UTF8traits::is3byte(c) ) + { + ucs4 = (PRUint32(c) << 12) & 0x0000F000L; + state = 2; + minUcs4 = 0x00000800; + } + else if ( UTF8traits::is4byte(c) ) + { + ucs4 = (PRUint32(c) << 18) & 0x001F0000L; + state = 3; + minUcs4 = 0x00010000; + } + else if ( UTF8traits::is5byte(c) ) + { + ucs4 = (PRUint32(c) << 24) & 0x03000000L; + state = 4; + minUcs4 = 0x00200000; + } + else if ( UTF8traits::is6byte(c) ) + { + ucs4 = (PRUint32(c) << 30) & 0x40000000L; + state = 5; + minUcs4 = 0x04000000; + } + else + { + NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); + mErrorEncountered = PR_TRUE; + mBuffer = out; + return N; + } + + while ( state-- ) + { + c = *p++; + + if ( UTF8traits::isInSeq(c) ) + { + PRInt32 shift = state * 6; + ucs4 |= (PRUint32(c) & 0x3F) << shift; + } + else + { + NS_ERROR("not a UTF8 string"); + mErrorEncountered = PR_TRUE; + mBuffer = out; + return N; + } + } + + if ( ucs4 < minUcs4 ) + { + // Overlong sequence + *out++ = UCS2_REPLACEMENT_CHAR; + } + else if ( ucs4 <= 0xD7FF ) + { + *out++ = ucs4; + } + else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF ) + { + // Surrogates + *out++ = UCS2_REPLACEMENT_CHAR; + } + else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF ) + { + // Prohibited characters + *out++ = UCS2_REPLACEMENT_CHAR; + } + else if ( ucs4 >= PLANE1_BASE ) + { + if ( ucs4 >= 0x00110000 ) + *out++ = UCS2_REPLACEMENT_CHAR; + else { + // surrogate, see unicode specification 3.7 for following math. + ucs4 -= PLANE1_BASE; + *out++ = (PRUnichar)(ucs4 >> 10) | 0xd800u; + *out++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u; + } + } + else + { + *out++ = ucs4; + } + } + mBuffer = out; + return p - start; + } + + void write_terminator() + { + *mBuffer = buffer_type(0); + } + + private: + buffer_type* const mStart; + buffer_type* mBuffer; + PRBool mErrorEncountered; + }; + +/** + * A character sink (see |copy_string| in nsAlgorithm.h) for computing + * the length of the UTF-16 string equivalent to a UTF-8 string. + */ +class CalculateUTF8Length + { + public: + typedef nsACString::char_type value_type; + + CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { } + + size_t Length() const { return mLength; } + + PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N ) + { + // ignore any further requests + if ( mErrorEncountered ) + return N; + + // algorithm assumes utf8 units won't + // be spread across fragments + const value_type* p = start; + const value_type* end = start + N; + for ( ; p < end /* && *p */; ++mLength ) + { + if ( UTF8traits::isASCII(*p) ) + p += 1; + else if ( UTF8traits::is2byte(*p) ) + p += 2; + else if ( UTF8traits::is3byte(*p) ) + p += 3; + else if ( UTF8traits::is4byte(*p) ) { + p += 4; + // Because a UTF-8 sequence of 4 bytes represents a codepoint + // greater than 0xFFFF, it will become a surrogate pair in the + // UTF-16 string, so add 1 more to mLength. + // This doesn't happen with is5byte and is6byte because they + // are illegal UTF-8 sequences (greater than 0x10FFFF) so get + // converted to a single replacement character. + // + // XXX: if the 4-byte sequence is an illegal non-shortest form, + // it also gets converted to a replacement character, so + // mLength will be off by one in this case. + ++mLength; + } + else if ( UTF8traits::is5byte(*p) ) + p += 5; + else if ( UTF8traits::is6byte(*p) ) + p += 6; + else + { + break; + } + } + if ( p != end ) + { + NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); + mErrorEncountered = PR_TRUE; + mLength = 0; + return N; + } + return p - start; + } + + private: + size_t mLength; + PRBool mErrorEncountered; + }; + +/** + * A character sink (see |copy_string| in nsAlgorithm.h) for converting + * UTF-16 to UTF-8. + */ +class ConvertUTF16toUTF8 + { + public: + typedef nsAString::char_type value_type; + typedef nsACString::char_type buffer_type; + + // The error handling here is more lenient than that in + // |ConvertUTF8toUTF16|, but it's that way for backwards + // compatibility. + + ConvertUTF16toUTF8( buffer_type* aBuffer ) + : mStart(aBuffer), mBuffer(aBuffer) {} + + size_t Size() const { return mBuffer - mStart; } + + PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N ) + { + buffer_type *out = mBuffer; // gcc isn't smart enough to do this! + + for (const value_type *p = start, *end = start + N; p < end; ++p ) + { + value_type c = *p; + if (! (c & 0xFF80)) // U+0000 - U+007F + { + *out++ = (char)c; + } + else if (! (c & 0xF800)) // U+0100 - U+07FF + { + *out++ = 0xC0 | (char)(c >> 6); + *out++ = 0x80 | (char)(0x003F & c); + } + else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF + { + *out++ = 0xE0 | (char)(c >> 12); + *out++ = 0x80 | (char)(0x003F & (c >> 6)); + *out++ = 0x80 | (char)(0x003F & c ); + } + else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF + { + // D800- DBFF - High Surrogate + // N = (H- D800) *400 + 10000 + ... + PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10); + + ++p; + if (p == end) + { + NS_ERROR("Surrogate pair split between fragments"); + mBuffer = out; + return N; + } + c = *p; + + if (0xDC00 == (0xFC00 & c)) + { + // DC00- DFFF - Low Surrogate + // N += ( L - DC00 ) + ucs4 |= (0x03FF & c); + + // 0001 0000-001F FFFF + *out++ = 0xF0 | (char)(ucs4 >> 18); + *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12)); + *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6)); + *out++ = 0x80 | (char)(0x003F & ucs4); + } + else + { + NS_ERROR("got a High Surrogate but no low surrogate"); + // output nothing. + } + } + else // U+DC00 - U+DFFF + { + // DC00- DFFF - Low Surrogate + NS_ERROR("got a low Surrogate but no high surrogate"); + // output nothing. + } + } + + mBuffer = out; + return N; + } + + void write_terminator() + { + *mBuffer = buffer_type(0); + } + + private: + buffer_type* const mStart; + buffer_type* mBuffer; + }; + +/** + * A character sink (see |copy_string| in nsAlgorithm.h) for computing + * the number of bytes a UTF-16 would occupy in UTF-8. + */ +class CalculateUTF8Size + { + public: + typedef nsAString::char_type value_type; + + CalculateUTF8Size() + : mSize(0) { } + + size_t Size() const { return mSize; } + + PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N ) + { + // Assume UCS2 surrogate pairs won't be spread across fragments. + for (const value_type *p = start, *end = start + N; p < end; ++p ) + { + value_type c = *p; + if (! (c & 0xFF80)) // U+0000 - U+007F + mSize += 1; + else if (! (c & 0xF800)) // U+0100 - U+07FF + mSize += 2; + else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF + mSize += 3; + else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF + { + ++p; + if (p == end) + { + NS_ERROR("Surrogate pair split between fragments"); + return N; + } + c = *p; + + if (0xDC00 == (0xFC00 & c)) + mSize += 4; + else + NS_ERROR("got a high Surrogate but no low surrogate"); + } + else // U+DC00 - U+DFFF + NS_ERROR("got a low Surrogate but no high surrogate"); + } + + return N; + } + + private: + size_t mSize; + }; + +/** + * A character sink that performs a |reinterpret_cast| style conversion + * between character types. + */ +template <class FromCharT, class ToCharT> +class LossyConvertEncoding + { + public: + typedef FromCharT value_type; + + typedef FromCharT input_type; + typedef ToCharT output_type; + + typedef typename nsCharTraits<FromCharT>::unsigned_char_type unsigned_input_type; + + public: + LossyConvertEncoding( output_type* aDestination ) : mDestination(aDestination) { } + + PRUint32 + write( const input_type* aSource, PRUint32 aSourceLength ) + { + const input_type* done_writing = aSource + aSourceLength; + while ( aSource < done_writing ) + *mDestination++ = (output_type)(unsigned_input_type)(*aSource++); // use old-style cast to mimic old |ns[C]String| behavior + return aSourceLength; + } + + void + write_terminator() + { + *mDestination = output_type(0); + } + + private: + output_type* mDestination; + }; + +#endif /* !defined(nsUTF8Utils_h_) */ |