summaryrefslogtreecommitdiffstats
path: root/src/libs/xpcom18a4/xpcom/string/public/nsUTF8Utils.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/libs/xpcom18a4/xpcom/string/public/nsUTF8Utils.h')
-rw-r--r--src/libs/xpcom18a4/xpcom/string/public/nsUTF8Utils.h462
1 files changed, 462 insertions, 0 deletions
diff --git a/src/libs/xpcom18a4/xpcom/string/public/nsUTF8Utils.h b/src/libs/xpcom18a4/xpcom/string/public/nsUTF8Utils.h
new file mode 100644
index 00000000..c91079c2
--- /dev/null
+++ b/src/libs/xpcom18a4/xpcom/string/public/nsUTF8Utils.h
@@ -0,0 +1,462 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is mozilla.org code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 2001
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ * Peter Annema <jaggernaut@netscape.com> (original author)
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either of the GNU General Public License Version 2 or later (the "GPL"),
+ * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#ifndef nsUTF8Utils_h_
+#define nsUTF8Utils_h_
+
+class UTF8traits
+ {
+ public:
+ static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
+ static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
+ static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
+ static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
+ static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
+ static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
+ static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
+ };
+
+#define PLANE1_BASE 0x00010000
+#define UCS2_REPLACEMENT_CHAR 0xfffd
+
+#ifdef __GNUC__
+#define NS_ALWAYS_INLINE __attribute__((always_inline))
+#else
+#define NS_ALWAYS_INLINE
+#endif
+
+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for converting
+ * UTF-8 to UTF-16
+ */
+class ConvertUTF8toUTF16
+ {
+ public:
+ typedef nsACString::char_type value_type;
+ typedef nsAString::char_type buffer_type;
+
+ ConvertUTF8toUTF16( buffer_type* aBuffer )
+ : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
+
+ size_t Length() const { return mBuffer - mStart; }
+
+ PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
+ {
+ if ( mErrorEncountered )
+ return N;
+
+ // algorithm assumes utf8 units won't
+ // be spread across fragments
+ const value_type* p = start;
+ const value_type* end = start + N;
+ buffer_type* out = mBuffer;
+ for ( ; p != end /* && *p */; )
+ {
+ char c = *p++;
+
+ if ( UTF8traits::isASCII(c) )
+ {
+ *out++ = buffer_type(c);
+ continue;
+ }
+
+ PRUint32 ucs4;
+ PRUint32 minUcs4;
+ PRInt32 state = 0;
+
+ if ( UTF8traits::is2byte(c) )
+ {
+ ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
+ state = 1;
+ minUcs4 = 0x00000080;
+ }
+ else if ( UTF8traits::is3byte(c) )
+ {
+ ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
+ state = 2;
+ minUcs4 = 0x00000800;
+ }
+ else if ( UTF8traits::is4byte(c) )
+ {
+ ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
+ state = 3;
+ minUcs4 = 0x00010000;
+ }
+ else if ( UTF8traits::is5byte(c) )
+ {
+ ucs4 = (PRUint32(c) << 24) & 0x03000000L;
+ state = 4;
+ minUcs4 = 0x00200000;
+ }
+ else if ( UTF8traits::is6byte(c) )
+ {
+ ucs4 = (PRUint32(c) << 30) & 0x40000000L;
+ state = 5;
+ minUcs4 = 0x04000000;
+ }
+ else
+ {
+ NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
+ mErrorEncountered = PR_TRUE;
+ mBuffer = out;
+ return N;
+ }
+
+ while ( state-- )
+ {
+ c = *p++;
+
+ if ( UTF8traits::isInSeq(c) )
+ {
+ PRInt32 shift = state * 6;
+ ucs4 |= (PRUint32(c) & 0x3F) << shift;
+ }
+ else
+ {
+ NS_ERROR("not a UTF8 string");
+ mErrorEncountered = PR_TRUE;
+ mBuffer = out;
+ return N;
+ }
+ }
+
+ if ( ucs4 < minUcs4 )
+ {
+ // Overlong sequence
+ *out++ = UCS2_REPLACEMENT_CHAR;
+ }
+ else if ( ucs4 <= 0xD7FF )
+ {
+ *out++ = ucs4;
+ }
+ else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
+ {
+ // Surrogates
+ *out++ = UCS2_REPLACEMENT_CHAR;
+ }
+ else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
+ {
+ // Prohibited characters
+ *out++ = UCS2_REPLACEMENT_CHAR;
+ }
+ else if ( ucs4 >= PLANE1_BASE )
+ {
+ if ( ucs4 >= 0x00110000 )
+ *out++ = UCS2_REPLACEMENT_CHAR;
+ else {
+ // surrogate, see unicode specification 3.7 for following math.
+ ucs4 -= PLANE1_BASE;
+ *out++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
+ *out++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
+ }
+ }
+ else
+ {
+ *out++ = ucs4;
+ }
+ }
+ mBuffer = out;
+ return p - start;
+ }
+
+ void write_terminator()
+ {
+ *mBuffer = buffer_type(0);
+ }
+
+ private:
+ buffer_type* const mStart;
+ buffer_type* mBuffer;
+ PRBool mErrorEncountered;
+ };
+
+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for computing
+ * the length of the UTF-16 string equivalent to a UTF-8 string.
+ */
+class CalculateUTF8Length
+ {
+ public:
+ typedef nsACString::char_type value_type;
+
+ CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { }
+
+ size_t Length() const { return mLength; }
+
+ PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
+ {
+ // ignore any further requests
+ if ( mErrorEncountered )
+ return N;
+
+ // algorithm assumes utf8 units won't
+ // be spread across fragments
+ const value_type* p = start;
+ const value_type* end = start + N;
+ for ( ; p < end /* && *p */; ++mLength )
+ {
+ if ( UTF8traits::isASCII(*p) )
+ p += 1;
+ else if ( UTF8traits::is2byte(*p) )
+ p += 2;
+ else if ( UTF8traits::is3byte(*p) )
+ p += 3;
+ else if ( UTF8traits::is4byte(*p) ) {
+ p += 4;
+ // Because a UTF-8 sequence of 4 bytes represents a codepoint
+ // greater than 0xFFFF, it will become a surrogate pair in the
+ // UTF-16 string, so add 1 more to mLength.
+ // This doesn't happen with is5byte and is6byte because they
+ // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
+ // converted to a single replacement character.
+ //
+ // XXX: if the 4-byte sequence is an illegal non-shortest form,
+ // it also gets converted to a replacement character, so
+ // mLength will be off by one in this case.
+ ++mLength;
+ }
+ else if ( UTF8traits::is5byte(*p) )
+ p += 5;
+ else if ( UTF8traits::is6byte(*p) )
+ p += 6;
+ else
+ {
+ break;
+ }
+ }
+ if ( p != end )
+ {
+ NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
+ mErrorEncountered = PR_TRUE;
+ mLength = 0;
+ return N;
+ }
+ return p - start;
+ }
+
+ private:
+ size_t mLength;
+ PRBool mErrorEncountered;
+ };
+
+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for converting
+ * UTF-16 to UTF-8.
+ */
+class ConvertUTF16toUTF8
+ {
+ public:
+ typedef nsAString::char_type value_type;
+ typedef nsACString::char_type buffer_type;
+
+ // The error handling here is more lenient than that in
+ // |ConvertUTF8toUTF16|, but it's that way for backwards
+ // compatibility.
+
+ ConvertUTF16toUTF8( buffer_type* aBuffer )
+ : mStart(aBuffer), mBuffer(aBuffer) {}
+
+ size_t Size() const { return mBuffer - mStart; }
+
+ PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
+ {
+ buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
+
+ for (const value_type *p = start, *end = start + N; p < end; ++p )
+ {
+ value_type c = *p;
+ if (! (c & 0xFF80)) // U+0000 - U+007F
+ {
+ *out++ = (char)c;
+ }
+ else if (! (c & 0xF800)) // U+0100 - U+07FF
+ {
+ *out++ = 0xC0 | (char)(c >> 6);
+ *out++ = 0x80 | (char)(0x003F & c);
+ }
+ else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
+ {
+ *out++ = 0xE0 | (char)(c >> 12);
+ *out++ = 0x80 | (char)(0x003F & (c >> 6));
+ *out++ = 0x80 | (char)(0x003F & c );
+ }
+ else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+ {
+ // D800- DBFF - High Surrogate
+ // N = (H- D800) *400 + 10000 + ...
+ PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
+
+ ++p;
+ if (p == end)
+ {
+ NS_ERROR("Surrogate pair split between fragments");
+ mBuffer = out;
+ return N;
+ }
+ c = *p;
+
+ if (0xDC00 == (0xFC00 & c))
+ {
+ // DC00- DFFF - Low Surrogate
+ // N += ( L - DC00 )
+ ucs4 |= (0x03FF & c);
+
+ // 0001 0000-001F FFFF
+ *out++ = 0xF0 | (char)(ucs4 >> 18);
+ *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
+ *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
+ *out++ = 0x80 | (char)(0x003F & ucs4);
+ }
+ else
+ {
+ NS_ERROR("got a High Surrogate but no low surrogate");
+ // output nothing.
+ }
+ }
+ else // U+DC00 - U+DFFF
+ {
+ // DC00- DFFF - Low Surrogate
+ NS_ERROR("got a low Surrogate but no high surrogate");
+ // output nothing.
+ }
+ }
+
+ mBuffer = out;
+ return N;
+ }
+
+ void write_terminator()
+ {
+ *mBuffer = buffer_type(0);
+ }
+
+ private:
+ buffer_type* const mStart;
+ buffer_type* mBuffer;
+ };
+
+/**
+ * A character sink (see |copy_string| in nsAlgorithm.h) for computing
+ * the number of bytes a UTF-16 would occupy in UTF-8.
+ */
+class CalculateUTF8Size
+ {
+ public:
+ typedef nsAString::char_type value_type;
+
+ CalculateUTF8Size()
+ : mSize(0) { }
+
+ size_t Size() const { return mSize; }
+
+ PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
+ {
+ // Assume UCS2 surrogate pairs won't be spread across fragments.
+ for (const value_type *p = start, *end = start + N; p < end; ++p )
+ {
+ value_type c = *p;
+ if (! (c & 0xFF80)) // U+0000 - U+007F
+ mSize += 1;
+ else if (! (c & 0xF800)) // U+0100 - U+07FF
+ mSize += 2;
+ else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
+ mSize += 3;
+ else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
+ {
+ ++p;
+ if (p == end)
+ {
+ NS_ERROR("Surrogate pair split between fragments");
+ return N;
+ }
+ c = *p;
+
+ if (0xDC00 == (0xFC00 & c))
+ mSize += 4;
+ else
+ NS_ERROR("got a high Surrogate but no low surrogate");
+ }
+ else // U+DC00 - U+DFFF
+ NS_ERROR("got a low Surrogate but no high surrogate");
+ }
+
+ return N;
+ }
+
+ private:
+ size_t mSize;
+ };
+
+/**
+ * A character sink that performs a |reinterpret_cast| style conversion
+ * between character types.
+ */
+template <class FromCharT, class ToCharT>
+class LossyConvertEncoding
+ {
+ public:
+ typedef FromCharT value_type;
+
+ typedef FromCharT input_type;
+ typedef ToCharT output_type;
+
+ typedef typename nsCharTraits<FromCharT>::unsigned_char_type unsigned_input_type;
+
+ public:
+ LossyConvertEncoding( output_type* aDestination ) : mDestination(aDestination) { }
+
+ PRUint32
+ write( const input_type* aSource, PRUint32 aSourceLength )
+ {
+ const input_type* done_writing = aSource + aSourceLength;
+ while ( aSource < done_writing )
+ *mDestination++ = (output_type)(unsigned_input_type)(*aSource++); // use old-style cast to mimic old |ns[C]String| behavior
+ return aSourceLength;
+ }
+
+ void
+ write_terminator()
+ {
+ *mDestination = output_type(0);
+ }
+
+ private:
+ output_type* mDestination;
+ };
+
+#endif /* !defined(nsUTF8Utils_h_) */