1 files changed, 323 insertions, 0 deletions
diff --git a/include/grub/charset.h b/include/grub/charset.h
new file mode 100644
index 0000000..d14faea
--- /dev/null
+++ b/include/grub/charset.h
@@ -0,0 +1,323 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009  Free Software Foundation, Inc.
+ *
+ *  GRUB is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  GRUB is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GRUB.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GRUB_CHARSET_HEADER
+#define GRUB_CHARSET_HEADER	1
+
+#include <grub/types.h>
+
+#define GRUB_UINT8_1_LEADINGBIT 0x80
+#define GRUB_UINT8_2_LEADINGBITS 0xc0
+#define GRUB_UINT8_3_LEADINGBITS 0xe0
+#define GRUB_UINT8_4_LEADINGBITS 0xf0
+#define GRUB_UINT8_5_LEADINGBITS 0xf8
+#define GRUB_UINT8_6_LEADINGBITS 0xfc
+#define GRUB_UINT8_7_LEADINGBITS 0xfe
+
+#define GRUB_UINT8_1_TRAILINGBIT 0x01
+#define GRUB_UINT8_2_TRAILINGBITS 0x03
+#define GRUB_UINT8_3_TRAILINGBITS 0x07
+#define GRUB_UINT8_4_TRAILINGBITS 0x0f
+#define GRUB_UINT8_5_TRAILINGBITS 0x1f
+#define GRUB_UINT8_6_TRAILINGBITS 0x3f
+
+#define GRUB_MAX_UTF8_PER_UTF16 4
+/* You need at least one UTF-8 byte to have one UTF-16 word.
+   You need at least three UTF-8 bytes to have 2 UTF-16 words (surrogate pairs).
+ */
+#define GRUB_MAX_UTF16_PER_UTF8 1
+#define GRUB_MAX_UTF8_PER_CODEPOINT 4
+
+#define GRUB_UCS2_LIMIT 0x10000
+#define GRUB_UTF16_UPPER_SURROGATE(code) \
+  (0xD800 | ((((code) - GRUB_UCS2_LIMIT) >> 10) & 0x3ff))
+#define GRUB_UTF16_LOWER_SURROGATE(code) \
+  (0xDC00 | (((code) - GRUB_UCS2_LIMIT) & 0x3ff))
+
+/* Process one character from UTF8 sequence. 
+   At beginning set *code = 0, *count = 0. Returns 0 on failure and
+   1 on success. *count holds the number of trailing bytes.  */
+static inline int
+grub_utf8_process (grub_uint8_t c, grub_uint32_t *code, int *count)
+{
+  if (*count)
+    {
+      if ((c & GRUB_UINT8_2_LEADINGBITS) != GRUB_UINT8_1_LEADINGBIT)
+	{
+	  *count = 0;
+	  /* invalid */
+	  return 0;
+	}
+      else
+	{
+	  *code <<= 6;
+	  *code |= (c & GRUB_UINT8_6_TRAILINGBITS);
+	  (*count)--;
+	  /* Overlong.  */
+	  if ((*count == 1 && *code <= 0x1f)
+	      || (*count == 2 && *code <= 0xf))
+	    {
+	      *code = 0;
+	      *count = 0;
+	      return 0;
+	    }
+	  return 1;
+	}
+    }
+
+  if ((c & GRUB_UINT8_1_LEADINGBIT) == 0)
+    {
+      *code = c;
+      return 1;
+    }
+  if ((c & GRUB_UINT8_3_LEADINGBITS) == GRUB_UINT8_2_LEADINGBITS)
+    {
+      *count = 1;
+      *code = c & GRUB_UINT8_5_TRAILINGBITS;
+      /* Overlong */
+      if (*code <= 1)
+	{
+	  *count = 0;
+	  *code = 0;
+	  return 0;
+	}
+      return 1;
+    }
+  if ((c & GRUB_UINT8_4_LEADINGBITS) == GRUB_UINT8_3_LEADINGBITS)
+    {
+      *count = 2;
+      *code = c & GRUB_UINT8_4_TRAILINGBITS;
+      return 1;
+    }
+  if ((c & GRUB_UINT8_5_LEADINGBITS) == GRUB_UINT8_4_LEADINGBITS)
+    {
+      *count = 3;
+      *code = c & GRUB_UINT8_3_TRAILINGBITS;
+      return 1;
+    }
+  return 0;
+}
+
+
+/* Convert a (possibly null-terminated) UTF-8 string of at most SRCSIZE
+   bytes (if SRCSIZE is -1, it is ignored) in length to a UTF-16 string.
+   Return the number of characters converted. DEST must be able to hold
+   at least DESTSIZE characters. If an invalid sequence is found, return -1.
+   If SRCEND is not NULL, then *SRCEND is set to the next byte after the
+   last byte used in SRC.  */
+static inline grub_size_t
+grub_utf8_to_utf16 (grub_uint16_t *dest, grub_size_t destsize,
+		    const grub_uint8_t *src, grub_size_t srcsize,
+		    const grub_uint8_t **srcend)
+{
+  grub_uint16_t *p = dest;
+  int count = 0;
+  grub_uint32_t code = 0;
+
+  if (srcend)
+    *srcend = src;
+
+  while (srcsize && destsize)
+    {
+      int was_count = count;
+      if (srcsize != (grub_size_t)-1)
+	srcsize--;
+      if (!grub_utf8_process (*src++, &code, &count))
+	{
+	  code = '?';
+	  count = 0;
+	  /* Character c may be valid, don't eat it.  */
+	  if (was_count)
+	    src--;
+	}
+      if (count != 0)
+	continue;
+      if (code == 0)
+	break;
+      if (destsize < 2 && code >= GRUB_UCS2_LIMIT)
+	break;
+      if (code >= GRUB_UCS2_LIMIT)
+	{
+	  *p++ = GRUB_UTF16_UPPER_SURROGATE (code);
+	  *p++ = GRUB_UTF16_LOWER_SURROGATE (code);
+	  destsize -= 2;
+	}
+      else
+	{
+	  *p++ = code;
+	  destsize--;
+	}
+    }
+
+  if (srcend)
+    *srcend = src;
+  return p - dest;
+}
+
+/* Determine the last position where the UTF-8 string [beg, end) can
+   be safely cut. */
+static inline grub_size_t
+grub_getend (const char *beg, const char *end)
+{
+  const char *ptr;
+  for (ptr = end - 1; ptr >= beg; ptr--)
+    if ((*ptr & GRUB_UINT8_2_LEADINGBITS) != GRUB_UINT8_1_LEADINGBIT)
+      break;
+  if (ptr < beg)
+    return 0;
+  if ((*ptr & GRUB_UINT8_1_LEADINGBIT) == 0)
+    return ptr + 1 - beg;
+  if ((*ptr & GRUB_UINT8_3_LEADINGBITS) == GRUB_UINT8_2_LEADINGBITS
+      && ptr + 2 <= end)
+    return ptr + 2 - beg;
+  if ((*ptr & GRUB_UINT8_4_LEADINGBITS) == GRUB_UINT8_3_LEADINGBITS
+      && ptr + 3 <= end)
+    return ptr + 3 - beg;
+  if ((*ptr & GRUB_UINT8_5_LEADINGBITS) == GRUB_UINT8_4_LEADINGBITS
+      && ptr + 4 <= end)
+    return ptr + 4 - beg;
+  /* Invalid character or incomplete. Cut before it.  */
+  return ptr - beg;
+}
+
+/* Convert UTF-16 to UTF-8.  */
+static inline grub_uint8_t *
+grub_utf16_to_utf8 (grub_uint8_t *dest, const grub_uint16_t *src,
+		    grub_size_t size)
+{
+  grub_uint32_t code_high = 0;
+
+  while (size--)
+    {
+      grub_uint32_t code = *src++;
+
+      if (code_high)
+	{
+	  if (code >= 0xDC00 && code <= 0xDFFF)
+	    {
+	      /* Surrogate pair.  */
+	      code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
+
+	      *dest++ = (code >> 18) | 0xF0;
+	      *dest++ = ((code >> 12) & 0x3F) | 0x80;
+	      *dest++ = ((code >> 6) & 0x3F) | 0x80;
+	      *dest++ = (code & 0x3F) | 0x80;
+	    }
+	  else
+	    {
+	      /* Error...  */
+	      *dest++ = '?';
+	      /* *src may be valid. Don't eat it.  */
+	      src--;
+	    }
+
+	  code_high = 0;
+	}
+      else
+	{
+	  if (code <= 0x007F)
+	    *dest++ = code;
+	  else if (code <= 0x07FF)
+	    {
+	      *dest++ = (code >> 6) | 0xC0;
+	      *dest++ = (code & 0x3F) | 0x80;
+	    }
+	  else if (code >= 0xD800 && code <= 0xDBFF)
+	    {
+	      code_high = code;
+	      continue;
+	    }
+	  else if (code >= 0xDC00 && code <= 0xDFFF)
+	    {
+	      /* Error... */
+	      *dest++ = '?';
+	    }
+	  else if (code < 0x10000)
+	    {
+	      *dest++ = (code >> 12) | 0xE0;
+	      *dest++ = ((code >> 6) & 0x3F) | 0x80;
+	      *dest++ = (code & 0x3F) | 0x80;
+	    }
+	  else
+	    {
+	      *dest++ = (code >> 18) | 0xF0;
+	      *dest++ = ((code >> 12) & 0x3F) | 0x80;
+	      *dest++ = ((code >> 6) & 0x3F) | 0x80;
+	      *dest++ = (code & 0x3F) | 0x80;
+	    }
+	}
+    }
+
+  return dest;
+}
+
+#define GRUB_MAX_UTF8_PER_LATIN1 2
+
+/* Convert Latin1 to UTF-8.  */
+static inline grub_uint8_t *
+grub_latin1_to_utf8 (grub_uint8_t *dest, const grub_uint8_t *src,
+		     grub_size_t size)
+{
+  while (size--)
+    {
+      if (!(*src & 0x80))
+	*dest++ = *src;
+      else
+	{
+	  *dest++ = (*src >> 6) | 0xC0;
+	  *dest++ = (*src & 0x3F) | 0x80;
+	}
+      src++;
+    }
+
+  return dest;
+}
+
+/* Convert UCS-4 to UTF-8.  */
+char *grub_ucs4_to_utf8_alloc (const grub_uint32_t *src, grub_size_t size);
+
+int
+grub_is_valid_utf8 (const grub_uint8_t *src, grub_size_t srcsize);
+
+grub_ssize_t grub_utf8_to_ucs4_alloc (const char *msg,
+				      grub_uint32_t **unicode_msg,
+				      grub_uint32_t **last_position);
+
+/* Returns the number of bytes the string src would occupy is converted
+   to UTF-8, excluding \0.  */
+grub_size_t
+grub_get_num_of_utf8_bytes (const grub_uint32_t *src, grub_size_t size);
+
+/* Converts UCS-4 to UTF-8. Returns the number of bytes effectively written
+   excluding the trailing \0.  */
+grub_size_t
+grub_ucs4_to_utf8 (const grub_uint32_t *src, grub_size_t size,
+		   grub_uint8_t *dest, grub_size_t destsize);
+grub_size_t grub_utf8_to_ucs4 (grub_uint32_t *dest, grub_size_t destsize,
+			       const grub_uint8_t *src, grub_size_t srcsize,
+			       const grub_uint8_t **srcend);
+/* Returns -2 if not enough space, -1 on invalid character.  */
+grub_ssize_t
+grub_encode_utf8_character (grub_uint8_t *dest, grub_uint8_t *destend,
+			    grub_uint32_t code);
+
+const grub_uint32_t *
+grub_unicode_get_comb_start (const grub_uint32_t *str, 
+			     const grub_uint32_t *cur);
+
+#endif