diff options
Diffstat (limited to 'include/grub/charset.h')
-rw-r--r-- | include/grub/charset.h | 323 |
1 files changed, 323 insertions, 0 deletions
diff --git a/include/grub/charset.h b/include/grub/charset.h new file mode 100644 index 0000000..d14faea --- /dev/null +++ b/include/grub/charset.h @@ -0,0 +1,323 @@ +/* + * GRUB -- GRand Unified Bootloader + * Copyright (C) 1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009 Free Software Foundation, Inc. + * + * GRUB is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GRUB is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GRUB. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef GRUB_CHARSET_HEADER +#define GRUB_CHARSET_HEADER 1 + +#include <grub/types.h> + +#define GRUB_UINT8_1_LEADINGBIT 0x80 +#define GRUB_UINT8_2_LEADINGBITS 0xc0 +#define GRUB_UINT8_3_LEADINGBITS 0xe0 +#define GRUB_UINT8_4_LEADINGBITS 0xf0 +#define GRUB_UINT8_5_LEADINGBITS 0xf8 +#define GRUB_UINT8_6_LEADINGBITS 0xfc +#define GRUB_UINT8_7_LEADINGBITS 0xfe + +#define GRUB_UINT8_1_TRAILINGBIT 0x01 +#define GRUB_UINT8_2_TRAILINGBITS 0x03 +#define GRUB_UINT8_3_TRAILINGBITS 0x07 +#define GRUB_UINT8_4_TRAILINGBITS 0x0f +#define GRUB_UINT8_5_TRAILINGBITS 0x1f +#define GRUB_UINT8_6_TRAILINGBITS 0x3f + +#define GRUB_MAX_UTF8_PER_UTF16 4 +/* You need at least one UTF-8 byte to have one UTF-16 word. + You need at least three UTF-8 bytes to have 2 UTF-16 words (surrogate pairs). + */ +#define GRUB_MAX_UTF16_PER_UTF8 1 +#define GRUB_MAX_UTF8_PER_CODEPOINT 4 + +#define GRUB_UCS2_LIMIT 0x10000 +#define GRUB_UTF16_UPPER_SURROGATE(code) \ + (0xD800 | ((((code) - GRUB_UCS2_LIMIT) >> 10) & 0x3ff)) +#define GRUB_UTF16_LOWER_SURROGATE(code) \ + (0xDC00 | (((code) - GRUB_UCS2_LIMIT) & 0x3ff)) + +/* Process one character from UTF8 sequence. + At beginning set *code = 0, *count = 0. Returns 0 on failure and + 1 on success. *count holds the number of trailing bytes. */ +static inline int +grub_utf8_process (grub_uint8_t c, grub_uint32_t *code, int *count) +{ + if (*count) + { + if ((c & GRUB_UINT8_2_LEADINGBITS) != GRUB_UINT8_1_LEADINGBIT) + { + *count = 0; + /* invalid */ + return 0; + } + else + { + *code <<= 6; + *code |= (c & GRUB_UINT8_6_TRAILINGBITS); + (*count)--; + /* Overlong. */ + if ((*count == 1 && *code <= 0x1f) + || (*count == 2 && *code <= 0xf)) + { + *code = 0; + *count = 0; + return 0; + } + return 1; + } + } + + if ((c & GRUB_UINT8_1_LEADINGBIT) == 0) + { + *code = c; + return 1; + } + if ((c & GRUB_UINT8_3_LEADINGBITS) == GRUB_UINT8_2_LEADINGBITS) + { + *count = 1; + *code = c & GRUB_UINT8_5_TRAILINGBITS; + /* Overlong */ + if (*code <= 1) + { + *count = 0; + *code = 0; + return 0; + } + return 1; + } + if ((c & GRUB_UINT8_4_LEADINGBITS) == GRUB_UINT8_3_LEADINGBITS) + { + *count = 2; + *code = c & GRUB_UINT8_4_TRAILINGBITS; + return 1; + } + if ((c & GRUB_UINT8_5_LEADINGBITS) == GRUB_UINT8_4_LEADINGBITS) + { + *count = 3; + *code = c & GRUB_UINT8_3_TRAILINGBITS; + return 1; + } + return 0; +} + + +/* Convert a (possibly null-terminated) UTF-8 string of at most SRCSIZE + bytes (if SRCSIZE is -1, it is ignored) in length to a UTF-16 string. + Return the number of characters converted. DEST must be able to hold + at least DESTSIZE characters. If an invalid sequence is found, return -1. + If SRCEND is not NULL, then *SRCEND is set to the next byte after the + last byte used in SRC. */ +static inline grub_size_t +grub_utf8_to_utf16 (grub_uint16_t *dest, grub_size_t destsize, + const grub_uint8_t *src, grub_size_t srcsize, + const grub_uint8_t **srcend) +{ + grub_uint16_t *p = dest; + int count = 0; + grub_uint32_t code = 0; + + if (srcend) + *srcend = src; + + while (srcsize && destsize) + { + int was_count = count; + if (srcsize != (grub_size_t)-1) + srcsize--; + if (!grub_utf8_process (*src++, &code, &count)) + { + code = '?'; + count = 0; + /* Character c may be valid, don't eat it. */ + if (was_count) + src--; + } + if (count != 0) + continue; + if (code == 0) + break; + if (destsize < 2 && code >= GRUB_UCS2_LIMIT) + break; + if (code >= GRUB_UCS2_LIMIT) + { + *p++ = GRUB_UTF16_UPPER_SURROGATE (code); + *p++ = GRUB_UTF16_LOWER_SURROGATE (code); + destsize -= 2; + } + else + { + *p++ = code; + destsize--; + } + } + + if (srcend) + *srcend = src; + return p - dest; +} + +/* Determine the last position where the UTF-8 string [beg, end) can + be safely cut. */ +static inline grub_size_t +grub_getend (const char *beg, const char *end) +{ + const char *ptr; + for (ptr = end - 1; ptr >= beg; ptr--) + if ((*ptr & GRUB_UINT8_2_LEADINGBITS) != GRUB_UINT8_1_LEADINGBIT) + break; + if (ptr < beg) + return 0; + if ((*ptr & GRUB_UINT8_1_LEADINGBIT) == 0) + return ptr + 1 - beg; + if ((*ptr & GRUB_UINT8_3_LEADINGBITS) == GRUB_UINT8_2_LEADINGBITS + && ptr + 2 <= end) + return ptr + 2 - beg; + if ((*ptr & GRUB_UINT8_4_LEADINGBITS) == GRUB_UINT8_3_LEADINGBITS + && ptr + 3 <= end) + return ptr + 3 - beg; + if ((*ptr & GRUB_UINT8_5_LEADINGBITS) == GRUB_UINT8_4_LEADINGBITS + && ptr + 4 <= end) + return ptr + 4 - beg; + /* Invalid character or incomplete. Cut before it. */ + return ptr - beg; +} + +/* Convert UTF-16 to UTF-8. */ +static inline grub_uint8_t * +grub_utf16_to_utf8 (grub_uint8_t *dest, const grub_uint16_t *src, + grub_size_t size) +{ + grub_uint32_t code_high = 0; + + while (size--) + { + grub_uint32_t code = *src++; + + if (code_high) + { + if (code >= 0xDC00 && code <= 0xDFFF) + { + /* Surrogate pair. */ + code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000; + + *dest++ = (code >> 18) | 0xF0; + *dest++ = ((code >> 12) & 0x3F) | 0x80; + *dest++ = ((code >> 6) & 0x3F) | 0x80; + *dest++ = (code & 0x3F) | 0x80; + } + else + { + /* Error... */ + *dest++ = '?'; + /* *src may be valid. Don't eat it. */ + src--; + } + + code_high = 0; + } + else + { + if (code <= 0x007F) + *dest++ = code; + else if (code <= 0x07FF) + { + *dest++ = (code >> 6) | 0xC0; + *dest++ = (code & 0x3F) | 0x80; + } + else if (code >= 0xD800 && code <= 0xDBFF) + { + code_high = code; + continue; + } + else if (code >= 0xDC00 && code <= 0xDFFF) + { + /* Error... */ + *dest++ = '?'; + } + else if (code < 0x10000) + { + *dest++ = (code >> 12) | 0xE0; + *dest++ = ((code >> 6) & 0x3F) | 0x80; + *dest++ = (code & 0x3F) | 0x80; + } + else + { + *dest++ = (code >> 18) | 0xF0; + *dest++ = ((code >> 12) & 0x3F) | 0x80; + *dest++ = ((code >> 6) & 0x3F) | 0x80; + *dest++ = (code & 0x3F) | 0x80; + } + } + } + + return dest; +} + +#define GRUB_MAX_UTF8_PER_LATIN1 2 + +/* Convert Latin1 to UTF-8. */ +static inline grub_uint8_t * +grub_latin1_to_utf8 (grub_uint8_t *dest, const grub_uint8_t *src, + grub_size_t size) +{ + while (size--) + { + if (!(*src & 0x80)) + *dest++ = *src; + else + { + *dest++ = (*src >> 6) | 0xC0; + *dest++ = (*src & 0x3F) | 0x80; + } + src++; + } + + return dest; +} + +/* Convert UCS-4 to UTF-8. */ +char *grub_ucs4_to_utf8_alloc (const grub_uint32_t *src, grub_size_t size); + +int +grub_is_valid_utf8 (const grub_uint8_t *src, grub_size_t srcsize); + +grub_ssize_t grub_utf8_to_ucs4_alloc (const char *msg, + grub_uint32_t **unicode_msg, + grub_uint32_t **last_position); + +/* Returns the number of bytes the string src would occupy is converted + to UTF-8, excluding \0. */ +grub_size_t +grub_get_num_of_utf8_bytes (const grub_uint32_t *src, grub_size_t size); + +/* Converts UCS-4 to UTF-8. Returns the number of bytes effectively written + excluding the trailing \0. */ +grub_size_t +grub_ucs4_to_utf8 (const grub_uint32_t *src, grub_size_t size, + grub_uint8_t *dest, grub_size_t destsize); +grub_size_t grub_utf8_to_ucs4 (grub_uint32_t *dest, grub_size_t destsize, + const grub_uint8_t *src, grub_size_t srcsize, + const grub_uint8_t **srcend); +/* Returns -2 if not enough space, -1 on invalid character. */ +grub_ssize_t +grub_encode_utf8_character (grub_uint8_t *dest, grub_uint8_t *destend, + grub_uint32_t code); + +const grub_uint32_t * +grub_unicode_get_comb_start (const grub_uint32_t *str, + const grub_uint32_t *cur); + +#endif |