diff options
Diffstat (limited to '')
-rw-r--r-- | src/st/croco/cr-utils.c | 1330 |
1 files changed, 1330 insertions, 0 deletions
diff --git a/src/st/croco/cr-utils.c b/src/st/croco/cr-utils.c new file mode 100644 index 0000000..5fafade --- /dev/null +++ b/src/st/croco/cr-utils.c @@ -0,0 +1,1330 @@ +/* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */ + +/* + * This file is part of The Croco Library + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + * Author: Dodji Seketeli + * See COPYRIGHTS file for copyright information. + */ + +#include "cr-utils.h" +#include "cr-string.h" + +/** + *@file: + *Some misc utility functions used + *in the libcroco. + *Note that troughout this file I will + *refer to the CSS SPECIFICATIONS DOCUMENTATION + *written by the w3c guys. You can find that document + *at http://www.w3.org/TR/REC-CSS2/ . + */ + +/**************************** + *Encoding transformations and + *encoding helpers + ****************************/ + +/* + *Here is the correspondence between the ucs-4 charactere codes + *and there matching utf-8 encoding pattern as described by RFC 2279: + * + *UCS-4 range (hex.) UTF-8 octet sequence (binary) + *------------------ ----------------------------- + *0000 0000-0000 007F 0xxxxxxx + *0000 0080-0000 07FF 110xxxxx 10xxxxxx + *0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx + *0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + *0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + *0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx + */ + +/** + *Given an utf8 string buffer, calculates + *the length of this string if it was encoded + *in ucs4. + *@param a_in_start a pointer to the beginning of + *the input utf8 string. + *@param a_in_end a pointre to the end of the input + *utf8 string (points to the last byte of the buffer) + *@param a_len out parameter the calculated length. + *@return CR_OK upon successful completion, an error code + *otherwise. + */ +enum CRStatus +cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start, + const guchar * a_in_end, gulong * a_len) +{ + guchar *byte_ptr = NULL; + gint len = 0; + + /* + *to store the final decoded + *unicode char + */ + guint c = 0; + + g_return_val_if_fail (a_in_start && a_in_end && a_len, + CR_BAD_PARAM_ERROR); + *a_len = 0; + + for (byte_ptr = (guchar *) a_in_start; + byte_ptr <= a_in_end; byte_ptr++) { + gint nb_bytes_2_decode = 0; + + if (*byte_ptr <= 0x7F) { + /* + *7 bits long char + *encoded over 1 byte: + * 0xxx xxxx + */ + c = *byte_ptr; + nb_bytes_2_decode = 1; + + } else if ((*byte_ptr & 0xE0) == 0xC0) { + /* + *up to 11 bits long char. + *encoded over 2 bytes: + *110x xxxx 10xx xxxx + */ + c = *byte_ptr & 0x1F; + nb_bytes_2_decode = 2; + + } else if ((*byte_ptr & 0xF0) == 0xE0) { + /* + *up to 16 bit long char + *encoded over 3 bytes: + *1110 xxxx 10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 0x0F; + nb_bytes_2_decode = 3; + + } else if ((*byte_ptr & 0xF8) == 0xF0) { + /* + *up to 21 bits long char + *encoded over 4 bytes: + *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 0x7; + nb_bytes_2_decode = 4; + + } else if ((*byte_ptr & 0xFC) == 0xF8) { + /* + *up to 26 bits long char + *encoded over 5 bytes. + *1111 10xx 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 3; + nb_bytes_2_decode = 5; + + } else if ((*byte_ptr & 0xFE) == 0xFC) { + /* + *up to 31 bits long char + *encoded over 6 bytes: + *1111 110x 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 1; + nb_bytes_2_decode = 6; + + } else { + /* + *BAD ENCODING + */ + return CR_ENCODING_ERROR; + } + + /* + *Go and decode the remaining byte(s) + *(if any) to get the current character. + */ + for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { + /*decode the next byte */ + byte_ptr++; + + /*byte pattern must be: 10xx xxxx */ + if ((*byte_ptr & 0xC0) != 0x80) { + return CR_ENCODING_ERROR; + } + + c = (c << 6) | (*byte_ptr & 0x3F); + } + + len++; + } + + *a_len = len; + + return CR_OK; +} + +/** + *Given an ucs4 string, this function + *returns the size (in bytes) this string + *would have occupied if it was encoded in utf-8. + *@param a_in_start a pointer to the beginning of the input + *buffer. + *@param a_in_end a pointer to the end of the input buffer. + *@param a_len out parameter. The computed length. + *@return CR_OK upon successful completion, an error code otherwise. + */ +enum CRStatus +cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start, + const guint32 * a_in_end, gulong * a_len) +{ + gint len = 0; + guint32 *char_ptr = NULL; + + g_return_val_if_fail (a_in_start && a_in_end && a_len, + CR_BAD_PARAM_ERROR); + + for (char_ptr = (guint32 *) a_in_start; + char_ptr <= a_in_end; char_ptr++) { + if (*char_ptr <= 0x7F) { + /*the utf-8 char would take 1 byte */ + len += 1; + } else if (*char_ptr <= 0x7FF) { + /*the utf-8 char would take 2 bytes */ + len += 2; + } else if (*char_ptr <= 0xFFFF) { + len += 3; + } else if (*char_ptr <= 0x1FFFFF) { + len += 4; + } else if (*char_ptr <= 0x3FFFFFF) { + len += 5; + } else if (*char_ptr <= 0x7FFFFFFF) { + len += 6; + } + } + + *a_len = len; + return CR_OK; +} + +/** + *Given an ucsA string, this function + *returns the size (in bytes) this string + *would have occupied if it was encoded in utf-8. + *@param a_in_start a pointer to the beginning of the input + *buffer. + *@param a_in_end a pointer to the end of the input buffer. + *@param a_len out parameter. The computed length. + *@return CR_OK upon successful completion, an error code otherwise. + */ +enum CRStatus +cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start, + const guchar * a_in_end, gulong * a_len) +{ + gint len = 0; + guchar *char_ptr = NULL; + + g_return_val_if_fail (a_in_start && a_in_end && a_len, + CR_BAD_PARAM_ERROR); + + for (char_ptr = (guchar *) a_in_start; + char_ptr <= a_in_end; char_ptr++) { + if (*char_ptr <= 0x7F) { + /*the utf-8 char would take 1 byte */ + len += 1; + } else { + /*the utf-8 char would take 2 bytes */ + len += 2; + } + } + + *a_len = len; + return CR_OK; +} + +/** + *Converts an utf8 buffer into an ucs4 buffer. + * + *@param a_in the input utf8 buffer to convert. + *@param a_in_len in/out parameter. The size of the + *input buffer to convert. After return, this parameter contains + *the actual number of bytes consumed. + *@param a_out the output converted ucs4 buffer. Must be allocated by + *the caller. + *@param a_out_len in/out parameter. The size of the output buffer. + *If this size is actually smaller than the real needed size, the function + *just converts what it can and returns a success status. After return, + *this param points to the actual number of characters decoded. + *@return CR_OK upon successful completion, an error code otherwise. + */ +enum CRStatus +cr_utils_utf8_to_ucs4 (const guchar * a_in, + gulong * a_in_len, guint32 * a_out, gulong * a_out_len) +{ + gulong in_len = 0, + out_len = 0, + in_index = 0, + out_index = 0; + enum CRStatus status = CR_OK; + + /* + *to store the final decoded + *unicode char + */ + guint c = 0; + + g_return_val_if_fail (a_in && a_in_len + && a_out && a_out_len, CR_BAD_PARAM_ERROR); + + if (*a_in_len < 1) { + status = CR_OK; + goto end; + } + + in_len = *a_in_len; + out_len = *a_out_len; + + for (in_index = 0, out_index = 0; + (in_index < in_len) && (out_index < out_len); + in_index++, out_index++) { + gint nb_bytes_2_decode = 0; + + if (a_in[in_index] <= 0x7F) { + /* + *7 bits long char + *encoded over 1 byte: + * 0xxx xxxx + */ + c = a_in[in_index]; + nb_bytes_2_decode = 1; + + } else if ((a_in[in_index] & 0xE0) == 0xC0) { + /* + *up to 11 bits long char. + *encoded over 2 bytes: + *110x xxxx 10xx xxxx + */ + c = a_in[in_index] & 0x1F; + nb_bytes_2_decode = 2; + + } else if ((a_in[in_index] & 0xF0) == 0xE0) { + /* + *up to 16 bit long char + *encoded over 3 bytes: + *1110 xxxx 10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 0x0F; + nb_bytes_2_decode = 3; + + } else if ((a_in[in_index] & 0xF8) == 0xF0) { + /* + *up to 21 bits long char + *encoded over 4 bytes: + *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 0x7; + nb_bytes_2_decode = 4; + + } else if ((a_in[in_index] & 0xFC) == 0xF8) { + /* + *up to 26 bits long char + *encoded over 5 bytes. + *1111 10xx 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 3; + nb_bytes_2_decode = 5; + + } else if ((a_in[in_index] & 0xFE) == 0xFC) { + /* + *up to 31 bits long char + *encoded over 6 bytes: + *1111 110x 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 1; + nb_bytes_2_decode = 6; + + } else { + /*BAD ENCODING */ + goto end; + } + + /* + *Go and decode the remaining byte(s) + *(if any) to get the current character. + */ + for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { + /*decode the next byte */ + in_index++; + + /*byte pattern must be: 10xx xxxx */ + if ((a_in[in_index] & 0xC0) != 0x80) { + goto end; + } + + c = (c << 6) | (a_in[in_index] & 0x3F); + } + + /* + *The decoded ucs4 char is now + *in c. + */ + + /************************ + *Some security tests + ***********************/ + + /*be sure c is a char */ + if (c == 0xFFFF || c == 0xFFFE) + goto end; + + /*be sure c is inferior to the max ucs4 char value */ + if (c > 0x10FFFF) + goto end; + + /* + *c must be less than UTF16 "lower surrogate begin" + *or higher than UTF16 "High surrogate end" + */ + if (c >= 0xD800 && c <= 0xDFFF) + goto end; + + /*Avoid characters that equals zero */ + if (c == 0) + goto end; + + a_out[out_index] = c; + } + + end: + *a_out_len = out_index + 1; + *a_in_len = in_index + 1; + + return status; +} + +/** + *Reads a character from an utf8 buffer. + *Actually decode the next character code (unicode character code) + *and returns it. + *@param a_in the starting address of the utf8 buffer. + *@param a_in_len the length of the utf8 buffer. + *@param a_out output parameter. The resulting read char. + *@param a_consumed the number of the bytes consumed to + *decode the returned character code. + *@return CR_OK upon successful completion, an error code otherwise. + */ +enum CRStatus +cr_utils_read_char_from_utf8_buf (const guchar * a_in, + gulong a_in_len, + guint32 * a_out, gulong * a_consumed) +{ + gulong in_index = 0, + nb_bytes_2_decode = 0; + enum CRStatus status = CR_OK; + + /* + *to store the final decoded + *unicode char + */ + guint32 c = 0; + + g_return_val_if_fail (a_in && a_out && a_out + && a_consumed, CR_BAD_PARAM_ERROR); + + if (a_in_len < 1) { + status = CR_OK; + goto end; + } + + if (*a_in <= 0x7F) { + /* + *7 bits long char + *encoded over 1 byte: + * 0xxx xxxx + */ + c = *a_in; + nb_bytes_2_decode = 1; + + } else if ((*a_in & 0xE0) == 0xC0) { + /* + *up to 11 bits long char. + *encoded over 2 bytes: + *110x xxxx 10xx xxxx + */ + c = *a_in & 0x1F; + nb_bytes_2_decode = 2; + + } else if ((*a_in & 0xF0) == 0xE0) { + /* + *up to 16 bit long char + *encoded over 3 bytes: + *1110 xxxx 10xx xxxx 10xx xxxx + */ + c = *a_in & 0x0F; + nb_bytes_2_decode = 3; + + } else if ((*a_in & 0xF8) == 0xF0) { + /* + *up to 21 bits long char + *encoded over 4 bytes: + *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx + */ + c = *a_in & 0x7; + nb_bytes_2_decode = 4; + + } else if ((*a_in & 0xFC) == 0xF8) { + /* + *up to 26 bits long char + *encoded over 5 bytes. + *1111 10xx 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx + */ + c = *a_in & 3; + nb_bytes_2_decode = 5; + + } else if ((*a_in & 0xFE) == 0xFC) { + /* + *up to 31 bits long char + *encoded over 6 bytes: + *1111 110x 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx 10xx xxxx + */ + c = *a_in & 1; + nb_bytes_2_decode = 6; + + } else { + /*BAD ENCODING */ + goto end; + } + + if (nb_bytes_2_decode > a_in_len) { + status = CR_END_OF_INPUT_ERROR; + goto end; + } + + /* + *Go and decode the remaining byte(s) + *(if any) to get the current character. + */ + for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) { + /*byte pattern must be: 10xx xxxx */ + if ((a_in[in_index] & 0xC0) != 0x80) { + goto end; + } + + c = (c << 6) | (a_in[in_index] & 0x3F); + } + + /* + *The decoded ucs4 char is now + *in c. + */ + + /************************ + *Some security tests + ***********************/ + + /*be sure c is a char */ + if (c == 0xFFFF || c == 0xFFFE) + goto end; + + /*be sure c is inferior to the max ucs4 char value */ + if (c > 0x10FFFF) + goto end; + + /* + *c must be less than UTF16 "lower surrogate begin" + *or higher than UTF16 "High surrogate end" + */ + if (c >= 0xD800 && c <= 0xDFFF) + goto end; + + /*Avoid characters that equals zero */ + if (c == 0) + goto end; + + *a_out = c; + + end: + *a_consumed = nb_bytes_2_decode; + + return status; +} + +/** + * + */ +enum CRStatus +cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start, + const guchar * a_in_end, gulong * a_len) +{ + /* + *Note: this function can be made shorter + *but it considers all the cases of the utf8 encoding + *to ease further extensions ... + */ + + guchar *byte_ptr = NULL; + gint len = 0; + + /* + *to store the final decoded + *unicode char + */ + guint c = 0; + + g_return_val_if_fail (a_in_start && a_in_end && a_len, + CR_BAD_PARAM_ERROR); + *a_len = 0; + + for (byte_ptr = (guchar *) a_in_start; + byte_ptr <= a_in_end; byte_ptr++) { + gint nb_bytes_2_decode = 0; + + if (*byte_ptr <= 0x7F) { + /* + *7 bits long char + *encoded over 1 byte: + * 0xxx xxxx + */ + c = *byte_ptr; + nb_bytes_2_decode = 1; + + } else if ((*byte_ptr & 0xE0) == 0xC0) { + /* + *up to 11 bits long char. + *encoded over 2 bytes: + *110x xxxx 10xx xxxx + */ + c = *byte_ptr & 0x1F; + nb_bytes_2_decode = 2; + + } else if ((*byte_ptr & 0xF0) == 0xE0) { + /* + *up to 16 bit long char + *encoded over 3 bytes: + *1110 xxxx 10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 0x0F; + nb_bytes_2_decode = 3; + + } else if ((*byte_ptr & 0xF8) == 0xF0) { + /* + *up to 21 bits long char + *encoded over 4 bytes: + *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 0x7; + nb_bytes_2_decode = 4; + + } else if ((*byte_ptr & 0xFC) == 0xF8) { + /* + *up to 26 bits long char + *encoded over 5 bytes. + *1111 10xx 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 3; + nb_bytes_2_decode = 5; + + } else if ((*byte_ptr & 0xFE) == 0xFC) { + /* + *up to 31 bits long char + *encoded over 6 bytes: + *1111 110x 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 1; + nb_bytes_2_decode = 6; + + } else { + /* + *BAD ENCODING + */ + return CR_ENCODING_ERROR; + } + + /* + *Go and decode the remaining byte(s) + *(if any) to get the current character. + */ + for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { + /*decode the next byte */ + byte_ptr++; + + /*byte pattern must be: 10xx xxxx */ + if ((*byte_ptr & 0xC0) != 0x80) { + return CR_ENCODING_ERROR; + } + + c = (c << 6) | (*byte_ptr & 0x3F); + } + + /* + *The decoded ucs4 char is now + *in c. + */ + + if (c <= 0xFF) { /*Add other conditions to support + *other char sets (ucs2, ucs3, ucs4). + */ + len++; + } else { + /*the char is too long to fit + *into the supposed charset len. + */ + return CR_ENCODING_ERROR; + } + } + + *a_len = len; + + return CR_OK; +} + +/** + *Converts an utf8 string into an ucs4 string. + *@param a_in the input string to convert. + *@param a_in_len in/out parameter. The length of the input + *string. After return, points to the actual number of bytes + *consumed. This can be useful to debug the input stream in case + *of encoding error. + *@param a_out out parameter. Points to the output string. It is allocated + *by this function and must be freed by the caller. + *@param a_out_len out parameter. The length of the output string. + *@return CR_OK upon successful completion, an error code otherwise. + * + */ +enum CRStatus +cr_utils_utf8_str_to_ucs4 (const guchar * a_in, + gulong * a_in_len, + guint32 ** a_out, gulong * a_out_len) +{ + enum CRStatus status = CR_OK; + + g_return_val_if_fail (a_in && a_in_len + && a_out && a_out_len, CR_BAD_PARAM_ERROR); + + status = cr_utils_utf8_str_len_as_ucs4 (a_in, + &a_in[*a_in_len - 1], + a_out_len); + + g_return_val_if_fail (status == CR_OK, status); + + *a_out = g_malloc0 (*a_out_len * sizeof (guint32)); + + status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len); + + return status; +} + +/** + *Converts an ucs4 buffer into an utf8 buffer. + * + *@param a_in the input ucs4 buffer to convert. + *@param a_in_len in/out parameter. The size of the + *input buffer to convert. After return, this parameter contains + *the actual number of characters consumed. + *@param a_out the output converted utf8 buffer. Must be allocated by + *the caller. + *@param a_out_len in/out parameter. The size of the output buffer. + *If this size is actually smaller than the real needed size, the function + *just converts what it can and returns a success status. After return, + *this param points to the actual number of bytes in the buffer. + *@return CR_OK upon successful completion, an error code otherwise. + */ +enum CRStatus +cr_utils_ucs4_to_utf8 (const guint32 * a_in, + gulong * a_in_len, guchar * a_out, gulong * a_out_len) +{ + gulong in_len = 0, + in_index = 0, + out_index = 0; + enum CRStatus status = CR_OK; + + g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len, + CR_BAD_PARAM_ERROR); + + if (*a_in_len < 1) { + status = CR_OK; + goto end; + } + + in_len = *a_in_len; + + for (in_index = 0; in_index < in_len; in_index++) { + /* + *FIXME: return whenever we encounter forbidden char values. + */ + + if (a_in[in_index] <= 0x7F) { + a_out[out_index] = a_in[in_index]; + out_index++; + } else if (a_in[in_index] <= 0x7FF) { + a_out[out_index] = (0xC0 | (a_in[in_index] >> 6)); + a_out[out_index + 1] = + (0x80 | (a_in[in_index] & 0x3F)); + out_index += 2; + } else if (a_in[in_index] <= 0xFFFF) { + a_out[out_index] = (0xE0 | (a_in[in_index] >> 12)); + a_out[out_index + 1] = + (0x80 | ((a_in[in_index] >> 6) & 0x3F)); + a_out[out_index + 2] = + (0x80 | (a_in[in_index] & 0x3F)); + out_index += 3; + } else if (a_in[in_index] <= 0x1FFFFF) { + a_out[out_index] = (0xF0 | (a_in[in_index] >> 18)); + a_out[out_index + 1] + = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); + a_out[out_index + 2] + = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); + a_out[out_index + 3] + = (0x80 | (a_in[in_index] & 0x3F)); + out_index += 4; + } else if (a_in[in_index] <= 0x3FFFFFF) { + a_out[out_index] = (0xF8 | (a_in[in_index] >> 24)); + a_out[out_index + 1] = + (0x80 | (a_in[in_index] >> 18)); + a_out[out_index + 2] + = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); + a_out[out_index + 3] + = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); + a_out[out_index + 4] + = (0x80 | (a_in[in_index] & 0x3F)); + out_index += 5; + } else if (a_in[in_index] <= 0x7FFFFFFF) { + a_out[out_index] = (0xFC | (a_in[in_index] >> 30)); + a_out[out_index + 1] = + (0x80 | (a_in[in_index] >> 24)); + a_out[out_index + 2] + = (0x80 | ((a_in[in_index] >> 18) & 0x3F)); + a_out[out_index + 3] + = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); + a_out[out_index + 4] + = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); + a_out[out_index + 4] + = (0x80 | (a_in[in_index] & 0x3F)); + out_index += 6; + } else { + status = CR_ENCODING_ERROR; + goto end; + } + } /*end for */ + + end: + *a_in_len = in_index + 1; + *a_out_len = out_index + 1; + + return status; +} + +/** + *Converts an ucs4 string into an utf8 string. + *@param a_in the input string to convert. + *@param a_in_len in/out parameter. The length of the input + *string. After return, points to the actual number of characters + *consumed. This can be useful to debug the input string in case + *of encoding error. + *@param a_out out parameter. Points to the output string. It is allocated + *by this function and must be freed by the caller. + *@param a_out_len out parameter. The length (in bytes) of the output string. + *@return CR_OK upon successful completion, an error code otherwise. + */ +enum CRStatus +cr_utils_ucs4_str_to_utf8 (const guint32 * a_in, + gulong * a_in_len, + guchar ** a_out, gulong * a_out_len) +{ + enum CRStatus status = CR_OK; + + g_return_val_if_fail (a_in && a_in_len && a_out + && a_out_len, CR_BAD_PARAM_ERROR); + + status = cr_utils_ucs4_str_len_as_utf8 (a_in, + &a_in[*a_out_len - 1], + a_out_len); + + g_return_val_if_fail (status == CR_OK, status); + + status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len); + + return status; +} + +/** + *Converts an ucs1 buffer into an utf8 buffer. + *The caller must know the size of the resulting buffer and + *allocate it prior to calling this function. + * + *@param a_in the input ucs1 buffer. + * + *@param a_in_len in/out parameter. The length of the input buffer. + *After return, points to the number of bytes actually consumed even + *in case of encoding error. + * + *@param a_out out parameter. The output utf8 converted buffer. + * + *@param a_out_len in/out parameter. The size of the output buffer. + *If the output buffer size is shorter than the actual needed size, + *this function just convert what it can. + * + *@return CR_OK upon successful completion, an error code otherwise. + * + */ +enum CRStatus +cr_utils_ucs1_to_utf8 (const guchar * a_in, + gulong * a_in_len, guchar * a_out, gulong * a_out_len) +{ + gulong out_index = 0, + in_index = 0, + in_len = 0, + out_len = 0; + enum CRStatus status = CR_OK; + + g_return_val_if_fail (a_in && a_in_len + && a_out_len, + CR_BAD_PARAM_ERROR); + + if (*a_in_len == 0) { + *a_out_len = 0 ; + return status; + } + g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ; + + in_len = *a_in_len; + out_len = *a_out_len; + + for (in_index = 0, out_index = 0; + (in_index < in_len) && (out_index < out_len); in_index++) { + /* + *FIXME: return whenever we encounter forbidden char values. + */ + + if (a_in[in_index] <= 0x7F) { + a_out[out_index] = a_in[in_index]; + out_index++; + } else { + a_out[out_index] = (0xC0 | (a_in[in_index] >> 6)); + a_out[out_index + 1] = + (0x80 | (a_in[in_index] & 0x3F)); + out_index += 2; + } + } /*end for */ + + *a_in_len = in_index; + *a_out_len = out_index; + + return status; +} + +/** + *Converts an ucs1 string into an utf8 string. + *@param a_in_start the beginning of the input string to convert. + *@param a_in_end the end of the input string to convert. + *@param a_out out parameter. The converted string. + *@param a_out out parameter. The length of the converted string. + *@return CR_OK upon successful completion, an error code otherwise. + * + */ +enum CRStatus +cr_utils_ucs1_str_to_utf8 (const guchar * a_in, + gulong * a_in_len, + guchar ** a_out, gulong * a_out_len) +{ + gulong out_len = 0; + enum CRStatus status = CR_OK; + + g_return_val_if_fail (a_in && a_in_len && a_out + && a_out_len, CR_BAD_PARAM_ERROR); + + if (*a_in_len < 1) { + *a_out_len = 0; + *a_out = NULL; + return CR_OK; + } + + status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1], + &out_len); + + g_return_val_if_fail (status == CR_OK, status); + + *a_out = g_malloc0 (out_len); + + status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len); + + *a_out_len = out_len; + + return status; +} + +/** + *Converts an utf8 buffer into an ucs1 buffer. + *The caller must know the size of the resulting + *converted buffer, and allocated it prior to calling this + *function. + * + *@param a_in the input utf8 buffer to convert. + * + *@param a_in_len in/out parameter. The size of the input utf8 buffer. + *After return, points to the number of bytes consumed + *by the function even in case of encoding error. + * + *@param a_out out parameter. Points to the resulting buffer. + *Must be allocated by the caller. If the size of a_out is shorter + *than its required size, this function converts what it can and return + *a successful status. + * + *@param a_out_len in/out parameter. The size of the output buffer. + *After return, points to the number of bytes consumed even in case of + *encoding error. + * + *@return CR_OK upon successful completion, an error code otherwise. + */ +enum CRStatus +cr_utils_utf8_to_ucs1 (const guchar * a_in, + gulong * a_in_len, guchar * a_out, gulong * a_out_len) +{ + gulong in_index = 0, + out_index = 0, + in_len = 0, + out_len = 0; + enum CRStatus status = CR_OK; + + /* + *to store the final decoded + *unicode char + */ + guint32 c = 0; + + g_return_val_if_fail (a_in && a_in_len + && a_out && a_out_len, CR_BAD_PARAM_ERROR); + + if (*a_in_len < 1) { + goto end; + } + + in_len = *a_in_len; + out_len = *a_out_len; + + for (in_index = 0, out_index = 0; + (in_index < in_len) && (out_index < out_len); + in_index++, out_index++) { + gint nb_bytes_2_decode = 0; + + if (a_in[in_index] <= 0x7F) { + /* + *7 bits long char + *encoded over 1 byte: + * 0xxx xxxx + */ + c = a_in[in_index]; + nb_bytes_2_decode = 1; + + } else if ((a_in[in_index] & 0xE0) == 0xC0) { + /* + *up to 11 bits long char. + *encoded over 2 bytes: + *110x xxxx 10xx xxxx + */ + c = a_in[in_index] & 0x1F; + nb_bytes_2_decode = 2; + + } else if ((a_in[in_index] & 0xF0) == 0xE0) { + /* + *up to 16 bit long char + *encoded over 3 bytes: + *1110 xxxx 10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 0x0F; + nb_bytes_2_decode = 3; + + } else if ((a_in[in_index] & 0xF8) == 0xF0) { + /* + *up to 21 bits long char + *encoded over 4 bytes: + *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 0x7; + nb_bytes_2_decode = 4; + + } else if ((a_in[in_index] & 0xFC) == 0xF8) { + /* + *up to 26 bits long char + *encoded over 5 bytes. + *1111 10xx 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 3; + nb_bytes_2_decode = 5; + + } else if ((a_in[in_index] & 0xFE) == 0xFC) { + /* + *up to 31 bits long char + *encoded over 6 bytes: + *1111 110x 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 1; + nb_bytes_2_decode = 6; + + } else { + /*BAD ENCODING */ + status = CR_ENCODING_ERROR; + goto end; + } + + /* + *Go and decode the remaining byte(s) + *(if any) to get the current character. + */ + if (in_index + nb_bytes_2_decode - 1 >= in_len) { + goto end; + } + + for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { + /*decode the next byte */ + in_index++; + + /*byte pattern must be: 10xx xxxx */ + if ((a_in[in_index] & 0xC0) != 0x80) { + status = CR_ENCODING_ERROR; + goto end; + } + + c = (c << 6) | (a_in[in_index] & 0x3F); + } + + /* + *The decoded ucs4 char is now + *in c. + */ + + if (c > 0xFF) { + status = CR_ENCODING_ERROR; + goto end; + } + + a_out[out_index] = c; + } + + end: + *a_out_len = out_index; + *a_in_len = in_index; + + return status; +} + +/** + *Converts an utf8 buffer into an + *ucs1 buffer. + *@param a_in_start the start of the input buffer. + *@param a_in_end the end of the input buffer. + *@param a_out out parameter. The resulting converted ucs4 buffer. + *Must be freed by the caller. + *@param a_out_len out parameter. The length of the converted buffer. + *@return CR_OK upon successful completion, an error code otherwise. + *Note that out parameters are valid if and only if this function + *returns CR_OK. + */ +enum CRStatus +cr_utils_utf8_str_to_ucs1 (const guchar * a_in, + gulong * a_in_len, + guchar ** a_out, gulong * a_out_len) +{ + enum CRStatus status = CR_OK; + + g_return_val_if_fail (a_in && a_in_len + && a_out && a_out_len, CR_BAD_PARAM_ERROR); + + if (*a_in_len < 1) { + *a_out_len = 0; + *a_out = NULL; + return CR_OK; + } + + status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1], + a_out_len); + + g_return_val_if_fail (status == CR_OK, status); + + *a_out = g_malloc0 (*a_out_len * sizeof (guint32)); + + status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len); + return status; +} + +/***************************************** + *CSS basic types identification utilities + *****************************************/ + +/** + *Returns TRUE if a_char is a white space as + *defined in the css spec in chap 4.1.1. + * + *white-space ::= ' '| \t|\r|\n|\f + * + *@param a_char the character to test. + *return TRUE if is a white space, false otherwise. + */ +gboolean +cr_utils_is_white_space (guint32 a_char) +{ + switch (a_char) { + case ' ': + case '\t': + case '\r': + case '\n': + case '\f': + return TRUE; + break; + default: + return FALSE; + } +} + +/** + *Returns true if the character is a newline + *as defined in the css spec in the chap 4.1.1. + * + *nl ::= \n|\r\n|\r|\f + * + *@param a_char the character to test. + *@return TRUE if the character is a newline, FALSE otherwise. + */ +gboolean +cr_utils_is_newline (guint32 a_char) +{ + switch (a_char) { + case '\n': + case '\r': + case '\f': + return TRUE; + break; + default: + return FALSE; + } +} + +/** + *returns TRUE if the char is part of an hexa num char: + *i.e hexa_char ::= [0-9A-F] + */ +gboolean +cr_utils_is_hexa_char (guint32 a_char) +{ + if ((a_char >= '0' && a_char <= '9') + || (a_char >= 'A' && a_char <= 'F')) { + return TRUE; + } + return FALSE; +} + +/** + *Returns true if the character is a nonascii + *character (as defined in the css spec chap 4.1.1): + * + *nonascii ::= [^\0-\177] + * + *@param a_char the character to test. + *@return TRUE if the character is a nonascii char, + *FALSE otherwise. + */ +gboolean +cr_utils_is_nonascii (guint32 a_char) +{ + if (a_char <= 177) { + return FALSE; + } + + return TRUE; +} + +/** + *Dumps a character a_nb times on a file. + *@param a_char the char to dump + *@param a_fp the destination file pointer + *@param a_nb the number of times a_char is to be dumped. + */ +void +cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb) +{ + glong i = 0; + + for (i = 0; i < a_nb; i++) { + fprintf (a_fp, "%c", a_char); + } +} + +void +cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb) +{ + glong i = 0; + + g_return_if_fail (a_string); + + for (i = 0; i < a_nb; i++) { + g_string_append_printf (a_string, "%c", a_char); + } +} + +/** + *Duplicates a list of GString instances. + *@return the duplicated list of GString instances or NULL if + *something bad happened. + *@param a_list_of_strings the list of strings to be duplicated. + */ +GList * +cr_utils_dup_glist_of_string (GList const * a_list_of_strings) +{ + GList const *cur = NULL; + GList *result = NULL; + + g_return_val_if_fail (a_list_of_strings, NULL); + + for (cur = a_list_of_strings; cur; cur = cur->next) { + GString *str = NULL; + + str = g_string_new_len (((GString *) cur->data)->str, + ((GString *) cur->data)->len); + if (str) + result = g_list_append (result, str); + } + + return result; +} + +/** + *Duplicate a GList where the GList::data is a CRString. + *@param a_list_of_strings the list to duplicate + *@return the duplicated list, or NULL if something bad + *happened. + */ +GList * +cr_utils_dup_glist_of_cr_string (GList const * a_list_of_strings) +{ + GList const *cur = NULL; + GList *result = NULL; + + g_return_val_if_fail (a_list_of_strings, NULL); + + for (cur = a_list_of_strings; cur; cur = cur->next) { + CRString *str = NULL; + + str = cr_string_dup ((CRString const *) cur->data) ; + if (str) + result = g_list_append (result, str); + } + + return result; +} |