/* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */ /* * This file is part of The Croco Library * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2.1 of the GNU Lesser General Public * License as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA * * Author: Dodji Seketeli * See COPYRIGHTS file for copyright information. */ #include "cr-utils.h" #include "cr-string.h" /** *@file: *Some misc utility functions used *in the libcroco. *Note that troughout this file I will *refer to the CSS SPECIFICATIONS DOCUMENTATION *written by the w3c guys. You can find that document *at http://www.w3.org/TR/REC-CSS2/ . */ /**************************** *Encoding transformations and *encoding helpers ****************************/ /* *Here is the correspondence between the ucs-4 charactere codes *and there matching utf-8 encoding pattern as described by RFC 2279: * *UCS-4 range (hex.) UTF-8 octet sequence (binary) *------------------ ----------------------------- *0000 0000-0000 007F 0xxxxxxx *0000 0080-0000 07FF 110xxxxx 10xxxxxx *0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx *0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx *0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx */ /** *Given an utf8 string buffer, calculates *the length of this string if it was encoded *in ucs4. *@param a_in_start a pointer to the beginning of *the input utf8 string. *@param a_in_end a pointre to the end of the input *utf8 string (points to the last byte of the buffer) *@param a_len out parameter the calculated length. *@return CR_OK upon successful completion, an error code *otherwise. */ enum CRStatus cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start, const guchar * a_in_end, gulong * a_len) { guchar *byte_ptr = NULL; gint len = 0; /* *to store the final decoded *unicode char */ guint c = 0; g_return_val_if_fail (a_in_start && a_in_end && a_len, CR_BAD_PARAM_ERROR); *a_len = 0; for (byte_ptr = (guchar *) a_in_start; byte_ptr <= a_in_end; byte_ptr++) { gint nb_bytes_2_decode = 0; if (*byte_ptr <= 0x7F) { /* *7 bits long char *encoded over 1 byte: * 0xxx xxxx */ c = *byte_ptr; nb_bytes_2_decode = 1; } else if ((*byte_ptr & 0xE0) == 0xC0) { /* *up to 11 bits long char. *encoded over 2 bytes: *110x xxxx 10xx xxxx */ c = *byte_ptr & 0x1F; nb_bytes_2_decode = 2; } else if ((*byte_ptr & 0xF0) == 0xE0) { /* *up to 16 bit long char *encoded over 3 bytes: *1110 xxxx 10xx xxxx 10xx xxxx */ c = *byte_ptr & 0x0F; nb_bytes_2_decode = 3; } else if ((*byte_ptr & 0xF8) == 0xF0) { /* *up to 21 bits long char *encoded over 4 bytes: *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */ c = *byte_ptr & 0x7; nb_bytes_2_decode = 4; } else if ((*byte_ptr & 0xFC) == 0xF8) { /* *up to 26 bits long char *encoded over 5 bytes. *1111 10xx 10xx xxxx 10xx xxxx *10xx xxxx 10xx xxxx */ c = *byte_ptr & 3; nb_bytes_2_decode = 5; } else if ((*byte_ptr & 0xFE) == 0xFC) { /* *up to 31 bits long char *encoded over 6 bytes: *1111 110x 10xx xxxx 10xx xxxx *10xx xxxx 10xx xxxx 10xx xxxx */ c = *byte_ptr & 1; nb_bytes_2_decode = 6; } else { /* *BAD ENCODING */ return CR_ENCODING_ERROR; } /* *Go and decode the remaining byte(s) *(if any) to get the current character. */ for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { /*decode the next byte */ byte_ptr++; /*byte pattern must be: 10xx xxxx */ if ((*byte_ptr & 0xC0) != 0x80) { return CR_ENCODING_ERROR; } c = (c << 6) | (*byte_ptr & 0x3F); } len++; } *a_len = len; return CR_OK; } /** *Given an ucs4 string, this function *returns the size (in bytes) this string *would have occupied if it was encoded in utf-8. *@param a_in_start a pointer to the beginning of the input *buffer. *@param a_in_end a pointer to the end of the input buffer. *@param a_len out parameter. The computed length. *@return CR_OK upon successful completion, an error code otherwise. */ enum CRStatus cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start, const guint32 * a_in_end, gulong * a_len) { gint len = 0; guint32 *char_ptr = NULL; g_return_val_if_fail (a_in_start && a_in_end && a_len, CR_BAD_PARAM_ERROR); for (char_ptr = (guint32 *) a_in_start; char_ptr <= a_in_end; char_ptr++) { if (*char_ptr <= 0x7F) { /*the utf-8 char would take 1 byte */ len += 1; } else if (*char_ptr <= 0x7FF) { /*the utf-8 char would take 2 bytes */ len += 2; } else if (*char_ptr <= 0xFFFF) { len += 3; } else if (*char_ptr <= 0x1FFFFF) { len += 4; } else if (*char_ptr <= 0x3FFFFFF) { len += 5; } else if (*char_ptr <= 0x7FFFFFFF) { len += 6; } } *a_len = len; return CR_OK; } /** *Given an ucsA string, this function *returns the size (in bytes) this string *would have occupied if it was encoded in utf-8. *@param a_in_start a pointer to the beginning of the input *buffer. *@param a_in_end a pointer to the end of the input buffer. *@param a_len out parameter. The computed length. *@return CR_OK upon successful completion, an error code otherwise. */ enum CRStatus cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start, const guchar * a_in_end, gulong * a_len) { gint len = 0; guchar *char_ptr = NULL; g_return_val_if_fail (a_in_start && a_in_end && a_len, CR_BAD_PARAM_ERROR); for (char_ptr = (guchar *) a_in_start; char_ptr <= a_in_end; char_ptr++) { if (*char_ptr <= 0x7F) { /*the utf-8 char would take 1 byte */ len += 1; } else { /*the utf-8 char would take 2 bytes */ len += 2; } } *a_len = len; return CR_OK; } /** *Converts an utf8 buffer into an ucs4 buffer. * *@param a_in the input utf8 buffer to convert. *@param a_in_len in/out parameter. The size of the *input buffer to convert. After return, this parameter contains *the actual number of bytes consumed. *@param a_out the output converted ucs4 buffer. Must be allocated by *the caller. *@param a_out_len in/out parameter. The size of the output buffer. *If this size is actually smaller than the real needed size, the function *just converts what it can and returns a success status. After return, *this param points to the actual number of characters decoded. *@return CR_OK upon successful completion, an error code otherwise. */ enum CRStatus cr_utils_utf8_to_ucs4 (const guchar * a_in, gulong * a_in_len, guint32 * a_out, gulong * a_out_len) { gulong in_len = 0, out_len = 0, in_index = 0, out_index = 0; enum CRStatus status = CR_OK; /* *to store the final decoded *unicode char */ guint c = 0; g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len, CR_BAD_PARAM_ERROR); if (*a_in_len < 1) { status = CR_OK; goto end; } in_len = *a_in_len; out_len = *a_out_len; for (in_index = 0, out_index = 0; (in_index < in_len) && (out_index < out_len); in_index++, out_index++) { gint nb_bytes_2_decode = 0; if (a_in[in_index] <= 0x7F) { /* *7 bits long char *encoded over 1 byte: * 0xxx xxxx */ c = a_in[in_index]; nb_bytes_2_decode = 1; } else if ((a_in[in_index] & 0xE0) == 0xC0) { /* *up to 11 bits long char. *encoded over 2 bytes: *110x xxxx 10xx xxxx */ c = a_in[in_index] & 0x1F; nb_bytes_2_decode = 2; } else if ((a_in[in_index] & 0xF0) == 0xE0) { /* *up to 16 bit long char *encoded over 3 bytes: *1110 xxxx 10xx xxxx 10xx xxxx */ c = a_in[in_index] & 0x0F; nb_bytes_2_decode = 3; } else if ((a_in[in_index] & 0xF8) == 0xF0) { /* *up to 21 bits long char *encoded over 4 bytes: *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */ c = a_in[in_index] & 0x7; nb_bytes_2_decode = 4; } else if ((a_in[in_index] & 0xFC) == 0xF8) { /* *up to 26 bits long char *encoded over 5 bytes. *1111 10xx 10xx xxxx 10xx xxxx *10xx xxxx 10xx xxxx */ c = a_in[in_index] & 3; nb_bytes_2_decode = 5; } else if ((a_in[in_index] & 0xFE) == 0xFC) { /* *up to 31 bits long char *encoded over 6 bytes: *1111 110x 10xx xxxx 10xx xxxx *10xx xxxx 10xx xxxx 10xx xxxx */ c = a_in[in_index] & 1; nb_bytes_2_decode = 6; } else { /*BAD ENCODING */ goto end; } /* *Go and decode the remaining byte(s) *(if any) to get the current character. */ for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { /*decode the next byte */ in_index++; /*byte pattern must be: 10xx xxxx */ if ((a_in[in_index] & 0xC0) != 0x80) { goto end; } c = (c << 6) | (a_in[in_index] & 0x3F); } /* *The decoded ucs4 char is now *in c. */ /************************ *Some security tests ***********************/ /*be sure c is a char */ if (c == 0xFFFF || c == 0xFFFE) goto end; /*be sure c is inferior to the max ucs4 char value */ if (c > 0x10FFFF) goto end; /* *c must be less than UTF16 "lower surrogate begin" *or higher than UTF16 "High surrogate end" */ if (c >= 0xD800 && c <= 0xDFFF) goto end; /*Avoid characters that equals zero */ if (c == 0) goto end; a_out[out_index] = c; } end: *a_out_len = out_index + 1; *a_in_len = in_index + 1; return status; } /** *Reads a character from an utf8 buffer. *Actually decode the next character code (unicode character code) *and returns it. *@param a_in the starting address of the utf8 buffer. *@param a_in_len the length of the utf8 buffer. *@param a_out output parameter. The resulting read char. *@param a_consumed the number of the bytes consumed to *decode the returned character code. *@return CR_OK upon successful completion, an error code otherwise. */ enum CRStatus cr_utils_read_char_from_utf8_buf (const guchar * a_in, gulong a_in_len, guint32 * a_out, gulong * a_consumed) { gulong in_index = 0, nb_bytes_2_decode = 0; enum CRStatus status = CR_OK; /* *to store the final decoded *unicode char */ guint32 c = 0; g_return_val_if_fail (a_in && a_out && a_out && a_consumed, CR_BAD_PARAM_ERROR); if (a_in_len < 1) { status = CR_OK; goto end; } if (*a_in <= 0x7F) { /* *7 bits long char *encoded over 1 byte: * 0xxx xxxx */ c = *a_in; nb_bytes_2_decode = 1; } else if ((*a_in & 0xE0) == 0xC0) { /* *up to 11 bits long char. *encoded over 2 bytes: *110x xxxx 10xx xxxx */ c = *a_in & 0x1F; nb_bytes_2_decode = 2; } else if ((*a_in & 0xF0) == 0xE0) { /* *up to 16 bit long char *encoded over 3 bytes: *1110 xxxx 10xx xxxx 10xx xxxx */ c = *a_in & 0x0F; nb_bytes_2_decode = 3; } else if ((*a_in & 0xF8) == 0xF0) { /* *up to 21 bits long char *encoded over 4 bytes: *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */ c = *a_in & 0x7; nb_bytes_2_decode = 4; } else if ((*a_in & 0xFC) == 0xF8) { /* *up to 26 bits long char *encoded over 5 bytes. *1111 10xx 10xx xxxx 10xx xxxx *10xx xxxx 10xx xxxx */ c = *a_in & 3; nb_bytes_2_decode = 5; } else if ((*a_in & 0xFE) == 0xFC) { /* *up to 31 bits long char *encoded over 6 bytes: *1111 110x 10xx xxxx 10xx xxxx *10xx xxxx 10xx xxxx 10xx xxxx */ c = *a_in & 1; nb_bytes_2_decode = 6; } else { /*BAD ENCODING */ goto end; } if (nb_bytes_2_decode > a_in_len) { status = CR_END_OF_INPUT_ERROR; goto end; } /* *Go and decode the remaining byte(s) *(if any) to get the current character. */ for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) { /*byte pattern must be: 10xx xxxx */ if ((a_in[in_index] & 0xC0) != 0x80) { goto end; } c = (c << 6) | (a_in[in_index] & 0x3F); } /* *The decoded ucs4 char is now *in c. */ /************************ *Some security tests ***********************/ /*be sure c is a char */ if (c == 0xFFFF || c == 0xFFFE) goto end; /*be sure c is inferior to the max ucs4 char value */ if (c > 0x10FFFF) goto end; /* *c must be less than UTF16 "lower surrogate begin" *or higher than UTF16 "High surrogate end" */ if (c >= 0xD800 && c <= 0xDFFF) goto end; /*Avoid characters that equals zero */ if (c == 0) goto end; *a_out = c; end: *a_consumed = nb_bytes_2_decode; return status; } /** * */ enum CRStatus cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start, const guchar * a_in_end, gulong * a_len) { /* *Note: this function can be made shorter *but it considers all the cases of the utf8 encoding *to ease further extensions ... */ guchar *byte_ptr = NULL; gint len = 0; /* *to store the final decoded *unicode char */ guint c = 0; g_return_val_if_fail (a_in_start && a_in_end && a_len, CR_BAD_PARAM_ERROR); *a_len = 0; for (byte_ptr = (guchar *) a_in_start; byte_ptr <= a_in_end; byte_ptr++) { gint nb_bytes_2_decode = 0; if (*byte_ptr <= 0x7F) { /* *7 bits long char *encoded over 1 byte: * 0xxx xxxx */ c = *byte_ptr; nb_bytes_2_decode = 1; } else if ((*byte_ptr & 0xE0) == 0xC0) { /* *up to 11 bits long char. *encoded over 2 bytes: *110x xxxx 10xx xxxx */ c = *byte_ptr & 0x1F; nb_bytes_2_decode = 2; } else if ((*byte_ptr & 0xF0) == 0xE0) { /* *up to 16 bit long char *encoded over 3 bytes: *1110 xxxx 10xx xxxx 10xx xxxx */ c = *byte_ptr & 0x0F; nb_bytes_2_decode = 3; } else if ((*byte_ptr & 0xF8) == 0xF0) { /* *up to 21 bits long char *encoded over 4 bytes: *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */ c = *byte_ptr & 0x7; nb_bytes_2_decode = 4; } else if ((*byte_ptr & 0xFC) == 0xF8) { /* *up to 26 bits long char *encoded over 5 bytes. *1111 10xx 10xx xxxx 10xx xxxx *10xx xxxx 10xx xxxx */ c = *byte_ptr & 3; nb_bytes_2_decode = 5; } else if ((*byte_ptr & 0xFE) == 0xFC) { /* *up to 31 bits long char *encoded over 6 bytes: *1111 110x 10xx xxxx 10xx xxxx *10xx xxxx 10xx xxxx 10xx xxxx */ c = *byte_ptr & 1; nb_bytes_2_decode = 6; } else { /* *BAD ENCODING */ return CR_ENCODING_ERROR; } /* *Go and decode the remaining byte(s) *(if any) to get the current character. */ for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { /*decode the next byte */ byte_ptr++; /*byte pattern must be: 10xx xxxx */ if ((*byte_ptr & 0xC0) != 0x80) { return CR_ENCODING_ERROR; } c = (c << 6) | (*byte_ptr & 0x3F); } /* *The decoded ucs4 char is now *in c. */ if (c <= 0xFF) { /*Add other conditions to support *other char sets (ucs2, ucs3, ucs4). */ len++; } else { /*the char is too long to fit *into the supposed charset len. */ return CR_ENCODING_ERROR; } } *a_len = len; return CR_OK; } /** *Converts an utf8 string into an ucs4 string. *@param a_in the input string to convert. *@param a_in_len in/out parameter. The length of the input *string. After return, points to the actual number of bytes *consumed. This can be useful to debug the input stream in case *of encoding error. *@param a_out out parameter. Points to the output string. It is allocated *by this function and must be freed by the caller. *@param a_out_len out parameter. The length of the output string. *@return CR_OK upon successful completion, an error code otherwise. * */ enum CRStatus cr_utils_utf8_str_to_ucs4 (const guchar * a_in, gulong * a_in_len, guint32 ** a_out, gulong * a_out_len) { enum CRStatus status = CR_OK; g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len, CR_BAD_PARAM_ERROR); status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1], a_out_len); g_return_val_if_fail (status == CR_OK, status); *a_out = g_malloc0 (*a_out_len * sizeof (guint32)); status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len); return status; } /** *Converts an ucs4 buffer into an utf8 buffer. * *@param a_in the input ucs4 buffer to convert. *@param a_in_len in/out parameter. The size of the *input buffer to convert. After return, this parameter contains *the actual number of characters consumed. *@param a_out the output converted utf8 buffer. Must be allocated by *the caller. *@param a_out_len in/out parameter. The size of the output buffer. *If this size is actually smaller than the real needed size, the function *just converts what it can and returns a success status. After return, *this param points to the actual number of bytes in the buffer. *@return CR_OK upon successful completion, an error code otherwise. */ enum CRStatus cr_utils_ucs4_to_utf8 (const guint32 * a_in, gulong * a_in_len, guchar * a_out, gulong * a_out_len) { gulong in_len = 0, in_index = 0, out_index = 0; enum CRStatus status = CR_OK; g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len, CR_BAD_PARAM_ERROR); if (*a_in_len < 1) { status = CR_OK; goto end; } in_len = *a_in_len; for (in_index = 0; in_index < in_len; in_index++) { /* *FIXME: return whenever we encounter forbidden char values. */ if (a_in[in_index] <= 0x7F) { a_out[out_index] = a_in[in_index]; out_index++; } else if (a_in[in_index] <= 0x7FF) { a_out[out_index] = (0xC0 | (a_in[in_index] >> 6)); a_out[out_index + 1] = (0x80 | (a_in[in_index] & 0x3F)); out_index += 2; } else if (a_in[in_index] <= 0xFFFF) { a_out[out_index] = (0xE0 | (a_in[in_index] >> 12)); a_out[out_index + 1] = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); a_out[out_index + 2] = (0x80 | (a_in[in_index] & 0x3F)); out_index += 3; } else if (a_in[in_index] <= 0x1FFFFF) { a_out[out_index] = (0xF0 | (a_in[in_index] >> 18)); a_out[out_index + 1] = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); a_out[out_index + 2] = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); a_out[out_index + 3] = (0x80 | (a_in[in_index] & 0x3F)); out_index += 4; } else if (a_in[in_index] <= 0x3FFFFFF) { a_out[out_index] = (0xF8 | (a_in[in_index] >> 24)); a_out[out_index + 1] = (0x80 | (a_in[in_index] >> 18)); a_out[out_index + 2] = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); a_out[out_index + 3] = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); a_out[out_index + 4] = (0x80 | (a_in[in_index] & 0x3F)); out_index += 5; } else if (a_in[in_index] <= 0x7FFFFFFF) { a_out[out_index] = (0xFC | (a_in[in_index] >> 30)); a_out[out_index + 1] = (0x80 | (a_in[in_index] >> 24)); a_out[out_index + 2] = (0x80 | ((a_in[in_index] >> 18) & 0x3F)); a_out[out_index + 3] = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); a_out[out_index + 4] = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); a_out[out_index + 4] = (0x80 | (a_in[in_index] & 0x3F)); out_index += 6; } else { status = CR_ENCODING_ERROR; goto end; } } /*end for */ end: *a_in_len = in_index + 1; *a_out_len = out_index + 1; return status; } /** *Converts an ucs4 string into an utf8 string. *@param a_in the input string to convert. *@param a_in_len in/out parameter. The length of the input *string. After return, points to the actual number of characters *consumed. This can be useful to debug the input string in case *of encoding error. *@param a_out out parameter. Points to the output string. It is allocated *by this function and must be freed by the caller. *@param a_out_len out parameter. The length (in bytes) of the output string. *@return CR_OK upon successful completion, an error code otherwise. */ enum CRStatus cr_utils_ucs4_str_to_utf8 (const guint32 * a_in, gulong * a_in_len, guchar ** a_out, gulong * a_out_len) { enum CRStatus status = CR_OK; g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len, CR_BAD_PARAM_ERROR); status = cr_utils_ucs4_str_len_as_utf8 (a_in, &a_in[*a_out_len - 1], a_out_len); g_return_val_if_fail (status == CR_OK, status); status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len); return status; } /** *Converts an ucs1 buffer into an utf8 buffer. *The caller must know the size of the resulting buffer and *allocate it prior to calling this function. * *@param a_in the input ucs1 buffer. * *@param a_in_len in/out parameter. The length of the input buffer. *After return, points to the number of bytes actually consumed even *in case of encoding error. * *@param a_out out parameter. The output utf8 converted buffer. * *@param a_out_len in/out parameter. The size of the output buffer. *If the output buffer size is shorter than the actual needed size, *this function just convert what it can. * *@return CR_OK upon successful completion, an error code otherwise. * */ enum CRStatus cr_utils_ucs1_to_utf8 (const guchar * a_in, gulong * a_in_len, guchar * a_out, gulong * a_out_len) { gulong out_index = 0, in_index = 0, in_len = 0, out_len = 0; enum CRStatus status = CR_OK; g_return_val_if_fail (a_in && a_in_len && a_out_len, CR_BAD_PARAM_ERROR); if (*a_in_len == 0) { *a_out_len = 0 ; return status; } g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ; in_len = *a_in_len; out_len = *a_out_len; for (in_index = 0, out_index = 0; (in_index < in_len) && (out_index < out_len); in_index++) { /* *FIXME: return whenever we encounter forbidden char values. */ if (a_in[in_index] <= 0x7F) { a_out[out_index] = a_in[in_index]; out_index++; } else { a_out[out_index] = (0xC0 | (a_in[in_index] >> 6)); a_out[out_index + 1] = (0x80 | (a_in[in_index] & 0x3F)); out_index += 2; } } /*end for */ *a_in_len = in_index; *a_out_len = out_index; return status; } /** *Converts an ucs1 string into an utf8 string. *@param a_in_start the beginning of the input string to convert. *@param a_in_end the end of the input string to convert. *@param a_out out parameter. The converted string. *@param a_out out parameter. The length of the converted string. *@return CR_OK upon successful completion, an error code otherwise. * */ enum CRStatus cr_utils_ucs1_str_to_utf8 (const guchar * a_in, gulong * a_in_len, guchar ** a_out, gulong * a_out_len) { gulong out_len = 0; enum CRStatus status = CR_OK; g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len, CR_BAD_PARAM_ERROR); if (*a_in_len < 1) { *a_out_len = 0; *a_out = NULL; return CR_OK; } status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1], &out_len); g_return_val_if_fail (status == CR_OK, status); *a_out = g_malloc0 (out_len); status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len); *a_out_len = out_len; return status; } /** *Converts an utf8 buffer into an ucs1 buffer. *The caller must know the size of the resulting *converted buffer, and allocated it prior to calling this *function. * *@param a_in the input utf8 buffer to convert. * *@param a_in_len in/out parameter. The size of the input utf8 buffer. *After return, points to the number of bytes consumed *by the function even in case of encoding error. * *@param a_out out parameter. Points to the resulting buffer. *Must be allocated by the caller. If the size of a_out is shorter *than its required size, this function converts what it can and return *a successful status. * *@param a_out_len in/out parameter. The size of the output buffer. *After return, points to the number of bytes consumed even in case of *encoding error. * *@return CR_OK upon successful completion, an error code otherwise. */ enum CRStatus cr_utils_utf8_to_ucs1 (const guchar * a_in, gulong * a_in_len, guchar * a_out, gulong * a_out_len) { gulong in_index = 0, out_index = 0, in_len = 0, out_len = 0; enum CRStatus status = CR_OK; /* *to store the final decoded *unicode char */ guint32 c = 0; g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len, CR_BAD_PARAM_ERROR); if (*a_in_len < 1) { goto end; } in_len = *a_in_len; out_len = *a_out_len; for (in_index = 0, out_index = 0; (in_index < in_len) && (out_index < out_len); in_index++, out_index++) { gint nb_bytes_2_decode = 0; if (a_in[in_index] <= 0x7F) { /* *7 bits long char *encoded over 1 byte: * 0xxx xxxx */ c = a_in[in_index]; nb_bytes_2_decode = 1; } else if ((a_in[in_index] & 0xE0) == 0xC0) { /* *up to 11 bits long char. *encoded over 2 bytes: *110x xxxx 10xx xxxx */ c = a_in[in_index] & 0x1F; nb_bytes_2_decode = 2; } else if ((a_in[in_index] & 0xF0) == 0xE0) { /* *up to 16 bit long char *encoded over 3 bytes: *1110 xxxx 10xx xxxx 10xx xxxx */ c = a_in[in_index] & 0x0F; nb_bytes_2_decode = 3; } else if ((a_in[in_index] & 0xF8) == 0xF0) { /* *up to 21 bits long char *encoded over 4 bytes: *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */ c = a_in[in_index] & 0x7; nb_bytes_2_decode = 4; } else if ((a_in[in_index] & 0xFC) == 0xF8) { /* *up to 26 bits long char *encoded over 5 bytes. *1111 10xx 10xx xxxx 10xx xxxx *10xx xxxx 10xx xxxx */ c = a_in[in_index] & 3; nb_bytes_2_decode = 5; } else if ((a_in[in_index] & 0xFE) == 0xFC) { /* *up to 31 bits long char *encoded over 6 bytes: *1111 110x 10xx xxxx 10xx xxxx *10xx xxxx 10xx xxxx 10xx xxxx */ c = a_in[in_index] & 1; nb_bytes_2_decode = 6; } else { /*BAD ENCODING */ status = CR_ENCODING_ERROR; goto end; } /* *Go and decode the remaining byte(s) *(if any) to get the current character. */ if (in_index + nb_bytes_2_decode - 1 >= in_len) { goto end; } for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { /*decode the next byte */ in_index++; /*byte pattern must be: 10xx xxxx */ if ((a_in[in_index] & 0xC0) != 0x80) { status = CR_ENCODING_ERROR; goto end; } c = (c << 6) | (a_in[in_index] & 0x3F); } /* *The decoded ucs4 char is now *in c. */ if (c > 0xFF) { status = CR_ENCODING_ERROR; goto end; } a_out[out_index] = c; } end: *a_out_len = out_index; *a_in_len = in_index; return status; } /** *Converts an utf8 buffer into an *ucs1 buffer. *@param a_in_start the start of the input buffer. *@param a_in_end the end of the input buffer. *@param a_out out parameter. The resulting converted ucs4 buffer. *Must be freed by the caller. *@param a_out_len out parameter. The length of the converted buffer. *@return CR_OK upon successful completion, an error code otherwise. *Note that out parameters are valid if and only if this function *returns CR_OK. */ enum CRStatus cr_utils_utf8_str_to_ucs1 (const guchar * a_in, gulong * a_in_len, guchar ** a_out, gulong * a_out_len) { enum CRStatus status = CR_OK; g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len, CR_BAD_PARAM_ERROR); if (*a_in_len < 1) { *a_out_len = 0; *a_out = NULL; return CR_OK; } status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1], a_out_len); g_return_val_if_fail (status == CR_OK, status); *a_out = g_malloc0 (*a_out_len * sizeof (guint32)); status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len); return status; } /***************************************** *CSS basic types identification utilities *****************************************/ /** *Returns TRUE if a_char is a white space as *defined in the css spec in chap 4.1.1. * *white-space ::= ' '| \t|\r|\n|\f * *@param a_char the character to test. *return TRUE if is a white space, false otherwise. */ gboolean cr_utils_is_white_space (guint32 a_char) { switch (a_char) { case ' ': case '\t': case '\r': case '\n': case '\f': return TRUE; break; default: return FALSE; } } /** *Returns true if the character is a newline *as defined in the css spec in the chap 4.1.1. * *nl ::= \n|\r\n|\r|\f * *@param a_char the character to test. *@return TRUE if the character is a newline, FALSE otherwise. */ gboolean cr_utils_is_newline (guint32 a_char) { switch (a_char) { case '\n': case '\r': case '\f': return TRUE; break; default: return FALSE; } } /** *returns TRUE if the char is part of an hexa num char: *i.e hexa_char ::= [0-9A-F] */ gboolean cr_utils_is_hexa_char (guint32 a_char) { if ((a_char >= '0' && a_char <= '9') || (a_char >= 'A' && a_char <= 'F')) { return TRUE; } return FALSE; } /** *Returns true if the character is a nonascii *character (as defined in the css spec chap 4.1.1): * *nonascii ::= [^\0-\177] * *@param a_char the character to test. *@return TRUE if the character is a nonascii char, *FALSE otherwise. */ gboolean cr_utils_is_nonascii (guint32 a_char) { if (a_char <= 177) { return FALSE; } return TRUE; } /** *Dumps a character a_nb times on a file. *@param a_char the char to dump *@param a_fp the destination file pointer *@param a_nb the number of times a_char is to be dumped. */ void cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb) { glong i = 0; for (i = 0; i < a_nb; i++) { fprintf (a_fp, "%c", a_char); } } void cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb) { glong i = 0; g_return_if_fail (a_string); for (i = 0; i < a_nb; i++) { g_string_append_printf (a_string, "%c", a_char); } } /** *Duplicates a list of GString instances. *@return the duplicated list of GString instances or NULL if *something bad happened. *@param a_list_of_strings the list of strings to be duplicated. */ GList * cr_utils_dup_glist_of_string (GList const * a_list_of_strings) { GList const *cur = NULL; GList *result = NULL; g_return_val_if_fail (a_list_of_strings, NULL); for (cur = a_list_of_strings; cur; cur = cur->next) { GString *str = NULL; str = g_string_new_len (((GString *) cur->data)->str, ((GString *) cur->data)->len); if (str) result = g_list_append (result, str); } return result; } /** *Duplicate a GList where the GList::data is a CRString. *@param a_list_of_strings the list to duplicate *@return the duplicated list, or NULL if something bad *happened. */ GList * cr_utils_dup_glist_of_cr_string (GList const * a_list_of_strings) { GList const *cur = NULL; GList *result = NULL; g_return_val_if_fail (a_list_of_strings, NULL); for (cur = a_list_of_strings; cur; cur = cur->next) { CRString *str = NULL; str = cr_string_dup ((CRString const *) cur->data) ; if (str) result = g_list_append (result, str); } return result; }