1 files changed, 1330 insertions, 0 deletions
diff --git a/src/st/croco/cr-utils.c b/src/st/croco/cr-utils.c
new file mode 100644
index 0000000..5fafade
--- /dev/null
+++ b/src/st/croco/cr-utils.c
@@ -0,0 +1,1330 @@
+/* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
+
+/*
+ * This file is part of The Croco Library
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ * Author: Dodji Seketeli
+ * See COPYRIGHTS file for copyright information.
+ */
+
+#include "cr-utils.h"
+#include "cr-string.h"
+
+/**
+ *@file:
+ *Some misc utility functions used
+ *in the libcroco.
+ *Note that troughout this file I will
+ *refer to the CSS SPECIFICATIONS DOCUMENTATION
+ *written by the w3c guys. You can find that document
+ *at http://www.w3.org/TR/REC-CSS2/ .
+ */
+
+/****************************
+ *Encoding transformations and
+ *encoding helpers
+ ****************************/
+
+/*
+ *Here is the correspondence between the ucs-4 charactere codes
+ *and there matching utf-8 encoding pattern as described by RFC 2279:
+ *
+ *UCS-4 range (hex.)    UTF-8 octet sequence (binary)
+ *------------------    -----------------------------
+ *0000 0000-0000 007F   0xxxxxxx
+ *0000 0080-0000 07FF   110xxxxx 10xxxxxx
+ *0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
+ *0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
+ */
+
+/**
+ *Given an utf8 string buffer, calculates
+ *the length of this string if it was encoded
+ *in ucs4.
+ *@param a_in_start a pointer to the beginning of
+ *the input utf8 string.
+ *@param a_in_end a pointre to the end of the input
+ *utf8 string (points to the last byte of the buffer)
+ *@param a_len out parameter the calculated length.
+ *@return CR_OK upon successful completion, an error code
+ *otherwise.
+ */
+enum CRStatus
+cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start,
+                               const guchar * a_in_end, gulong * a_len)
+{
+        guchar *byte_ptr = NULL;
+        gint len = 0;
+
+        /*
+         *to store the final decoded 
+         *unicode char
+         */
+        guint c = 0;
+
+        g_return_val_if_fail (a_in_start && a_in_end && a_len,
+                              CR_BAD_PARAM_ERROR);
+        *a_len = 0;
+
+        for (byte_ptr = (guchar *) a_in_start;
+             byte_ptr <= a_in_end; byte_ptr++) {
+                gint nb_bytes_2_decode = 0;
+
+                if (*byte_ptr <= 0x7F) {
+                        /*
+                         *7 bits long char
+                         *encoded over 1 byte:
+                         * 0xxx xxxx
+                         */
+                        c = *byte_ptr;
+                        nb_bytes_2_decode = 1;
+
+                } else if ((*byte_ptr & 0xE0) == 0xC0) {
+                        /*
+                         *up to 11 bits long char.
+                         *encoded over 2 bytes:
+                         *110x xxxx  10xx xxxx
+                         */
+                        c = *byte_ptr & 0x1F;
+                        nb_bytes_2_decode = 2;
+
+                } else if ((*byte_ptr & 0xF0) == 0xE0) {
+                        /*
+                         *up to 16 bit long char
+                         *encoded over 3 bytes:
+                         *1110 xxxx  10xx xxxx  10xx xxxx
+                         */
+                        c = *byte_ptr & 0x0F;
+                        nb_bytes_2_decode = 3;
+
+                } else if ((*byte_ptr & 0xF8) == 0xF0) {
+                        /*
+                         *up to 21 bits long char
+                         *encoded over 4 bytes:
+                         *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
+                         */
+                        c = *byte_ptr & 0x7;
+                        nb_bytes_2_decode = 4;
+
+                } else if ((*byte_ptr & 0xFC) == 0xF8) {
+                        /*
+                         *up to 26 bits long char
+                         *encoded over 5 bytes.
+                         *1111 10xx  10xx xxxx  10xx xxxx  
+                         *10xx xxxx  10xx xxxx
+                         */
+                        c = *byte_ptr & 3;
+                        nb_bytes_2_decode = 5;
+
+                } else if ((*byte_ptr & 0xFE) == 0xFC) {
+                        /*
+                         *up to 31 bits long char
+                         *encoded over 6 bytes:
+                         *1111 110x  10xx xxxx  10xx xxxx  
+                         *10xx xxxx  10xx xxxx  10xx xxxx
+                         */
+                        c = *byte_ptr & 1;
+                        nb_bytes_2_decode = 6;
+
+                } else {
+                        /*
+                         *BAD ENCODING
+                         */
+                        return CR_ENCODING_ERROR;
+                }
+
+                /*
+                 *Go and decode the remaining byte(s)
+                 *(if any) to get the current character.
+                 */
+                for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
+                        /*decode the next byte */
+                        byte_ptr++;
+
+                        /*byte pattern must be: 10xx xxxx */
+                        if ((*byte_ptr & 0xC0) != 0x80) {
+                                return CR_ENCODING_ERROR;
+                        }
+
+                        c = (c << 6) | (*byte_ptr & 0x3F);
+                }
+
+                len++;
+        }
+
+        *a_len = len;
+
+        return CR_OK;
+}
+
+/**
+ *Given an ucs4 string, this function
+ *returns the size (in bytes) this string
+ *would have occupied if it was encoded in utf-8.
+ *@param a_in_start a pointer to the beginning of the input
+ *buffer.
+ *@param a_in_end a pointer to the end of the input buffer.
+ *@param a_len out parameter. The computed length.
+ *@return CR_OK upon successful completion, an error code otherwise.
+ */
+enum CRStatus
+cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start,
+                               const guint32 * a_in_end, gulong * a_len)
+{
+        gint len = 0;
+        guint32 *char_ptr = NULL;
+
+        g_return_val_if_fail (a_in_start && a_in_end && a_len,
+                              CR_BAD_PARAM_ERROR);
+
+        for (char_ptr = (guint32 *) a_in_start;
+             char_ptr <= a_in_end; char_ptr++) {
+                if (*char_ptr <= 0x7F) {
+                        /*the utf-8 char would take 1 byte */
+                        len += 1;
+                } else if (*char_ptr <= 0x7FF) {
+                        /*the utf-8 char would take 2 bytes */
+                        len += 2;
+                } else if (*char_ptr <= 0xFFFF) {
+                        len += 3;
+                } else if (*char_ptr <= 0x1FFFFF) {
+                        len += 4;
+                } else if (*char_ptr <= 0x3FFFFFF) {
+                        len += 5;
+                } else if (*char_ptr <= 0x7FFFFFFF) {
+                        len += 6;
+                }
+        }
+
+        *a_len = len;
+        return CR_OK;
+}
+
+/**
+ *Given an ucsA string, this function
+ *returns the size (in bytes) this string
+ *would have occupied if it was encoded in utf-8.
+ *@param a_in_start a pointer to the beginning of the input
+ *buffer.
+ *@param a_in_end a pointer to the end of the input buffer.
+ *@param a_len out parameter. The computed length.
+ *@return CR_OK upon successful completion, an error code otherwise.
+ */
+enum CRStatus
+cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start,
+                               const guchar * a_in_end, gulong * a_len)
+{
+        gint len = 0;
+        guchar *char_ptr = NULL;
+
+        g_return_val_if_fail (a_in_start && a_in_end && a_len,
+                              CR_BAD_PARAM_ERROR);
+
+        for (char_ptr = (guchar *) a_in_start;
+             char_ptr <= a_in_end; char_ptr++) {
+                if (*char_ptr <= 0x7F) {
+                        /*the utf-8 char would take 1 byte */
+                        len += 1;
+                } else {
+                        /*the utf-8 char would take 2 bytes */
+                        len += 2;
+                }
+        }
+
+        *a_len = len;
+        return CR_OK;
+}
+
+/**
+ *Converts an utf8 buffer into an ucs4 buffer.
+ *
+ *@param a_in the input utf8 buffer to convert.
+ *@param a_in_len in/out parameter. The size of the
+ *input buffer to convert. After return, this parameter contains
+ *the actual number of bytes consumed.
+ *@param a_out the output converted ucs4 buffer. Must be allocated by
+ *the caller.
+ *@param a_out_len in/out parameter. The size of the output buffer.
+ *If this size is actually smaller than the real needed size, the function
+ *just converts what it can and returns a success status. After return,
+ *this param points to the actual number of characters decoded.
+ *@return CR_OK upon successful completion, an error code otherwise.
+ */
+enum CRStatus
+cr_utils_utf8_to_ucs4 (const guchar * a_in,
+                       gulong * a_in_len, guint32 * a_out, gulong * a_out_len)
+{
+        gulong in_len = 0,
+                out_len = 0,
+                in_index = 0,
+                out_index = 0;
+        enum CRStatus status = CR_OK;
+
+        /*
+         *to store the final decoded 
+         *unicode char
+         */
+        guint c = 0;
+
+        g_return_val_if_fail (a_in && a_in_len
+                              && a_out && a_out_len, CR_BAD_PARAM_ERROR);
+
+        if (*a_in_len < 1) {
+                status = CR_OK;
+                goto end;
+        }
+
+        in_len = *a_in_len;
+        out_len = *a_out_len;
+
+        for (in_index = 0, out_index = 0;
+             (in_index < in_len) && (out_index < out_len);
+             in_index++, out_index++) {
+                gint nb_bytes_2_decode = 0;
+
+                if (a_in[in_index] <= 0x7F) {
+                        /*
+                         *7 bits long char
+                         *encoded over 1 byte:
+                         * 0xxx xxxx
+                         */
+                        c = a_in[in_index];
+                        nb_bytes_2_decode = 1;
+
+                } else if ((a_in[in_index] & 0xE0) == 0xC0) {
+                        /*
+                         *up to 11 bits long char.
+                         *encoded over 2 bytes:
+                         *110x xxxx  10xx xxxx
+                         */
+                        c = a_in[in_index] & 0x1F;
+                        nb_bytes_2_decode = 2;
+
+                } else if ((a_in[in_index] & 0xF0) == 0xE0) {
+                        /*
+                         *up to 16 bit long char
+                         *encoded over 3 bytes:
+                         *1110 xxxx  10xx xxxx  10xx xxxx
+                         */
+                        c = a_in[in_index] & 0x0F;
+                        nb_bytes_2_decode = 3;
+
+                } else if ((a_in[in_index] & 0xF8) == 0xF0) {
+                        /*
+                         *up to 21 bits long char
+                         *encoded over 4 bytes:
+                         *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
+                         */
+                        c = a_in[in_index] & 0x7;
+                        nb_bytes_2_decode = 4;
+
+                } else if ((a_in[in_index] & 0xFC) == 0xF8) {
+                        /*
+                         *up to 26 bits long char
+                         *encoded over 5 bytes.
+                         *1111 10xx  10xx xxxx  10xx xxxx  
+                         *10xx xxxx  10xx xxxx
+                         */
+                        c = a_in[in_index] & 3;
+                        nb_bytes_2_decode = 5;
+
+                } else if ((a_in[in_index] & 0xFE) == 0xFC) {
+                        /*
+                         *up to 31 bits long char
+                         *encoded over 6 bytes:
+                         *1111 110x  10xx xxxx  10xx xxxx  
+                         *10xx xxxx  10xx xxxx  10xx xxxx
+                         */
+                        c = a_in[in_index] & 1;
+                        nb_bytes_2_decode = 6;
+
+                } else {
+                        /*BAD ENCODING */
+                        goto end;
+                }
+
+                /*
+                 *Go and decode the remaining byte(s)
+                 *(if any) to get the current character.
+                 */
+                for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
+                        /*decode the next byte */
+                        in_index++;
+
+                        /*byte pattern must be: 10xx xxxx */
+                        if ((a_in[in_index] & 0xC0) != 0x80) {
+                                goto end;
+                        }
+
+                        c = (c << 6) | (a_in[in_index] & 0x3F);
+                }
+
+                /*
+                 *The decoded ucs4 char is now
+                 *in c.
+                 */
+
+                /************************
+                 *Some security tests
+                 ***********************/
+
+                /*be sure c is a char */
+                if (c == 0xFFFF || c == 0xFFFE)
+                        goto end;
+
+                /*be sure c is inferior to the max ucs4 char value */
+                if (c > 0x10FFFF)
+                        goto end;
+
+                /*
+                 *c must be less than UTF16 "lower surrogate begin"
+                 *or higher than UTF16 "High surrogate end"
+                 */
+                if (c >= 0xD800 && c <= 0xDFFF)
+                        goto end;
+
+                /*Avoid characters that equals zero */
+                if (c == 0)
+                        goto end;
+
+                a_out[out_index] = c;
+        }
+
+      end:
+        *a_out_len = out_index + 1;
+        *a_in_len = in_index + 1;
+
+        return status;
+}
+
+/**
+ *Reads a character from an utf8 buffer.
+ *Actually decode the next character code (unicode character code)
+ *and returns it.
+ *@param a_in the starting address of the utf8 buffer.
+ *@param a_in_len the length of the utf8 buffer.
+ *@param a_out output parameter. The resulting read char.
+ *@param a_consumed the number of the bytes consumed to
+ *decode the returned character code.
+ *@return CR_OK upon successful completion, an error code otherwise.
+ */
+enum CRStatus
+cr_utils_read_char_from_utf8_buf (const guchar * a_in,
+                                  gulong a_in_len,
+                                  guint32 * a_out, gulong * a_consumed)
+{
+        gulong in_index = 0,
+               nb_bytes_2_decode = 0;
+        enum CRStatus status = CR_OK;
+
+        /*
+         *to store the final decoded 
+         *unicode char
+         */
+        guint32 c = 0;
+
+        g_return_val_if_fail (a_in && a_out && a_out
+                              && a_consumed, CR_BAD_PARAM_ERROR);
+
+        if (a_in_len < 1) {
+                status = CR_OK;
+                goto end;
+        }
+
+        if (*a_in <= 0x7F) {
+                /*
+                 *7 bits long char
+                 *encoded over 1 byte:
+                 * 0xxx xxxx
+                 */
+                c = *a_in;
+                nb_bytes_2_decode = 1;
+
+        } else if ((*a_in & 0xE0) == 0xC0) {
+                /*
+                 *up to 11 bits long char.
+                 *encoded over 2 bytes:
+                 *110x xxxx  10xx xxxx
+                 */
+                c = *a_in & 0x1F;
+                nb_bytes_2_decode = 2;
+
+        } else if ((*a_in & 0xF0) == 0xE0) {
+                /*
+                 *up to 16 bit long char
+                 *encoded over 3 bytes:
+                 *1110 xxxx  10xx xxxx  10xx xxxx
+                 */
+                c = *a_in & 0x0F;
+                nb_bytes_2_decode = 3;
+
+        } else if ((*a_in & 0xF8) == 0xF0) {
+                /*
+                 *up to 21 bits long char
+                 *encoded over 4 bytes:
+                 *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
+                 */
+                c = *a_in & 0x7;
+                nb_bytes_2_decode = 4;
+
+        } else if ((*a_in & 0xFC) == 0xF8) {
+                /*
+                 *up to 26 bits long char
+                 *encoded over 5 bytes.
+                 *1111 10xx  10xx xxxx  10xx xxxx  
+                 *10xx xxxx  10xx xxxx
+                 */
+                c = *a_in & 3;
+                nb_bytes_2_decode = 5;
+
+        } else if ((*a_in & 0xFE) == 0xFC) {
+                /*
+                 *up to 31 bits long char
+                 *encoded over 6 bytes:
+                 *1111 110x  10xx xxxx  10xx xxxx  
+                 *10xx xxxx  10xx xxxx  10xx xxxx
+                 */
+                c = *a_in & 1;
+                nb_bytes_2_decode = 6;
+
+        } else {
+                /*BAD ENCODING */
+                goto end;
+        }
+
+        if (nb_bytes_2_decode > a_in_len) {
+                status = CR_END_OF_INPUT_ERROR;
+                goto end;
+        }
+
+        /*
+         *Go and decode the remaining byte(s)
+         *(if any) to get the current character.
+         */
+        for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
+                /*byte pattern must be: 10xx xxxx */
+                if ((a_in[in_index] & 0xC0) != 0x80) {
+                        goto end;
+                }
+
+                c = (c << 6) | (a_in[in_index] & 0x3F);
+        }
+
+        /*
+         *The decoded ucs4 char is now
+         *in c.
+         */
+
+    /************************
+     *Some security tests
+     ***********************/
+
+        /*be sure c is a char */
+        if (c == 0xFFFF || c == 0xFFFE)
+                goto end;
+
+        /*be sure c is inferior to the max ucs4 char value */
+        if (c > 0x10FFFF)
+                goto end;
+
+        /*
+         *c must be less than UTF16 "lower surrogate begin"
+         *or higher than UTF16 "High surrogate end"
+         */
+        if (c >= 0xD800 && c <= 0xDFFF)
+                goto end;
+
+        /*Avoid characters that equals zero */
+        if (c == 0)
+                goto end;
+
+        *a_out = c;
+
+      end:
+        *a_consumed = nb_bytes_2_decode;
+
+        return status;
+}
+
+/**
+ *
+ */
+enum CRStatus
+cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start,
+                               const guchar * a_in_end, gulong * a_len)
+{
+        /*
+         *Note: this function can be made shorter
+         *but it considers all the cases of the utf8 encoding
+         *to ease further extensions ...
+         */
+
+        guchar *byte_ptr = NULL;
+        gint len = 0;
+
+        /*
+         *to store the final decoded 
+         *unicode char
+         */
+        guint c = 0;
+
+        g_return_val_if_fail (a_in_start && a_in_end && a_len,
+                              CR_BAD_PARAM_ERROR);
+        *a_len = 0;
+
+        for (byte_ptr = (guchar *) a_in_start;
+             byte_ptr <= a_in_end; byte_ptr++) {
+                gint nb_bytes_2_decode = 0;
+
+                if (*byte_ptr <= 0x7F) {
+                        /*
+                         *7 bits long char
+                         *encoded over 1 byte:
+                         * 0xxx xxxx
+                         */
+                        c = *byte_ptr;
+                        nb_bytes_2_decode = 1;
+
+                } else if ((*byte_ptr & 0xE0) == 0xC0) {
+                        /*
+                         *up to 11 bits long char.
+                         *encoded over 2 bytes:
+                         *110x xxxx  10xx xxxx
+                         */
+                        c = *byte_ptr & 0x1F;
+                        nb_bytes_2_decode = 2;
+
+                } else if ((*byte_ptr & 0xF0) == 0xE0) {
+                        /*
+                         *up to 16 bit long char
+                         *encoded over 3 bytes:
+                         *1110 xxxx  10xx xxxx  10xx xxxx
+                         */
+                        c = *byte_ptr & 0x0F;
+                        nb_bytes_2_decode = 3;
+
+                } else if ((*byte_ptr & 0xF8) == 0xF0) {
+                        /*
+                         *up to 21 bits long char
+                         *encoded over 4 bytes:
+                         *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
+                         */
+                        c = *byte_ptr & 0x7;
+                        nb_bytes_2_decode = 4;
+
+                } else if ((*byte_ptr & 0xFC) == 0xF8) {
+                        /*
+                         *up to 26 bits long char
+                         *encoded over 5 bytes.
+                         *1111 10xx  10xx xxxx  10xx xxxx  
+                         *10xx xxxx  10xx xxxx
+                         */
+                        c = *byte_ptr & 3;
+                        nb_bytes_2_decode = 5;
+
+                } else if ((*byte_ptr & 0xFE) == 0xFC) {
+                        /*
+                         *up to 31 bits long char
+                         *encoded over 6 bytes:
+                         *1111 110x  10xx xxxx  10xx xxxx  
+                         *10xx xxxx  10xx xxxx  10xx xxxx
+                         */
+                        c = *byte_ptr & 1;
+                        nb_bytes_2_decode = 6;
+
+                } else {
+                        /*
+                         *BAD ENCODING
+                         */
+                        return CR_ENCODING_ERROR;
+                }
+
+                /*
+                 *Go and decode the remaining byte(s)
+                 *(if any) to get the current character.
+                 */
+                for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
+                        /*decode the next byte */
+                        byte_ptr++;
+
+                        /*byte pattern must be: 10xx xxxx */
+                        if ((*byte_ptr & 0xC0) != 0x80) {
+                                return CR_ENCODING_ERROR;
+                        }
+
+                        c = (c << 6) | (*byte_ptr & 0x3F);
+                }
+
+                /*
+                 *The decoded ucs4 char is now
+                 *in c.
+                 */
+
+                if (c <= 0xFF) { /*Add other conditions to support
+                                  *other char sets (ucs2, ucs3, ucs4).
+                                  */
+                        len++;
+                } else {
+                        /*the char is too long to fit
+                         *into the supposed charset len.
+                         */
+                        return CR_ENCODING_ERROR;
+                }
+        }
+
+        *a_len = len;
+
+        return CR_OK;
+}
+
+/**
+ *Converts an utf8 string into an ucs4 string.
+ *@param a_in the input string to convert.
+ *@param a_in_len in/out parameter. The length of the input
+ *string. After return, points to the actual number of bytes
+ *consumed. This can be useful to debug the input stream in case
+ *of encoding error.
+ *@param a_out out parameter. Points to the output string. It is allocated 
+ *by this function and must be freed by the caller.
+ *@param a_out_len out parameter. The length of the output string.
+ *@return CR_OK upon successful completion, an error code otherwise.
+ *
+ */
+enum CRStatus
+cr_utils_utf8_str_to_ucs4 (const guchar * a_in,
+                           gulong * a_in_len,
+                           guint32 ** a_out, gulong * a_out_len)
+{
+        enum CRStatus status = CR_OK;
+
+        g_return_val_if_fail (a_in && a_in_len
+                              && a_out && a_out_len, CR_BAD_PARAM_ERROR);
+
+        status = cr_utils_utf8_str_len_as_ucs4 (a_in,
+                                                &a_in[*a_in_len - 1],
+                                                a_out_len);
+
+        g_return_val_if_fail (status == CR_OK, status);
+
+        *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
+
+        status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len);
+
+        return status;
+}
+
+/**
+ *Converts an ucs4 buffer into an utf8 buffer.
+ *
+ *@param a_in the input ucs4 buffer to convert.
+ *@param a_in_len in/out parameter. The size of the
+ *input buffer to convert. After return, this parameter contains
+ *the actual number of characters consumed.
+ *@param a_out the output converted utf8 buffer. Must be allocated by
+ *the caller.
+ *@param a_out_len in/out parameter. The size of the output buffer.
+ *If this size is actually smaller than the real needed size, the function
+ *just converts what it can and returns a success status. After return,
+ *this param points to the actual number of bytes in the buffer.
+ *@return CR_OK upon successful completion, an error code otherwise.
+ */
+enum CRStatus
+cr_utils_ucs4_to_utf8 (const guint32 * a_in,
+                       gulong * a_in_len, guchar * a_out, gulong * a_out_len)
+{
+        gulong in_len = 0,
+                in_index = 0,
+                out_index = 0;
+        enum CRStatus status = CR_OK;
+
+        g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len,
+                              CR_BAD_PARAM_ERROR);
+
+        if (*a_in_len < 1) {
+                status = CR_OK;
+                goto end;
+        }
+
+        in_len = *a_in_len;
+
+        for (in_index = 0; in_index < in_len; in_index++) {
+                /*
+                 *FIXME: return whenever we encounter forbidden char values.
+                 */
+
+                if (a_in[in_index] <= 0x7F) {
+                        a_out[out_index] = a_in[in_index];
+                        out_index++;
+                } else if (a_in[in_index] <= 0x7FF) {
+                        a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
+                        a_out[out_index + 1] =
+                                (0x80 | (a_in[in_index] & 0x3F));
+                        out_index += 2;
+                } else if (a_in[in_index] <= 0xFFFF) {
+                        a_out[out_index] = (0xE0 | (a_in[in_index] >> 12));
+                        a_out[out_index + 1] =
+                                (0x80 | ((a_in[in_index] >> 6) & 0x3F));
+                        a_out[out_index + 2] =
+                                (0x80 | (a_in[in_index] & 0x3F));
+                        out_index += 3;
+                } else if (a_in[in_index] <= 0x1FFFFF) {
+                        a_out[out_index] = (0xF0 | (a_in[in_index] >> 18));
+                        a_out[out_index + 1]
+                                = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
+                        a_out[out_index + 2]
+                                = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
+                        a_out[out_index + 3]
+                                = (0x80 | (a_in[in_index] & 0x3F));
+                        out_index += 4;
+                } else if (a_in[in_index] <= 0x3FFFFFF) {
+                        a_out[out_index] = (0xF8 | (a_in[in_index] >> 24));
+                        a_out[out_index + 1] =
+                                (0x80 | (a_in[in_index] >> 18));
+                        a_out[out_index + 2]
+                                = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
+                        a_out[out_index + 3]
+                                = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
+                        a_out[out_index + 4]
+                                = (0x80 | (a_in[in_index] & 0x3F));
+                        out_index += 5;
+                } else if (a_in[in_index] <= 0x7FFFFFFF) {
+                        a_out[out_index] = (0xFC | (a_in[in_index] >> 30));
+                        a_out[out_index + 1] =
+                                (0x80 | (a_in[in_index] >> 24));
+                        a_out[out_index + 2]
+                                = (0x80 | ((a_in[in_index] >> 18) & 0x3F));
+                        a_out[out_index + 3]
+                                = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
+                        a_out[out_index + 4]
+                                = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
+                        a_out[out_index + 4]
+                                = (0x80 | (a_in[in_index] & 0x3F));
+                        out_index += 6;
+                } else {
+                        status = CR_ENCODING_ERROR;
+                        goto end;
+                }
+        }                       /*end for */
+
+      end:
+        *a_in_len = in_index + 1;
+        *a_out_len = out_index + 1;
+
+        return status;
+}
+
+/**
+ *Converts an ucs4 string into an utf8 string.
+ *@param a_in the input string to convert.
+ *@param a_in_len in/out parameter. The length of the input
+ *string. After return, points to the actual number of characters
+ *consumed. This can be useful to debug the input string in case
+ *of encoding error.
+ *@param a_out out parameter. Points to the output string. It is allocated 
+ *by this function and must be freed by the caller.
+ *@param a_out_len out parameter. The length (in bytes) of the output string.
+ *@return CR_OK upon successful completion, an error code otherwise.
+ */
+enum CRStatus
+cr_utils_ucs4_str_to_utf8 (const guint32 * a_in,
+                           gulong * a_in_len,
+                           guchar ** a_out, gulong * a_out_len)
+{
+        enum CRStatus status = CR_OK;
+
+        g_return_val_if_fail (a_in && a_in_len && a_out
+                              && a_out_len, CR_BAD_PARAM_ERROR);
+
+        status = cr_utils_ucs4_str_len_as_utf8 (a_in,
+                                                &a_in[*a_out_len - 1],
+                                                a_out_len);
+
+        g_return_val_if_fail (status == CR_OK, status);
+
+        status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len);
+
+        return status;
+}
+
+/**
+ *Converts an ucs1 buffer into an utf8 buffer.
+ *The caller must know the size of the resulting buffer and
+ *allocate it prior to calling this function.
+ *
+ *@param a_in the input ucs1 buffer.
+ *
+ *@param a_in_len in/out parameter. The length of the input buffer.
+ *After return, points to the number of bytes actually consumed even
+ *in case of encoding error.
+ *
+ *@param a_out out parameter. The output utf8 converted buffer.
+ *
+ *@param a_out_len in/out parameter. The size of the output buffer.
+ *If the output buffer size is shorter than the actual needed size, 
+ *this function just convert what it can.
+ *
+ *@return CR_OK upon successful completion, an error code otherwise.
+ *
+ */
+enum CRStatus
+cr_utils_ucs1_to_utf8 (const guchar * a_in,
+                       gulong * a_in_len, guchar * a_out, gulong * a_out_len)
+{
+        gulong out_index = 0,
+                in_index = 0,
+                in_len = 0,
+                out_len = 0;
+        enum CRStatus status = CR_OK;
+
+        g_return_val_if_fail (a_in && a_in_len
+                              && a_out_len, 
+                              CR_BAD_PARAM_ERROR);
+
+        if (*a_in_len == 0) {
+                *a_out_len = 0 ;
+                return status;
+        }
+        g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ;
+
+        in_len = *a_in_len;
+        out_len = *a_out_len;
+
+        for (in_index = 0, out_index = 0;
+             (in_index < in_len) && (out_index < out_len); in_index++) {
+                /*
+                 *FIXME: return whenever we encounter forbidden char values.
+                 */
+
+                if (a_in[in_index] <= 0x7F) {
+                        a_out[out_index] = a_in[in_index];
+                        out_index++;
+                } else {
+                        a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
+                        a_out[out_index + 1] =
+                                (0x80 | (a_in[in_index] & 0x3F));
+                        out_index += 2;
+                }
+        }                       /*end for */
+
+        *a_in_len = in_index;
+        *a_out_len = out_index;
+
+        return status;
+}
+
+/**
+ *Converts an ucs1 string into an utf8 string.
+ *@param a_in_start the beginning of the input string to convert.
+ *@param a_in_end the end of the input string to convert.
+ *@param a_out out parameter. The converted string.
+ *@param a_out out parameter. The length of the converted string.
+ *@return CR_OK upon successful completion, an error code otherwise.
+ *
+ */
+enum CRStatus
+cr_utils_ucs1_str_to_utf8 (const guchar * a_in,
+                           gulong * a_in_len,
+                           guchar ** a_out, gulong * a_out_len)
+{
+        gulong out_len = 0;
+        enum CRStatus status = CR_OK;
+
+        g_return_val_if_fail (a_in && a_in_len && a_out
+                              && a_out_len, CR_BAD_PARAM_ERROR);
+
+        if (*a_in_len < 1) {
+                *a_out_len = 0;
+                *a_out = NULL;
+                return CR_OK;
+        }
+
+        status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1],
+                                                &out_len);
+
+        g_return_val_if_fail (status == CR_OK, status);
+
+        *a_out = g_malloc0 (out_len);
+
+        status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len);
+
+        *a_out_len = out_len;
+
+        return status;
+}
+
+/**
+ *Converts an utf8 buffer into an ucs1 buffer.
+ *The caller must know the size of the resulting
+ *converted buffer, and allocated it prior to calling this
+ *function.
+ *
+ *@param a_in the input utf8 buffer to convert.
+ *
+ *@param a_in_len in/out parameter. The size of the input utf8 buffer.
+ *After return, points to the number of bytes consumed
+ *by the function even in case of encoding error.
+ *
+ *@param a_out out parameter. Points to the resulting buffer.
+ *Must be allocated by the caller. If the size of a_out is shorter
+ *than its required size, this function converts what it can and return
+ *a successful status.
+ *
+ *@param a_out_len in/out parameter. The size of the output buffer.
+ *After return, points to the number of bytes consumed even in case of
+ *encoding error.
+ *
+ *@return CR_OK upon successful completion, an error code otherwise.
+ */
+enum CRStatus
+cr_utils_utf8_to_ucs1 (const guchar * a_in,
+                       gulong * a_in_len, guchar * a_out, gulong * a_out_len)
+{
+        gulong in_index = 0,
+                out_index = 0,
+                in_len = 0,
+                out_len = 0;
+        enum CRStatus status = CR_OK;
+
+        /*
+         *to store the final decoded 
+         *unicode char
+         */
+        guint32 c = 0;
+
+        g_return_val_if_fail (a_in && a_in_len
+                              && a_out && a_out_len, CR_BAD_PARAM_ERROR);
+
+        if (*a_in_len < 1) {
+                goto end;
+        }
+
+        in_len = *a_in_len;
+        out_len = *a_out_len;
+
+        for (in_index = 0, out_index = 0;
+             (in_index < in_len) && (out_index < out_len);
+             in_index++, out_index++) {
+                gint nb_bytes_2_decode = 0;
+
+                if (a_in[in_index] <= 0x7F) {
+                        /*
+                         *7 bits long char
+                         *encoded over 1 byte:
+                         * 0xxx xxxx
+                         */
+                        c = a_in[in_index];
+                        nb_bytes_2_decode = 1;
+
+                } else if ((a_in[in_index] & 0xE0) == 0xC0) {
+                        /*
+                         *up to 11 bits long char.
+                         *encoded over 2 bytes:
+                         *110x xxxx  10xx xxxx
+                         */
+                        c = a_in[in_index] & 0x1F;
+                        nb_bytes_2_decode = 2;
+
+                } else if ((a_in[in_index] & 0xF0) == 0xE0) {
+                        /*
+                         *up to 16 bit long char
+                         *encoded over 3 bytes:
+                         *1110 xxxx  10xx xxxx  10xx xxxx
+                         */
+                        c = a_in[in_index] & 0x0F;
+                        nb_bytes_2_decode = 3;
+
+                } else if ((a_in[in_index] & 0xF8) == 0xF0) {
+                        /*
+                         *up to 21 bits long char
+                         *encoded over 4 bytes:
+                         *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
+                         */
+                        c = a_in[in_index] & 0x7;
+                        nb_bytes_2_decode = 4;
+
+                } else if ((a_in[in_index] & 0xFC) == 0xF8) {
+                        /*
+                         *up to 26 bits long char
+                         *encoded over 5 bytes.
+                         *1111 10xx  10xx xxxx  10xx xxxx  
+                         *10xx xxxx  10xx xxxx
+                         */
+                        c = a_in[in_index] & 3;
+                        nb_bytes_2_decode = 5;
+
+                } else if ((a_in[in_index] & 0xFE) == 0xFC) {
+                        /*
+                         *up to 31 bits long char
+                         *encoded over 6 bytes:
+                         *1111 110x  10xx xxxx  10xx xxxx  
+                         *10xx xxxx  10xx xxxx  10xx xxxx
+                         */
+                        c = a_in[in_index] & 1;
+                        nb_bytes_2_decode = 6;
+
+                } else {
+                        /*BAD ENCODING */
+                        status = CR_ENCODING_ERROR;
+                        goto end;
+                }
+
+                /*
+                 *Go and decode the remaining byte(s)
+                 *(if any) to get the current character.
+                 */
+                if (in_index + nb_bytes_2_decode - 1 >= in_len) {
+                        goto end;
+                }
+
+                for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
+                        /*decode the next byte */
+                        in_index++;
+
+                        /*byte pattern must be: 10xx xxxx */
+                        if ((a_in[in_index] & 0xC0) != 0x80) {
+                                status = CR_ENCODING_ERROR;
+                                goto end;
+                        }
+
+                        c = (c << 6) | (a_in[in_index] & 0x3F);
+                }
+
+                /*
+                 *The decoded ucs4 char is now
+                 *in c.
+                 */
+
+                if (c > 0xFF) {
+                        status = CR_ENCODING_ERROR;
+                        goto end;
+                }
+
+                a_out[out_index] = c;
+        }
+
+      end:
+        *a_out_len = out_index;
+        *a_in_len = in_index;
+
+        return status;
+}
+
+/**
+ *Converts an utf8 buffer into an
+ *ucs1 buffer.
+ *@param a_in_start the start of the input buffer.
+ *@param a_in_end the end of the input buffer.
+ *@param a_out out parameter. The resulting converted ucs4 buffer.
+ *Must be freed by the caller.
+ *@param a_out_len out parameter. The length of the converted buffer.
+ *@return CR_OK upon successful completion, an error code otherwise.
+ *Note that out parameters are valid if and only if this function
+ *returns CR_OK.
+ */
+enum CRStatus
+cr_utils_utf8_str_to_ucs1 (const guchar * a_in,
+                           gulong * a_in_len,
+                           guchar ** a_out, gulong * a_out_len)
+{
+        enum CRStatus status = CR_OK;
+
+        g_return_val_if_fail (a_in && a_in_len
+                              && a_out && a_out_len, CR_BAD_PARAM_ERROR);
+
+        if (*a_in_len < 1) {
+                *a_out_len = 0;
+                *a_out = NULL;
+                return CR_OK;
+        }
+
+        status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1],
+                                                a_out_len);
+
+        g_return_val_if_fail (status == CR_OK, status);
+
+        *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
+
+        status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len);
+        return status;
+}
+
+/*****************************************
+ *CSS basic types identification utilities
+ *****************************************/
+
+/**
+ *Returns TRUE if a_char is a white space as
+ *defined in the css spec in chap 4.1.1.
+ *
+ *white-space ::= ' '| \t|\r|\n|\f
+ *
+ *@param a_char the character to test.
+ *return TRUE if is a white space, false otherwise.
+ */
+gboolean
+cr_utils_is_white_space (guint32 a_char)
+{
+        switch (a_char) {
+        case ' ':
+        case '\t':
+        case '\r':
+        case '\n':
+        case '\f':
+                return TRUE;
+                break;
+        default:
+                return FALSE;
+        }
+}
+
+/**
+ *Returns true if the character is a newline
+ *as defined in the css spec in the chap 4.1.1.
+ *
+ *nl ::= \n|\r\n|\r|\f
+ *
+ *@param a_char the character to test.
+ *@return TRUE if the character is a newline, FALSE otherwise.
+ */
+gboolean
+cr_utils_is_newline (guint32 a_char)
+{
+        switch (a_char) {
+        case '\n':
+        case '\r':
+        case '\f':
+                return TRUE;
+                break;
+        default:
+                return FALSE;
+        }
+}
+
+/**
+ *returns TRUE if the char is part of an hexa num char:
+ *i.e hexa_char ::= [0-9A-F]
+ */
+gboolean
+cr_utils_is_hexa_char (guint32 a_char)
+{
+        if ((a_char >= '0' && a_char <= '9')
+            || (a_char >= 'A' && a_char <= 'F')) {
+                return TRUE;
+        }
+        return FALSE;
+}
+
+/**
+ *Returns true if the character is a nonascii
+ *character (as defined in the css spec chap 4.1.1):
+ *
+ *nonascii ::= [^\0-\177]
+ *
+ *@param a_char the character to test.
+ *@return TRUE if the character is a nonascii char,
+ *FALSE otherwise.
+ */
+gboolean
+cr_utils_is_nonascii (guint32 a_char)
+{
+        if (a_char <= 177) {
+                return FALSE;
+        }
+
+        return TRUE;
+}
+
+/**
+ *Dumps a character a_nb times on a file.
+ *@param a_char the char to dump
+ *@param a_fp the destination file pointer
+ *@param a_nb the number of times a_char is to be dumped.
+ */
+void
+cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb)
+{
+        glong i = 0;
+
+        for (i = 0; i < a_nb; i++) {
+                fprintf (a_fp, "%c", a_char);
+        }
+}
+
+void
+cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb)
+{
+        glong i = 0;
+
+        g_return_if_fail (a_string);
+
+        for (i = 0; i < a_nb; i++) {
+                g_string_append_printf (a_string, "%c", a_char);
+        }
+}
+
+/**
+ *Duplicates a list of GString instances.
+ *@return the duplicated list of GString instances or NULL if
+ *something bad happened.
+ *@param a_list_of_strings the list of strings to be duplicated.
+ */
+GList *
+cr_utils_dup_glist_of_string (GList const * a_list_of_strings)
+{
+        GList const *cur = NULL;
+        GList *result = NULL;
+
+        g_return_val_if_fail (a_list_of_strings, NULL);
+
+        for (cur = a_list_of_strings; cur; cur = cur->next) {
+                GString *str = NULL;
+
+                str = g_string_new_len (((GString *) cur->data)->str,
+                                        ((GString *) cur->data)->len);
+                if (str)
+                        result = g_list_append (result, str);
+        }
+
+        return result;
+}
+
+/**
+ *Duplicate a GList where the GList::data is a CRString.
+ *@param a_list_of_strings the list to duplicate
+ *@return the duplicated list, or NULL if something bad
+ *happened.
+ */
+GList *
+cr_utils_dup_glist_of_cr_string (GList const * a_list_of_strings)
+{
+        GList const *cur = NULL;
+        GList *result = NULL;
+
+        g_return_val_if_fail (a_list_of_strings, NULL);
+
+        for (cur = a_list_of_strings; cur; cur = cur->next) {
+                CRString *str = NULL;
+
+                str = cr_string_dup ((CRString const *) cur->data) ;
+                if (str)
+                        result = g_list_append (result, str);
+        }
+
+        return result;
+}