diff options
Diffstat (limited to 'src/raptor_unicode.c')
-rw-r--r-- | src/raptor_unicode.c | 940 |
1 files changed, 940 insertions, 0 deletions
diff --git a/src/raptor_unicode.c b/src/raptor_unicode.c new file mode 100644 index 0000000..c50d253 --- /dev/null +++ b/src/raptor_unicode.c @@ -0,0 +1,940 @@ +/* -*- Mode: c; c-basic-offset: 2 -*- + * + * raptor_unicode.c - Raptor Unicode and UTF-8 support + * + * Copyright (C) 2002-2010, David Beckett http://www.dajobe.org/ + * Copyright (C) 2002-2004, University of Bristol, UK http://www.bristol.ac.uk/ + * + * This package is Free Software and part of Redland http://librdf.org/ + * + * It is licensed under the following three licenses as alternatives: + * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version + * 2. GNU General Public License (GPL) V2 or any newer version + * 3. Apache License, V2.0 or any newer version + * + * You may not use this file except in compliance with at least one of + * the above three licenses. + * + * See LICENSE.html or LICENSE.txt at the top of this package for the + * complete terms and further detail along with the license texts for + * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. + * + * + */ + + +#ifdef HAVE_CONFIG_H +#include <raptor_config.h> +#endif + +#include <stdio.h> +#include <string.h> +#include <stdarg.h> +#ifdef HAVE_STDLIB_H +#include <stdlib.h> +#endif + +/* Raptor includes */ +#include "raptor2.h" +#include "raptor_internal.h" + + +/* Unicode defines only the range U+0000 to U+10FFFF */ +const raptor_unichar raptor_unicode_max_codepoint = 0x10FFFF; + + +/** + * raptor_unicode_utf8_string_put_char: + * @c: Unicode character + * @output: UTF-8 string buffer or NULL + * @length: length of output buffer + * + * Encode a Unicode character to a UTF-8 string + * + * If @output is NULL, then will calculate the length rather than + * perform the encoding. This can be used by the called to allocate + * space and then re-call this function with the new buffer. + * + * Return value: number of bytes encoded to output buffer or <0 on failure + **/ +int +raptor_unicode_utf8_string_put_char(raptor_unichar c, + unsigned char *output, size_t length) +{ + size_t size = 0; + + /* check for illegal code positions: + * [ U+D800 to U+DFFF (UTF-16 surrogates) - now allowed ] + * U+FFFE and U+FFFF + */ + if(c == 0xFFFE || c == 0xFFFF) + return -1; + + if (c < 0x00000080) + size = 1; + else if(c < 0x00000800) + size = 2; + else if(c < 0x00010000) + size = 3; + else if(c < 0x00200000) + size = 4; + else if(c < 0x04000000) + size = 5; + else if(c < 0x80000000) + size = 6; + else + return -1; + + /* when no buffer given, return size */ + if(!output) + return RAPTOR_GOOD_CAST(int, size); /* ok since size is in range 1..6 */ + + if(size > length) + return -1; + + switch(size) { + case 6: + output[5] = RAPTOR_GOOD_CAST(unsigned char, 0x80 | (unsigned char)(c & 0x3F)); + c= c >> 6; + /* set bit 2 (bits 7,6,5,4,3,2 less 7,6,5,4,3 set below) on last byte */ + c |= 0x4000000; /* 0x10000 = 0x04 << 24 */ + /* FALLTHROUGH */ + case 5: + output[4] = RAPTOR_GOOD_CAST(unsigned char, 0x80 | (unsigned char)(c & 0x3F)); + c= c >> 6; + /* set bit 3 (bits 7,6,5,4,3 less 7,6,5,4 set below) on last byte */ + c |= 0x200000; /* 0x10000 = 0x08 << 18 */ + /* FALLTHROUGH */ + case 4: + output[3] = RAPTOR_GOOD_CAST(unsigned char, 0x80 | (unsigned char)(c & 0x3F)); + c= c >> 6; + /* set bit 4 (bits 7,6,5,4 less 7,6,5 set below) on last byte */ + c |= 0x10000; /* 0x10000 = 0x10 << 12 */ + /* FALLTHROUGH */ + case 3: + output[2] = RAPTOR_GOOD_CAST(unsigned char, 0x80 | (unsigned char)(c & 0x3F)); + c= c >> 6; + /* set bit 5 (bits 7,6,5 less 7,6 set below) on last byte */ + c |= 0x800; /* 0x800 = 0x20 << 6 */ + /* FALLTHROUGH */ + case 2: + output[1] = RAPTOR_GOOD_CAST(unsigned char, 0x80 | (unsigned char)(c & 0x3F)); + c= c >> 6; + /* set bits 7,6 on last byte */ + c |= 0xc0; + /* FALLTHROUGH */ + case 1: + output[0] = (unsigned char)c; + } + + return RAPTOR_GOOD_CAST(int, size); /* ok since size is in range 1..6 */ +} + + +/** + * raptor_unicode_utf8_string_get_char: + * @input: UTF-8 string buffer + * @length: buffer size + * @output: Pointer to the Unicode character or NULL + * + * Decode a UTF-8 encoded string to get a Unicode character. + * + * If output is NULL, then will calculate the number of bytes that + * will be used from the input buffer and not perform the conversion. + * + * Return value: bytes used from input buffer or <0 on failure: -1 input buffer too short or length error, -2 overlong UTF-8 sequence, -3 illegal code positions, -4 code out of range U+0000 to U+10FFFF. In cases -2, -3 and -4 the coded character is stored in the output. + */ +int +raptor_unicode_utf8_string_get_char(const unsigned char *input, size_t length, + raptor_unichar *output) +{ + unsigned char in; + size_t size; + raptor_unichar c = 0; + + if(length < 1) + return -1; + + in=*input++; + if((in & 0x80) == 0) { + size = 1; + c= in & 0x7f; + } else if((in & 0xe0) == 0xc0) { + size = 2; + c= in & 0x1f; + } else if((in & 0xf0) == 0xe0) { + size = 3; + c= in & 0x0f; + } else if((in & 0xf8) == 0xf0) { + size = 4; + c = in & 0x07; + } else if((in & 0xfc) == 0xf8) { + size = 5; + c = in & 0x03; + } else if((in & 0xfe) == 0xfc) { + size = 6; + c = in & 0x01; + } else + return -1; + + + if(!output) + return RAPTOR_GOOD_CAST(int, size); /* ok since size is in range 1..6 */ + + if(length < size) + return -1; + + switch(size) { + case 6: + in=*input++ & 0x3f; + c= c << 6; + c |= in; + /* FALLTHROUGH */ + case 5: + in=*input++ & 0x3f; + c= c << 6; + c |= in; + /* FALLTHROUGH */ + case 4: + in=*input++ & 0x3f; + c= c << 6; + c |= in; + /* FALLTHROUGH */ + case 3: + in=*input++ & 0x3f; + c= c << 6; + c |= in; + /* FALLTHROUGH */ + case 2: + /* '*input' used here since we never need to use new value of input [CLANG] */ + in = *input & 0x3f; + c= c << 6; + c |= in; + /* FALLTHROUGH */ + default: + break; + } + + *output=c; + + /* check for overlong UTF-8 sequences */ + switch(size) { + case 2: + if(c < 0x00000080) + return -2; + break; + case 3: + if(c < 0x00000800) + return -2; + break; + case 4: + if(c < 0x00010000) + return -2; + break; + + default: /* 1 */ + break; + } + + + /* check for illegal code positions: + * [ U+D800 to U+DFFF (UTF-16 surrogates) - now allowed ] + * U+FFFE and U+FFFF + */ + if(c == 0xFFFE || c == 0xFFFF) + return -3; + + if(c > raptor_unicode_max_codepoint) + return -4; + + return RAPTOR_GOOD_CAST(int, size); /* ok since size is in range 1..6 */ +} + + +static int raptor_unicode_is_letter(long c); +static int raptor_unicode_is_basechar(long c); +static int raptor_unicode_is_ideographic(long c); +static int raptor_unicode_is_combiningchar(long c); +static int raptor_unicode_is_digit(long c); +static int raptor_unicode_is_extender(long c); + + +/** + * raptor_unicode_is_xml11_namestartchar: + * @c: Unicode character to check + * + * Check if Unicode character is legal to start an XML 1.1 Name + * + * See <ulink url="http://www.w3.org/TR/2004/REC-xml11-20040204/#NT-NameStartChar">Namespaces in XML 1.1 REC 2004-02-04 NameStartChar</ulink> + * updating + * <ulink url="http://www.w3.org/TR/2004/REC-xml11-20040204/">Extensible Markup Language (XML) 1.1 REC 2004-02-04</ulink> sec 2.3, [4a] + * excluding the ':' + * + * Return value: non-0 if legal + **/ +int +raptor_unicode_is_xml11_namestartchar(raptor_unichar c) +{ + return (((c >= 0x0041) && (c <= 0x005A)) || /* [A-Z] */ + (c == 0x005F) || /* '_' */ + ((c >= 0x0061) && (c <= 0x007A)) || /* [a-z] */ + ((c >= 0x00C0) && (c <= 0x00D6)) || + ((c >= 0x00D8) && (c <= 0x00F6)) || + ((c >= 0x00F8) && (c <= 0x02FF)) || + ((c >= 0x0370) && (c <= 0x037D)) || + ((c >= 0x037F) && (c <= 0x1FFF)) || + ((c >= 0x200C) && (c <= 0x200D)) || + ((c >= 0x2070) && (c <= 0x218F)) || + ((c >= 0x2C00) && (c <= 0x2FEF)) || + ((c >= 0x3001) && (c <= 0xD7FF)) || + ((c >= 0xF900) && (c <= 0xFDCF)) || + ((c >= 0xFDF0) && (c <= 0xFFFD)) || + ((c >= 0x10000) && (c <= 0xEFFFF))); +} + + +/** + * raptor_unicode_is_xml10_namestartchar: + * @c: Unicode character to check + * + * Check if Unicode character is legal to start an XML 1.0 Name + * + * See <ulink url="http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName">Namespaces in XML REC 1999-01-14</ulink> + * updating + * <ulink url="http://www.w3.org/TR/2004/REC-xml-20040204/">Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04</ulink> + * excluding the ':' + * + * Return value: non-0 if legal + **/ +int +raptor_unicode_is_xml10_namestartchar(raptor_unichar c) +{ + return (raptor_unicode_is_letter(c) || + (c == '_')); +} + + +/** + * raptor_unicode_is_namestartchar: + * @c: Unicode character to check + * + * Check if Unicode character is legal to start an XML Name + * + * Return value: non-0 if the character is legal + **/ +int +raptor_unicode_is_namestartchar(raptor_unichar c) { +#ifdef RAPTOR_XML_1_1 + return raptor_unicode_is_xml11_namestartchar(c); +#else + return raptor_unicode_is_xml10_namestartchar(c); +#endif +} + + +/** + * raptor_unicode_is_xml11_namechar: + * @c: Unicode character + * + * Check if a Unicode codepoint is a legal to continue an XML 1.1 Name + * + * See <ulink url="http://www.w3.org/TR/2004/REC-xml11-20040204/">Namespaces in XML 1.1 REC 2004-02-04</ulink> + * updating + * <ulink url="http://www.w3.org/TR/2004/REC-xml-20040204/">Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04</ulink> sec 2.3, [4a] + * excluding the ':' + * + * Return value: non-0 if legal + **/ +int +raptor_unicode_is_xml11_namechar(raptor_unichar c) +{ + return (raptor_unicode_is_xml11_namestartchar(c) || + (c == 0x002D) || /* '-' */ + (c == 0x002E) || /* '.' */ + (c >= 0x0030 && c <= 0x0039) || /* 0-9 */ + (c == 0x00B7) || + (c >= 0x0300 && c <=0x036F) || + (c >= 0x203F && c <=0x2040)); +} + + +/** + * raptor_unicode_is_xml10_namechar: + * @c: Unicode character + * + * Check if a Unicode codepoint is a legal to continue an XML 1.0 Name + * + * See <ulink url="http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCNameChar">Namespaces in XML REC 1999-01-14 NCNameChar</ulink> + * updating + * <ulink url="http://www.w3.org/TR/2004/REC-xml-20040204/">Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04</ulink> + * excluding the ':' + * + * Return value: non-0 if legal + **/ +int +raptor_unicode_is_xml10_namechar(raptor_unichar c) +{ + return (raptor_unicode_is_letter(c) || + raptor_unicode_is_digit(c) || + (c == 0x002E) || /* '.' */ + (c == 0x002D) || /* '-' */ + (c == 0x005F) || /* '_' */ + raptor_unicode_is_combiningchar(c) || + raptor_unicode_is_extender(c)); +} + + +/** + * raptor_unicode_is_namechar: + * @c: Unicode character to check + * + * Check if Unicode character is legal to continue an XML Name . + * + * Return value: non-0 if the character is legal + **/ +int +raptor_unicode_is_namechar(raptor_unichar c) +{ +#ifdef RAPTOR_XML_1_1 + return raptor_unicode_is_xml11_namechar(c); +#else + return raptor_unicode_is_xml10_namechar(c); +#endif +} + + +/* + * All this below was derived by machine-transforming the classes in Appendix B + * of http://www.w3.org/TR/2000/REC-xml-20001006 + */ + +static int +raptor_unicode_is_letter(long c) +{ + return(raptor_unicode_is_basechar(c) || + raptor_unicode_is_ideographic(c)); +} + + +static int +raptor_unicode_is_basechar(long c) +{ + /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-BaseChar */ + return((c >= 0x0041 && c <= 0x005A ) || + (c >= 0x0061 && c <= 0x007A ) || + (c >= 0x00C0 && c <= 0x00D6 ) || + (c >= 0x00D8 && c <= 0x00F6 ) || + (c >= 0x00F8 && c <= 0x00FF ) || + (c >= 0x0100 && c <= 0x0131 ) || + (c >= 0x0134 && c <= 0x013E ) || + (c >= 0x0141 && c <= 0x0148 ) || + (c >= 0x014A && c <= 0x017E ) || + (c >= 0x0180 && c <= 0x01C3 ) || + (c >= 0x01CD && c <= 0x01F0 ) || + (c >= 0x01F4 && c <= 0x01F5 ) || + (c >= 0x01FA && c <= 0x0217 ) || + (c >= 0x0250 && c <= 0x02A8 ) || + (c >= 0x02BB && c <= 0x02C1 ) || + (c == 0x0386) || + (c >= 0x0388 && c <= 0x038A ) || + (c == 0x038C) || + (c >= 0x038E && c <= 0x03A1 ) || + (c >= 0x03A3 && c <= 0x03CE ) || + (c >= 0x03D0 && c <= 0x03D6 ) || + (c == 0x03DA) || + (c == 0x03DC) || + (c == 0x03DE) || + (c == 0x03E0) || + (c >= 0x03E2 && c <= 0x03F3 ) || + (c >= 0x0401 && c <= 0x040C ) || + (c >= 0x040E && c <= 0x044F ) || + (c >= 0x0451 && c <= 0x045C ) || + (c >= 0x045E && c <= 0x0481 ) || + (c >= 0x0490 && c <= 0x04C4 ) || + (c >= 0x04C7 && c <= 0x04C8 ) || + (c >= 0x04CB && c <= 0x04CC ) || + (c >= 0x04D0 && c <= 0x04EB ) || + (c >= 0x04EE && c <= 0x04F5 ) || + (c >= 0x04F8 && c <= 0x04F9 ) || + (c >= 0x0531 && c <= 0x0556 ) || + (c == 0x0559) || + (c >= 0x0561 && c <= 0x0586 ) || + (c >= 0x05D0 && c <= 0x05EA ) || + (c >= 0x05F0 && c <= 0x05F2 ) || + (c >= 0x0621 && c <= 0x063A ) || + (c >= 0x0641 && c <= 0x064A ) || + (c >= 0x0671 && c <= 0x06B7 ) || + (c >= 0x06BA && c <= 0x06BE ) || + (c >= 0x06C0 && c <= 0x06CE ) || + (c >= 0x06D0 && c <= 0x06D3 ) || + (c == 0x06D5) || + (c >= 0x06E5 && c <= 0x06E6 ) || + (c >= 0x0905 && c <= 0x0939 ) || + (c == 0x093D) || + (c >= 0x0958 && c <= 0x0961 ) || + (c >= 0x0985 && c <= 0x098C ) || + (c >= 0x098F && c <= 0x0990 ) || + (c >= 0x0993 && c <= 0x09A8 ) || + (c >= 0x09AA && c <= 0x09B0 ) || + (c == 0x09B2) || + (c >= 0x09B6 && c <= 0x09B9 ) || + (c >= 0x09DC && c <= 0x09DD ) || + (c >= 0x09DF && c <= 0x09E1 ) || + (c >= 0x09F0 && c <= 0x09F1 ) || + (c >= 0x0A05 && c <= 0x0A0A ) || + (c >= 0x0A0F && c <= 0x0A10 ) || + (c >= 0x0A13 && c <= 0x0A28 ) || + (c >= 0x0A2A && c <= 0x0A30 ) || + (c >= 0x0A32 && c <= 0x0A33 ) || + (c >= 0x0A35 && c <= 0x0A36 ) || + (c >= 0x0A38 && c <= 0x0A39 ) || + (c >= 0x0A59 && c <= 0x0A5C ) || + (c == 0x0A5E) || + (c >= 0x0A72 && c <= 0x0A74 ) || + (c >= 0x0A85 && c <= 0x0A8B ) || + (c == 0x0A8D) || + (c >= 0x0A8F && c <= 0x0A91 ) || + (c >= 0x0A93 && c <= 0x0AA8 ) || + (c >= 0x0AAA && c <= 0x0AB0 ) || + (c >= 0x0AB2 && c <= 0x0AB3 ) || + (c >= 0x0AB5 && c <= 0x0AB9 ) || + (c == 0x0ABD) || + (c == 0x0AE0) || + (c >= 0x0B05 && c <= 0x0B0C ) || + (c >= 0x0B0F && c <= 0x0B10 ) || + (c >= 0x0B13 && c <= 0x0B28 ) || + (c >= 0x0B2A && c <= 0x0B30 ) || + (c >= 0x0B32 && c <= 0x0B33 ) || + (c >= 0x0B36 && c <= 0x0B39 ) || + (c == 0x0B3D) || + (c >= 0x0B5C && c <= 0x0B5D ) || + (c >= 0x0B5F && c <= 0x0B61 ) || + (c >= 0x0B85 && c <= 0x0B8A ) || + (c >= 0x0B8E && c <= 0x0B90 ) || + (c >= 0x0B92 && c <= 0x0B95 ) || + (c >= 0x0B99 && c <= 0x0B9A ) || + (c == 0x0B9C) || + (c >= 0x0B9E && c <= 0x0B9F ) || + (c >= 0x0BA3 && c <= 0x0BA4 ) || + (c >= 0x0BA8 && c <= 0x0BAA ) || + (c >= 0x0BAE && c <= 0x0BB5 ) || + (c >= 0x0BB7 && c <= 0x0BB9 ) || + (c >= 0x0C05 && c <= 0x0C0C ) || + (c >= 0x0C0E && c <= 0x0C10 ) || + (c >= 0x0C12 && c <= 0x0C28 ) || + (c >= 0x0C2A && c <= 0x0C33 ) || + (c >= 0x0C35 && c <= 0x0C39 ) || + (c >= 0x0C60 && c <= 0x0C61 ) || + (c >= 0x0C85 && c <= 0x0C8C ) || + (c >= 0x0C8E && c <= 0x0C90 ) || + (c >= 0x0C92 && c <= 0x0CA8 ) || + (c >= 0x0CAA && c <= 0x0CB3 ) || + (c >= 0x0CB5 && c <= 0x0CB9 ) || + (c == 0x0CDE) || + (c >= 0x0CE0 && c <= 0x0CE1 ) || + (c >= 0x0D05 && c <= 0x0D0C ) || + (c >= 0x0D0E && c <= 0x0D10 ) || + (c >= 0x0D12 && c <= 0x0D28 ) || + (c >= 0x0D2A && c <= 0x0D39 ) || + (c >= 0x0D60 && c <= 0x0D61 ) || + (c >= 0x0E01 && c <= 0x0E2E ) || + (c == 0x0E30) || + (c >= 0x0E32 && c <= 0x0E33 ) || + (c >= 0x0E40 && c <= 0x0E45 ) || + (c >= 0x0E81 && c <= 0x0E82 ) || + (c == 0x0E84) || + (c >= 0x0E87 && c <= 0x0E88 ) || + (c == 0x0E8A) || + (c == 0x0E8D) || + (c >= 0x0E94 && c <= 0x0E97 ) || + (c >= 0x0E99 && c <= 0x0E9F ) || + (c >= 0x0EA1 && c <= 0x0EA3 ) || + (c == 0x0EA5) || + (c == 0x0EA7) || + (c >= 0x0EAA && c <= 0x0EAB ) || + (c >= 0x0EAD && c <= 0x0EAE ) || + (c == 0x0EB0) || + (c >= 0x0EB2 && c <= 0x0EB3 ) || + (c == 0x0EBD) || + (c >= 0x0EC0 && c <= 0x0EC4 ) || + (c >= 0x0F40 && c <= 0x0F47 ) || + (c >= 0x0F49 && c <= 0x0F69 ) || + (c >= 0x10A0 && c <= 0x10C5 ) || + (c >= 0x10D0 && c <= 0x10F6 ) || + (c == 0x1100) || + (c >= 0x1102 && c <= 0x1103 ) || + (c >= 0x1105 && c <= 0x1107 ) || + (c == 0x1109) || + (c >= 0x110B && c <= 0x110C ) || + (c >= 0x110E && c <= 0x1112 ) || + (c == 0x113C) || + (c == 0x113E) || + (c == 0x1140) || + (c == 0x114C) || + (c == 0x114E) || + (c == 0x1150) || + (c >= 0x1154 && c <= 0x1155 ) || + (c == 0x1159) || + (c >= 0x115F && c <= 0x1161 ) || + (c == 0x1163) || + (c == 0x1165) || + (c == 0x1167) || + (c == 0x1169) || + (c >= 0x116D && c <= 0x116E ) || + (c >= 0x1172 && c <= 0x1173 ) || + (c == 0x1175) || + (c == 0x119E) || + (c == 0x11A8) || + (c == 0x11AB) || + (c >= 0x11AE && c <= 0x11AF ) || + (c >= 0x11B7 && c <= 0x11B8 ) || + (c == 0x11BA) || + (c >= 0x11BC && c <= 0x11C2 ) || + (c == 0x11EB) || + (c == 0x11F0) || + (c == 0x11F9) || + (c >= 0x1E00 && c <= 0x1E9B ) || + (c >= 0x1EA0 && c <= 0x1EF9 ) || + (c >= 0x1F00 && c <= 0x1F15 ) || + (c >= 0x1F18 && c <= 0x1F1D ) || + (c >= 0x1F20 && c <= 0x1F45 ) || + (c >= 0x1F48 && c <= 0x1F4D ) || + (c >= 0x1F50 && c <= 0x1F57 ) || + (c == 0x1F59) || + (c == 0x1F5B) || + (c == 0x1F5D) || + (c >= 0x1F5F && c <= 0x1F7D ) || + (c >= 0x1F80 && c <= 0x1FB4 ) || + (c >= 0x1FB6 && c <= 0x1FBC ) || + (c == 0x1FBE) || + (c >= 0x1FC2 && c <= 0x1FC4 ) || + (c >= 0x1FC6 && c <= 0x1FCC ) || + (c >= 0x1FD0 && c <= 0x1FD3 ) || + (c >= 0x1FD6 && c <= 0x1FDB ) || + (c >= 0x1FE0 && c <= 0x1FEC ) || + (c >= 0x1FF2 && c <= 0x1FF4 ) || + (c >= 0x1FF6 && c <= 0x1FFC ) || + (c == 0x2126) || + (c >= 0x212A && c <= 0x212B ) || + (c == 0x212E) || + (c >= 0x2180 && c <= 0x2182 ) || + (c >= 0x3041 && c <= 0x3094 ) || + (c >= 0x30A1 && c <= 0x30FA ) || + (c >= 0x3105 && c <= 0x312C ) || + (c >= 0xAC00 && c <= 0xD7A3 ) + ); +} + + +static int +raptor_unicode_is_ideographic(long c) +{ + /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Ideographic */ + return((c >= 0x4E00 && c <= 0x9FA5 ) || + (c == 0x3007) || + (c >= 0x3021 && c <= 0x3029 )); +} + + +static int +raptor_unicode_is_combiningchar(long c) +{ + /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-CombiningChar */ + return((c >= 0x0300 && c <= 0x0345 ) || + (c >= 0x0360 && c <= 0x0361 ) || + (c >= 0x0483 && c <= 0x0486 ) || + (c >= 0x0591 && c <= 0x05A1 ) || + (c >= 0x05A3 && c <= 0x05B9 ) || + (c >= 0x05BB && c <= 0x05BD ) || + (c == 0x05BF) || + (c >= 0x05C1 && c <= 0x05C2 ) || + (c == 0x05C4) || + (c >= 0x064B && c <= 0x0652 ) || + (c == 0x0670) || + (c >= 0x06D6 && c <= 0x06DC ) || + (c >= 0x06DD && c <= 0x06DF ) || + (c >= 0x06E0 && c <= 0x06E4 ) || + (c >= 0x06E7 && c <= 0x06E8 ) || + (c >= 0x06EA && c <= 0x06ED ) || + (c >= 0x0901 && c <= 0x0903 ) || + (c == 0x093C) || + (c >= 0x093E && c <= 0x094C ) || + (c == 0x094D) || + (c >= 0x0951 && c <= 0x0954 ) || + (c >= 0x0962 && c <= 0x0963 ) || + (c >= 0x0981 && c <= 0x0983 ) || + (c == 0x09BC) || + (c == 0x09BE) || + (c == 0x09BF) || + (c >= 0x09C0 && c <= 0x09C4 ) || + (c >= 0x09C7 && c <= 0x09C8 ) || + (c >= 0x09CB && c <= 0x09CD ) || + (c == 0x09D7) || + (c >= 0x09E2 && c <= 0x09E3 ) || + (c == 0x0A02) || + (c == 0x0A3C) || + (c == 0x0A3E) || + (c == 0x0A3F) || + (c >= 0x0A40 && c <= 0x0A42 ) || + (c >= 0x0A47 && c <= 0x0A48 ) || + (c >= 0x0A4B && c <= 0x0A4D ) || + (c >= 0x0A70 && c <= 0x0A71 ) || + (c >= 0x0A81 && c <= 0x0A83 ) || + (c == 0x0ABC) || + (c >= 0x0ABE && c <= 0x0AC5 ) || + (c >= 0x0AC7 && c <= 0x0AC9 ) || + (c >= 0x0ACB && c <= 0x0ACD ) || + (c >= 0x0B01 && c <= 0x0B03 ) || + (c == 0x0B3C) || + (c >= 0x0B3E && c <= 0x0B43 ) || + (c >= 0x0B47 && c <= 0x0B48 ) || + (c >= 0x0B4B && c <= 0x0B4D ) || + (c >= 0x0B56 && c <= 0x0B57 ) || + (c >= 0x0B82 && c <= 0x0B83 ) || + (c >= 0x0BBE && c <= 0x0BC2 ) || + (c >= 0x0BC6 && c <= 0x0BC8 ) || + (c >= 0x0BCA && c <= 0x0BCD ) || + (c == 0x0BD7) || + (c >= 0x0C01 && c <= 0x0C03 ) || + (c >= 0x0C3E && c <= 0x0C44 ) || + (c >= 0x0C46 && c <= 0x0C48 ) || + (c >= 0x0C4A && c <= 0x0C4D ) || + (c >= 0x0C55 && c <= 0x0C56 ) || + (c >= 0x0C82 && c <= 0x0C83 ) || + (c >= 0x0CBE && c <= 0x0CC4 ) || + (c >= 0x0CC6 && c <= 0x0CC8 ) || + (c >= 0x0CCA && c <= 0x0CCD ) || + (c >= 0x0CD5 && c <= 0x0CD6 ) || + (c >= 0x0D02 && c <= 0x0D03 ) || + (c >= 0x0D3E && c <= 0x0D43 ) || + (c >= 0x0D46 && c <= 0x0D48 ) || + (c >= 0x0D4A && c <= 0x0D4D ) || + (c == 0x0D57) || + (c == 0x0E31) || + (c >= 0x0E34 && c <= 0x0E3A ) || + (c >= 0x0E47 && c <= 0x0E4E ) || + (c == 0x0EB1) || + (c >= 0x0EB4 && c <= 0x0EB9 ) || + (c >= 0x0EBB && c <= 0x0EBC ) || + (c >= 0x0EC8 && c <= 0x0ECD ) || + (c >= 0x0F18 && c <= 0x0F19 ) || + (c == 0x0F35) || + (c == 0x0F37) || + (c == 0x0F39) || + (c == 0x0F3E) || + (c == 0x0F3F) || + (c >= 0x0F71 && c <= 0x0F84 ) || + (c >= 0x0F86 && c <= 0x0F8B ) || + (c >= 0x0F90 && c <= 0x0F95 ) || + (c == 0x0F97) || + (c >= 0x0F99 && c <= 0x0FAD ) || + (c >= 0x0FB1 && c <= 0x0FB7 ) || + (c == 0x0FB9) || + (c >= 0x20D0 && c <= 0x20DC ) || + (c == 0x20E1) || + (c >= 0x302A && c <= 0x302F ) || + (c == 0x3099) || + (c == 0x309A)); +} + + +static int +raptor_unicode_is_digit(long c) +{ + /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Digit */ + return((c >= 0x0030 && c <= 0x0039 ) || + (c >= 0x0660 && c <= 0x0669 ) || + (c >= 0x06F0 && c <= 0x06F9 ) || + (c >= 0x0966 && c <= 0x096F ) || + (c >= 0x09E6 && c <= 0x09EF ) || + (c >= 0x0A66 && c <= 0x0A6F ) || + (c >= 0x0AE6 && c <= 0x0AEF ) || + (c >= 0x0B66 && c <= 0x0B6F ) || + (c >= 0x0BE7 && c <= 0x0BEF ) || + (c >= 0x0C66 && c <= 0x0C6F ) || + (c >= 0x0CE6 && c <= 0x0CEF ) || + (c >= 0x0D66 && c <= 0x0D6F ) || + (c >= 0x0E50 && c <= 0x0E59 ) || + (c >= 0x0ED0 && c <= 0x0ED9 ) || + (c >= 0x0F20 && c <= 0x0F29 )); +} + + +static int +raptor_unicode_is_extender(long c) +{ + /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Extender */ + return((c == 0x00B7) || + (c == 0x02D0) || + (c == 0x02D1) || + (c == 0x0387) || + (c == 0x0640) || + (c == 0x0E46) || + (c == 0x0EC6) || + (c == 0x3005) || + (c >= 0x3031 && c <= 0x3035 ) || + (c >= 0x309D && c <= 0x309E ) || + (c >= 0x30FC && c <= 0x30FE )); +} + + +/* + * raptor_unicode_check_utf8_nfc_string: + * @input: UTF-8 string + * @length: length of string + * @error: pointer to error flag (or NULL) + * + * INTERNAL - Check if a Unicode UTF-8 encoded string is in Unicode Normal Form C. + * + * Return value: <0 on error, 0 if not NFC, >0 if is NFC + **/ +int +raptor_unicode_check_utf8_nfc_string(const unsigned char *input, size_t length) +{ + unsigned int i; + int plain = 1; + int rc; + + for(i = 0; i < length; i++) + if(input[i] > 0x7f) { + plain = 0; + break; + } + + if(plain) + return 1; + +#ifdef RAPTOR_NFC_ICU + rc = raptor_nfc_icu_check(input, length); +#else + rc = 1; +#endif + return rc; +} + + +/** + * raptor_unicode_check_utf8_string: + * @string: UTF-8 string + * @length: length of string + * + * Check a string is valid Unicode UTF-8. + * + * Return value: Non 0 if the string is UTF-8 + **/ +int +raptor_unicode_check_utf8_string(const unsigned char *string, size_t length) +{ + while(length > 0) { + raptor_unichar unichar = 0; + + int unichar_len; + unichar_len = raptor_unicode_utf8_string_get_char(string, length, &unichar); + if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > length) + return 0; + + if(unichar > raptor_unicode_max_codepoint) + return 0; + + string += unichar_len; + length -= unichar_len; + } + return 1; +} + + +/** + * raptor_unicode_utf8_strlen: + * @string: buffer + * @length: buffer length + * + * Calculate the number of Unicode characters in the given UTF-8 encoded buffer + * + * Return value: number of characters or <0 if sequence is invalid + */ +int +raptor_unicode_utf8_strlen(const unsigned char *string, size_t length) +{ + int unicode_length = 0; + + while(length > 0) { + int unichar_len; + unichar_len = raptor_unicode_utf8_string_get_char(string, length, NULL); + if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > length) { + unicode_length = -1; + break; + } + + string += unichar_len; + length -= unichar_len; + + unicode_length++; + } + + return unicode_length; +} + + +/** + * raptor_unicode_utf8_substr: + * @dest: destination string buffer to write to (or NULL) + * @dest_length_p: location to store actual destination length (or NULL) + * @src: source string + * @src_length: source length in bytes + * @startingLoc: starting location offset 0 for first Unicode character + * @length: number of Unicode characters to copy at offset @startingLoc (or < 0) + * + * Get a unicode (UTF-8) substring of an existing UTF-8 string + * + * If @dest is NULL, returns the number of bytes needed to write and + * does no work. + * + * Return value: number of bytes used in destination string or 0 on failure + */ +size_t +raptor_unicode_utf8_substr(unsigned char* dest, size_t* dest_length_p, + const unsigned char* src, size_t src_length, + int startingLoc, int length) +{ + size_t dest_length = 0; /* destination unicode characters count */ + size_t dest_bytes = 0; /* destination UTF-8 bytes count */ + int dest_offset = 0; /* destination string unicode characters index */ + unsigned char* p = dest; + + if(!src) + return 0; + + while(src_length > 0) { + int unichar_len; + + unichar_len = raptor_unicode_utf8_string_get_char(src, src_length, NULL); + if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > src_length) + break; + + if(dest_offset >= startingLoc) { + if(p) { + /* copy 1 Unicode character to dest */ + memcpy(p, src, RAPTOR_GOOD_CAST(size_t, unichar_len)); + p += unichar_len; + } + dest_bytes += unichar_len; + + dest_length++; + if(length >= 0 && dest_length == RAPTOR_GOOD_CAST(size_t, length)) + break; + } + + src += unichar_len; + src_length -= unichar_len; + + dest_offset++; + } + + if(p) + *p = '\0'; + + if(dest_length_p) + *dest_length_p = dest_length; + + return dest_bytes; +} |