diff options
Diffstat (limited to 'storage/mroonga/vendor/groonga/lib/icudump.c')
-rw-r--r-- | storage/mroonga/vendor/groonga/lib/icudump.c | 298 |
1 files changed, 298 insertions, 0 deletions
diff --git a/storage/mroonga/vendor/groonga/lib/icudump.c b/storage/mroonga/vendor/groonga/lib/icudump.c new file mode 100644 index 00000000..91751f94 --- /dev/null +++ b/storage/mroonga/vendor/groonga/lib/icudump.c @@ -0,0 +1,298 @@ +/* -*- c-basic-offset: 2 -*- */ +/* Copyright(C) 2010 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ +#include <stdio.h> +#include <getopt.h> +#include <unistd.h> +#include <string.h> +#include <unicode/utf.h> +#include <unicode/uchar.h> +#include <unicode/unorm.h> +#include <unicode/ustring.h> + +#define MAX_UNICODE 0x110000 +#define BUF_SIZE 0x100 + +static int +ucs2utf(unsigned int i, unsigned char *buf) +{ + unsigned char *p = buf; + if (i < 0x80) { + *p++ = i; + } else { + if (i < 0x800) { + *p++ = (i >> 6) | 0xc0; + } else { + if (i < 0x00010000) { + *p++ = (i >> 12) | 0xe0; + } else { + if (i < 0x00200000) { + *p++ = (i >> 18) | 0xf0; + } else { + if (i < 0x04000000) { + *p++ = (i >> 24) | 0xf8; + } else if (i < 0x80000000) { + *p++ = (i >> 30) | 0xfc; + *p++ = ((i >> 24) & 0x3f) | 0x80; + } + *p++ = ((i >> 18) & 0x3f) | 0x80; + } + *p++ = ((i >> 12) & 0x3f) | 0x80; + } + *p++ = ((i >> 6) & 0x3f) | 0x80; + } + *p++ = (0x3f & i) | 0x80; + } + *p = '\0'; + return (p - buf); +} + +void +blockcode(void) +{ + UChar32 ch; + unsigned char *p, src[7]; + UBlockCode code, lc = -1; + for (ch = 1; ch < MAX_UNICODE; ch++) { + if (!U_IS_UNICODE_CHAR(ch)) { continue; } + code = ublock_getCode(ch); + if (code != lc) { + ucs2utf(ch, src); + for (p = src; *p; p++) { + printf("%x:", *p); + } + printf("\t%04x\t%d\n", ch, code); + } + lc = code; + } +} + +int +normalize(const char *str, char *res, UNormalizationMode mode) +{ + UErrorCode rc; + int32_t ulen, nlen; + UChar ubuf[BUF_SIZE], nbuf[BUF_SIZE]; + rc = U_ZERO_ERROR; + u_strFromUTF8(ubuf, BUF_SIZE, &ulen, str, -1, &rc); + if (rc != U_ZERO_ERROR /*&& rc != U_STRING_NOT_TERMINATED_WARNING*/) { + return -1; + } + rc = U_ZERO_ERROR; + nlen = unorm_normalize(ubuf, ulen, mode, 0, nbuf, BUF_SIZE, &rc); + if (rc != U_ZERO_ERROR /*&& rc != U_STRING_NOT_TERMINATED_WARNING*/) { + return -1; + } + rc = U_ZERO_ERROR; + u_strToUTF8(res, BUF_SIZE, NULL, nbuf, nlen, &rc); + if (rc != U_ZERO_ERROR /*&& rc != U_BUFFER_OVERFLOW_ERROR*/) { + return -1; + } + return 0; +} + +void +dump(UNormalizationMode mode) +{ + UChar32 ch; + char str[7], norm[BUF_SIZE]; + for (ch = 1; ch < MAX_UNICODE; ch++) { + if (!U_IS_UNICODE_CHAR(ch)) { continue; } + ucs2utf(ch, (unsigned char *)str); + if (normalize(str, norm, mode)) { + printf("ch=%04x error occure\n", ch); + continue; + } + if (strcmp(norm, str)) { + printf("%04x\t%s\t%s\n", ch, str, norm); + } + } +} + +void +ccdump(void) +{ + UChar32 ch; + char str[7], nfd[BUF_SIZE], nfc[BUF_SIZE]; + for (ch = 1; ch < MAX_UNICODE; ch++) { + if (!U_IS_UNICODE_CHAR(ch)) { continue; } + ucs2utf(ch, (unsigned char *)str); + if (normalize(str, nfd, UNORM_NFD)) { + printf("ch=%04x error occure\n", ch); + continue; + } + if (normalize(str, nfc, UNORM_NFC)) { + printf("ch=%04x error occure\n", ch); + continue; + } + if (strcmp(nfd, nfc)) { + printf("%04x\t%s\t%s\n", ch, nfd, nfc); + } + } +} + +enum { + ctype_null = 0, + ctype_alpha, + ctype_digit, + ctype_symbol, + ctype_hiragana, + ctype_katakana, + ctype_kanji, + ctype_others +}; + +static const char *ctypes[] = { + "GRN_CHAR_NULL", + "GRN_CHAR_ALPHA", + "GRN_CHAR_DIGIT", + "GRN_CHAR_SYMBOL", + "GRN_CHAR_HIRAGANA", + "GRN_CHAR_KATAKANA", + "GRN_CHAR_KANJI", + "GRN_CHAR_OTHERS" +}; + +void +gcdump(void) +{ + UChar32 ch; + unsigned char *p, src[7]; + int ctype, lc = -1; + for (ch = 1; ch < MAX_UNICODE; ch++) { + UCharCategory cat; + UBlockCode code; + if (!U_IS_UNICODE_CHAR(ch)) { continue; } + code = ublock_getCode(ch); + switch (code) { + case UBLOCK_CJK_RADICALS_SUPPLEMENT: /* cjk radicals */ + case UBLOCK_KANGXI_RADICALS: /* kanji radicals */ + case UBLOCK_BOPOMOFO: /* bopomofo letter */ + case UBLOCK_HANGUL_COMPATIBILITY_JAMO: /* hangul letter */ + case UBLOCK_KANBUN: /* kaeri ten used in kanbun ex. re-ten */ + case UBLOCK_BOPOMOFO_EXTENDED: /* bopomofo extended letter */ + case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A: /* cjk letter */ + case UBLOCK_CJK_UNIFIED_IDEOGRAPHS: /* cjk letter */ + case UBLOCK_YI_SYLLABLES: /* Yi syllables */ + case UBLOCK_YI_RADICALS: /* Yi radicals */ + case UBLOCK_HANGUL_SYLLABLES: /* hangul syllables */ + case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS: /* cjk letter */ + case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B: /* cjk letter */ + case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT: /* cjk letter */ + case UBLOCK_CJK_STROKES: /* kakijun*/ + ctype = ctype_kanji; + break; + case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION: /* symbols ex. JIS mark */ + case UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS: /* ex. (kabu) */ + case UBLOCK_CJK_COMPATIBILITY: /* symbols ex. ton doll */ + case UBLOCK_CJK_COMPATIBILITY_FORMS: /* symbols ex. tategaki kagi-kakko */ + ctype = ctype_symbol; + break; + case UBLOCK_HIRAGANA: + ctype = ctype_hiragana; + break; + case UBLOCK_KATAKANA: + case UBLOCK_KATAKANA_PHONETIC_EXTENSIONS: + ctype = ctype_katakana; + break; + default: + cat = u_charType(ch); + switch (cat) { + case U_UPPERCASE_LETTER: + case U_LOWERCASE_LETTER: + case U_TITLECASE_LETTER: + case U_MODIFIER_LETTER: + case U_OTHER_LETTER: + ctype = ctype_alpha; + break; + case U_DECIMAL_DIGIT_NUMBER: + case U_LETTER_NUMBER: + case U_OTHER_NUMBER: + ctype = ctype_digit; + break; + case U_DASH_PUNCTUATION: + case U_START_PUNCTUATION: + case U_END_PUNCTUATION: + case U_CONNECTOR_PUNCTUATION: + case U_OTHER_PUNCTUATION: + case U_MATH_SYMBOL: + case U_CURRENCY_SYMBOL: + case U_MODIFIER_SYMBOL: + case U_OTHER_SYMBOL: + ctype = ctype_symbol; + break; + default: + ctype = ctype_others; + break; + } + break; + } + if (ctype != lc) { + ucs2utf(ch, src); + for (p = src; *p; p++) { + printf("%x:", *p); + } + printf("\t%04x\t%s\n", ch, ctypes[ctype]); + } + lc = ctype; + } +} + +struct option options[] = { + {"bc", 0, NULL, 'b'}, + {"nfd", 0, NULL, 'd'}, + {"nfkd", 0, NULL, 'D'}, + {"nfc", 0, NULL, 'c'}, + {"nfkc", 0, NULL, 'C'}, + {"cc", 0, NULL, 'o'}, + {"gc", 0, NULL, 'g'}, + {"version", 0, NULL, 'v'}, +}; + +int +main(int argc, char **argv) +{ + switch (getopt_long(argc, argv, "bdDcCogv", options, NULL)) { + case 'b' : + blockcode(); + break; + case 'd' : + dump(UNORM_NFD); + break; + case 'D' : + dump(UNORM_NFKD); + break; + case 'c' : + dump(UNORM_NFC); + break; + case 'C' : + dump(UNORM_NFKC); + break; + case 'o' : + ccdump(); + break; + case 'g' : + gcdump(); + break; + case 'v' : + printf("%s\n", U_UNICODE_VERSION); + break; + default : + fputs("usage: icudump --[bc|nfd|nfkd|nfc|nfkc|cc|gc|version]\n", stderr); + break; + } + return 0; +} |