diff options
Diffstat (limited to 'strings/unidata-dump.c')
-rw-r--r-- | strings/unidata-dump.c | 1110 |
1 files changed, 1110 insertions, 0 deletions
diff --git a/strings/unidata-dump.c b/strings/unidata-dump.c new file mode 100644 index 00000000..5ecc8cb0 --- /dev/null +++ b/strings/unidata-dump.c @@ -0,0 +1,1110 @@ +const char COPYING[]= "\ +/* Copyright (c) 2000, 2013, Oracle and/or its affiliates.\n\ + Copyright (c) 2009, 2023, MariaDB Corporation.\n\ +\n\ + This program is free software; you can redistribute it and/or modify\n\ + it under the terms of the GNU General Public License as published by\n\ + the Free Software Foundation; version 2 of the License.\n\ +\n\ + This program is distributed in the hope that it will be useful,\n\ + but WITHOUT ANY WARRANTY; without even the implied warranty of\n\ + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\ + GNU General Public License for more details.\n\ +\n\ + You should have received a copy of the GNU General Public License\n\ + along with this program; if not, write to the Free Software\n\ + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA\n\ +*/\n"; + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#define MAX_UNI_CHAR 0x10FFFF +#define MAX_UNI_PAGE 0x10FF + +#define STRING_WITH_LEN(X) (X), ((size_t) (sizeof(X) - 1)) + + +typedef unsigned int my_wchar_t; + +/* Character types, as in m_ctype.h */ +#define _MY_U 01 /* Upper case */ +#define _MY_L 02 /* Lower case */ +#define _MY_NMR 04 /* Numeral (digit) */ +#define _MY_SPC 010 /* Spacing character */ +#define _MY_PNT 020 /* Punctuation */ +#define _MY_CTR 040 /* Control character */ +#define _MY_B 0100 /* Blank */ +#define _MY_X 0200 /* heXadecimal digit */ + +#define CT_MAX _MY_X +#define CT_CJK _MY_L | _MY_U +#define CT_HANGUL _MY_L | _MY_U +#define CT_NONE 0 + + +/* Decomposition types */ +typedef enum +{ + DT_UNKNOWN, + DT_FONT, + DT_NOBREAK, + DT_INITIAL, + DT_MEDIAL, + DT_FINAL, + DT_ISOLATED, + DT_CIRCLE, + DT_SUPER, + DT_SUB, + DT_VERTICAL, + DT_WIDE, + DT_NARROW, + DT_SMALL, + DT_SQUARE, + DT_FRACTION, + DT_COMPAT +} decomposition_type_t; + + +typedef enum +{ + PAGE_DATA_USELESS= 0, + PAGE_DATA_IMPORTANT= 1, + PAGE_DATA_DUMMY= 2 +} page_data_type_t; + + +typedef struct +{ + page_data_type_t page_tab; + int page_overridden; + int page_ctype; +} PAGE_STAT; + + +typedef struct +{ + const char *mode_name; + int print_ctype; + int print_toupper; + int print_tolower; + int print_noaccent; + int print_noaccent_tolower; + int print_noaccent_toupper; + int print_curly_brackets_in_items; + int print_curly_brackets_in_index; + int chars_per_line; + int single_array; + int pages_per_line_in_index; + int const_data; + const char *page_data_type_name; + const char *page_name; + const char *page_name_derived; + const char *index_data_type_name; + const char *index_name; +} UNIDATA_OPT_MODE; + + +typedef struct +{ + my_wchar_t max_char; + my_wchar_t dummy_pages_codepoint_max; + const char *filename; + UNIDATA_OPT_MODE mode; +} UNIDATA_OPT; + + +my_wchar_t npages_by_opt(const UNIDATA_OPT *opt) +{ + return (opt->max_char + 1) / 256; +} + + +typedef struct my_ctype_name_st +{ + const char *name; + int val; + int to_be_decomposed; +} MY_CTYPE_NAME_ST; + + +static MY_CTYPE_NAME_ST my_ctype_name[]= +{ + {"Lu", _MY_U, 1}, /* Letter, Uppercase */ + {"Ll", _MY_L, 1}, /* Letter, Lowercase */ + {"Lt", _MY_U, 1}, /* Letter, Titlecase */ + {"Lo", _MY_L, 1}, /* Letter, other */ + {"Lm", _MY_L, 0}, /* Letter, Modifier */ + + {"Nd", _MY_NMR, 0}, /* Number, Decimal Digit */ + {"Nl", _MY_NMR|_MY_U|_MY_L, 0}, /* Number, Letter */ + {"No", _MY_NMR|_MY_PNT, 0}, /* Number, Other */ + + {"Mn", _MY_L|_MY_PNT, 0}, /* Mark, Nonspacing */ + {"Mc", _MY_L|_MY_PNT, 1}, /* Mark, Spacing Combining */ + {"Me", _MY_L|_MY_PNT, 0}, /* Mark, Enclosing */ + + {"Pc", _MY_PNT, 0}, /* Punctuation, Connector */ + {"Pd", _MY_PNT, 0}, /* Punctuation, Dash */ + {"Ps", _MY_PNT, 0}, /* Punctuation, Open */ + {"Pe", _MY_PNT, 0}, /* Punctuation, Close */ + {"Pi", _MY_PNT, 0}, /* Punctuation, Initial quote */ + {"Pf", _MY_PNT, 0}, /* Punctuation, Final quote */ + {"Po", _MY_PNT, 0}, /* Punctuation, Other */ + + {"Sm", _MY_PNT, 0}, /* Symbol, Math */ + {"Sc", _MY_PNT, 0}, /* Symbol, Currency */ + {"Sk", _MY_PNT, 0}, /* Symbol, Modifier */ + {"So", _MY_PNT, 0}, /* Symbol, Other */ + + {"Zs", _MY_SPC, 0}, /* Separator, Space */ + {"Zl", _MY_SPC, 0}, /* Separator, Line */ + {"Zp", _MY_SPC, 0}, /* Separator, Paragraph */ + + {"Cc", _MY_CTR, 0}, /* Other, Control */ + {"Cf", _MY_CTR, 0}, /* Other, Format */ + {"Cs", _MY_CTR, 0}, /* Other, Surrogate */ + {"Co", _MY_CTR, 0}, /* Other, Private Use */ + {"Cn", _MY_CTR, 0}, /* Other, Not Assigned */ + {NULL, 0, 0} +}; + + +static const MY_CTYPE_NAME_ST * +ctype_name_st_find(my_wchar_t codepoint, const char *tok) +{ + MY_CTYPE_NAME_ST *p; + for (p= my_ctype_name; p->name; p++) + { + if (!strncasecmp(p->name, tok, 2)) + return p; + } + return NULL; +} + + +static int +ctype_name_st_to_num(const MY_CTYPE_NAME_ST *st, my_wchar_t codepoint) +{ + if ((codepoint >= 'a' && codepoint <= 'z') || + (codepoint >= 'A' && codepoint <= 'Z')) + return st->val | _MY_X; + return st->val; +} + + +static UNIDATA_OPT opt_caseinfo= +{ + 0x10FFFF, /* max_char */ + 0x7FF, /* dummy_pages_codepoint_max == utf8 mb2 range */ + NULL, /*filename*/ + { + "caseinfo", /* mode name */ + 0, /* print_ctype */ + 1, /* print_toupper */ + 1, /* print_tolower */ + 0, /* print_noaccent */ + 0, /* print_noaccent_tolower */ + 1, /* print_noaccent_toupper */ + 1, /* print_curly_brackets_in_items */ + 0, /* print_curly_brackets_in_index */ + 2, /* chars_per_line */ + 0, /* single_array */ + 8, /* pages_per_line_in_index */ + 0, /* const_data */ + "MY_UNICASE_CHARACTER", /* page_data_type_name */ + "plane", /* page_name */ + NULL, /* page_name_derived */ + "MY_UNICASE_CHARACTER *", /* index_data_type_name */ + "my_unicase_default_pages" /* index_name */ + } +}; + + +static UNIDATA_OPT opt_casefold= +{ + 0x10FFFF, /* max_char */ + 0x7FF, /* dummy_pages_codepoint_max == utf8 mb2 range */ + NULL, /*filename*/ + { + "casefold", /* mode name */ + 0, /* print_ctype */ + 1, /* print_toupper */ + 1, /* print_tolower */ + 0, /* print_noaccent */ + 0, /* print_noaccent_tolower */ + 0, /* print_noaccent_toupper */ + 1, /* print_curly_brackets_in_items */ + 0, /* print_curly_brackets_in_index */ + 2, /* chars_per_line */ + 0, /* single_array */ + 8, /* pages_per_line_in_index */ + 1, /* const_data */ + "MY_CASEFOLD_CHARACTER" , /* page_data_type_name */ + "page", /* page_name */ + NULL, /* page_name_derived */ + "MY_CASEFOLD_CHARACTER *", /* index_data_type_name */ + "my_casefold_default_pages" /* index_name */ + } +}; + + +static UNIDATA_OPT opt_casefold_tr= +{ + 0x10FFFF, /* max_char */ + 0x7FF, /* dummy_pages_codepoint_max == utf8 mb2 range */ + NULL, /*filename*/ + { + "casefold-tr", /* mode name */ + 0, /* print_ctype */ + 1, /* print_toupper */ + 1, /* print_tolower */ + 0, /* print_noaccent */ + 0, /* print_noaccent_tolower */ + 0, /* print_noaccent_toupper */ + 1, /* print_curly_brackets_in_items */ + 0, /* print_curly_brackets_in_index */ + 2, /* chars_per_line */ + 0, /* single_array */ + 8, /* pages_per_line_in_index */ + 1, /* const_data */ + "MY_CASEFOLD_CHARACTER" , /* page_data_type_name */ + "page_tr", /* page_name */ + "page", /* page_name_derived */ + "MY_CASEFOLD_CHARACTER *", /* index_data_type_name */ + "my_casefold_tr_pages" /* index_name */ + } +}; + + +static UNIDATA_OPT opt_weight_general_ci= +{ + 0xFFFF, /* max_char */ + 0x7FF, /* dummy_pages_codepoint_max == utf8 mb2 range */ + NULL, /*filename*/ + { + "weight_general_ci", /* mode name */ + 0, /* print_ctype */ + 0, /* print_toupper */ + 0, /* print_tolower */ + 0, /* print_noaccent */ + 0, /* print_noaccent_tolower */ + 1, /* print_noaccent_toupper */ + 0, /* print_curly_brackets_in_items */ + 0, /* print_curly_brackets_in_index */ + 8, /* chars_per_line */ + 0, /* single_array */ + 2, /* pages_per_line_in_index */ + 1, /* const_data */ + "uint16", /* page_data_type_name */ + "weight_general_ci_page", /* page_name */ + NULL, /* page_name_derived */ + "uint16 *", /* index_data_type_name */ + "weight_general_ci_index" /* index_name */ + } +}; + + +static UNIDATA_OPT opt_weight_general_mysql500_ci= +{ + 0xFFFF, /* max_char */ + 0x7FF, /* dummy_pages_codepoint_max == utf8 mb2 range */ + NULL, /*filename*/ + { + "weight_general_mysql500_ci", /* mode name */ + 0, /* print_ctype */ + 0, /* print_toupper */ + 0, /* print_tolower */ + 0, /* print_noaccent */ + 0, /* print_noaccent_tolower */ + 1, /* print_noaccent_toupper */ + 0, /* print_curly_brackets_in_items */ + 0, /* print_curly_brackets_in_index */ + 8, /* chars_per_line */ + 0, /* single_array */ + 2, /* pages_per_line_in_index */ + 1, /* const_data */ + "uint16", /* page_data_type_name */ + "weight_general_mysql500_ci_page", /* page_name */ + "weight_general_ci_page", /* page_name_derived */ + "uint16 *", /* index_data_type_name */ + "weight_general_mysql500_ci_index" /* index_name */ + } +}; + + +static UNIDATA_OPT opt_ctype= +{ + 0x10FFFF, /* max_char */ + 0x7FF, /* dummy_pages_codepoint_max == utf8 mb2 range */ + NULL, /*filename*/ + { + "ctype", /* mode name */ + 1, /* print_ctype */ + 0, /* print_toupper */ + 0, /* print_tolower */ + 0, /* print_noaccent */ + 0, /* print_noaccent_tolower */ + 0, /* print_noaccent_toupper */ + 0, /* print_curly_brackets_in_items */ + 1, /* print_curly_brackets_in_index */ + 16, /* chars_per_line */ + 0, /* single_array */ + 1, /* pages_per_line_in_index */ + 1, /* const_data */ + "unsigned char", /* page_data_type_name */ + "uctype_page", /* page_name */ + NULL, /* page_name_derived */ + "MY_UNI_CTYPE", /* index_data_type_name */ + "my_uni_ctype" /* index_name */ + } +}; + + +int opt_set_mode(UNIDATA_OPT *to, const char *name_and_value, const char *value) +{ + if (!strcmp(value, "casefold")) + { + to->mode= opt_casefold.mode; + return 0; + } + else if (!strcmp(value, "casefold-tr")) + { + to->mode= opt_casefold_tr.mode; + return 0; + } + else if (!strcmp(value, "caseinfo")) + { + to->mode= opt_caseinfo.mode; + return 0; + } + else if (!strcmp(value, "weight_general_ci")) + { + to->mode= opt_weight_general_ci.mode; + return 0; + } + else if (!strcmp(value, "weight_general_mysql500_ci")) + { + to->mode= opt_weight_general_mysql500_ci.mode; + return 0; + } + else if (!strcmp(value, "ctype")) + { + to->mode= opt_ctype.mode; + return 0; + } + fprintf(stderr, "Bad option: %s\n", name_and_value); + return 1; +} + + +static decomposition_type_t +get_decomposition_type(const char *str) +{ + if (!strcmp(str, "<font>")) return DT_FONT; + if (!strcmp(str, "<noBreak>")) return DT_NOBREAK; + if (!strcmp(str, "<initial>")) return DT_INITIAL; + if (!strcmp(str, "<medial>")) return DT_MEDIAL; + if (!strcmp(str, "<final>")) return DT_FINAL; + if (!strcmp(str, "<isolated>")) return DT_ISOLATED; + if (!strcmp(str, "<circle>")) return DT_CIRCLE; + if (!strcmp(str, "<super>")) return DT_SUPER; + if (!strcmp(str, "<sub>")) return DT_SUB; + if (!strcmp(str, "<vertical>")) return DT_VERTICAL; + if (!strcmp(str, "<wide>")) return DT_WIDE; + if (!strcmp(str, "<narrow>")) return DT_NARROW; + if (!strcmp(str, "<small>")) return DT_SMALL; + if (!strcmp(str, "<square>")) return DT_SQUARE; + if (!strcmp(str, "<fraction>")) return DT_FRACTION; + if (!strcmp(str, "<compat>")) return DT_COMPAT; + return DT_UNKNOWN; +} + + +#define MAX_DECOMP 20 + + +typedef struct +{ + int ctype; + int toupper; + int tolower; + int noaccent; + int noaccent_tolower; + int noaccent_toupper; + int decomp_type; + int decomp[MAX_DECOMP]; + int to_be_decomposed; +} UNIDATA_CHAR; + + + +/************* Initialization functions *********/ + + +static int +strip_accent(UNIDATA_CHAR *code, int i) +{ + if (code[i].decomp[0] && + code[i].decomp[1] >= 0x0300 && + code[i].decomp[1] <= 0x036F && + code[i].decomp[2] == 0) + return strip_accent(code, code[i].decomp[0]); + return i; +} + + +static void +set_noaccent(const UNIDATA_OPT *opt, UNIDATA_CHAR *code) +{ + my_wchar_t i; + for (i= 0; i <= opt->max_char; i++) + { + code[i].noaccent= strip_accent(code, i); + } +} + + +static void +set_noaccent_tolower(const UNIDATA_OPT *opt, UNIDATA_CHAR *code) +{ + my_wchar_t i; + for (i= 0; i <= opt->max_char; i++) + { + code[i].noaccent_tolower= code[code[i].noaccent].tolower; + } +} + + +static void +set_noaccent_toupper(const UNIDATA_OPT *opt, UNIDATA_CHAR *code) +{ + my_wchar_t i; + for (i= 0; i <= opt->max_char; i++) + { + code[i].noaccent_toupper= code[code[i].noaccent].toupper; + } +} + + +static void +set_default_case_folding(const UNIDATA_OPT *opt, UNIDATA_CHAR *code) +{ + my_wchar_t i; + for (i= 0; i <= opt->max_char; i++) + { + code[i].tolower= i; + code[i].toupper= i; + } +} + + +/* + Fill ideographs +*/ + +static void +fill_cjk(UNIDATA_CHAR *code) +{ + size_t i; + /* CJK Ideographs Extension A (U+3400 - U+4DB5) */ + for(i=0x3400;i<=0x4DB5;i++) + { + code[i].tolower=i; + code[i].ctype= CT_CJK; + } + /* CJK Ideographs (U+4E00 - U+9FA5) */ + for(i=0x4E00;i<=0x9FA5;i++) + { + code[i].tolower=i; + code[i].ctype= CT_CJK; + } + /* Hangul Syllables (U+AC00 - U+D7A3) */ + for(i=0xAC00;i<=0xD7A3;i++) + { + code[i].tolower=i; + code[i].ctype= CT_HANGUL; + } +} + + +/************* Loading functions ***************/ + + +static void handle_general_category(const UNIDATA_OPT *opt, + UNIDATA_CHAR *ch, + const char *tok, + my_wchar_t codepoint) +{ + /* + TODO: check if ctype is set correctly. + A difference can break fulltext indexes. + */ + + const MY_CTYPE_NAME_ST *ct= ctype_name_st_find( + (my_wchar_t) codepoint, tok); + if (ct) + { + ch->ctype|= ctype_name_st_to_num( + ct, + (my_wchar_t) codepoint); + ch->to_be_decomposed= ct->to_be_decomposed; + } +} + + +int handle_decomposition(UNIDATA_CHAR *ch, char *tok, const char *str) +{ + char *lt, *part; + size_t num; + + if (!ch->to_be_decomposed) + return 0; /* Decompose only letters */ + + for (part= strtok_r(tok, " ", <), num= 0; + part; + part= strtok_r(NULL, " ", <)) + { + char *end; + if (part[0] == '<') + { + if ((ch->decomp_type= get_decomposition_type(part)) == DT_UNKNOWN) + { + fprintf(stderr, "Unknown decomposition type:\n%s\n", str); + return 1; + } + continue; + } + + if (num + 1 >= MAX_DECOMP) + { + fprintf(stderr, "Too many decomposition parts:\n%s\n", str); + return 1; + } + ch->decomp[num]= strtol(part,&end,16); + ch->decomp[num+1]= 0; + num++; + } + return 0; +} + + +static int +parse_unidata_line(const UNIDATA_OPT *opt, char *str, UNIDATA_CHAR *unidata) +{ + unsigned long codepoint= 0; + int fieldno= 0; + char *s; + + for (s= str; *s; fieldno++) + { + char *tok= s, *e; + + if ((e= strchr(s,';'))) + { + *e= '\0'; + s= e + 1; + } + else + { + s+= strlen(s); + } + + switch (fieldno) + { + case 0: /* Code point */ + codepoint= strtoul(tok, NULL, 16); + if (codepoint > opt->max_char) + return 1; + break; + case 1: /* name */ + break; + case 2: /* general category */ + handle_general_category(opt, &unidata[codepoint], + tok, (my_wchar_t) codepoint); + break; + case 3: /* Canonical combining class */ + break; + case 4: /* BiDi class */ + break; + case 5: /* Decomposition type */ + if (tok[0] && handle_decomposition(&unidata[codepoint], tok, str)) + return -1; + break; + case 6: /* Numeric_Type, Numeric Value */ + break; + case 7: /* Numeric_Type, Numeric Value */ + break; + case 8: /* Numeric_Type, Numeric Value */ + break; + case 9: /* BiDi mirrored */ + break; + case 10: /* Unicode_1_Name */ + break; + case 11: /* ISO_Comment */ + break; + case 12: /*Simple_Uppercase_Mapping*/ + if (tok[0]) + unidata[codepoint].toupper= strtol(tok, NULL, 16); + break; + case 13: /*Simple_Lowercase_Mapping*/ + if (tok[0]) + unidata[codepoint].tolower= strtol(tok, NULL, 16); + break; + case 14: /* Simple_Titlecase_Mapping */ + break; + } + } + + return 0; +} + + +static int +load_unidata_file(const UNIDATA_OPT *opt, FILE *f, UNIDATA_CHAR *unidata) +{ + char str[1024]; + + while (fgets(str, sizeof(str), f)) + { + if (parse_unidata_line(opt, str, unidata) < 0) + return 1; + } + return 0; +} + + +static int +load_unidata(const UNIDATA_OPT *opt, UNIDATA_CHAR *unidata) +{ + FILE *f; + int rc; + if (!(f= fopen(opt->filename, "r"))) + { + fprintf(stderr, "Could not open file '%s'\n", opt->filename); + return 1; + } + rc= load_unidata_file(opt, f, unidata); + fclose(f); + return rc; +} + +/************** Printing functions ********************/ + +static void +print_one_char(const UNIDATA_OPT *opt, UNIDATA_CHAR *data, int code) +{ + UNIDATA_CHAR *ch= &data[code]; + const char *comma= ""; + + if (opt->mode.print_curly_brackets_in_items) + printf("{"); + + if (opt->mode.print_ctype) + { + printf("%s", comma); + printf("%3d", ch->ctype); + comma= ","; + } + + if (opt->mode.print_toupper) + { + printf("%s", comma); + printf("0x%04X", ch->toupper); + comma= ","; + } + + if (opt->mode.print_tolower) + { + printf("%s", comma); + printf("0x%04X", ch->tolower); + comma= ","; + } + + if (opt->mode.print_noaccent) + { + printf("%s", comma); + printf("0x%04X", ch->noaccent); + comma= ","; + } + + if (opt->mode.print_noaccent_tolower) + { + printf("%s", comma); + printf("0x%04X", ch->noaccent_tolower); + comma= ","; + } + + if (opt->mode.print_noaccent_toupper) + { + printf("%s", comma); + printf("0x%04X", ch->noaccent_toupper); + comma= ","; + } + + if (opt->mode.print_curly_brackets_in_items) + printf("}"); + + if (opt->mode.single_array || + (code & 0xFF) != 0xFF) /* Don't print comma for the last char in a page */ + printf(","); + else + printf(" "); +} + + +static void +print_one_page(const UNIDATA_OPT *opt, UNIDATA_CHAR *data, + my_wchar_t pageno, const PAGE_STAT *pstat) +{ + my_wchar_t charnum; + + if (!opt->mode.single_array || pageno == 0) + { + printf("%s%s%s %s%02X[256]={%s\n", + pageno == 0 ? "" : "static ", + opt->mode.const_data ? "const " : "", + opt->mode.page_data_type_name, opt->mode.page_name, + (unsigned int) pageno, + pstat[pageno].page_tab == PAGE_DATA_DUMMY ? + " /* This page is dummy */" : ""); + } + + for (charnum= 0; charnum < 256; charnum++) + { + my_wchar_t codepoint= (pageno << 8) + charnum; + my_wchar_t rem= charnum % opt->mode.chars_per_line; + if (!rem) + printf(" "); + print_one_char(opt, data, codepoint); + if (rem + 1 == opt->mode.chars_per_line) + { + printf(" /* %04X */", (codepoint + 1) - opt->mode.chars_per_line); + printf("\n"); + } + } + if (!opt->mode.single_array) + printf("};\n\n"); +} + + +static const char *page_name_in_index(const UNIDATA_OPT *opt, + const PAGE_STAT *pstat, + my_wchar_t pageno) +{ + if (!opt->mode.page_name_derived) + return opt->mode.page_name; + + return pstat[pageno].page_overridden ? + opt->mode.page_name : + opt->mode.page_name_derived; +} + + +static void print_page_index(const UNIDATA_OPT *opt, + const PAGE_STAT *pstat) +{ + my_wchar_t page; + my_wchar_t npages= npages_by_opt(opt); + int printing_ctype= !strcmp(opt->mode.index_data_type_name, "MY_UNI_CTYPE"); + + printf("%s%s %s[%d]={\n", + opt->mode.const_data ? "const " : "", + opt->mode.index_data_type_name, opt->mode.index_name, + (unsigned int) npages); + + for (page= 0; page < npages; page++) + { + my_wchar_t rem= page % opt->mode.pages_per_line_in_index; + if (!rem) + printf(" "); + if (opt->mode.print_curly_brackets_in_index) + printf("{"); + if (printing_ctype) + printf("%d,", pstat[page].page_ctype); + + if (pstat[page].page_tab) + printf("%s%02X", page_name_in_index(opt, pstat, page), page); + else + printf("NULL"); + + if (opt->mode.print_curly_brackets_in_index) + printf("}"); + + if (page + 1 < npages) + printf(","); + + if (rem + 1 == opt->mode.pages_per_line_in_index) + printf("\n"); + else + printf(" "); + } + printf("};\n"); +} + + +static void print(UNIDATA_OPT *opt, UNIDATA_CHAR *unidata, const PAGE_STAT *pstat) +{ + my_wchar_t npages= npages_by_opt(opt); + my_wchar_t page; + + /* Print all pages */ + for (page= 0; page < npages; page++) + { + if (opt->mode.page_name_derived && !pstat[page].page_overridden) + continue; + if (opt->mode.single_array || pstat[page].page_tab) + print_one_page(opt, unidata, page, pstat); + } + + /* Print index */ + if (!opt->mode.single_array) + print_page_index(opt, pstat); +} + + +void print_command_line_options(int ac, char **av) +{ + int i; + printf("/*\n"); + printf(" Generated by:\n"); + for (i= 0; i < ac; i++) + { + printf(" %s%s%s\n", i > 0 ? " " : "", av[i], i+1 < ac ? " \\" :""); + } + printf("\n"); + printf("*/\n"); +} + + +static void calc_page_parameters(const UNIDATA_OPT *opt, const UNIDATA_CHAR *code, + PAGE_STAT *pstat) +{ + my_wchar_t npages= npages_by_opt(opt); + my_wchar_t page; + for(page= 0; page < npages; page++) + { + int ntype[CT_MAX + 1], t; + int character, done=0; + + memset(ntype,0,sizeof(ntype)); + for(character= 0;character < 256; character++) + { + size_t cod= (page << 8) + character; + const UNIDATA_CHAR *ch= &code[cod]; + ntype[ch->ctype]++; + + if((ch->tolower != cod || + ch->toupper != cod || + ch->noaccent != cod || + ch->noaccent_toupper != cod) && + (opt->mode.print_tolower || + opt->mode.print_toupper || + opt->mode.print_noaccent || + opt->mode.print_noaccent_toupper)) + { + pstat[page].page_tab= PAGE_DATA_IMPORTANT; + } + } + + if (opt->mode.print_ctype) + { + for (t= 0; t <= CT_MAX; t++) + { + if(ntype[t]==256) + { + /* All ctypes are the same */ + pstat[page].page_ctype= t; + done=1; + break; + } + } + } + else + { + done= 1; /* Don't need ctype */ + } + + if(!done) + { + /* Mixed page, lets create the table */ + pstat[page].page_ctype= CT_NONE; + pstat[page].page_tab= PAGE_DATA_IMPORTANT; + } + if (!pstat[page].page_tab && + page <= (opt->dummy_pages_codepoint_max >> 8)) + pstat[page].page_tab= PAGE_DATA_DUMMY; + } +} + + +static UNIDATA_CHAR code[MAX_UNI_CHAR + 1]; +static PAGE_STAT pstat[MAX_UNI_PAGE + 1]; + + +int usage(int ac, char **av) +{ + fprintf(stderr, "Usage: %s filename\n", av[0]); + return 1; +} + + +const char *one_opt(const char *option, const char *name, size_t length) +{ + if (!strncmp(option, name, length)) + return option + length; + return 0; +} + + +int get_option_bool(int *to, const char *name_and_value, const char *value) +{ + if (!strcmp(value, "1")) + *to= 1; + else if (!strcmp(value, "0")) + *to= 0; + else + { + fprintf(stderr, "Bad option: %s\n", name_and_value); + return 1; + } + return 0; +} + + +int get_option_codepoint(my_wchar_t *to, const char *name_and_value, const char *value) +{ + unsigned long codepoint= value[0]=='0' && value[1]=='x' ? + strtoul(value + 2, NULL, 16) : + strtoul(value, NULL, 10); + if (codepoint > MAX_UNI_CHAR) + { + fprintf(stderr, "Too large --max-char: %s\n", name_and_value); + return 1; + } + *to= (my_wchar_t) codepoint; + return 0; +} + + +int process_param(UNIDATA_OPT *opt, int ac, char **av) +{ + int i; + if (ac < 2) + return usage(ac, av); + for (i= 1; i < ac; i++) + { + const char *op; + if ((op= one_opt(av[i], STRING_WITH_LEN("--mode=")))) + { + if (opt_set_mode(opt, av[i], op)) + return 1; + } + else if ((op= one_opt(av[i], STRING_WITH_LEN("--max-char=")))) + { + if (get_option_codepoint(&opt->max_char, av[i], op)) + return 1; + } + else if ((op= one_opt(av[i], STRING_WITH_LEN("--print-toupper=")))) + { + if (get_option_bool(&opt->mode.print_toupper, av[i], op)) + return 1; + } + else if ((op= one_opt(av[i], STRING_WITH_LEN("--print-tolower=")))) + { + if (get_option_bool(&opt->mode.print_tolower, av[i], op)) + return 1; + } + else if ((op= one_opt(av[i], STRING_WITH_LEN("--print-noaccent-toupper=")))) + { + if (get_option_bool(&opt->mode.print_noaccent_toupper, av[i], op)) + return 1; + } + else if ((op= one_opt(av[i], STRING_WITH_LEN("--page-name=")))) + { + opt->mode.page_name= op; + } + else if ((op= one_opt(av[i], STRING_WITH_LEN("--page-name-derived=")))) + { + opt->mode.page_name_derived= op; + } + else if ((op= one_opt(av[i], STRING_WITH_LEN("--index-name=")))) + { + opt->mode.index_name= op; + } + else + { + if (av[i][0] == '-' && av[i][1] == '-') + { + fprintf(stderr, "Unknown option: %s\n", av[i]); + return 1; + } + break; + } + } + if (i + 1 != ac) + return usage(ac, av); + opt->filename= av[i]; + return 0; +} + + +int main(int ac,char **av) +{ + UNIDATA_OPT opt= opt_caseinfo; + + if (process_param(&opt, ac, av)) + return 1; + + memset(code,0,sizeof(code)); + memset(pstat,0,sizeof(pstat)); + + set_default_case_folding(&opt, code); + + fill_cjk(code); + + if (load_unidata(&opt, code)) + return 1; + + set_noaccent(&opt, code); + set_noaccent_tolower(&opt, code); + set_noaccent_toupper(&opt, code); + + /* + Bug#8385: utf8_general_ci treats cyrillic letters I and SHORT I as the same + Because of decomposition applied, noaccent_toupper for the following letters: + U+0419 CYRILLIC CAPITAL LETTER SHORT I + U+0439 CYRILLIC SMALL LETTER SHORT I + was set to: + U+418 CYRILLIC CAPITAL LETTER I + Reset it back to U+0419. + */ + code[0x0419].noaccent_toupper= 0x0419; + code[0x0439].noaccent_toupper= 0x0419; + + /* + Bug#27877 incorrect german order in utf8_general_ci + */ + if (strcmp(opt.mode.mode_name, "weight_general_mysql500_ci")) + { + code[0x00DF].noaccent_toupper= code['s'].noaccent_toupper; + } + else + pstat[0].page_overridden= 1; + + if (!strcmp(opt.mode.mode_name, "casefold-tr")) + { + code[0x49].tolower= 0x0131; + code[0x69].toupper= 0x0130; + pstat[0].page_overridden= 1; + } + + calc_page_parameters(&opt, code, pstat); + + printf("%s\n", COPYING); + print_command_line_options(ac, av); + print(&opt, code, pstat); + + return 0; +} |