diff options
Diffstat (limited to 'strings/uctypedump.c')
-rw-r--r-- | strings/uctypedump.c | 235 |
1 files changed, 235 insertions, 0 deletions
diff --git a/strings/uctypedump.c b/strings/uctypedump.c new file mode 100644 index 00000000..397b6e58 --- /dev/null +++ b/strings/uctypedump.c @@ -0,0 +1,235 @@ +/* Copyright (c) 2006 MySQL AB + Use is subject to license terms. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "strings_def.h" +#include <m_ctype.h> + + +typedef struct my_ctype_name_st +{ + const char *name; + int val; +} MY_CTYPE_NAME_ST; + + +static MY_CTYPE_NAME_ST my_ctype_name[]= +{ + {"Lu", _MY_U}, /* Letter, Uppercase */ + {"Ll", _MY_L}, /* Letter, Lowercase */ + {"Lt", _MY_U}, /* Letter, Titlecase */ + {"Lm", _MY_L}, /* Letter, Modifier */ + {"Lo", _MY_L}, /* Letter, other */ + + {"Nd", _MY_NMR}, /* Number, Decimal Digit */ + {"Nl", _MY_NMR|_MY_U|_MY_L}, /* Number, Letter */ + {"No", _MY_NMR|_MY_PNT}, /* Number, Other */ + + {"Mn", _MY_L|_MY_PNT}, /* Mark, Nonspacing */ + {"Mc", _MY_L|_MY_PNT}, /* Mark, Spacing Combining */ + {"Me", _MY_L|_MY_PNT}, /* Mark, Enclosing */ + + {"Pc", _MY_PNT}, /* Punctuation, Connector */ + {"Pd", _MY_PNT}, /* Punctuation, Dash */ + {"Ps", _MY_PNT}, /* Punctuation, Open */ + {"Pe", _MY_PNT}, /* Punctuation, Close */ + {"Pi", _MY_PNT}, /* Punctuation, Initial quote */ + {"Pf", _MY_PNT}, /* Punctuation, Final quote */ + {"Po", _MY_PNT}, /* Punctuation, Other */ + + {"Sm", _MY_PNT}, /* Symbol, Math */ + {"Sc", _MY_PNT}, /* Symbol, Currency */ + {"Sk", _MY_PNT}, /* Symbol, Modifier */ + {"So", _MY_PNT}, /* Symbol, Other */ + + {"Zs", _MY_SPC}, /* Separator, Space */ + {"Zl", _MY_SPC}, /* Separator, Line */ + {"Zp", _MY_SPC}, /* Separator, Paragraph */ + + {"Cc", _MY_CTR}, /* Other, Control */ + {"Cf", _MY_CTR}, /* Other, Format */ + {"Cs", _MY_CTR}, /* Other, Surrogate */ + {"Co", _MY_CTR}, /* Other, Private Use */ + {"Cn", _MY_CTR}, /* Other, Not Assigned */ + {NULL, 0} +}; + + +static int +ctypestr2num(const char *tok) +{ + MY_CTYPE_NAME_ST *p; + for (p= my_ctype_name; p->name; p++) + { + if (!strncasecmp(p->name, tok, 2)) + return p->val; + } + return 0; +} + + +int main(int ac, char ** av) +{ + char str[1024]; + unsigned char ctypea[64*1024]; + size_t i; + size_t plane; + MY_UNI_CTYPE uctype[256]; + FILE *f= stdin; + + if (ac > 1 && av[1] && !(f= fopen(av[1],"r"))) + { + fprintf(stderr, "Can't open file %s\n", av[1]); + exit(1); + } + bzero(&ctypea,sizeof(ctypea)); + bzero(&uctype, sizeof(uctype)); + + printf("/*\n"); + printf(" Unicode ctype data\n"); + printf(" Generated from %s\n", av[1] ? av[1] : "stdin"); + printf("*/\n"); + + while(fgets(str, sizeof(str), f)) + { + size_t n= 0, code= 0; + char *s,*e; + int ctype= 0; + + for(s= str; s; ) + { + char *end; + char tok[1024]=""; + e=strchr(s,';'); + if(e) + { + strncpy(tok,s,(unsigned int)(e-s)); + tok[e-s]=0; + } + else + { + safe_strcpy(tok, sizeof(tok), s); + } + + end=tok+strlen(tok); + + switch(n) + { + case 0: code= strtol(tok,&end,16);break; + case 2: ctype= ctypestr2num(tok);break; + } + + n++; + if(e) s=e+1; + else s=e; + } + if(code<=0xFFFF) + { + ctypea[code]= ctype; + } + } + + /* Fill digits */ + for (i= '0'; i <= '9'; i++) + ctypea[i]= _MY_NMR; + + for (i= 'a'; i <= 'z'; i++) + ctypea[i]|= _MY_X; + for (i= 'A'; i <= 'Z'; i++) + ctypea[i]|= _MY_X; + + + /* Fill ideographs */ + + /* CJK Ideographs Extension A (U+3400 - U+4DB5) */ + for(i=0x3400;i<=0x4DB5;i++) + { + ctypea[i]= _MY_L | _MY_U; + } + + /* CJK Ideographs (U+4E00 - U+9FA5) */ + for(i=0x4E00;i<=0x9FA5;i++){ + ctypea[i]= _MY_L | _MY_U; + } + + /* Hangul Syllables (U+AC00 - U+D7A3) */ + for(i=0xAC00;i<=0xD7A3;i++) + { + ctypea[i]= _MY_L | _MY_U; + } + + + /* Calc plane parameters */ + for(plane=0;plane<256;plane++) + { + size_t character; + uctype[plane].ctype= ctypea+plane*256; + + uctype[plane].pctype= uctype[plane].ctype[0]; + for(character=1;character<256;character++) + { + if (uctype[plane].ctype[character] != uctype[plane].pctype) + { + uctype[plane].pctype= 0; /* Mixed plane */ + break; + } + } + if (character==256) /* All the same, no needs to dump whole plane */ + uctype[plane].ctype= NULL; + } + + /* Dump mixed planes */ + for(plane=0;plane<256;plane++) + { + if(uctype[plane].ctype) + { + int charnum=0; + int num=0; + + printf("static unsigned char uctype_page%02X[256]=\n{\n", (uint) plane); + + for(charnum=0;charnum<256;charnum++) + { + int cod; + + cod=(plane<<8)+charnum; + printf(" %2d%s",uctype[plane].ctype[charnum],charnum<255?",":""); + + num++; + if(num==16) + { + printf("\n"); + num=0; + } + } + printf("};\n\n"); + } + } + + + /* Dump plane index */ + printf("MY_UNI_CTYPE my_uni_ctype[256]={\n"); + for(plane=0;plane<256;plane++) + { + char plane_name[128]="NULL"; + if(uctype[plane].ctype){ + snprintf(plane_name,sizeof(plane_name),"uctype_page%02X",(uint) plane); + } + printf("\t{%d,%s}%s\n",uctype[plane].pctype,plane_name,plane<255?",":""); + } + printf("};\n"); + + return 0; +} |