diff options
Diffstat (limited to 'strings/uca-dump.c')
-rw-r--r-- | strings/uca-dump.c | 354 |
1 files changed, 354 insertions, 0 deletions
diff --git a/strings/uca-dump.c b/strings/uca-dump.c new file mode 100644 index 00000000..837dd7f3 --- /dev/null +++ b/strings/uca-dump.c @@ -0,0 +1,354 @@ +/* Copyright (c) 2004, 2006 MySQL AB + Copyright (c) 2009-2011, Monty Program Ab + Use is subject to license terms. + Copyright (c) 2009-2011, Monty Program Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +typedef unsigned char uchar; +typedef unsigned short uint16; + +struct uca_item_st +{ + uchar num; + uint16 weight[4][9]; +}; + +#if 0 +#define MY_UCA_NPAGES 1024 +#define MY_UCA_NCHARS 64 +#define MY_UCA_CMASK 63 +#define MY_UCA_PSHIFT 6 +#else +#define MY_UCA_NPAGES 4352 /* 0x110000 characters / 0x100 chars per page */ +#define MY_UCA_NCHARS 256 +#define MY_UCA_CMASK 255 +#define MY_UCA_PSHIFT 8 +#endif + +#define MAX_ALLOWED_CODE 0x10FFFF + +/* Name that goes into all array names */ +static const char *global_name_prefix= "uca520"; + +/* Name prefix that goes into page weight array names after global_name_prefix */ +static char *pname_prefix[]= {"_p", "_p", "_p"}; + +/* Name suffix that goes into page weight array names after page number */ +static char *pname_suffix[]= {"", "_w2", "_w3"}; + + +int main(int ac, char **av) +{ + char str[256]; + char *weights[64]; + static struct uca_item_st uca[MAX_ALLOWED_CODE+1]; + size_t code, w; + int pageloaded[MY_UCA_NPAGES]; + + bzero(uca, sizeof(uca)); + bzero(pageloaded, sizeof(pageloaded)); + + while (fgets(str,sizeof(str),stdin)) + { + char *comment; + char *weight; + char *s; + size_t codenum; + + code= strtol(str,NULL,16); + + if (str[0]=='#' || (code > MAX_ALLOWED_CODE)) + continue; + if ((comment=strchr(str,'#'))) + { + *comment++= '\0'; + for ( ; *comment==' ' ; comment++); + }else + continue; + + if ((weight=strchr(str,';'))) + { + *weight++= '\0'; + for ( ; *weight==' ' ; weight++); + } + else + continue; + + codenum= 0; + s= strtok(str, " \t"); + while (s) + { + s= strtok(NULL, " \t"); + codenum++; + } + + if (codenum>1) + { + /* Multi-character weight, + i.e. contraction. + Not supported yet. + */ + continue; + } + + uca[code].num= 0; + s= strtok(weight, " []"); + while (s) + { + weights[uca[code].num]= s; + s= strtok(NULL, " []"); + uca[code].num++; + } + + for (w=0; w < uca[code].num; w++) + { + size_t partnum; + + partnum= 0; + s= weights[w]; + while (*s) + { + char *endptr; + size_t part; + part= strtol(s+1,&endptr,16); + uca[code].weight[partnum][w]= part; + s= endptr; + partnum++; + } + } + /* Mark that a character from this page was loaded */ + pageloaded[code >> MY_UCA_PSHIFT]++; + } + + + + /* Now set implicit weights */ + for (code=0; code <= MAX_ALLOWED_CODE; code++) + { + size_t base, aaaa, bbbb; + + if (uca[code].num) + continue; + + /* + 3400;<CJK Ideograph Extension A, First> + 4DB5;<CJK Ideograph Extension A, Last> + 4E00;<CJK Ideograph, First> + 9FA5;<CJK Ideograph, Last> + */ + + if (code >= 0x3400 && code <= 0x4DB5) + base= 0xFB80; + else if (code >= 0x4E00 && code <= 0x9FA5) + base= 0xFB40; + else + base= 0xFBC0; + + aaaa= base + (code >> 15); + bbbb= (code & 0x7FFF) | 0x8000; + uca[code].weight[0][0]= aaaa; + uca[code].weight[0][1]= bbbb; + + uca[code].weight[1][0]= 0x0020; + uca[code].weight[1][1]= 0x0000; + + uca[code].weight[2][0]= 0x0002; + uca[code].weight[2][1]= 0x0000; + + uca[code].weight[3][0]= 0x0001; + uca[code].weight[3][2]= 0x0000; + + uca[code].num= 2; + } + + printf("#include \"my_uca.h\"\n"); + + printf("#define MY_UCA_NPAGES %d\n",MY_UCA_NPAGES); + printf("#define MY_UCA_NCHARS %d\n",MY_UCA_NCHARS); + printf("#define MY_UCA_CMASK %d\n",MY_UCA_CMASK); + printf("#define MY_UCA_PSHIFT %d\n",MY_UCA_PSHIFT); + + for (w=0; w<3; w++) + { + size_t page; + int pagemaxlen[MY_UCA_NPAGES]; + + for (page=0; page < MY_UCA_NPAGES; page++) + { + size_t offs; + size_t maxnum= 0; + size_t nchars= 0; + size_t mchars; + size_t ndefs= 0; + size_t code_line_start= page * MY_UCA_NCHARS; + + pagemaxlen[page]= 0; + + /* + Skip this page if no weights were loaded + */ + + if (!pageloaded[page]) + continue; + + /* + Calculate maximum weight + length for this page + */ + + for (offs=0; offs < MY_UCA_NCHARS; offs++) + { + size_t i, num; + + code= page*MY_UCA_NCHARS+offs; + + /* Calculate only non-zero weights */ + for (num=0, i=0; i < uca[code].num; i++) + if (uca[code].weight[w][i]) + num++; + + maxnum= maxnum < num ? num : maxnum; + + /* Check if default weight */ + if (w == 1 && num == 1) + { + /* 0020 0000 ... */ + if (uca[code].weight[w][0] == 0x0020) + ndefs++; + } + else if (w == 2 && num == 1) + { + /* 0002 0000 ... */ + if (uca[code].weight[w][0] == 0x0002) + ndefs++; + } + } + maxnum++; + + /* + If the page have only default weights + then no needs to dump it, skip. + */ + if (ndefs == MY_UCA_NCHARS) + { + continue; + } + switch (maxnum) + { + case 0: mchars= 8; break; + case 1: mchars= 8; break; + case 2: mchars= 8; break; + case 3: mchars= 9; break; + case 4: mchars= 8; break; + default: mchars= uca[code].num; + } + + pagemaxlen[page]= maxnum; + + + /* + Now print this page + */ + + + printf("static const uint16 %s%s%03X%s[]= { /* %04X (%d weights per char) */\n", + global_name_prefix, pname_prefix[w], (int) page, pname_suffix[w], + (int) page*MY_UCA_NCHARS, (int) maxnum); + + for (offs=0; offs < MY_UCA_NCHARS; offs++) + { + uint16 weight[8]; + size_t num, i; + + code= page*MY_UCA_NCHARS+offs; + + bzero(weight,sizeof(weight)); + + /* Copy non-zero weights */ + for (num=0, i=0; i < uca[code].num; i++) + { + if (uca[code].weight[w][i]) + { + weight[num]= uca[code].weight[w][i]; + num++; + } + } + + for (i=0; i < maxnum; i++) + { + /* + Invert weights for secondary level to + sort upper case letters before their + lower case counter part. + */ + int tmp= weight[i]; + if (w == 2 && tmp) + tmp= (int)(0x20 - weight[i]); + + + printf("0x%04X", tmp); + if ((offs+1 != MY_UCA_NCHARS) || (i+1!=maxnum)) + printf(","); + else + printf(" "); + nchars++; + } + if (nchars >=mchars) + { + printf(" /* %04X */\n", (int) code_line_start); + code_line_start= code + 1; + nchars=0; + } + else + { + printf(" "); + } + } + printf("};\n\n"); + } + + printf("const uchar %s_length%s[%d]={\n", + global_name_prefix, pname_suffix[w], MY_UCA_NPAGES); + for (page=0; page < MY_UCA_NPAGES; page++) + { + printf("%d%s%s",pagemaxlen[page],page<MY_UCA_NPAGES-1?",":"",(page+1) % 16 ? "":"\n"); + } + printf("};\n"); + + + printf("static const uint16 *%s_weight%s[%d]={\n", + global_name_prefix, pname_suffix[w], MY_UCA_NPAGES); + for (page=0; page < MY_UCA_NPAGES; page++) + { + const char *comma= page < MY_UCA_NPAGES-1 ? "," : ""; + const char *nline= (page+1) % 4 ? "" : "\n"; + if (!pagemaxlen[page]) + printf("NULL %s%s%s", w ? " ": "", comma , nline); + else + printf("%s%s%03X%s%s%s", + global_name_prefix, pname_prefix[w], (int) page, pname_suffix[w], + comma, nline); + } + printf("};\n"); + } + + + printf("int main(void){ return 0;};\n"); + return 0; +} |