summaryrefslogtreecommitdiffstats
path: root/strings/uca-dump.c
diff options
context:
space:
mode:
Diffstat (limited to 'strings/uca-dump.c')
-rw-r--r--strings/uca-dump.c354
1 files changed, 354 insertions, 0 deletions
diff --git a/strings/uca-dump.c b/strings/uca-dump.c
new file mode 100644
index 00000000..837dd7f3
--- /dev/null
+++ b/strings/uca-dump.c
@@ -0,0 +1,354 @@
+/* Copyright (c) 2004, 2006 MySQL AB
+ Copyright (c) 2009-2011, Monty Program Ab
+ Use is subject to license terms.
+ Copyright (c) 2009-2011, Monty Program Ab
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+typedef unsigned char uchar;
+typedef unsigned short uint16;
+
+struct uca_item_st
+{
+ uchar num;
+ uint16 weight[4][9];
+};
+
+#if 0
+#define MY_UCA_NPAGES 1024
+#define MY_UCA_NCHARS 64
+#define MY_UCA_CMASK 63
+#define MY_UCA_PSHIFT 6
+#else
+#define MY_UCA_NPAGES 4352 /* 0x110000 characters / 0x100 chars per page */
+#define MY_UCA_NCHARS 256
+#define MY_UCA_CMASK 255
+#define MY_UCA_PSHIFT 8
+#endif
+
+#define MAX_ALLOWED_CODE 0x10FFFF
+
+/* Name that goes into all array names */
+static const char *global_name_prefix= "uca520";
+
+/* Name prefix that goes into page weight array names after global_name_prefix */
+static char *pname_prefix[]= {"_p", "_p", "_p"};
+
+/* Name suffix that goes into page weight array names after page number */
+static char *pname_suffix[]= {"", "_w2", "_w3"};
+
+
+int main(int ac, char **av)
+{
+ char str[256];
+ char *weights[64];
+ static struct uca_item_st uca[MAX_ALLOWED_CODE+1];
+ size_t code, w;
+ int pageloaded[MY_UCA_NPAGES];
+
+ bzero(uca, sizeof(uca));
+ bzero(pageloaded, sizeof(pageloaded));
+
+ while (fgets(str,sizeof(str),stdin))
+ {
+ char *comment;
+ char *weight;
+ char *s;
+ size_t codenum;
+
+ code= strtol(str,NULL,16);
+
+ if (str[0]=='#' || (code > MAX_ALLOWED_CODE))
+ continue;
+ if ((comment=strchr(str,'#')))
+ {
+ *comment++= '\0';
+ for ( ; *comment==' ' ; comment++);
+ }else
+ continue;
+
+ if ((weight=strchr(str,';')))
+ {
+ *weight++= '\0';
+ for ( ; *weight==' ' ; weight++);
+ }
+ else
+ continue;
+
+ codenum= 0;
+ s= strtok(str, " \t");
+ while (s)
+ {
+ s= strtok(NULL, " \t");
+ codenum++;
+ }
+
+ if (codenum>1)
+ {
+ /* Multi-character weight,
+ i.e. contraction.
+ Not supported yet.
+ */
+ continue;
+ }
+
+ uca[code].num= 0;
+ s= strtok(weight, " []");
+ while (s)
+ {
+ weights[uca[code].num]= s;
+ s= strtok(NULL, " []");
+ uca[code].num++;
+ }
+
+ for (w=0; w < uca[code].num; w++)
+ {
+ size_t partnum;
+
+ partnum= 0;
+ s= weights[w];
+ while (*s)
+ {
+ char *endptr;
+ size_t part;
+ part= strtol(s+1,&endptr,16);
+ uca[code].weight[partnum][w]= part;
+ s= endptr;
+ partnum++;
+ }
+ }
+ /* Mark that a character from this page was loaded */
+ pageloaded[code >> MY_UCA_PSHIFT]++;
+ }
+
+
+
+ /* Now set implicit weights */
+ for (code=0; code <= MAX_ALLOWED_CODE; code++)
+ {
+ size_t base, aaaa, bbbb;
+
+ if (uca[code].num)
+ continue;
+
+ /*
+ 3400;<CJK Ideograph Extension A, First>
+ 4DB5;<CJK Ideograph Extension A, Last>
+ 4E00;<CJK Ideograph, First>
+ 9FA5;<CJK Ideograph, Last>
+ */
+
+ if (code >= 0x3400 && code <= 0x4DB5)
+ base= 0xFB80;
+ else if (code >= 0x4E00 && code <= 0x9FA5)
+ base= 0xFB40;
+ else
+ base= 0xFBC0;
+
+ aaaa= base + (code >> 15);
+ bbbb= (code & 0x7FFF) | 0x8000;
+ uca[code].weight[0][0]= aaaa;
+ uca[code].weight[0][1]= bbbb;
+
+ uca[code].weight[1][0]= 0x0020;
+ uca[code].weight[1][1]= 0x0000;
+
+ uca[code].weight[2][0]= 0x0002;
+ uca[code].weight[2][1]= 0x0000;
+
+ uca[code].weight[3][0]= 0x0001;
+ uca[code].weight[3][2]= 0x0000;
+
+ uca[code].num= 2;
+ }
+
+ printf("#include \"my_uca.h\"\n");
+
+ printf("#define MY_UCA_NPAGES %d\n",MY_UCA_NPAGES);
+ printf("#define MY_UCA_NCHARS %d\n",MY_UCA_NCHARS);
+ printf("#define MY_UCA_CMASK %d\n",MY_UCA_CMASK);
+ printf("#define MY_UCA_PSHIFT %d\n",MY_UCA_PSHIFT);
+
+ for (w=0; w<3; w++)
+ {
+ size_t page;
+ int pagemaxlen[MY_UCA_NPAGES];
+
+ for (page=0; page < MY_UCA_NPAGES; page++)
+ {
+ size_t offs;
+ size_t maxnum= 0;
+ size_t nchars= 0;
+ size_t mchars;
+ size_t ndefs= 0;
+ size_t code_line_start= page * MY_UCA_NCHARS;
+
+ pagemaxlen[page]= 0;
+
+ /*
+ Skip this page if no weights were loaded
+ */
+
+ if (!pageloaded[page])
+ continue;
+
+ /*
+ Calculate maximum weight
+ length for this page
+ */
+
+ for (offs=0; offs < MY_UCA_NCHARS; offs++)
+ {
+ size_t i, num;
+
+ code= page*MY_UCA_NCHARS+offs;
+
+ /* Calculate only non-zero weights */
+ for (num=0, i=0; i < uca[code].num; i++)
+ if (uca[code].weight[w][i])
+ num++;
+
+ maxnum= maxnum < num ? num : maxnum;
+
+ /* Check if default weight */
+ if (w == 1 && num == 1)
+ {
+ /* 0020 0000 ... */
+ if (uca[code].weight[w][0] == 0x0020)
+ ndefs++;
+ }
+ else if (w == 2 && num == 1)
+ {
+ /* 0002 0000 ... */
+ if (uca[code].weight[w][0] == 0x0002)
+ ndefs++;
+ }
+ }
+ maxnum++;
+
+ /*
+ If the page have only default weights
+ then no needs to dump it, skip.
+ */
+ if (ndefs == MY_UCA_NCHARS)
+ {
+ continue;
+ }
+ switch (maxnum)
+ {
+ case 0: mchars= 8; break;
+ case 1: mchars= 8; break;
+ case 2: mchars= 8; break;
+ case 3: mchars= 9; break;
+ case 4: mchars= 8; break;
+ default: mchars= uca[code].num;
+ }
+
+ pagemaxlen[page]= maxnum;
+
+
+ /*
+ Now print this page
+ */
+
+
+ printf("static const uint16 %s%s%03X%s[]= { /* %04X (%d weights per char) */\n",
+ global_name_prefix, pname_prefix[w], (int) page, pname_suffix[w],
+ (int) page*MY_UCA_NCHARS, (int) maxnum);
+
+ for (offs=0; offs < MY_UCA_NCHARS; offs++)
+ {
+ uint16 weight[8];
+ size_t num, i;
+
+ code= page*MY_UCA_NCHARS+offs;
+
+ bzero(weight,sizeof(weight));
+
+ /* Copy non-zero weights */
+ for (num=0, i=0; i < uca[code].num; i++)
+ {
+ if (uca[code].weight[w][i])
+ {
+ weight[num]= uca[code].weight[w][i];
+ num++;
+ }
+ }
+
+ for (i=0; i < maxnum; i++)
+ {
+ /*
+ Invert weights for secondary level to
+ sort upper case letters before their
+ lower case counter part.
+ */
+ int tmp= weight[i];
+ if (w == 2 && tmp)
+ tmp= (int)(0x20 - weight[i]);
+
+
+ printf("0x%04X", tmp);
+ if ((offs+1 != MY_UCA_NCHARS) || (i+1!=maxnum))
+ printf(",");
+ else
+ printf(" ");
+ nchars++;
+ }
+ if (nchars >=mchars)
+ {
+ printf(" /* %04X */\n", (int) code_line_start);
+ code_line_start= code + 1;
+ nchars=0;
+ }
+ else
+ {
+ printf(" ");
+ }
+ }
+ printf("};\n\n");
+ }
+
+ printf("const uchar %s_length%s[%d]={\n",
+ global_name_prefix, pname_suffix[w], MY_UCA_NPAGES);
+ for (page=0; page < MY_UCA_NPAGES; page++)
+ {
+ printf("%d%s%s",pagemaxlen[page],page<MY_UCA_NPAGES-1?",":"",(page+1) % 16 ? "":"\n");
+ }
+ printf("};\n");
+
+
+ printf("static const uint16 *%s_weight%s[%d]={\n",
+ global_name_prefix, pname_suffix[w], MY_UCA_NPAGES);
+ for (page=0; page < MY_UCA_NPAGES; page++)
+ {
+ const char *comma= page < MY_UCA_NPAGES-1 ? "," : "";
+ const char *nline= (page+1) % 4 ? "" : "\n";
+ if (!pagemaxlen[page])
+ printf("NULL %s%s%s", w ? " ": "", comma , nline);
+ else
+ printf("%s%s%03X%s%s%s",
+ global_name_prefix, pname_prefix[w], (int) page, pname_suffix[w],
+ comma, nline);
+ }
+ printf("};\n");
+ }
+
+
+ printf("int main(void){ return 0;};\n");
+ return 0;
+}