diff options
Diffstat (limited to 'ext/fts3/unicode/mkunicode.tcl')
-rw-r--r-- | ext/fts3/unicode/mkunicode.tcl | 984 |
1 files changed, 984 insertions, 0 deletions
diff --git a/ext/fts3/unicode/mkunicode.tcl b/ext/fts3/unicode/mkunicode.tcl new file mode 100644 index 0000000..58d90c6 --- /dev/null +++ b/ext/fts3/unicode/mkunicode.tcl @@ -0,0 +1,984 @@ + +source [file join [file dirname [info script]] parseunicode.tcl] + +proc print_rd {map} { + global tl_lookup_table + set aChar [list] + set lRange [list] + + set nRange 1 + set iFirst [lindex $map 0 0] + set cPrev [lindex $map 0 1] + set fPrev [lindex $map 0 2] + + foreach m [lrange $map 1 end] { + foreach {i c f} $m {} + + if {$cPrev == $c && $fPrev==$f} { + for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} { + if {[info exists tl_lookup_table($j)]==0} break + } + + if {$j==$i} { + set nNew [expr {(1 + $i - $iFirst)}] + if {$nNew<=8} { + set nRange $nNew + continue + } + } + } + + lappend lRange [list $iFirst $nRange] + lappend aChar $cPrev + lappend aFlag $fPrev + + set iFirst $i + set cPrev $c + set fPrev $f + set nRange 1 + } + lappend lRange [list $iFirst $nRange] + lappend aChar $cPrev + lappend aFlag $fPrev + + puts "/*" + puts "** If the argument is a codepoint corresponding to a lowercase letter" + puts "** in the ASCII range with a diacritic added, return the codepoint" + puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN" + puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER" + puts "** E\"). The resuls of passing a codepoint that corresponds to an" + puts "** uppercase letter are undefined." + puts "*/" + puts "static int ${::remove_diacritic}(int c, int bComplex)\{" + puts " unsigned short aDia\[\] = \{" + puts -nonewline " 0, " + set i 1 + foreach r $lRange { + foreach {iCode nRange} $r {} + if {($i % 8)==0} {puts "" ; puts -nonewline " " } + incr i + + puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]] + puts -nonewline ", " + } + puts "" + puts " \};" + puts "#define HIBIT ((unsigned char)0x80)" + puts " unsigned char aChar\[\] = \{" + puts -nonewline " '\\0', " + set i 1 + foreach c $aChar f $aFlag { + if { $f } { + set str "'$c'|HIBIT, " + } else { + set str "'$c', " + } + if {$c == ""} { set str "'\\0', " } + + if {($i % 6)==0} {puts "" ; puts -nonewline " " } + incr i + puts -nonewline "$str" + } + puts "" + puts " \};" + puts { + unsigned int key = (((unsigned int)c)<<3) | 0x00000007; + int iRes = 0; + int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; + int iLo = 0; + while( iHi>=iLo ){ + int iTest = (iHi + iLo) / 2; + if( key >= aDia[iTest] ){ + iRes = iTest; + iLo = iTest+1; + }else{ + iHi = iTest-1; + } + } + assert( key>=aDia[iRes] ); + if( bComplex==0 && (aChar[iRes] & 0x80) ) return c; + return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);} + puts "\}" +} + +proc print_isdiacritic {zFunc map} { + + set lCode [list] + foreach m $map { + foreach {code char flag} $m {} + if {$flag} continue + if {$code && $char == ""} { lappend lCode $code } + } + set lCode [lsort -integer $lCode] + set iFirst [lindex $lCode 0] + set iLast [lindex $lCode end] + + set i1 0 + set i2 0 + + foreach c $lCode { + set i [expr $c - $iFirst] + if {$i < 32} { + set i1 [expr {$i1 | (1<<$i)}] + } else { + set i2 [expr {$i2 | (1<<($i-32))}] + } + } + + puts "/*" + puts "** Return true if the argument interpreted as a unicode codepoint" + puts "** is a diacritical modifier character." + puts "*/" + puts "int ${zFunc}\(int c)\{" + puts " unsigned int mask0 = [format "0x%08X" $i1];" + puts " unsigned int mask1 = [format "0x%08X" $i2];" + + puts " if( c<$iFirst || c>$iLast ) return 0;" + puts " return (c < $iFirst+32) ?" + puts " (mask0 & ((unsigned int)1 << (c-$iFirst))) :" + puts " (mask1 & ((unsigned int)1 << (c-$iFirst-32)));" + puts "\}" +} + + +#------------------------------------------------------------------------- + +proc an_load_separator_ranges {} { + global unicodedata.txt + set lSep [an_load_unicodedata_text ${unicodedata.txt}] + unset -nocomplain iFirst + unset -nocomplain nRange + set lRange [list] + foreach sep $lSep { + if {0==[info exists iFirst]} { + set iFirst $sep + set nRange 1 + } elseif { $sep == ($iFirst+$nRange) } { + incr nRange + } else { + lappend lRange [list $iFirst $nRange] + set iFirst $sep + set nRange 1 + } + } + lappend lRange [list $iFirst $nRange] + set lRange +} + +proc an_print_range_array {lRange} { + set iFirstMax 0 + set nRangeMax 0 + foreach range $lRange { + foreach {iFirst nRange} $range {} + if {$iFirst > $iFirstMax} {set iFirstMax $iFirst} + if {$nRange > $nRangeMax} {set nRangeMax $nRange} + } + if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"} + if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"} + + puts -nonewline " " + puts [string trim { + /* Each unsigned integer in the following array corresponds to a contiguous + ** range of unicode codepoints that are not either letters or numbers (i.e. + ** codepoints for which this function should return 0). + ** + ** The most significant 22 bits in each 32-bit value contain the first + ** codepoint in the range. The least significant 10 bits are used to store + ** the size of the range (always at least 1). In other words, the value + ** ((C<<22) + N) represents a range of N codepoints starting with codepoint + ** C. It is not possible to represent a range larger than 1023 codepoints + ** using this format. + */ + }] + puts -nonewline " static const unsigned int aEntry\[\] = \{" + set i 0 + foreach range $lRange { + foreach {iFirst nRange} $range {} + set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]] + + if {($i % 5)==0} {puts "" ; puts -nonewline " "} + puts -nonewline " $u32," + incr i + } + puts "" + puts " \};" +} + +proc an_print_ascii_bitmap {lRange} { + foreach range $lRange { + foreach {iFirst nRange} $range {} + for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { + if {$i<=127} { set a($i) 1 } + } + } + + set aAscii [list 0 0 0 0] + foreach key [array names a] { + set idx [expr $key >> 5] + lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))] + } + + puts " static const unsigned int aAscii\[4\] = \{" + puts -nonewline " " + foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] } + puts "" + puts " \};" +} + +proc print_isalnum {zFunc lRange} { + puts "/*" + puts "** Return true if the argument corresponds to a unicode codepoint" + puts "** classified as either a letter or a number. Otherwise false." + puts "**" + puts "** The results are undefined if the value passed to this function" + puts "** is less than zero." + puts "*/" + puts "int ${zFunc}\(int c)\{" + an_print_range_array $lRange + an_print_ascii_bitmap $lRange + puts { + if( (unsigned int)c<128 ){ + return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 ); + }else if( (unsigned int)c<(1<<22) ){ + unsigned int key = (((unsigned int)c)<<10) | 0x000003FF; + int iRes = 0; + int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; + int iLo = 0; + while( iHi>=iLo ){ + int iTest = (iHi + iLo) / 2; + if( key >= aEntry[iTest] ){ + iRes = iTest; + iLo = iTest+1; + }else{ + iHi = iTest-1; + } + } + assert( aEntry[0]<key ); + assert( key>=aEntry[iRes] ); + return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF))); + } + return 1;} + puts "\}" +} + +proc print_test_isalnum {zFunc lRange} { + foreach range $lRange { + foreach {iFirst nRange} $range {} + for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 } + } + + puts "static int isalnum_test(int *piCode)\{" + puts -nonewline " unsigned char aAlnum\[\] = \{" + for {set i 0} {$i < 70000} {incr i} { + if {($i % 32)==0} { puts "" ; puts -nonewline " " } + set bFlag [expr ![info exists a($i)]] + puts -nonewline "${bFlag}," + } + puts "" + puts " \};" + + puts -nonewline " int aLargeSep\[\] = \{" + set i 0 + foreach iSep [lsort -integer [array names a]] { + if {$iSep<70000} continue + if {($i % 8)==0} { puts "" ; puts -nonewline " " } + puts -nonewline " $iSep," + incr i + } + puts "" + puts " \};" + puts -nonewline " int aLargeOther\[\] = \{" + set i 0 + foreach iSep [lsort -integer [array names a]] { + if {$iSep<70000} continue + if {[info exists a([expr $iSep-1])]==0} { + if {($i % 8)==0} { puts "" ; puts -nonewline " " } + puts -nonewline " [expr $iSep-1]," + incr i + } + if {[info exists a([expr $iSep+1])]==0} { + if {($i % 8)==0} { puts "" ; puts -nonewline " " } + puts -nonewline " [expr $iSep+1]," + incr i + } + } + puts "" + puts " \};" + + puts [subst -nocommands { + int i; + for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){ + if( ${zFunc}(i)!=aAlnum[i] ){ + *piCode = i; + return 1; + } + } + for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){ + if( ${zFunc}(aLargeSep[i])!=0 ){ + *piCode = aLargeSep[i]; + return 1; + } + } + for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){ + if( ${zFunc}(aLargeOther[i])!=1 ){ + *piCode = aLargeOther[i]; + return 1; + } + } + }] + puts " return 0;" + puts "\}" +} + +#------------------------------------------------------------------------- + +proc tl_create_records {} { + global tl_lookup_table + + set iFirst "" + set nOff 0 + set nRange 0 + set nIncr 0 + + set lRecord [list] + foreach code [lsort -integer [array names tl_lookup_table]] { + set mapping $tl_lookup_table($code) + if {$iFirst == ""} { + set iFirst $code + set nOff [expr $mapping - $code] + set nRange 1 + set nIncr 1 + } else { + set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))] + if { $nRange==1 && ($diff==1 || $diff==2) } { + set nIncr $diff + } + + if {$diff != $nIncr || ($mapping - $code)!=$nOff} { + if { $nRange==1 } {set nIncr 1} + lappend lRecord [list $iFirst $nIncr $nRange $nOff] + set iFirst $code + set nOff [expr $mapping - $code] + set nRange 1 + set nIncr 1 + } else { + incr nRange + } + } + } + + lappend lRecord [list $iFirst $nIncr $nRange $nOff] + + set lRecord +} + +proc tl_print_table_header {} { + puts -nonewline " " + puts [string trim { + /* Each entry in the following array defines a rule for folding a range + ** of codepoints to lower case. The rule applies to a range of nRange + ** codepoints starting at codepoint iCode. + ** + ** If the least significant bit in flags is clear, then the rule applies + ** to all nRange codepoints (i.e. all nRange codepoints are upper case and + ** need to be folded). Or, if it is set, then the rule only applies to + ** every second codepoint in the range, starting with codepoint C. + ** + ** The 7 most significant bits in flags are an index into the aiOff[] + ** array. If a specific codepoint C does require folding, then its lower + ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF). + ** + ** The contents of this array are generated by parsing the CaseFolding.txt + ** file distributed as part of the "Unicode Character Database". See + ** http://www.unicode.org for details. + */ + }] + puts " static const struct TableEntry \{" + puts " unsigned short iCode;" + puts " unsigned char flags;" + puts " unsigned char nRange;" + puts " \} aEntry\[\] = \{" +} + +proc tl_print_table_entry {togglevar entry liOff} { + upvar $togglevar t + foreach {iFirst nIncr nRange nOff} $entry {} + + if {$iFirst > (1<<16)} { return 1 } + + if {[info exists t]==0} {set t 0} + if {$t==0} { puts -nonewline " " } + + set flags 0 + if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]} + if {$nOff<0} { incr nOff [expr (1<<16)] } + + set idx [lsearch $liOff $nOff] + if {$idx<0} {error "malfunction generating aiOff"} + set flags [expr $flags + $idx*2] + + set txt "{$iFirst, $flags, $nRange}," + if {$t==2} { + puts $txt + } else { + puts -nonewline [format "% -23s" $txt] + } + set t [expr ($t+1)%3] + + return 0 +} + +proc tl_print_table_footer {togglevar} { + upvar $togglevar t + if {$t!=0} {puts ""} + puts " \};" +} + +proc tl_print_if_entry {entry} { + foreach {iFirst nIncr nRange nOff} $entry {} + if {$nIncr==2} {error "tl_print_if_entry needs improvement!"} + + puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{" + puts " ret = c + $nOff;" + puts " \}" +} + +proc tl_generate_ioff_table {lRecord} { + foreach entry $lRecord { + foreach {iFirst nIncr nRange iOff} $entry {} + if {$iOff<0} { incr iOff [expr (1<<16)] } + if {[info exists a($iOff)]} continue + set a($iOff) 1 + } + + set liOff [lsort -integer [array names a]] + if {[llength $liOff]>128} { error "Too many distinct ioffs" } + return $liOff +} + +proc tl_print_ioff_table {liOff} { + puts -nonewline " static const unsigned short aiOff\[\] = \{" + set i 0 + foreach off $liOff { + if {($i % 8)==0} {puts "" ; puts -nonewline " "} + puts -nonewline [format "% -7s" "$off,"] + incr i + } + puts "" + puts " \};" + +} + +proc print_fold {zFunc} { + + set lRecord [tl_create_records] + + set lHigh [list] + puts "/*" + puts "** Interpret the argument as a unicode codepoint. If the codepoint" + puts "** is an upper case character that has a lower case equivalent," + puts "** return the codepoint corresponding to the lower case version." + puts "** Otherwise, return a copy of the argument." + puts "**" + puts "** The results are undefined if the value passed to this function" + puts "** is less than zero." + puts "*/" + puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{" + + set liOff [tl_generate_ioff_table $lRecord] + tl_print_table_header + foreach entry $lRecord { + if {[tl_print_table_entry toggle $entry $liOff]} { + lappend lHigh $entry + } + } + tl_print_table_footer toggle + tl_print_ioff_table $liOff + + puts [subst -nocommands { + int ret = c; + + assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); + + if( c<128 ){ + if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); + }else if( c<65536 ){ + const struct TableEntry *p; + int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; + int iLo = 0; + int iRes = -1; + + assert( c>aEntry[0].iCode ); + while( iHi>=iLo ){ + int iTest = (iHi + iLo) / 2; + int cmp = (c - aEntry[iTest].iCode); + if( cmp>=0 ){ + iRes = iTest; + iLo = iTest+1; + }else{ + iHi = iTest-1; + } + } + + assert( iRes>=0 && c>=aEntry[iRes].iCode ); + p = &aEntry[iRes]; + if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ + ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; + assert( ret>0 ); + } + + if( eRemoveDiacritic ){ + ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2); + } + } + }] + + foreach entry $lHigh { + tl_print_if_entry $entry + } + + puts "" + puts " return ret;" + puts "\}" +} + +proc code {txt} { + set txt [string trimright $txt] + set txt [string trimleft $txt "\n"] + set n [expr {[string length $txt] - [string length [string trim $txt]]}] + set ret "" + foreach L [split $txt "\n"] { + append ret "[string range $L $n end]\n" + } + return [uplevel "subst -nocommands {$ret}"] +} + +proc intarray {lInt} { + set ret "" + set n [llength $lInt] + for {set i 0} {$i < $n} {incr i 10} { + append ret "\n " + foreach int [lrange $lInt $i [expr $i+9]] { + append ret [format "%-7s" "$int, "] + } + } + append ret "\n " + set ret +} + +proc categories_switch {Cvar first lSecond} { + upvar $Cvar C + set ret "" + append ret "case '$first':\n" + append ret " switch( zCat\[1\] ){\n" + foreach s $lSecond { + append ret " case '$s': aArray\[$C($first$s)\] = 1; break;\n" + } + append ret " case '*': \n" + foreach s $lSecond { + append ret " aArray\[$C($first$s)\] = 1;\n" + } + append ret " break;\n" + append ret " default: return 1;" + append ret " }\n" + append ret " break;\n" +} + +# Argument is a list. Each element of which is itself a list of two elements: +# +# * the codepoint +# * the category +# +# List elements are sorted in order of codepoint. +# +proc print_categories {lMap} { + set categories { + Cc Cf Cn Cs + Ll Lm Lo Lt Lu + Mc Me Mn + Nd Nl No + Pc Pd Pe Pf Pi Po Ps + Sc Sk Sm So + Zl Zp Zs + + LC Co + } + + for {set i 0} {$i < [llength $categories]} {incr i} { + set C([lindex $categories $i]) [expr 1+$i] + } + + set caseC [categories_switch C C {c f n s o}] + set caseL [categories_switch C L {l m o t u C}] + set caseM [categories_switch C M {c e n}] + set caseN [categories_switch C N {d l o}] + set caseP [categories_switch C P {c d e f i o s}] + set caseS [categories_switch C S {c k m o}] + set caseZ [categories_switch C Z {l p s}] + + set nCat [expr [llength [array names C]] + 1] + puts [code { + int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){ + aArray[0] = 1; + switch( zCat[0] ){ + $caseC + $caseL + $caseM + $caseN + $caseP + $caseS + $caseZ + } + return 0; + } + }] + + set nRepeat 0 + set first [lindex $lMap 0 0] + set class [lindex $lMap 0 1] + set prev -1 + + set CASE(0) "Lu" + set CASE(1) "Ll" + + foreach m $lMap { + foreach {codepoint cl} $m {} + set codepoint [expr "0x$codepoint"] + if {$codepoint>=(1<<20)} continue + + set bNew 0 + if {$codepoint!=($prev+1)} { + set bNew 1 + } elseif { + $cl==$class || ($class=="LC" && $cl==$CASE([expr $nRepeat & 0x01])) + } { + incr nRepeat + } elseif {$class=="Lu" && $nRepeat==1 && $cl=="Ll"} { + set class LC + incr nRepeat + } else { + set bNew 1 + } + if {$bNew} { + lappend lEntries [list $first $class $nRepeat] + set nRepeat 1 + set first $codepoint + set class $cl + } + set prev $codepoint + } + if {$nRepeat>0} { + lappend lEntries [list $first $class $nRepeat] + } + + set aBlock [list 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + set aMap [list] + foreach e $lEntries { + foreach {cp class nRepeat} $e {} + set block [expr ($cp>>16)] + if {$block>0 && [lindex $aBlock $block]==0} { + for {set i 1} {$i<=$block} {incr i} { + if {[lindex $aBlock $i]==0} { + lset aBlock $i [llength $aMap] + } + } + } + lappend aMap [expr {$cp & 0xFFFF}] + lappend aData [expr {($nRepeat << 5) + $C($class)}] + } + for {set i 1} {$i<[llength $aBlock]} {incr i} { + if {[lindex $aBlock $i]==0} { + lset aBlock $i [llength $aMap] + } + } + + set aBlockArray [intarray $aBlock] + set aMapArray [intarray $aMap] + set aDataArray [intarray $aData] + puts [code { + static u16 aFts5UnicodeBlock[] = {$aBlockArray}; + static u16 aFts5UnicodeMap[] = {$aMapArray}; + static u16 aFts5UnicodeData[] = {$aDataArray}; + + int sqlite3Fts5UnicodeCategory(u32 iCode) { + int iRes = -1; + int iHi; + int iLo; + int ret; + u16 iKey; + + if( iCode>=(1<<20) ){ + return 0; + } + iLo = aFts5UnicodeBlock[(iCode>>16)]; + iHi = aFts5UnicodeBlock[1+(iCode>>16)]; + iKey = (iCode & 0xFFFF); + while( iHi>iLo ){ + int iTest = (iHi + iLo) / 2; + assert( iTest>=iLo && iTest<iHi ); + if( iKey>=aFts5UnicodeMap[iTest] ){ + iRes = iTest; + iLo = iTest+1; + }else{ + iHi = iTest; + } + } + + if( iRes<0 ) return 0; + if( iKey>=(aFts5UnicodeMap[iRes]+(aFts5UnicodeData[iRes]>>5)) ) return 0; + ret = aFts5UnicodeData[iRes] & 0x1F; + if( ret!=$C(LC) ) return ret; + return ((iKey - aFts5UnicodeMap[iRes]) & 0x01) ? $C(Ll) : $C(Lu); + } + + void sqlite3Fts5UnicodeAscii(u8 *aArray, u8 *aAscii){ + int i = 0; + int iTbl = 0; + while( i<128 ){ + int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ]; + int n = (aFts5UnicodeData[iTbl] >> 5) + i; + for(; i<128 && i<n; i++){ + aAscii[i] = (u8)bToken; + } + iTbl++; + } + aAscii[0] = 0; /* 0x00 is never a token character */ + } + }] +} + +proc print_test_categories {lMap} { + + set lCP [list] + foreach e $lMap { + foreach {cp cat} $e {} + if {[expr 0x$cp] < (1<<20)} { + lappend lCP "{0x$cp, \"$cat\"}, " + } + } + + set aCP "\n" + for {set i 0} {$i < [llength $lCP]} {incr i 4} { + append aCP " [join [lrange $lCP $i $i+3]]\n" + } + + + puts [code { + static int categories_test (int *piCode){ + struct Codepoint { + int iCode; + const char *zCat; + } aCP[] = {$aCP}; + int i; + int iCP = 0; + + for(i=0; i<1000000; i++){ + u8 aArray[40]; + int cat = 0; + int c = 0; + memset(aArray, 0, sizeof(aArray)); + if( aCP[iCP].iCode==i ){ + sqlite3Fts5UnicodeCatParse(aCP[iCP].zCat, aArray); + iCP++; + }else{ + aArray[0] = 1; + } + + c = sqlite3Fts5UnicodeCategory((u32)i); + if( aArray[c]==0 ){ + *piCode = i; + return 1; + } + } + + return 0; + } + }] +} + +proc print_fold_test {zFunc mappings} { + global tl_lookup_table + + foreach m $mappings { + set c [lindex $m 1] + if {$c == ""} { + set extra([lindex $m 0]) 0 + } else { + scan $c %c i + set extra([lindex $m 0]) $i + } + } + + puts "static int fold_test(int *piCode)\{" + puts -nonewline " static int aLookup\[\] = \{" + for {set i 0} {$i < 70000} {incr i} { + + set expected $i + catch { set expected $tl_lookup_table($i) } + set expected2 $expected + catch { set expected2 $extra($expected2) } + + if {($i % 4)==0} { puts "" ; puts -nonewline " " } + puts -nonewline "$expected, $expected2, " + } + puts " \};" + puts " int i;" + puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{" + puts " int iCode = (i/2);" + puts " int bFlag = i & 0x0001;" + puts " if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{" + puts " *piCode = iCode;" + puts " return 1;" + puts " \}" + puts " \}" + puts " return 0;" + puts "\}" +} + + +proc print_fileheader {} { + puts [string trim { +/* +** 2012-05-25 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +*/ + +/* +** DO NOT EDIT THIS MACHINE GENERATED FILE. +*/ + }] + puts "" + if {$::generate_fts5_code} { + # no-op + } else { + puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE" + puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)" + } + puts "" + puts "#include <assert.h>" + puts "" +} + +proc print_test_main {} { + puts "" + puts "#include <stdio.h>" + puts "" + puts "int main(int argc, char **argv)\{" + puts " int r1, r2, r3;" + puts " int code;" + puts " r3 = 0;" + puts " r1 = isalnum_test(&code);" + puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);" + puts " else printf(\"isalnum(): test passed\\n\");" + puts " r2 = fold_test(&code);" + puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);" + puts " else printf(\"fold(): test passed\\n\");" + if {$::generate_fts5_code} { + puts " r3 = categories_test(&code);" + puts " if( r3 ) printf(\"categories(): Problem with code %d\\n\",code);" + puts " else printf(\"categories(): test passed\\n\");" + } + puts " return (r1 || r2 || r3);" + puts "\}" +} + +# Proces the command line arguments. Exit early if they are not to +# our liking. +# +proc usage {} { + puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? " + puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>" + exit 1 +} +if {[llength $argv]<2} usage +set unicodedata.txt [lindex $argv end] +set casefolding.txt [lindex $argv end-1] + +set remove_diacritic remove_diacritic +set generate_test_code 0 +set generate_fts5_code 0 +set function_prefix "sqlite3Fts" +for {set i 0} {$i < [llength $argv]-2} {incr i} { + switch -- [lindex $argv $i] { + -test { + set generate_test_code 1 + } + -fts5 { + set function_prefix sqlite3Fts5 + set generate_fts5_code 1 + set remove_diacritic fts5_remove_diacritic + } + default { + usage + } + } +} + +print_fileheader + +if {$::generate_test_code} { + puts "typedef unsigned short int u16;" + puts "typedef unsigned char u8;" + puts "#include <string.h>" +} + +# Print the isalnum() function to stdout. +# +set lRange [an_load_separator_ranges] +if {$generate_fts5_code==0} { + print_isalnum ${function_prefix}UnicodeIsalnum $lRange +} + +# Leave a gap between the two generated C functions. +# +puts "" +puts "" + +# Load the fold data. This is used by the [rd_XXX] commands +# as well as [print_fold]. +tl_load_casefolding_txt ${casefolding.txt} + +set mappings [rd_load_unicodedata_text ${unicodedata.txt}] +print_rd $mappings +puts "" +puts "" +print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings +puts "" +puts "" + +# Print the fold() function to stdout. +# +print_fold ${function_prefix}UnicodeFold + +if {$generate_fts5_code} { + puts "" + puts "" + print_categories [cc_load_unicodedata_text ${unicodedata.txt}] +} + +# Print the test routines and main() function to stdout, if -test +# was specified. +# +if {$::generate_test_code} { + if {$generate_fts5_code==0} { + print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange + } + print_fold_test ${function_prefix}UnicodeFold $mappings + print_test_categories [cc_load_unicodedata_text ${unicodedata.txt}] + print_test_main +} + +if {$generate_fts5_code} { + # no-op +} else { + puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */" + puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */" +} |