source [file join [file dirname [info script]] parseunicode.tcl] proc print_rd {map} { global tl_lookup_table set aChar [list] set lRange [list] set nRange 1 set iFirst [lindex $map 0 0] set cPrev [lindex $map 0 1] set fPrev [lindex $map 0 2] foreach m [lrange $map 1 end] { foreach {i c f} $m {} if {$cPrev == $c && $fPrev==$f} { for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} { if {[info exists tl_lookup_table($j)]==0} break } if {$j==$i} { set nNew [expr {(1 + $i - $iFirst)}] if {$nNew<=8} { set nRange $nNew continue } } } lappend lRange [list $iFirst $nRange] lappend aChar $cPrev lappend aFlag $fPrev set iFirst $i set cPrev $c set fPrev $f set nRange 1 } lappend lRange [list $iFirst $nRange] lappend aChar $cPrev lappend aFlag $fPrev puts "/*" puts "** If the argument is a codepoint corresponding to a lowercase letter" puts "** in the ASCII range with a diacritic added, return the codepoint" puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN" puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER" puts "** E\"). The resuls of passing a codepoint that corresponds to an" puts "** uppercase letter are undefined." puts "*/" puts "static int ${::remove_diacritic}(int c, int bComplex)\{" puts " unsigned short aDia\[\] = \{" puts -nonewline " 0, " set i 1 foreach r $lRange { foreach {iCode nRange} $r {} if {($i % 8)==0} {puts "" ; puts -nonewline " " } incr i puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]] puts -nonewline ", " } puts "" puts " \};" puts "#define HIBIT ((unsigned char)0x80)" puts " unsigned char aChar\[\] = \{" puts -nonewline " '\\0', " set i 1 foreach c $aChar f $aFlag { if { $f } { set str "'$c'|HIBIT, " } else { set str "'$c', " } if {$c == ""} { set str "'\\0', " } if {($i % 6)==0} {puts "" ; puts -nonewline " " } incr i puts -nonewline "$str" } puts "" puts " \};" puts { unsigned int key = (((unsigned int)c)<<3) | 0x00000007; int iRes = 0; int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; int iLo = 0; while( iHi>=iLo ){ int iTest = (iHi + iLo) / 2; if( key >= aDia[iTest] ){ iRes = iTest; iLo = iTest+1; }else{ iHi = iTest-1; } } assert( key>=aDia[iRes] ); if( bComplex==0 && (aChar[iRes] & 0x80) ) return c; return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);} puts "\}" } proc print_isdiacritic {zFunc map} { set lCode [list] foreach m $map { foreach {code char flag} $m {} if {$flag} continue if {$code && $char == ""} { lappend lCode $code } } set lCode [lsort -integer $lCode] set iFirst [lindex $lCode 0] set iLast [lindex $lCode end] set i1 0 set i2 0 foreach c $lCode { set i [expr $c - $iFirst] if {$i < 32} { set i1 [expr {$i1 | (1<<$i)}] } else { set i2 [expr {$i2 | (1<<($i-32))}] } } puts "/*" puts "** Return true if the argument interpreted as a unicode codepoint" puts "** is a diacritical modifier character." puts "*/" puts "int ${zFunc}\(int c)\{" puts " unsigned int mask0 = [format "0x%08X" $i1];" puts " unsigned int mask1 = [format "0x%08X" $i2];" puts " if( c<$iFirst || c>$iLast ) return 0;" puts " return (c < $iFirst+32) ?" puts " (mask0 & ((unsigned int)1 << (c-$iFirst))) :" puts " (mask1 & ((unsigned int)1 << (c-$iFirst-32)));" puts "\}" } #------------------------------------------------------------------------- proc an_load_separator_ranges {} { global unicodedata.txt set lSep [an_load_unicodedata_text ${unicodedata.txt}] unset -nocomplain iFirst unset -nocomplain nRange set lRange [list] foreach sep $lSep { if {0==[info exists iFirst]} { set iFirst $sep set nRange 1 } elseif { $sep == ($iFirst+$nRange) } { incr nRange } else { lappend lRange [list $iFirst $nRange] set iFirst $sep set nRange 1 } } lappend lRange [list $iFirst $nRange] set lRange } proc an_print_range_array {lRange} { set iFirstMax 0 set nRangeMax 0 foreach range $lRange { foreach {iFirst nRange} $range {} if {$iFirst > $iFirstMax} {set iFirstMax $iFirst} if {$nRange > $nRangeMax} {set nRangeMax $nRange} } if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"} if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"} puts -nonewline " " puts [string trim { /* Each unsigned integer in the following array corresponds to a contiguous ** range of unicode codepoints that are not either letters or numbers (i.e. ** codepoints for which this function should return 0). ** ** The most significant 22 bits in each 32-bit value contain the first ** codepoint in the range. The least significant 10 bits are used to store ** the size of the range (always at least 1). In other words, the value ** ((C<<22) + N) represents a range of N codepoints starting with codepoint ** C. It is not possible to represent a range larger than 1023 codepoints ** using this format. */ }] puts -nonewline " static const unsigned int aEntry\[\] = \{" set i 0 foreach range $lRange { foreach {iFirst nRange} $range {} set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]] if {($i % 5)==0} {puts "" ; puts -nonewline " "} puts -nonewline " $u32," incr i } puts "" puts " \};" } proc an_print_ascii_bitmap {lRange} { foreach range $lRange { foreach {iFirst nRange} $range {} for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { if {$i<=127} { set a($i) 1 } } } set aAscii [list 0 0 0 0] foreach key [array names a] { set idx [expr $key >> 5] lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))] } puts " static const unsigned int aAscii\[4\] = \{" puts -nonewline " " foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] } puts "" puts " \};" } proc print_isalnum {zFunc lRange} { puts "/*" puts "** Return true if the argument corresponds to a unicode codepoint" puts "** classified as either a letter or a number. Otherwise false." puts "**" puts "** The results are undefined if the value passed to this function" puts "** is less than zero." puts "*/" puts "int ${zFunc}\(int c)\{" an_print_range_array $lRange an_print_ascii_bitmap $lRange puts { if( (unsigned int)c<128 ){ return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 ); }else if( (unsigned int)c<(1<<22) ){ unsigned int key = (((unsigned int)c)<<10) | 0x000003FF; int iRes = 0; int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; int iLo = 0; while( iHi>=iLo ){ int iTest = (iHi + iLo) / 2; if( key >= aEntry[iTest] ){ iRes = iTest; iLo = iTest+1; }else{ iHi = iTest-1; } } assert( aEntry[0]=aEntry[iRes] ); return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF))); } return 1;} puts "\}" } proc print_test_isalnum {zFunc lRange} { foreach range $lRange { foreach {iFirst nRange} $range {} for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 } } puts "static int isalnum_test(int *piCode)\{" puts -nonewline " unsigned char aAlnum\[\] = \{" for {set i 0} {$i < 70000} {incr i} { if {($i % 32)==0} { puts "" ; puts -nonewline " " } set bFlag [expr ![info exists a($i)]] puts -nonewline "${bFlag}," } puts "" puts " \};" puts -nonewline " int aLargeSep\[\] = \{" set i 0 foreach iSep [lsort -integer [array names a]] { if {$iSep<70000} continue if {($i % 8)==0} { puts "" ; puts -nonewline " " } puts -nonewline " $iSep," incr i } puts "" puts " \};" puts -nonewline " int aLargeOther\[\] = \{" set i 0 foreach iSep [lsort -integer [array names a]] { if {$iSep<70000} continue if {[info exists a([expr $iSep-1])]==0} { if {($i % 8)==0} { puts "" ; puts -nonewline " " } puts -nonewline " [expr $iSep-1]," incr i } if {[info exists a([expr $iSep+1])]==0} { if {($i % 8)==0} { puts "" ; puts -nonewline " " } puts -nonewline " [expr $iSep+1]," incr i } } puts "" puts " \};" puts [subst -nocommands { int i; for(i=0; i>1]) & 0xFFFF). ** ** The contents of this array are generated by parsing the CaseFolding.txt ** file distributed as part of the "Unicode Character Database". See ** http://www.unicode.org for details. */ }] puts " static const struct TableEntry \{" puts " unsigned short iCode;" puts " unsigned char flags;" puts " unsigned char nRange;" puts " \} aEntry\[\] = \{" } proc tl_print_table_entry {togglevar entry liOff} { upvar $togglevar t foreach {iFirst nIncr nRange nOff} $entry {} if {$iFirst > (1<<16)} { return 1 } if {[info exists t]==0} {set t 0} if {$t==0} { puts -nonewline " " } set flags 0 if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]} if {$nOff<0} { incr nOff [expr (1<<16)] } set idx [lsearch $liOff $nOff] if {$idx<0} {error "malfunction generating aiOff"} set flags [expr $flags + $idx*2] set txt "{$iFirst, $flags, $nRange}," if {$t==2} { puts $txt } else { puts -nonewline [format "% -23s" $txt] } set t [expr ($t+1)%3] return 0 } proc tl_print_table_footer {togglevar} { upvar $togglevar t if {$t!=0} {puts ""} puts " \};" } proc tl_print_if_entry {entry} { foreach {iFirst nIncr nRange nOff} $entry {} if {$nIncr==2} {error "tl_print_if_entry needs improvement!"} puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{" puts " ret = c + $nOff;" puts " \}" } proc tl_generate_ioff_table {lRecord} { foreach entry $lRecord { foreach {iFirst nIncr nRange iOff} $entry {} if {$iOff<0} { incr iOff [expr (1<<16)] } if {[info exists a($iOff)]} continue set a($iOff) 1 } set liOff [lsort -integer [array names a]] if {[llength $liOff]>128} { error "Too many distinct ioffs" } return $liOff } proc tl_print_ioff_table {liOff} { puts -nonewline " static const unsigned short aiOff\[\] = \{" set i 0 foreach off $liOff { if {($i % 8)==0} {puts "" ; puts -nonewline " "} puts -nonewline [format "% -7s" "$off,"] incr i } puts "" puts " \};" } proc print_fold {zFunc} { set lRecord [tl_create_records] set lHigh [list] puts "/*" puts "** Interpret the argument as a unicode codepoint. If the codepoint" puts "** is an upper case character that has a lower case equivalent," puts "** return the codepoint corresponding to the lower case version." puts "** Otherwise, return a copy of the argument." puts "**" puts "** The results are undefined if the value passed to this function" puts "** is less than zero." puts "*/" puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{" set liOff [tl_generate_ioff_table $lRecord] tl_print_table_header foreach entry $lRecord { if {[tl_print_table_entry toggle $entry $liOff]} { lappend lHigh $entry } } tl_print_table_footer toggle tl_print_ioff_table $liOff puts [subst -nocommands { int ret = c; assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); if( c<128 ){ if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); }else if( c<65536 ){ const struct TableEntry *p; int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; int iLo = 0; int iRes = -1; assert( c>aEntry[0].iCode ); while( iHi>=iLo ){ int iTest = (iHi + iLo) / 2; int cmp = (c - aEntry[iTest].iCode); if( cmp>=0 ){ iRes = iTest; iLo = iTest+1; }else{ iHi = iTest-1; } } assert( iRes>=0 && c>=aEntry[iRes].iCode ); p = &aEntry[iRes]; if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; assert( ret>0 ); } if( eRemoveDiacritic ){ ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2); } } }] foreach entry $lHigh { tl_print_if_entry $entry } puts "" puts " return ret;" puts "\}" } proc code {txt} { set txt [string trimright $txt] set txt [string trimleft $txt "\n"] set n [expr {[string length $txt] - [string length [string trim $txt]]}] set ret "" foreach L [split $txt "\n"] { append ret "[string range $L $n end]\n" } return [uplevel "subst -nocommands {$ret}"] } proc intarray {lInt} { set ret "" set n [llength $lInt] for {set i 0} {$i < $n} {incr i 10} { append ret "\n " foreach int [lrange $lInt $i [expr $i+9]] { append ret [format "%-7s" "$int, "] } } append ret "\n " set ret } proc categories_switch {Cvar first lSecond} { upvar $Cvar C set ret "" append ret "case '$first':\n" append ret " switch( zCat\[1\] ){\n" foreach s $lSecond { append ret " case '$s': aArray\[$C($first$s)\] = 1; break;\n" } append ret " case '*': \n" foreach s $lSecond { append ret " aArray\[$C($first$s)\] = 1;\n" } append ret " break;\n" append ret " default: return 1;" append ret " }\n" append ret " break;\n" } # Argument is a list. Each element of which is itself a list of two elements: # # * the codepoint # * the category # # List elements are sorted in order of codepoint. # proc print_categories {lMap} { set categories { Cc Cf Cn Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs LC Co } for {set i 0} {$i < [llength $categories]} {incr i} { set C([lindex $categories $i]) [expr 1+$i] } set caseC [categories_switch C C {c f n s o}] set caseL [categories_switch C L {l m o t u C}] set caseM [categories_switch C M {c e n}] set caseN [categories_switch C N {d l o}] set caseP [categories_switch C P {c d e f i o s}] set caseS [categories_switch C S {c k m o}] set caseZ [categories_switch C Z {l p s}] set nCat [expr [llength [array names C]] + 1] puts [code { int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){ aArray[0] = 1; switch( zCat[0] ){ $caseC $caseL $caseM $caseN $caseP $caseS $caseZ } return 0; } }] set nRepeat 0 set first [lindex $lMap 0 0] set class [lindex $lMap 0 1] set prev -1 set CASE(0) "Lu" set CASE(1) "Ll" foreach m $lMap { foreach {codepoint cl} $m {} set codepoint [expr "0x$codepoint"] if {$codepoint>=(1<<20)} continue set bNew 0 if {$codepoint!=($prev+1)} { set bNew 1 } elseif { $cl==$class || ($class=="LC" && $cl==$CASE([expr $nRepeat & 0x01])) } { incr nRepeat } elseif {$class=="Lu" && $nRepeat==1 && $cl=="Ll"} { set class LC incr nRepeat } else { set bNew 1 } if {$bNew} { lappend lEntries [list $first $class $nRepeat] set nRepeat 1 set first $codepoint set class $cl } set prev $codepoint } if {$nRepeat>0} { lappend lEntries [list $first $class $nRepeat] } set aBlock [list 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] set aMap [list] foreach e $lEntries { foreach {cp class nRepeat} $e {} set block [expr ($cp>>16)] if {$block>0 && [lindex $aBlock $block]==0} { for {set i 1} {$i<=$block} {incr i} { if {[lindex $aBlock $i]==0} { lset aBlock $i [llength $aMap] } } } lappend aMap [expr {$cp & 0xFFFF}] lappend aData [expr {($nRepeat << 5) + $C($class)}] } for {set i 1} {$i<[llength $aBlock]} {incr i} { if {[lindex $aBlock $i]==0} { lset aBlock $i [llength $aMap] } } set aBlockArray [intarray $aBlock] set aMapArray [intarray $aMap] set aDataArray [intarray $aData] puts [code { static u16 aFts5UnicodeBlock[] = {$aBlockArray}; static u16 aFts5UnicodeMap[] = {$aMapArray}; static u16 aFts5UnicodeData[] = {$aDataArray}; int sqlite3Fts5UnicodeCategory(u32 iCode) { int iRes = -1; int iHi; int iLo; int ret; u16 iKey; if( iCode>=(1<<20) ){ return 0; } iLo = aFts5UnicodeBlock[(iCode>>16)]; iHi = aFts5UnicodeBlock[1+(iCode>>16)]; iKey = (iCode & 0xFFFF); while( iHi>iLo ){ int iTest = (iHi + iLo) / 2; assert( iTest>=iLo && iTest=aFts5UnicodeMap[iTest] ){ iRes = iTest; iLo = iTest+1; }else{ iHi = iTest; } } if( iRes<0 ) return 0; if( iKey>=(aFts5UnicodeMap[iRes]+(aFts5UnicodeData[iRes]>>5)) ) return 0; ret = aFts5UnicodeData[iRes] & 0x1F; if( ret!=$C(LC) ) return ret; return ((iKey - aFts5UnicodeMap[iRes]) & 0x01) ? $C(Ll) : $C(Lu); } void sqlite3Fts5UnicodeAscii(u8 *aArray, u8 *aAscii){ int i = 0; int iTbl = 0; while( i<128 ){ int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ]; int n = (aFts5UnicodeData[iTbl] >> 5) + i; for(; i<128 && i" puts "" } proc print_test_main {} { puts "" puts "#include " puts "" puts "int main(int argc, char **argv)\{" puts " int r1, r2, r3;" puts " int code;" puts " r3 = 0;" puts " r1 = isalnum_test(&code);" puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);" puts " else printf(\"isalnum(): test passed\\n\");" puts " r2 = fold_test(&code);" puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);" puts " else printf(\"fold(): test passed\\n\");" if {$::generate_fts5_code} { puts " r3 = categories_test(&code);" puts " if( r3 ) printf(\"categories(): Problem with code %d\\n\",code);" puts " else printf(\"categories(): test passed\\n\");" } puts " return (r1 || r2 || r3);" puts "\}" } # Proces the command line arguments. Exit early if they are not to # our liking. # proc usage {} { puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? " puts stderr " " exit 1 } if {[llength $argv]<2} usage set unicodedata.txt [lindex $argv end] set casefolding.txt [lindex $argv end-1] set remove_diacritic remove_diacritic set generate_test_code 0 set generate_fts5_code 0 set function_prefix "sqlite3Fts" for {set i 0} {$i < [llength $argv]-2} {incr i} { switch -- [lindex $argv $i] { -test { set generate_test_code 1 } -fts5 { set function_prefix sqlite3Fts5 set generate_fts5_code 1 set remove_diacritic fts5_remove_diacritic } default { usage } } } print_fileheader if {$::generate_test_code} { puts "typedef unsigned short int u16;" puts "typedef unsigned char u8;" puts "#include " } # Print the isalnum() function to stdout. # set lRange [an_load_separator_ranges] if {$generate_fts5_code==0} { print_isalnum ${function_prefix}UnicodeIsalnum $lRange } # Leave a gap between the two generated C functions. # puts "" puts "" # Load the fold data. This is used by the [rd_XXX] commands # as well as [print_fold]. tl_load_casefolding_txt ${casefolding.txt} set mappings [rd_load_unicodedata_text ${unicodedata.txt}] print_rd $mappings puts "" puts "" print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings puts "" puts "" # Print the fold() function to stdout. # print_fold ${function_prefix}UnicodeFold if {$generate_fts5_code} { puts "" puts "" print_categories [cc_load_unicodedata_text ${unicodedata.txt}] } # Print the test routines and main() function to stdout, if -test # was specified. # if {$::generate_test_code} { if {$generate_fts5_code==0} { print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange } print_fold_test ${function_prefix}UnicodeFold $mappings print_test_categories [cc_load_unicodedata_text ${unicodedata.txt}] print_test_main } if {$generate_fts5_code} { # no-op } else { puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */" puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */" }