# 2014 Dec 19 # # The author disclaims copyright to this source code. In place of # a legal notice, here is a blessing: # # May you do good and not evil. # May you find forgiveness for yourself and forgive others. # May you share freely, never taking more than you give. # #*********************************************************************** # if {![info exists testdir]} { set testdir [file join [file dirname [info script]] .. .. .. test] } source $testdir/tester.tcl ifcapable !fts5 { proc return_if_no_fts5 {} { finish_test return -code return } return } else { proc return_if_no_fts5 {} {} } catch { sqlite3_fts5_may_be_corrupt 0 reset_db } proc fts5_test_poslist {cmd} { set res [list] for {set i 0} {$i < [$cmd xInstCount]} {incr i} { lappend res [string map {{ } .} [$cmd xInst $i]] } set res } proc fts5_test_poslist2 {cmd} { set res [list] for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} { $cmd xPhraseForeach $i c o { lappend res $i.$c.$o } } #set res sort_poslist $res } proc fts5_test_collist {cmd} { set res [list] for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} { $cmd xPhraseColumnForeach $i c { lappend res $i.$c } } set res } proc fts5_collist {cmd iPhrase} { set res [list] $cmd xPhraseColumnForeach $iPhrase c { lappend res $c } set res } proc fts5_test_columnsize {cmd} { set res [list] for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { lappend res [$cmd xColumnSize $i] } set res } proc fts5_columntext {cmd iCol} { $cmd xColumnText $iCol } proc fts5_test_columntext {cmd} { set res [list] for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { lappend res [$cmd xColumnText $i] } set res } proc fts5_test_columntotalsize {cmd} { set res [list] for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { lappend res [$cmd xColumnTotalSize $i] } set res } proc test_append_token {varname token iStart iEnd} { upvar $varname var lappend var $token return "SQLITE_OK" } proc fts5_test_tokenize {cmd} { set res [list] for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { set tokens [list] $cmd xTokenize [$cmd xColumnText $i] [list test_append_token tokens] lappend res $tokens } set res } proc fts5_test_rowcount {cmd} { $cmd xRowCount } proc test_queryphrase_cb {cnt cmd} { upvar $cnt L for {set i 0} {$i < [$cmd xInstCount]} {incr i} { foreach {ip ic io} [$cmd xInst $i] break set A($ic) 1 } foreach ic [array names A] { lset L $ic [expr {[lindex $L $ic] + 1}] } } proc fts5_test_queryphrase {cmd} { set res [list] for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} { set cnt [list] for {set j 0} {$j < [$cmd xColumnCount]} {incr j} { lappend cnt 0 } $cmd xQueryPhrase $i [list test_queryphrase_cb cnt] lappend res $cnt } set res } proc fts5_queryphrase {cmd iPhrase} { set cnt [list] for {set j 0} {$j < [$cmd xColumnCount]} {incr j} { lappend cnt 0 } $cmd xQueryPhrase $iPhrase [list test_queryphrase_cb cnt] set cnt } proc fts5_test_phrasecount {cmd} { $cmd xPhraseCount } proc fts5_test_all {cmd} { set res [list] lappend res columnsize [fts5_test_columnsize $cmd] lappend res columntext [fts5_test_columntext $cmd] lappend res columntotalsize [fts5_test_columntotalsize $cmd] lappend res poslist [fts5_test_poslist $cmd] lappend res tokenize [fts5_test_tokenize $cmd] lappend res rowcount [fts5_test_rowcount $cmd] set res } proc fts5_aux_test_functions {db} { foreach f { fts5_test_columnsize fts5_test_columntext fts5_test_columntotalsize fts5_test_poslist fts5_test_poslist2 fts5_test_collist fts5_test_tokenize fts5_test_rowcount fts5_test_all fts5_test_queryphrase fts5_test_phrasecount fts5_columntext fts5_queryphrase fts5_collist } { sqlite3_fts5_create_function $db $f $f } } proc fts5_segcount {tbl} { set N 0 foreach n [fts5_level_segs $tbl] { incr N $n } set N } proc fts5_level_segs {tbl} { set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10" set ret [list] foreach L [lrange [db one $sql] 1 end] { lappend ret [expr [llength $L] - 3] } set ret } proc fts5_level_segids {tbl} { set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10" set ret [list] foreach L [lrange [db one $sql] 1 end] { set lvl [list] foreach S [lrange $L 3 end] { regexp {id=([1234567890]*)} $S -> segid lappend lvl $segid } lappend ret $lvl } set ret } proc fts5_rnddoc {n} { set map [list 0 a 1 b 2 c 3 d 4 e 5 f 6 g 7 h 8 i 9 j] set doc [list] for {set i 0} {$i < $n} {incr i} { lappend doc "x[string map $map [format %.3d [expr int(rand()*1000)]]]" } set doc } #------------------------------------------------------------------------- # Usage: # # nearset aCol ?-pc VARNAME? ?-near N? ?-col C? -- phrase1 phrase2... # # This command is used to test if a document (set of column values) matches # the logical equivalent of a single FTS5 NEAR() clump and, if so, return # the equivalent of an FTS5 position list. # # Parameter $aCol is passed a list of the column values for the document # to test. Parameters $phrase1 and so on are the phrases. # # The result is a list of phrase hits. Each phrase hit is formatted as # three integers separated by "." characters, in the following format: # # . . # # Options: # # -near N (NEAR distance. Default 10) # -col C (List of column indexes to match against) # -pc VARNAME (variable in caller frame to use for phrase numbering) # -dict VARNAME (array in caller frame to use for synonyms) # proc nearset {aCol args} { # Process the command line options. # set O(-near) 10 set O(-col) {} set O(-pc) "" set O(-dict) "" set nOpt [lsearch -exact $args --] if {$nOpt<0} { error "no -- option" } # Set $lPhrase to be a list of phrases. $nPhrase its length. set lPhrase [lrange $args [expr $nOpt+1] end] set nPhrase [llength $lPhrase] foreach {k v} [lrange $args 0 [expr $nOpt-1]] { if {[info exists O($k)]==0} { error "unrecognized option $k" } set O($k) $v } if {$O(-pc) == ""} { set counter 0 } else { upvar $O(-pc) counter } if {$O(-dict)!=""} { upvar $O(-dict) aDict } for {set j 0} {$j < [llength $aCol]} {incr j} { for {set i 0} {$i < $nPhrase} {incr i} { set A($j,$i) [list] } } # Loop through each column of the current row. for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} { # If there is a column filter, test whether this column is excluded. If # so, skip to the next iteration of this loop. Otherwise, set zCol to the # column value and nToken to the number of tokens that comprise it. if {$O(-col)!="" && [lsearch $O(-col) $iCol]<0} continue set zCol [lindex $aCol $iCol] set nToken [llength $zCol] # Each iteration of the following loop searches a substring of the # column value for phrase matches. The last token of the substring # is token $iLast of the column value. The first token is: # # iFirst = ($iLast - $O(-near) - 1) # # where $sz is the length of the phrase being searched for. A phrase # counts as matching the substring if its first token lies on or before # $iLast and its last token on or after $iFirst. # # For example, if the query is "NEAR(a+b c, 2)" and the column value: # # "x x x x A B x x C x" # 0 1 2 3 4 5 6 7 8 9" # # when (iLast==8 && iFirst=5) the range will contain both phrases and # so both instances can be added to the output poslists. # set iLast [expr $O(-near) >= $nToken ? $nToken - 1 : $O(-near)] for { } {$iLast < $nToken} {incr iLast} { catch { array unset B } for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} { set p [lindex $lPhrase $iPhrase] set nPm1 [expr {[llength $p] - 1}] set iFirst [expr $iLast - $O(-near) - [llength $p]] for {set i $iFirst} {$i <= $iLast} {incr i} { set lCand [lrange $zCol $i [expr $i+$nPm1]] set bMatch 1 foreach tok $p term $lCand { if {[nearset_match aDict $tok $term]==0} { set bMatch 0 ; break } } if {$bMatch} { lappend B($iPhrase) $i } } if {![info exists B($iPhrase)]} break } if {$iPhrase==$nPhrase} { for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} { set A($iCol,$iPhrase) [concat $A($iCol,$iPhrase) $B($iPhrase)] set A($iCol,$iPhrase) [lsort -integer -uniq $A($iCol,$iPhrase)] } } } } set res [list] #puts [array names A] for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} { for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} { foreach a $A($iCol,$iPhrase) { lappend res "$counter.$iCol.$a" } } incr counter } #puts "$aCol -> $res" sort_poslist $res } proc nearset_match {aDictVar tok term} { if {[string match $tok $term]} { return 1 } upvar $aDictVar aDict if {[info exists aDict($tok)]} { foreach s $aDict($tok) { if {[string match $s $term]} { return 1 } } } return 0; } #------------------------------------------------------------------------- # Usage: # # sort_poslist LIST # # Sort a position list of the type returned by command [nearset] # proc sort_poslist {L} { lsort -command instcompare $L } proc instcompare {lhs rhs} { foreach {p1 c1 o1} [split $lhs .] {} foreach {p2 c2 o2} [split $rhs .] {} set res [expr $c1 - $c2] if {$res==0} { set res [expr $o1 - $o2] } if {$res==0} { set res [expr $p1 - $p2] } return $res } #------------------------------------------------------------------------- # Logical operators used by the commands returned by fts5_tcl_expr(). # proc AND {args} { foreach a $args { if {[llength $a]==0} { return [list] } } sort_poslist [concat {*}$args] } proc OR {args} { sort_poslist [concat {*}$args] } proc NOT {a b} { if {[llength $b]>0} { return [list] } return $a } #------------------------------------------------------------------------- # This command is similar to [split], except that it also provides the # start and end offsets of each token. For example: # # [fts5_tokenize_split "abc d ef"] -> {abc 0 3 d 4 5 ef 6 8} # proc gobble_whitespace {textvar} { upvar $textvar t regexp {([ ]*)(.*)} $t -> space t return [string length $space] } proc gobble_text {textvar wordvar} { upvar $textvar t upvar $wordvar w regexp {([^ ]*)(.*)} $t -> w t return [string length $w] } proc fts5_tokenize_split {text} { set token "" set ret [list] set iOff [gobble_whitespace text] while {[set nToken [gobble_text text word]]} { lappend ret $word $iOff [expr $iOff+$nToken] incr iOff $nToken incr iOff [gobble_whitespace text] } set ret } #------------------------------------------------------------------------- # proc foreach_detail_mode {prefix script} { set saved $::testprefix foreach d [list full col none] { set s [string map [list %DETAIL% $d] $script] set ::detail $d set ::testprefix "$prefix-$d" reset_db uplevel $s unset ::detail } set ::testprefix $saved } proc detail_check {} { if {$::detail != "none" && $::detail!="full" && $::detail!="col"} { error "not in foreach_detail_mode {...} block" } } proc detail_is_none {} { detail_check ; expr {$::detail == "none"} } proc detail_is_col {} { detail_check ; expr {$::detail == "col" } } proc detail_is_full {} { detail_check ; expr {$::detail == "full"} } proc foreach_tokenizer_mode {prefix script} { set saved $::testprefix foreach {d mapping} { "" {} "-origintext" {, tokenize="origintext unicode61", tokendata=1} } { set s [string map [list %TOKENIZER% $mapping] $script] set ::testprefix "$prefix$d" reset_db sqlite3_fts5_register_origintext db uplevel $s } set ::testprefix $saved } #------------------------------------------------------------------------- # Convert a poslist of the type returned by fts5_test_poslist() to a # collist as returned by fts5_test_collist(). # proc fts5_poslist2collist {poslist} { set res [list] foreach h $poslist { regexp {(.*)\.[1234567890]+} $h -> cand lappend res $cand } set res [lsort -command fts5_collist_elem_compare -unique $res] return $res } # Comparison function used by fts5_poslist2collist to sort collist entries. proc fts5_collist_elem_compare {a b} { foreach {a1 a2} [split $a .] {} foreach {b1 b2} [split $b .] {} if {$a1==$b1} { return [expr $a2 - $b2] } return [expr $a1 - $b1] } #-------------------------------------------------------------------------- # Construct and return a tcl list equivalent to that returned by the SQL # query executed against database handle [db]: # # SELECT # rowid, # fts5_test_poslist($tbl), # fts5_test_collist($tbl) # FROM $tbl('$expr') # ORDER BY rowid $order; # proc fts5_query_data {expr tbl {order ASC} {aDictVar ""}} { # Figure out the set of columns in the FTS5 table. This routine does # not handle tables with UNINDEXED columns, but if it did, it would # have to be here. db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) } set d "" if {$aDictVar != ""} { upvar $aDictVar aDict set d aDict } set cols "" foreach e $lCols { append cols ", '$e'" } set tclexpr [db one [subst -novar { SELECT fts5_expr_tcl( $expr, 'nearset $cols -dict $d -pc ::pc' [set cols] ) }]] set res [list] db eval "SELECT rowid, * FROM $tbl ORDER BY rowid $order" x { set cols [list] foreach col $lCols { lappend cols $x($col) } set ::pc 0 set rowdata [eval $tclexpr] if {$rowdata != ""} { lappend res $x(rowid) $rowdata [fts5_poslist2collist $rowdata] } } set res } #------------------------------------------------------------------------- # Similar to [fts5_query_data], but omit the collist field. # proc fts5_poslist_data {expr tbl {order ASC} {aDictVar ""}} { set res [list] if {$aDictVar!=""} { upvar $aDictVar aDict set dict aDict } else { set dict "" } foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] { lappend res $rowid $poslist } set res } proc fts5_collist_data {expr tbl {order ASC} {aDictVar ""}} { set res [list] if {$aDictVar!=""} { upvar $aDictVar aDict set dict aDict } else { set dict "" } foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] { lappend res $rowid $collist } set res } #------------------------------------------------------------------------- # # This command will only work inside a [foreach_detail_mode] block. It tests # whether or not expression $expr run on FTS5 table $tbl is supported by # the current mode. If so, 1 is returned. If not, 0. # # detail=full (all queries supported) # detail=col (all but phrase queries and NEAR queries) # detail=none (all but phrase queries, NEAR queries, and column filters) # proc fts5_expr_ok {expr tbl} { if {![detail_is_full]} { set nearset "nearset_rc" if {[detail_is_col]} { set nearset "nearset_rf" } set ::expr_not_ok 0 db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) } set cols "" foreach e $lCols { append cols ", '$e'" } set ::pc 0 set tclexpr [db one [subst -novar { SELECT fts5_expr_tcl( $expr, '[set nearset] $cols -pc ::pc' [set cols] ) }]] eval $tclexpr if {$::expr_not_ok} { return 0 } } return 1 } # Helper for [fts5_expr_ok] proc nearset_rf {aCol args} { set idx [lsearch -exact $args --] if {$idx != [llength $args]-2 || [llength [lindex $args end]]!=1} { set ::expr_not_ok 1 } list } # Helper for [fts5_expr_ok] proc nearset_rc {aCol args} { nearset_rf $aCol {*}$args if {[lsearch $args -col]>=0} { set ::expr_not_ok 1 } list } proc dump {tname} { execsql_pp "SELECT * FROM ${tname}_idx" execsql_pp "SELECT id, quote(block), fts5_decode(id,block) FROM ${tname}_data" } #------------------------------------------------------------------------- # Code for a simple Tcl tokenizer that supports synonyms at query time. # proc tclnum_tokenize {mode tflags text} { foreach {w iStart iEnd} [fts5_tokenize_split $text] { sqlite3_fts5_token $w $iStart $iEnd if {$tflags == $mode && [info exists ::tclnum_syn($w)]} { foreach s $::tclnum_syn($w) { sqlite3_fts5_token -colo $s $iStart $iEnd } } } } proc tclnum_create {args} { set mode query if {[llength $args]} { set mode [lindex $args 0] } if {$mode != "query" && $mode != "document"} { error "bad mode: $mode" } return [list tclnum_tokenize $mode] } proc fts5_tclnum_register {db} { foreach SYNDICT { {zero 0} {one 1 i} {two 2 ii} {three 3 iii} {four 4 iv} {five 5 v} {six 6 vi} {seven 7 vii} {eight 8 viii} {nine 9 ix} {a1 a2 a3 a4 a5 a6 a7 a8 a9} {b1 b2 b3 b4 b5 b6 b7 b8 b9} {c1 c2 c3 c4 c5 c6 c7 c8 c9} } { foreach s $SYNDICT { set o [list] foreach x $SYNDICT {if {$x!=$s} {lappend o $x}} set ::tclnum_syn($s) $o } } sqlite3_fts5_create_tokenizer db tclnum tclnum_create } # # End of tokenizer code. #-------------------------------------------------------------------------