diff options
Diffstat (limited to 'ext/fts5/test/fts5_common.tcl')
-rw-r--r-- | ext/fts5/test/fts5_common.tcl | 647 |
1 files changed, 647 insertions, 0 deletions
diff --git a/ext/fts5/test/fts5_common.tcl b/ext/fts5/test/fts5_common.tcl new file mode 100644 index 0000000..0f371dc --- /dev/null +++ b/ext/fts5/test/fts5_common.tcl @@ -0,0 +1,647 @@ +# 2014 Dec 19 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# + +if {![info exists testdir]} { + set testdir [file join [file dirname [info script]] .. .. .. test] +} +source $testdir/tester.tcl + +ifcapable !fts5 { + proc return_if_no_fts5 {} { + finish_test + return -code return + } + return +} else { + proc return_if_no_fts5 {} {} +} + +catch { + sqlite3_fts5_may_be_corrupt 0 + reset_db +} + +proc fts5_test_poslist {cmd} { + set res [list] + for {set i 0} {$i < [$cmd xInstCount]} {incr i} { + lappend res [string map {{ } .} [$cmd xInst $i]] + } + set res +} + +proc fts5_test_poslist2 {cmd} { + set res [list] + + for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} { + $cmd xPhraseForeach $i c o { + lappend res $i.$c.$o + } + } + + #set res + sort_poslist $res +} + +proc fts5_test_collist {cmd} { + set res [list] + + for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} { + $cmd xPhraseColumnForeach $i c { lappend res $i.$c } + } + + set res +} + +proc fts5_test_columnsize {cmd} { + set res [list] + for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { + lappend res [$cmd xColumnSize $i] + } + set res +} + +proc fts5_test_columntext {cmd} { + set res [list] + for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { + lappend res [$cmd xColumnText $i] + } + set res +} + +proc fts5_test_columntotalsize {cmd} { + set res [list] + for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { + lappend res [$cmd xColumnTotalSize $i] + } + set res +} + +proc test_append_token {varname token iStart iEnd} { + upvar $varname var + lappend var $token + return "SQLITE_OK" +} +proc fts5_test_tokenize {cmd} { + set res [list] + for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { + set tokens [list] + $cmd xTokenize [$cmd xColumnText $i] [list test_append_token tokens] + lappend res $tokens + } + set res +} + +proc fts5_test_rowcount {cmd} { + $cmd xRowCount +} + +proc test_queryphrase_cb {cnt cmd} { + upvar $cnt L + for {set i 0} {$i < [$cmd xInstCount]} {incr i} { + foreach {ip ic io} [$cmd xInst $i] break + set A($ic) 1 + } + foreach ic [array names A] { + lset L $ic [expr {[lindex $L $ic] + 1}] + } +} +proc fts5_test_queryphrase {cmd} { + set res [list] + for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} { + set cnt [list] + for {set j 0} {$j < [$cmd xColumnCount]} {incr j} { lappend cnt 0 } + $cmd xQueryPhrase $i [list test_queryphrase_cb cnt] + lappend res $cnt + } + set res +} + +proc fts5_test_phrasecount {cmd} { + $cmd xPhraseCount +} + +proc fts5_test_all {cmd} { + set res [list] + lappend res columnsize [fts5_test_columnsize $cmd] + lappend res columntext [fts5_test_columntext $cmd] + lappend res columntotalsize [fts5_test_columntotalsize $cmd] + lappend res poslist [fts5_test_poslist $cmd] + lappend res tokenize [fts5_test_tokenize $cmd] + lappend res rowcount [fts5_test_rowcount $cmd] + set res +} + +proc fts5_aux_test_functions {db} { + foreach f { + fts5_test_columnsize + fts5_test_columntext + fts5_test_columntotalsize + fts5_test_poslist + fts5_test_poslist2 + fts5_test_collist + fts5_test_tokenize + fts5_test_rowcount + fts5_test_all + + fts5_test_queryphrase + fts5_test_phrasecount + } { + sqlite3_fts5_create_function $db $f $f + } +} + +proc fts5_segcount {tbl} { + set N 0 + foreach n [fts5_level_segs $tbl] { incr N $n } + set N +} + +proc fts5_level_segs {tbl} { + set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10" + set ret [list] + foreach L [lrange [db one $sql] 1 end] { + lappend ret [expr [llength $L] - 3] + } + set ret +} + +proc fts5_level_segids {tbl} { + set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10" + set ret [list] + foreach L [lrange [db one $sql] 1 end] { + set lvl [list] + foreach S [lrange $L 3 end] { + regexp {id=([1234567890]*)} $S -> segid + lappend lvl $segid + } + lappend ret $lvl + } + set ret +} + +proc fts5_rnddoc {n} { + set map [list 0 a 1 b 2 c 3 d 4 e 5 f 6 g 7 h 8 i 9 j] + set doc [list] + for {set i 0} {$i < $n} {incr i} { + lappend doc "x[string map $map [format %.3d [expr int(rand()*1000)]]]" + } + set doc +} + +#------------------------------------------------------------------------- +# Usage: +# +# nearset aCol ?-pc VARNAME? ?-near N? ?-col C? -- phrase1 phrase2... +# +# This command is used to test if a document (set of column values) matches +# the logical equivalent of a single FTS5 NEAR() clump and, if so, return +# the equivalent of an FTS5 position list. +# +# Parameter $aCol is passed a list of the column values for the document +# to test. Parameters $phrase1 and so on are the phrases. +# +# The result is a list of phrase hits. Each phrase hit is formatted as +# three integers separated by "." characters, in the following format: +# +# <phrase number> . <column number> . <token offset> +# +# Options: +# +# -near N (NEAR distance. Default 10) +# -col C (List of column indexes to match against) +# -pc VARNAME (variable in caller frame to use for phrase numbering) +# -dict VARNAME (array in caller frame to use for synonyms) +# +proc nearset {aCol args} { + + # Process the command line options. + # + set O(-near) 10 + set O(-col) {} + set O(-pc) "" + set O(-dict) "" + + set nOpt [lsearch -exact $args --] + if {$nOpt<0} { error "no -- option" } + + # Set $lPhrase to be a list of phrases. $nPhrase its length. + set lPhrase [lrange $args [expr $nOpt+1] end] + set nPhrase [llength $lPhrase] + + foreach {k v} [lrange $args 0 [expr $nOpt-1]] { + if {[info exists O($k)]==0} { error "unrecognized option $k" } + set O($k) $v + } + + if {$O(-pc) == ""} { + set counter 0 + } else { + upvar $O(-pc) counter + } + + if {$O(-dict)!=""} { upvar $O(-dict) aDict } + + for {set j 0} {$j < [llength $aCol]} {incr j} { + for {set i 0} {$i < $nPhrase} {incr i} { + set A($j,$i) [list] + } + } + + # Loop through each column of the current row. + for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} { + + # If there is a column filter, test whether this column is excluded. If + # so, skip to the next iteration of this loop. Otherwise, set zCol to the + # column value and nToken to the number of tokens that comprise it. + if {$O(-col)!="" && [lsearch $O(-col) $iCol]<0} continue + set zCol [lindex $aCol $iCol] + set nToken [llength $zCol] + + # Each iteration of the following loop searches a substring of the + # column value for phrase matches. The last token of the substring + # is token $iLast of the column value. The first token is: + # + # iFirst = ($iLast - $O(-near) - 1) + # + # where $sz is the length of the phrase being searched for. A phrase + # counts as matching the substring if its first token lies on or before + # $iLast and its last token on or after $iFirst. + # + # For example, if the query is "NEAR(a+b c, 2)" and the column value: + # + # "x x x x A B x x C x" + # 0 1 2 3 4 5 6 7 8 9" + # + # when (iLast==8 && iFirst=5) the range will contain both phrases and + # so both instances can be added to the output poslists. + # + set iLast [expr $O(-near) >= $nToken ? $nToken - 1 : $O(-near)] + for { } {$iLast < $nToken} {incr iLast} { + + catch { array unset B } + + for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} { + set p [lindex $lPhrase $iPhrase] + set nPm1 [expr {[llength $p] - 1}] + set iFirst [expr $iLast - $O(-near) - [llength $p]] + + for {set i $iFirst} {$i <= $iLast} {incr i} { + set lCand [lrange $zCol $i [expr $i+$nPm1]] + set bMatch 1 + foreach tok $p term $lCand { + if {[nearset_match aDict $tok $term]==0} { set bMatch 0 ; break } + } + if {$bMatch} { lappend B($iPhrase) $i } + } + + if {![info exists B($iPhrase)]} break + } + + if {$iPhrase==$nPhrase} { + for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} { + set A($iCol,$iPhrase) [concat $A($iCol,$iPhrase) $B($iPhrase)] + set A($iCol,$iPhrase) [lsort -integer -uniq $A($iCol,$iPhrase)] + } + } + } + } + + set res [list] + #puts [array names A] + + for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} { + for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} { + foreach a $A($iCol,$iPhrase) { + lappend res "$counter.$iCol.$a" + } + } + incr counter + } + + #puts "$aCol -> $res" + sort_poslist $res +} + +proc nearset_match {aDictVar tok term} { + if {[string match $tok $term]} { return 1 } + + upvar $aDictVar aDict + if {[info exists aDict($tok)]} { + foreach s $aDict($tok) { + if {[string match $s $term]} { return 1 } + } + } + return 0; +} + +#------------------------------------------------------------------------- +# Usage: +# +# sort_poslist LIST +# +# Sort a position list of the type returned by command [nearset] +# +proc sort_poslist {L} { + lsort -command instcompare $L +} +proc instcompare {lhs rhs} { + foreach {p1 c1 o1} [split $lhs .] {} + foreach {p2 c2 o2} [split $rhs .] {} + + set res [expr $c1 - $c2] + if {$res==0} { set res [expr $o1 - $o2] } + if {$res==0} { set res [expr $p1 - $p2] } + + return $res +} + +#------------------------------------------------------------------------- +# Logical operators used by the commands returned by fts5_tcl_expr(). +# +proc AND {args} { + foreach a $args { + if {[llength $a]==0} { return [list] } + } + sort_poslist [concat {*}$args] +} +proc OR {args} { + sort_poslist [concat {*}$args] +} +proc NOT {a b} { + if {[llength $b]>0} { return [list] } + return $a +} + +#------------------------------------------------------------------------- +# This command is similar to [split], except that it also provides the +# start and end offsets of each token. For example: +# +# [fts5_tokenize_split "abc d ef"] -> {abc 0 3 d 4 5 ef 6 8} +# + +proc gobble_whitespace {textvar} { + upvar $textvar t + regexp {([ ]*)(.*)} $t -> space t + return [string length $space] +} + +proc gobble_text {textvar wordvar} { + upvar $textvar t + upvar $wordvar w + regexp {([^ ]*)(.*)} $t -> w t + return [string length $w] +} + +proc fts5_tokenize_split {text} { + set token "" + set ret [list] + set iOff [gobble_whitespace text] + while {[set nToken [gobble_text text word]]} { + lappend ret $word $iOff [expr $iOff+$nToken] + incr iOff $nToken + incr iOff [gobble_whitespace text] + } + + set ret +} + +#------------------------------------------------------------------------- +# +proc foreach_detail_mode {prefix script} { + set saved $::testprefix + foreach d [list full col none] { + set s [string map [list %DETAIL% $d] $script] + set ::detail $d + set ::testprefix "$prefix-$d" + reset_db + uplevel $s + unset ::detail + } + set ::testprefix $saved +} + +proc detail_check {} { + if {$::detail != "none" && $::detail!="full" && $::detail!="col"} { + error "not in foreach_detail_mode {...} block" + } +} +proc detail_is_none {} { detail_check ; expr {$::detail == "none"} } +proc detail_is_col {} { detail_check ; expr {$::detail == "col" } } +proc detail_is_full {} { detail_check ; expr {$::detail == "full"} } + + +#------------------------------------------------------------------------- +# Convert a poslist of the type returned by fts5_test_poslist() to a +# collist as returned by fts5_test_collist(). +# +proc fts5_poslist2collist {poslist} { + set res [list] + foreach h $poslist { + regexp {(.*)\.[1234567890]+} $h -> cand + lappend res $cand + } + set res [lsort -command fts5_collist_elem_compare -unique $res] + return $res +} + +# Comparison function used by fts5_poslist2collist to sort collist entries. +proc fts5_collist_elem_compare {a b} { + foreach {a1 a2} [split $a .] {} + foreach {b1 b2} [split $b .] {} + + if {$a1==$b1} { return [expr $a2 - $b2] } + return [expr $a1 - $b1] +} + + +#-------------------------------------------------------------------------- +# Construct and return a tcl list equivalent to that returned by the SQL +# query executed against database handle [db]: +# +# SELECT +# rowid, +# fts5_test_poslist($tbl), +# fts5_test_collist($tbl) +# FROM $tbl('$expr') +# ORDER BY rowid $order; +# +proc fts5_query_data {expr tbl {order ASC} {aDictVar ""}} { + + # Figure out the set of columns in the FTS5 table. This routine does + # not handle tables with UNINDEXED columns, but if it did, it would + # have to be here. + db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) } + + set d "" + if {$aDictVar != ""} { + upvar $aDictVar aDict + set d aDict + } + + set cols "" + foreach e $lCols { append cols ", '$e'" } + set tclexpr [db one [subst -novar { + SELECT fts5_expr_tcl( $expr, 'nearset $cols -dict $d -pc ::pc' [set cols] ) + }]] + + set res [list] + db eval "SELECT rowid, * FROM $tbl ORDER BY rowid $order" x { + set cols [list] + foreach col $lCols { lappend cols $x($col) } + + set ::pc 0 + set rowdata [eval $tclexpr] + if {$rowdata != ""} { + lappend res $x(rowid) $rowdata [fts5_poslist2collist $rowdata] + } + } + + set res +} + +#------------------------------------------------------------------------- +# Similar to [fts5_query_data], but omit the collist field. +# +proc fts5_poslist_data {expr tbl {order ASC} {aDictVar ""}} { + set res [list] + + if {$aDictVar!=""} { + upvar $aDictVar aDict + set dict aDict + } else { + set dict "" + } + + foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] { + lappend res $rowid $poslist + } + set res +} + +proc fts5_collist_data {expr tbl {order ASC} {aDictVar ""}} { + set res [list] + + if {$aDictVar!=""} { + upvar $aDictVar aDict + set dict aDict + } else { + set dict "" + } + + foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] { + lappend res $rowid $collist + } + set res +} + +#------------------------------------------------------------------------- +# + +# This command will only work inside a [foreach_detail_mode] block. It tests +# whether or not expression $expr run on FTS5 table $tbl is supported by +# the current mode. If so, 1 is returned. If not, 0. +# +# detail=full (all queries supported) +# detail=col (all but phrase queries and NEAR queries) +# detail=none (all but phrase queries, NEAR queries, and column filters) +# +proc fts5_expr_ok {expr tbl} { + + if {![detail_is_full]} { + set nearset "nearset_rc" + if {[detail_is_col]} { set nearset "nearset_rf" } + + set ::expr_not_ok 0 + db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) } + + set cols "" + foreach e $lCols { append cols ", '$e'" } + set ::pc 0 + set tclexpr [db one [subst -novar { + SELECT fts5_expr_tcl( $expr, '[set nearset] $cols -pc ::pc' [set cols] ) + }]] + eval $tclexpr + if {$::expr_not_ok} { return 0 } + } + + return 1 +} + +# Helper for [fts5_expr_ok] +proc nearset_rf {aCol args} { + set idx [lsearch -exact $args --] + if {$idx != [llength $args]-2 || [llength [lindex $args end]]!=1} { + set ::expr_not_ok 1 + } + list +} + +# Helper for [fts5_expr_ok] +proc nearset_rc {aCol args} { + nearset_rf $aCol {*}$args + if {[lsearch $args -col]>=0} { + set ::expr_not_ok 1 + } + list +} + + +#------------------------------------------------------------------------- +# Code for a simple Tcl tokenizer that supports synonyms at query time. +# +proc tclnum_tokenize {mode tflags text} { + foreach {w iStart iEnd} [fts5_tokenize_split $text] { + sqlite3_fts5_token $w $iStart $iEnd + if {$tflags == $mode && [info exists ::tclnum_syn($w)]} { + foreach s $::tclnum_syn($w) { sqlite3_fts5_token -colo $s $iStart $iEnd } + } + } +} + +proc tclnum_create {args} { + set mode query + if {[llength $args]} { + set mode [lindex $args 0] + } + if {$mode != "query" && $mode != "document"} { error "bad mode: $mode" } + return [list tclnum_tokenize $mode] +} + +proc fts5_tclnum_register {db} { + foreach SYNDICT { + {zero 0} + {one 1 i} + {two 2 ii} + {three 3 iii} + {four 4 iv} + {five 5 v} + {six 6 vi} + {seven 7 vii} + {eight 8 viii} + {nine 9 ix} + + {a1 a2 a3 a4 a5 a6 a7 a8 a9} + {b1 b2 b3 b4 b5 b6 b7 b8 b9} + {c1 c2 c3 c4 c5 c6 c7 c8 c9} + } { + foreach s $SYNDICT { + set o [list] + foreach x $SYNDICT {if {$x!=$s} {lappend o $x}} + set ::tclnum_syn($s) $o + } + } + sqlite3_fts5_create_tokenizer db tclnum tclnum_create +} +# +# End of tokenizer code. +#------------------------------------------------------------------------- + |