diff options
Diffstat (limited to '')
-rw-r--r-- | test/fts3_common.tcl | 420 |
1 files changed, 420 insertions, 0 deletions
diff --git a/test/fts3_common.tcl b/test/fts3_common.tcl new file mode 100644 index 0000000..fcd3ca3 --- /dev/null +++ b/test/fts3_common.tcl @@ -0,0 +1,420 @@ +# 2009 November 04 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# +# This file contains common code used the fts3 tests. At one point +# equivalent functionality was implemented in C code. But it is easier +# to use Tcl. +# + +#------------------------------------------------------------------------- +# INSTRUCTIONS +# +# The following commands are available: +# +# fts3_build_db_1 N +# Using database handle [db] create an FTS4 table named t1 and populate +# it with N rows of data. N must be less than 10,000. Refer to the +# header comments above the proc implementation below for details. +# +# fts3_build_db_2 N +# Using database handle [db] create an FTS4 table named t2 and populate +# it with N rows of data. N must be less than 100,000. Refer to the +# header comments above the proc implementation below for details. +# +# fts3_integrity_check TBL +# TBL must be an FTS table in the database currently opened by handle +# [db]. This proc loads and tokenizes all documents within the table, +# then checks that the current contents of the FTS index matches the +# results. +# +# fts3_terms TBL WHERE +# Todo. +# +# fts3_doclist TBL TERM WHERE +# Todo. +# +# +# + +ifcapable fts3 { + sqlite3_fts3_may_be_corrupt 0 +} + +#------------------------------------------------------------------------- +# USAGE: fts3_build_db_1 SWITCHES N +# +# Build a sample FTS table in the database opened by database connection +# [db]. The name of the new table is "t1". +# +proc fts3_build_db_1 {args} { + + set default(-module) fts4 + + set nArg [llength $args] + if {($nArg%2)==0} { + error "wrong # args: should be \"fts3_build_db_1 ?switches? n\"" + } + + set n [lindex $args [expr $nArg-1]] + array set opts [array get default] + array set opts [lrange $args 0 [expr $nArg-2]] + foreach k [array names opts] { + if {0==[info exists default($k)]} { error "unknown option: $k" } + } + + if {$n > 10000} {error "n must be <= 10000"} + db eval "CREATE VIRTUAL TABLE t1 USING $opts(-module) (x, y)" + + set xwords [list zero one two three four five six seven eight nine ten] + set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa] + + for {set i 0} {$i < $n} {incr i} { + set x "" + set y "" + + set x [list] + lappend x [lindex $xwords [expr ($i / 1000) % 10]] + lappend x [lindex $xwords [expr ($i / 100) % 10]] + lappend x [lindex $xwords [expr ($i / 10) % 10]] + lappend x [lindex $xwords [expr ($i / 1) % 10]] + + set y [list] + lappend y [lindex $ywords [expr ($i / 1000) % 10]] + lappend y [lindex $ywords [expr ($i / 100) % 10]] + lappend y [lindex $ywords [expr ($i / 10) % 10]] + lappend y [lindex $ywords [expr ($i / 1) % 10]] + + db eval { INSERT INTO t1(docid, x, y) VALUES($i, $x, $y) } + } +} + +#------------------------------------------------------------------------- +# USAGE: fts3_build_db_2 N ARGS +# +# Build a sample FTS table in the database opened by database connection +# [db]. The name of the new table is "t2". +# +proc fts3_build_db_2 {args} { + + set default(-module) fts4 + set default(-extra) "" + + set nArg [llength $args] + if {($nArg%2)==0} { + error "wrong # args: should be \"fts3_build_db_1 ?switches? n\"" + } + + set n [lindex $args [expr $nArg-1]] + array set opts [array get default] + array set opts [lrange $args 0 [expr $nArg-2]] + foreach k [array names opts] { + if {0==[info exists default($k)]} { error "unknown option: $k" } + } + + if {$n > 100000} {error "n must be <= 100000"} + + set sql "CREATE VIRTUAL TABLE t2 USING $opts(-module) (content" + if {$opts(-extra) != ""} { + append sql ", " $opts(-extra) + } + append sql ")" + db eval $sql + + set chars [list a b c d e f g h i j k l m n o p q r s t u v w x y z ""] + + for {set i 0} {$i < $n} {incr i} { + set word "" + set nChar [llength $chars] + append word [lindex $chars [expr {($i / 1) % $nChar}]] + append word [lindex $chars [expr {($i / $nChar) % $nChar}]] + append word [lindex $chars [expr {($i / ($nChar*$nChar)) % $nChar}]] + + db eval { INSERT INTO t2(docid, content) VALUES($i, $word) } + } +} + +#------------------------------------------------------------------------- +# USAGE: fts3_integrity_check TBL +# +# This proc is used to verify that the full-text index is consistent with +# the contents of the fts3 table. In other words, it checks that the +# data in the %_contents table matches that in the %_segdir and %_segments +# tables. +# +# This is not an efficient procedure. It uses a lot of memory and a lot +# of CPU. But it is better than not checking at all. +# +# The procedure is: +# +# 1) Read the entire full-text index from the %_segdir and %_segments +# tables into memory. For each entry in the index, the following is +# done: +# +# set C($iDocid,$iCol,$iPosition) $zTerm +# +# 2) Iterate through each column of each row of the %_content table. +# Tokenize all documents, and check that for each token there is +# a corresponding entry in the $C array. After checking a token, +# [unset] the $C array entry. +# +# 3) Check that array $C is now empty. +# +# +proc fts3_integrity_check {tbl} { + + fts3_read2 $tbl 1 A + + foreach zTerm [array names A] { + #puts $zTerm + foreach doclist $A($zTerm) { + set docid 0 + while {[string length $doclist]>0} { + set iCol 0 + set iPos 0 + set lPos [list] + set lCol [list] + + # First varint of a doclist-entry is the docid. Delta-compressed + # with respect to the docid of the previous entry. + # + incr docid [gobble_varint doclist] + if {[info exists D($zTerm,$docid)]} { + while {[set iDelta [gobble_varint doclist]] != 0} {} + continue + } + set D($zTerm,$docid) 1 + + # Gobble varints until the 0x00 that terminates the doclist-entry + # is found. + while {[set iDelta [gobble_varint doclist]] > 0} { + if {$iDelta == 1} { + set iCol [gobble_varint doclist] + set iPos 0 + } else { + incr iPos $iDelta + incr iPos -2 + set C($docid,$iCol,$iPos) $zTerm + } + } + } + } + } + + foreach key [array names C] { + #puts "$key -> $C($key)" + } + + + db eval "SELECT * FROM ${tbl}_content" E { + set iCol 0 + set iDoc $E(docid) + foreach col [lrange $E(*) 1 end] { + set c $E($col) + set sql {SELECT fts3_tokenizer_test('simple', $c)} + + foreach {pos term dummy} [db one $sql] { + if {![info exists C($iDoc,$iCol,$pos)]} { + set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing" + lappend errors $es + } else { + if {[string compare $C($iDoc,$iCol,$pos) $term]} { + set es "Error at docid=$iDoc col=$iCol pos=$pos. Index " + append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\"" + lappend errors $es + } + unset C($iDoc,$iCol,$pos) + } + } + incr iCol + } + } + + foreach c [array names C] { + lappend errors "Bad index entry: $c -> $C($c)" + } + + if {[info exists errors]} { return [join $errors "\n"] } + return "ok" +} + +# USAGE: fts3_terms TBL WHERE +# +# Argument TBL must be the name of an FTS3 table. Argument WHERE is an +# SQL expression that will be used as the WHERE clause when scanning +# the %_segdir table. As in the following query: +# +# "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}" +# +# This function returns a list of all terms present in the segments +# selected by the statement above. +# +proc fts3_terms {tbl where} { + fts3_read $tbl $where a + return [lsort [array names a]] +} + + +# USAGE: fts3_doclist TBL TERM WHERE +# +# Argument TBL must be the name of an FTS3 table. TERM is a term that may +# or may not be present in the table. Argument WHERE is used to select a +# subset of the b-tree segments in the associated full-text index as +# described above for [fts3_terms]. +# +# This function returns the results of merging the doclists associated +# with TERM in the selected segments. Each doclist is an element of the +# returned list. Each doclist is formatted as follows: +# +# [$docid ?$col[$off1 $off2...]?...] +# +# The formatting is odd for a Tcl command in order to be compatible with +# the original C-language implementation. If argument WHERE is "1", then +# any empty doclists are omitted from the returned list. +# +proc fts3_doclist {tbl term where} { + fts3_read $tbl $where a + + + foreach doclist $a($term) { + set docid 0 + + while {[string length $doclist]>0} { + set iCol 0 + set iPos 0 + set lPos [list] + set lCol [list] + incr docid [gobble_varint doclist] + + while {[set iDelta [gobble_varint doclist]] > 0} { + if {$iDelta == 1} { + lappend lCol [list $iCol $lPos] + set iPos 0 + set lPos [list] + set iCol [gobble_varint doclist] + } else { + incr iPos $iDelta + incr iPos -2 + lappend lPos $iPos + } + } + + if {[llength $lPos]>0} { + lappend lCol [list $iCol $lPos] + } + + if {$where != "1" || [llength $lCol]>0} { + set ret($docid) $lCol + } else { + unset -nocomplain ret($docid) + } + } + } + + set lDoc [list] + foreach docid [lsort -integer [array names ret]] { + set lCol [list] + set cols "" + foreach col $ret($docid) { + foreach {iCol lPos} $col {} + append cols " $iCol\[[join $lPos { }]\]" + } + lappend lDoc "\[${docid}${cols}\]" + } + + join $lDoc " " +} + +########################################################################### + +proc gobble_varint {varname} { + upvar $varname blob + set n [read_fts3varint $blob ret] + set blob [string range $blob $n end] + return $ret +} +proc gobble_string {varname nLength} { + upvar $varname blob + set ret [string range $blob 0 [expr $nLength-1]] + set blob [string range $blob $nLength end] + return $ret +} + +# The argument is a blob of data representing an FTS3 segment leaf. +# Return a list consisting of alternating terms (strings) and doclists +# (blobs of data). +# +proc fts3_readleaf {blob} { + set zPrev "" + set terms [list] + + while {[string length $blob] > 0} { + set nPrefix [gobble_varint blob] + set nSuffix [gobble_varint blob] + + set zTerm [string range $zPrev 0 [expr $nPrefix-1]] + append zTerm [gobble_string blob $nSuffix] + set nDoclist [gobble_varint blob] + set doclist [gobble_string blob $nDoclist] + + lappend terms $zTerm $doclist + set zPrev $zTerm + } + + return $terms +} + +proc fts3_read2 {tbl where varname} { + upvar $varname a + array unset a + db eval " SELECT start_block, leaves_end_block, root + FROM ${tbl}_segdir WHERE $where + ORDER BY level ASC, idx DESC + " { + set c 0 + binary scan $root c c + if {$c==0} { + foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } + } else { + db eval " SELECT block + FROM ${tbl}_segments + WHERE blockid>=$start_block AND blockid<=$leaves_end_block + ORDER BY blockid + " { + foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } + } + } + } +} + +proc fts3_read {tbl where varname} { + upvar $varname a + array unset a + db eval " SELECT start_block, leaves_end_block, root + FROM ${tbl}_segdir WHERE $where + ORDER BY level DESC, idx ASC + " { + if {$start_block == 0} { + foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } + } else { + db eval " SELECT block + FROM ${tbl}_segments + WHERE blockid>=$start_block AND blockid<$leaves_end_block + ORDER BY blockid + " { + foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } + + } + } + } +} + +########################################################################## + |