From 18657a960e125336f704ea058e25c27bd3900dcb Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 5 May 2024 19:28:19 +0200 Subject: Adding upstream version 3.40.1. Signed-off-by: Daniel Baumann --- test/fts3rnd.test | 460 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 460 insertions(+) create mode 100644 test/fts3rnd.test (limited to 'test/fts3rnd.test') diff --git a/test/fts3rnd.test b/test/fts3rnd.test new file mode 100644 index 0000000..97af549 --- /dev/null +++ b/test/fts3rnd.test @@ -0,0 +1,460 @@ +# 2009 December 03 +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# +# Brute force (random data) tests for FTS3. +# + +#------------------------------------------------------------------------- +# +# The FTS3 tests implemented in this file focus on testing that FTS3 +# returns the correct set of documents for various types of full-text +# query. This is done using pseudo-randomly generated data and queries. +# The expected result of each query is calculated using Tcl code. +# +# 1. The database is initialized to contain a single table with three +# columns. 100 rows are inserted into the table. Each of the three +# values in each row is a document consisting of between 0 and 100 +# terms. Terms are selected from a vocabulary of $G(nVocab) terms. +# +# 2. The following is performed 100 times: +# +# a. A row is inserted into the database. The row contents are +# generated as in step 1. The docid is a pseudo-randomly selected +# value between 0 and 1000000. +# +# b. A psuedo-randomly selected row is updated. One of its columns is +# set to contain a new document generated in the same way as the +# documents in step 1. +# +# c. A psuedo-randomly selected row is deleted. +# +# d. For each of several types of fts3 queries, 10 SELECT queries +# of the form: +# +# SELECT docid FROM WHERE MATCH '' +# +# are evaluated. The results are compared to those calculated by +# Tcl code in this file. The patterns used for the different query +# types are: +# +# 1. query = +# 2. query = +# 3. query = " " +# 4. query = " " +# 5. query = " " +# 6. query = NEAR +# 7. query = NEAR/11 NEAR/11 +# 8. query = OR +# 9. query = NOT +# 10. query = AND +# 11. query = NEAR OR NEAR +# 12. query = NEAR NOT NEAR +# 13. query = NEAR AND NEAR +# +# where is a term psuedo-randomly selected from the vocabulary +# and prefix is the first 2 characters of such a term followed by +# a "*" character. +# +# Every second iteration, steps (a) through (d) above are performed +# within a single transaction. This forces the queries in (d) to +# read data from both the database and the in-memory hash table +# that caches the full-text index entries created by steps (a), (b) +# and (c) until the transaction is committed. +# +# The procedure above is run 5 times, using advisory fts3 node sizes of 50, +# 500, 1000 and 2000 bytes. +# +# After the test using an advisory node-size of 50, an OOM test is run using +# the database. This test is similar to step (d) above, except that it tests +# the effects of transient and persistent OOM conditions encountered while +# executing each query. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl + +# If this build does not include FTS3, skip the tests in this file. +# +ifcapable !fts3 { finish_test ; return } +source $testdir/fts3_common.tcl +source $testdir/malloc_common.tcl + +set G(nVocab) 100 + +set nVocab 100 +set lVocab [list] + +expr srand(0) + +# Generate a vocabulary of nVocab words. Each word is 3 characters long. +# +set lChar {a b c d e f g h i j k l m n o p q r s t u v w x y z} +for {set i 0} {$i < $nVocab} {incr i} { + set len [expr int(rand()*3)+2] + set word [lindex $lChar [expr int(rand()*26)]] + append word [lindex $lChar [expr int(rand()*26)]] + if {$len>2} { append word [lindex $lChar [expr int(rand()*26)]] } + if {$len>3} { append word [lindex $lChar [expr int(rand()*26)]] } + lappend lVocab $word +} + +proc random_term {} { + lindex $::lVocab [expr {int(rand()*$::nVocab)}] +} + +# Return a document consisting of $nWord arbitrarily selected terms +# from the $::lVocab list. +# +proc generate_doc {nWord} { + set doc [list] + for {set i 0} {$i < $nWord} {incr i} { + lappend doc [random_term] + } + return $doc +} + + + +# Primitives to update the table. +# +unset -nocomplain t1 +proc insert_row {rowid} { + set a [generate_doc [expr int((rand()*100))]] + set b [generate_doc [expr int((rand()*100))]] + set c [generate_doc [expr int((rand()*100))]] + execsql { INSERT INTO t1(docid, a, b, c) VALUES($rowid, $a, $b, $c) } + set ::t1($rowid) [list $a $b $c] +} +proc delete_row {rowid} { + execsql { DELETE FROM t1 WHERE rowid = $rowid } + catch {unset ::t1($rowid)} +} +proc update_row {rowid} { + set cols {a b c} + set iCol [expr int(rand()*3)] + set doc [generate_doc [expr int((rand()*100))]] + lset ::t1($rowid) $iCol $doc + execsql "UPDATE t1 SET [lindex $cols $iCol] = \$doc WHERE rowid = \$rowid" +} + +proc simple_phrase {zPrefix} { + set ret [list] + + set reg [string map {* {[^ ]*}} $zPrefix] + set reg " $reg " + + foreach key [lsort -integer [array names ::t1]] { + set value $::t1($key) + set cnt [list] + foreach col $value { + if {[regexp $reg " $col "]} { lappend ret $key ; break } + } + } + + #lsort -uniq -integer $ret + set ret +} + +# This [proc] is used to test the FTS3 matchinfo() function. +# +proc simple_token_matchinfo {zToken bDesc} { + + set nDoc(0) 0 + set nDoc(1) 0 + set nDoc(2) 0 + set nHit(0) 0 + set nHit(1) 0 + set nHit(2) 0 + + set dir -inc + if {$bDesc} { set dir -dec } + + foreach key [array names ::t1] { + set value $::t1($key) + set a($key) [list] + foreach i {0 1 2} col $value { + set hit [llength [lsearch -all $col $zToken]] + lappend a($key) $hit + incr nHit($i) $hit + if {$hit>0} { incr nDoc($i) } + } + } + + set ret [list] + foreach docid [lsort -integer $dir [array names a]] { + if { [lindex [lsort -integer $a($docid)] end] } { + set matchinfo [list 1 3] + foreach i {0 1 2} hit $a($docid) { + lappend matchinfo $hit $nHit($i) $nDoc($i) + } + lappend ret $docid $matchinfo + } + } + + set ret +} + +proc simple_near {termlist nNear} { + set ret [list] + + foreach {key value} [array get ::t1] { + foreach v $value { + + set l [lsearch -exact -all $v [lindex $termlist 0]] + foreach T [lrange $termlist 1 end] { + set l2 [list] + foreach i $l { + set iStart [expr $i - $nNear - 1] + set iEnd [expr $i + $nNear + 1] + if {$iStart < 0} {set iStart 0} + foreach i2 [lsearch -exact -all [lrange $v $iStart $iEnd] $T] { + incr i2 $iStart + if {$i2 != $i} { lappend l2 $i2 } + } + } + set l [lsort -uniq -integer $l2] + } + + if {[llength $l]} { +#puts "MATCH($key): $v" + lappend ret $key + } + } + } + + lsort -unique -integer $ret +} + +# The following three procs: +# +# setup_not A B +# setup_or A B +# setup_and A B +# +# each take two arguments. Both arguments must be lists of integer values +# sorted by value. The return value is the list produced by evaluating +# the equivalent of "A op B", where op is the FTS3 operator NOT, OR or +# AND. +# +proc setop_not {A B} { + foreach b $B { set n($b) {} } + set ret [list] + foreach a $A { if {![info exists n($a)]} {lappend ret $a} } + return $ret +} +proc setop_or {A B} { + lsort -integer -uniq [concat $A $B] +} +proc setop_and {A B} { + foreach b $B { set n($b) {} } + set ret [list] + foreach a $A { if {[info exists n($a)]} {lappend ret $a} } + return $ret +} + +proc mit {blob} { + set scan(littleEndian) i* + set scan(bigEndian) I* + binary scan $blob $scan($::tcl_platform(byteOrder)) r + return $r +} +db func mit mit +set sqlite_fts3_enable_parentheses 1 + +proc do_orderbydocid_test {tn sql res} { + uplevel [list do_select_test $tn.asc "$sql ORDER BY docid ASC" $res] + uplevel [list do_select_test $tn.desc "$sql ORDER BY docid DESC" \ + [lsort -int -dec $res] + ] +} + +set NUM_TRIALS 100 + +foreach {nodesize order} { + 50 DESC + 50 ASC + 500 ASC + 1000 DESC + 2000 ASC +} { + catch { array unset ::t1 } + set testname "$nodesize/$order" + + # Create the FTS3 table. Populate it (and the Tcl array) with 100 rows. + # + db transaction { + catchsql { DROP TABLE t1 } + execsql "CREATE VIRTUAL TABLE t1 USING fts4(a, b, c, order=$order)" + execsql "INSERT INTO t1(t1) VALUES('nodesize=$nodesize')" + for {set i 0} {$i < 100} {incr i} { insert_row $i } + } + + for {set iTest 1} {$iTest <= $NUM_TRIALS} {incr iTest} { + catchsql COMMIT + + set DO_MALLOC_TEST 0 + set nRep 10 + if {$iTest==100 && $nodesize==50} { + set DO_MALLOC_TEST 1 + set nRep 2 + } + + set ::testprefix fts3rnd-1.$testname.$iTest + + # Delete one row, update one row and insert one row. + # + set rows [array names ::t1] + set nRow [llength $rows] + set iUpdate [lindex $rows [expr {int(rand()*$nRow)}]] + set iDelete $iUpdate + while {$iDelete == $iUpdate} { + set iDelete [lindex $rows [expr {int(rand()*$nRow)}]] + } + set iInsert $iUpdate + while {[info exists ::t1($iInsert)]} { + set iInsert [expr {int(rand()*1000000)}] + } + execsql BEGIN + insert_row $iInsert + update_row $iUpdate + delete_row $iDelete + if {0==($iTest%2)} { execsql COMMIT } + + if {0==($iTest%2)} { + #do_test 0 { fts3_integrity_check t1 } ok + } + + # Pick 10 terms from the vocabulary. Check that the results of querying + # the database for the set of documents containing each of these terms + # is the same as the result obtained by scanning the contents of the Tcl + # array for each term. + # + for {set i 0} {$i < 10} {incr i} { + set term [random_term] + do_select_test 1.$i.asc { + SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term + ORDER BY docid ASC + } [simple_token_matchinfo $term 0] + do_select_test 1.$i.desc { + SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term + ORDER BY docid DESC + } [simple_token_matchinfo $term 1] + } + + # This time, use the first two characters of each term as a term prefix + # to query for. Test that querying the Tcl array produces the same results + # as querying the FTS3 table for the prefix. + # + for {set i 0} {$i < $nRep} {incr i} { + set prefix [string range [random_term] 0 end-1] + set match "${prefix}*" + do_orderbydocid_test 2.$i { + SELECT docid FROM t1 WHERE t1 MATCH $match + } [simple_phrase $match] + } + + # Similar to the above, except for phrase queries. + # + for {set i 0} {$i < $nRep} {incr i} { + set term [list [random_term] [random_term]] + set match "\"$term\"" + do_orderbydocid_test 3.$i { + SELECT docid FROM t1 WHERE t1 MATCH $match + } [simple_phrase $term] + } + + # Three word phrases. + # + for {set i 0} {$i < $nRep} {incr i} { + set term [list [random_term] [random_term] [random_term]] + set match "\"$term\"" + do_orderbydocid_test 4.$i { + SELECT docid FROM t1 WHERE t1 MATCH $match + } [simple_phrase $term] + } + + # Three word phrases made up of term-prefixes. + # + for {set i 0} {$i < $nRep} {incr i} { + set query "[string range [random_term] 0 end-1]* " + append query "[string range [random_term] 0 end-1]* " + append query "[string range [random_term] 0 end-1]*" + + set match "\"$query\"" + do_orderbydocid_test 5.$i { + SELECT docid FROM t1 WHERE t1 MATCH $match + } [simple_phrase $query] + } + + # A NEAR query with terms as the arguments: + # + # ... MATCH '$term1 NEAR $term2' ... + # + for {set i 0} {$i < $nRep} {incr i} { + set terms [list [random_term] [random_term]] + set match [join $terms " NEAR "] + do_orderbydocid_test 6.$i { + SELECT docid FROM t1 WHERE t1 MATCH $match + } [simple_near $terms 10] + } + + # A 3-way NEAR query with terms as the arguments. + # + for {set i 0} {$i < $nRep} {incr i} { + set terms [list [random_term] [random_term] [random_term]] + set nNear 11 + set match [join $terms " NEAR/$nNear "] + do_orderbydocid_test 7.$i { + SELECT docid FROM t1 WHERE t1 MATCH $match + } [simple_near $terms $nNear] + } + + # Set operations on simple term queries. + # + foreach {tn op proc} { + 8 OR setop_or + 9 NOT setop_not + 10 AND setop_and + } { + for {set i 0} {$i < $nRep} {incr i} { + set term1 [random_term] + set term2 [random_term] + set match "$term1 $op $term2" + do_orderbydocid_test $tn.$i { + SELECT docid FROM t1 WHERE t1 MATCH $match + } [$proc [simple_phrase $term1] [simple_phrase $term2]] + } + } + + # Set operations on NEAR queries. + # + foreach {tn op proc} { + 11 OR setop_or + 12 NOT setop_not + 13 AND setop_and + } { + for {set i 0} {$i < $nRep} {incr i} { + set term1 [random_term] + set term2 [random_term] + set term3 [random_term] + set term4 [random_term] + set match "$term1 NEAR $term2 $op $term3 NEAR $term4" + do_orderbydocid_test $tn.$i { + SELECT docid FROM t1 WHERE t1 MATCH $match + } [$proc \ + [simple_near [list $term1 $term2] 10] \ + [simple_near [list $term3 $term4] 10] + ] + } + } + + catchsql COMMIT + } +} + +finish_test -- cgit v1.2.3