1 files changed, 647 insertions, 0 deletions
diff --git a/ext/fts5/test/fts5_common.tcl b/ext/fts5/test/fts5_common.tcl
new file mode 100644
index 0000000..0f371dc
--- /dev/null
+++ b/ext/fts5/test/fts5_common.tcl
@@ -0,0 +1,647 @@
+# 2014 Dec 19
+#
+# The author disclaims copyright to this source code.  In place of
+# a legal notice, here is a blessing:
+#
+#    May you do good and not evil.
+#    May you find forgiveness for yourself and forgive others.
+#    May you share freely, never taking more than you give.
+#
+#***********************************************************************
+#
+
+if {![info exists testdir]} {
+  set testdir [file join [file dirname [info script]] .. .. .. test]
+}
+source $testdir/tester.tcl
+
+ifcapable !fts5 {
+  proc return_if_no_fts5 {} {
+    finish_test
+    return -code return
+  }
+  return
+} else {
+  proc return_if_no_fts5 {} {}
+}
+
+catch { 
+  sqlite3_fts5_may_be_corrupt 0 
+  reset_db
+}
+
+proc fts5_test_poslist {cmd} {
+  set res [list]
+  for {set i 0} {$i < [$cmd xInstCount]} {incr i} {
+    lappend res [string map {{ } .} [$cmd xInst $i]]
+  }
+  set res
+}
+
+proc fts5_test_poslist2 {cmd} {
+  set res [list]
+
+  for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
+    $cmd xPhraseForeach $i c o {
+      lappend res $i.$c.$o
+    }
+  }
+
+  #set res
+  sort_poslist $res
+}
+
+proc fts5_test_collist {cmd} {
+  set res [list]
+
+  for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
+    $cmd xPhraseColumnForeach $i c { lappend res $i.$c }
+  }
+
+  set res
+}
+
+proc fts5_test_columnsize {cmd} {
+  set res [list]
+  for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
+    lappend res [$cmd xColumnSize $i]
+  }
+  set res
+}
+
+proc fts5_test_columntext {cmd} {
+  set res [list]
+  for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
+    lappend res [$cmd xColumnText $i]
+  }
+  set res
+}
+
+proc fts5_test_columntotalsize {cmd} {
+  set res [list]
+  for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
+    lappend res [$cmd xColumnTotalSize $i]
+  }
+  set res
+}
+
+proc test_append_token {varname token iStart iEnd} {
+  upvar $varname var
+  lappend var $token
+  return "SQLITE_OK"
+}
+proc fts5_test_tokenize {cmd} {
+  set res [list]
+  for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
+    set tokens [list]
+    $cmd xTokenize [$cmd xColumnText $i] [list test_append_token tokens]
+    lappend res $tokens
+  }
+  set res
+}
+
+proc fts5_test_rowcount {cmd} {
+  $cmd xRowCount
+}
+
+proc test_queryphrase_cb {cnt cmd} {
+  upvar $cnt L 
+  for {set i 0} {$i < [$cmd xInstCount]} {incr i} {
+    foreach {ip ic io} [$cmd xInst $i] break
+    set A($ic) 1
+  }
+  foreach ic [array names A] {
+    lset L $ic [expr {[lindex $L $ic] + 1}]
+  }
+}
+proc fts5_test_queryphrase {cmd} {
+  set res [list]
+  for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
+    set cnt [list]
+    for {set j 0} {$j < [$cmd xColumnCount]} {incr j} { lappend cnt 0 }
+    $cmd xQueryPhrase $i [list test_queryphrase_cb cnt]
+    lappend res $cnt
+  }
+  set res
+}
+
+proc fts5_test_phrasecount {cmd} {
+  $cmd xPhraseCount
+}
+
+proc fts5_test_all {cmd} {
+  set res [list]
+  lappend res columnsize      [fts5_test_columnsize $cmd]
+  lappend res columntext      [fts5_test_columntext $cmd]
+  lappend res columntotalsize [fts5_test_columntotalsize $cmd]
+  lappend res poslist         [fts5_test_poslist $cmd]
+  lappend res tokenize        [fts5_test_tokenize $cmd]
+  lappend res rowcount        [fts5_test_rowcount $cmd]
+  set res
+}
+
+proc fts5_aux_test_functions {db} {
+  foreach f {
+    fts5_test_columnsize
+    fts5_test_columntext
+    fts5_test_columntotalsize
+    fts5_test_poslist
+    fts5_test_poslist2
+    fts5_test_collist
+    fts5_test_tokenize
+    fts5_test_rowcount
+    fts5_test_all
+
+    fts5_test_queryphrase
+    fts5_test_phrasecount
+  } {
+    sqlite3_fts5_create_function $db $f $f
+  }
+}
+
+proc fts5_segcount {tbl} {
+  set N 0
+  foreach n [fts5_level_segs $tbl] { incr N $n }
+  set N
+}
+
+proc fts5_level_segs {tbl} {
+  set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10"
+  set ret [list]
+  foreach L [lrange [db one $sql] 1 end] {
+    lappend ret [expr [llength $L] - 3]
+  }
+  set ret
+} 
+
+proc fts5_level_segids {tbl} {
+  set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10"
+  set ret [list]
+  foreach L [lrange [db one $sql] 1 end] {
+    set lvl [list]
+    foreach S [lrange $L 3 end] {
+      regexp {id=([1234567890]*)} $S -> segid
+      lappend lvl $segid
+    }
+    lappend ret $lvl
+  }
+  set ret
+}
+
+proc fts5_rnddoc {n} {
+  set map [list 0 a  1 b  2 c  3 d  4 e  5 f  6 g  7 h  8 i  9 j]
+  set doc [list]
+  for {set i 0} {$i < $n} {incr i} {
+    lappend doc "x[string map $map [format %.3d [expr int(rand()*1000)]]]"
+  }
+  set doc
+}
+
+#-------------------------------------------------------------------------
+# Usage:
+#
+#   nearset aCol ?-pc VARNAME? ?-near N? ?-col C? -- phrase1 phrase2...
+#
+# This command is used to test if a document (set of column values) matches
+# the logical equivalent of a single FTS5 NEAR() clump and, if so, return
+# the equivalent of an FTS5 position list.
+#
+# Parameter $aCol is passed a list of the column values for the document
+# to test. Parameters $phrase1 and so on are the phrases.
+#
+# The result is a list of phrase hits. Each phrase hit is formatted as
+# three integers separated by "." characters, in the following format:
+#
+#   <phrase number> . <column number> . <token offset>
+#
+# Options:
+#
+#   -near N        (NEAR distance. Default 10)
+#   -col  C        (List of column indexes to match against)
+#   -pc   VARNAME  (variable in caller frame to use for phrase numbering)
+#   -dict VARNAME  (array in caller frame to use for synonyms)
+#
+proc nearset {aCol args} {
+
+  # Process the command line options.
+  #
+  set O(-near) 10
+  set O(-col)  {}
+  set O(-pc)   ""
+  set O(-dict) ""
+
+  set nOpt [lsearch -exact $args --]
+  if {$nOpt<0} { error "no -- option" }
+
+  # Set $lPhrase to be a list of phrases. $nPhrase its length.
+  set lPhrase [lrange $args [expr $nOpt+1] end]
+  set nPhrase [llength $lPhrase]
+
+  foreach {k v} [lrange $args 0 [expr $nOpt-1]] {
+    if {[info exists O($k)]==0} { error "unrecognized option $k" }
+    set O($k) $v
+  }
+
+  if {$O(-pc) == ""} {
+    set counter 0
+  } else {
+    upvar $O(-pc) counter
+  }
+
+  if {$O(-dict)!=""} { upvar $O(-dict) aDict }
+
+  for {set j 0} {$j < [llength $aCol]} {incr j} {
+    for {set i 0} {$i < $nPhrase} {incr i} { 
+      set A($j,$i) [list]
+    }
+  }
+
+  # Loop through each column of the current row.
+  for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} {
+
+    # If there is a column filter, test whether this column is excluded. If
+    # so, skip to the next iteration of this loop. Otherwise, set zCol to the
+    # column value and nToken to the number of tokens that comprise it.
+    if {$O(-col)!="" && [lsearch $O(-col) $iCol]<0} continue
+    set zCol [lindex $aCol $iCol]
+    set nToken [llength $zCol]
+
+    # Each iteration of the following loop searches a substring of the 
+    # column value for phrase matches. The last token of the substring
+    # is token $iLast of the column value. The first token is:
+    #
+    #   iFirst = ($iLast - $O(-near) - 1)
+    #
+    # where $sz is the length of the phrase being searched for. A phrase 
+    # counts as matching the substring if its first token lies on or before
+    # $iLast and its last token on or after $iFirst.
+    #
+    # For example, if the query is "NEAR(a+b c, 2)" and the column value:
+    #
+    #   "x x x x A B x x C x"
+    #    0 1 2 3 4 5 6 7 8 9"
+    #
+    # when (iLast==8 && iFirst=5) the range will contain both phrases and
+    # so both instances can be added to the output poslists.
+    #
+    set iLast [expr $O(-near) >= $nToken ? $nToken - 1 : $O(-near)]
+    for { } {$iLast < $nToken} {incr iLast} {
+
+      catch { array unset B }
+      
+      for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
+        set p [lindex $lPhrase $iPhrase]
+        set nPm1 [expr {[llength $p] - 1}]
+        set iFirst [expr $iLast - $O(-near) - [llength $p]]
+
+        for {set i $iFirst} {$i <= $iLast} {incr i} {
+          set lCand [lrange $zCol $i [expr $i+$nPm1]]
+          set bMatch 1
+          foreach tok $p term $lCand {
+            if {[nearset_match aDict $tok $term]==0} { set bMatch 0 ; break }
+          }
+          if {$bMatch} { lappend B($iPhrase) $i }
+        }
+
+        if {![info exists B($iPhrase)]} break
+      }
+
+      if {$iPhrase==$nPhrase} {
+        for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
+          set A($iCol,$iPhrase) [concat $A($iCol,$iPhrase) $B($iPhrase)]
+          set A($iCol,$iPhrase) [lsort -integer -uniq $A($iCol,$iPhrase)]
+        }
+      }
+    }
+  }
+
+  set res [list]
+  #puts [array names A]
+
+  for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
+    for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} {
+      foreach a $A($iCol,$iPhrase) {
+        lappend res "$counter.$iCol.$a"
+      }
+    }
+    incr counter
+  }
+
+  #puts "$aCol -> $res"
+  sort_poslist $res
+}
+
+proc nearset_match {aDictVar tok term} {
+  if {[string match $tok $term]} { return 1 }
+
+  upvar $aDictVar aDict
+  if {[info exists aDict($tok)]} {
+    foreach s $aDict($tok) {
+      if {[string match $s $term]} { return 1 }
+    }
+  }
+  return 0;
+}
+
+#-------------------------------------------------------------------------
+# Usage:
+#
+#   sort_poslist LIST
+#
+# Sort a position list of the type returned by command [nearset]
+#
+proc sort_poslist {L} {
+  lsort -command instcompare $L
+}
+proc instcompare {lhs rhs} {
+  foreach {p1 c1 o1} [split $lhs .] {}
+  foreach {p2 c2 o2} [split $rhs .] {}
+
+  set res [expr $c1 - $c2]
+  if {$res==0} { set res [expr $o1 - $o2] }
+  if {$res==0} { set res [expr $p1 - $p2] }
+
+  return $res
+}
+
+#-------------------------------------------------------------------------
+# Logical operators used by the commands returned by fts5_tcl_expr().
+#
+proc AND {args} {
+  foreach a $args {
+    if {[llength $a]==0} { return [list] }
+  }
+  sort_poslist [concat {*}$args]
+}
+proc OR {args} {
+  sort_poslist [concat {*}$args]
+}
+proc NOT {a b} {
+  if {[llength $b]>0} { return [list] }
+  return $a
+}
+
+#-------------------------------------------------------------------------
+# This command is similar to [split], except that it also provides the
+# start and end offsets of each token. For example:
+#
+#   [fts5_tokenize_split "abc d ef"] -> {abc 0 3 d 4 5 ef 6 8}
+#
+
+proc gobble_whitespace {textvar} {
+  upvar $textvar t
+  regexp {([ ]*)(.*)} $t -> space t
+  return [string length $space]
+}
+
+proc gobble_text {textvar wordvar} {
+  upvar $textvar t
+  upvar $wordvar w
+  regexp {([^ ]*)(.*)} $t -> w t
+  return [string length $w]
+}
+
+proc fts5_tokenize_split {text} {
+  set token ""
+  set ret [list]
+  set iOff [gobble_whitespace text]
+  while {[set nToken [gobble_text text word]]} {
+    lappend ret $word $iOff [expr $iOff+$nToken]
+    incr iOff $nToken
+    incr iOff [gobble_whitespace text]
+  }
+
+  set ret
+}
+
+#-------------------------------------------------------------------------
+#
+proc foreach_detail_mode {prefix script} {
+  set saved $::testprefix
+  foreach d [list full col none] {
+    set s [string map [list %DETAIL% $d] $script]
+    set ::detail $d
+    set ::testprefix "$prefix-$d"
+    reset_db
+    uplevel $s
+    unset ::detail
+  }
+  set ::testprefix $saved
+}
+
+proc detail_check {} {
+  if {$::detail != "none" && $::detail!="full" && $::detail!="col"} {
+    error "not in foreach_detail_mode {...} block"
+  }
+}
+proc detail_is_none {} { detail_check ; expr {$::detail == "none"} }
+proc detail_is_col {}  { detail_check ; expr {$::detail == "col" } }
+proc detail_is_full {} { detail_check ; expr {$::detail == "full"} }
+
+
+#-------------------------------------------------------------------------
+# Convert a poslist of the type returned by fts5_test_poslist() to a 
+# collist as returned by fts5_test_collist().
+#
+proc fts5_poslist2collist {poslist} {
+  set res [list]
+  foreach h $poslist {
+    regexp {(.*)\.[1234567890]+} $h -> cand
+    lappend res $cand
+  }
+  set res [lsort -command fts5_collist_elem_compare -unique $res]
+  return $res
+}
+
+# Comparison function used by fts5_poslist2collist to sort collist entries.
+proc fts5_collist_elem_compare {a b} {
+  foreach {a1 a2} [split $a .] {}
+  foreach {b1 b2} [split $b .] {}
+
+  if {$a1==$b1} { return [expr $a2 - $b2] }
+  return [expr $a1 - $b1]
+}
+
+
+#--------------------------------------------------------------------------
+# Construct and return a tcl list equivalent to that returned by the SQL
+# query executed against database handle [db]:
+#
+#   SELECT 
+#     rowid, 
+#     fts5_test_poslist($tbl),
+#     fts5_test_collist($tbl) 
+#   FROM $tbl('$expr')
+#   ORDER BY rowid $order;
+#
+proc fts5_query_data {expr tbl {order ASC} {aDictVar ""}} {
+
+  # Figure out the set of columns in the FTS5 table. This routine does
+  # not handle tables with UNINDEXED columns, but if it did, it would
+  # have to be here.
+  db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) }
+
+  set d ""
+  if {$aDictVar != ""} {
+    upvar $aDictVar aDict
+    set d aDict
+  }
+
+  set cols ""
+  foreach e $lCols { append cols ", '$e'" }
+  set tclexpr [db one [subst -novar {
+    SELECT fts5_expr_tcl( $expr, 'nearset $cols -dict $d -pc ::pc' [set cols] )
+  }]]
+
+  set res [list]
+  db eval "SELECT rowid, * FROM $tbl ORDER BY rowid $order" x {
+    set cols [list]
+    foreach col $lCols { lappend cols $x($col) }
+    
+    set ::pc 0
+    set rowdata [eval $tclexpr]
+    if {$rowdata != ""} { 
+      lappend res $x(rowid) $rowdata [fts5_poslist2collist $rowdata]
+    }
+  }
+
+  set res
+}
+
+#-------------------------------------------------------------------------
+# Similar to [fts5_query_data], but omit the collist field.
+#
+proc fts5_poslist_data {expr tbl {order ASC} {aDictVar ""}} {
+  set res [list]
+
+  if {$aDictVar!=""} {
+    upvar $aDictVar aDict
+    set dict aDict
+  } else {
+    set dict ""
+  }
+
+  foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] {
+    lappend res $rowid $poslist
+  }
+  set res
+}
+
+proc fts5_collist_data {expr tbl {order ASC} {aDictVar ""}} {
+  set res [list]
+
+  if {$aDictVar!=""} {
+    upvar $aDictVar aDict
+    set dict aDict
+  } else {
+    set dict ""
+  }
+
+  foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] {
+    lappend res $rowid $collist
+  }
+  set res
+}
+
+#-------------------------------------------------------------------------
+#
+
+# This command will only work inside a [foreach_detail_mode] block. It tests
+# whether or not expression $expr run on FTS5 table $tbl is supported by
+# the current mode. If so, 1 is returned. If not, 0.
+#
+#   detail=full    (all queries supported)
+#   detail=col     (all but phrase queries and NEAR queries)
+#   detail=none    (all but phrase queries, NEAR queries, and column filters)
+#
+proc fts5_expr_ok {expr tbl} {
+
+  if {![detail_is_full]} {
+    set nearset "nearset_rc"
+    if {[detail_is_col]} { set nearset "nearset_rf" }
+
+    set ::expr_not_ok 0
+    db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) }
+
+    set cols ""
+    foreach e $lCols { append cols ", '$e'" }
+    set ::pc 0
+    set tclexpr [db one [subst -novar {
+      SELECT fts5_expr_tcl( $expr, '[set nearset] $cols -pc ::pc' [set cols] )
+    }]]
+    eval $tclexpr
+    if {$::expr_not_ok} { return 0 }
+  }
+
+  return 1
+}
+
+# Helper for [fts5_expr_ok]
+proc nearset_rf {aCol args} {
+  set idx [lsearch -exact $args --]
+  if {$idx != [llength $args]-2 || [llength [lindex $args end]]!=1} {
+    set ::expr_not_ok 1
+  }
+  list
+}
+
+# Helper for [fts5_expr_ok]
+proc nearset_rc {aCol args} {
+  nearset_rf $aCol {*}$args
+  if {[lsearch $args -col]>=0} { 
+    set ::expr_not_ok 1
+  }
+  list
+}
+
+
+#-------------------------------------------------------------------------
+# Code for a simple Tcl tokenizer that supports synonyms at query time.
+#
+proc tclnum_tokenize {mode tflags text} {
+  foreach {w iStart iEnd} [fts5_tokenize_split $text] {
+    sqlite3_fts5_token $w $iStart $iEnd
+    if {$tflags == $mode && [info exists ::tclnum_syn($w)]} {
+      foreach s $::tclnum_syn($w)  { sqlite3_fts5_token -colo $s $iStart $iEnd }
+    }
+  }
+}
+
+proc tclnum_create {args} {
+  set mode query
+  if {[llength $args]} {
+    set mode [lindex $args 0]
+  }
+  if {$mode != "query" && $mode != "document"} { error "bad mode: $mode" }
+  return [list tclnum_tokenize $mode]
+}
+
+proc fts5_tclnum_register {db} {
+  foreach SYNDICT {
+    {zero  0}
+    {one   1 i}
+    {two   2 ii}
+    {three 3 iii}
+    {four  4 iv}
+    {five  5 v}
+    {six   6 vi}
+    {seven 7 vii}
+    {eight 8 viii}
+    {nine  9 ix}
+
+    {a1 a2 a3 a4 a5 a6 a7 a8 a9}
+    {b1 b2 b3 b4 b5 b6 b7 b8 b9}
+    {c1 c2 c3 c4 c5 c6 c7 c8 c9}
+  } {
+    foreach s $SYNDICT {
+      set o [list]
+      foreach x $SYNDICT {if {$x!=$s} {lappend o $x}}
+      set ::tclnum_syn($s) $o
+    }
+  }
+  sqlite3_fts5_create_tokenizer db tclnum tclnum_create
+}
+#
+# End of tokenizer code.
+#-------------------------------------------------------------------------
+