diff options
Diffstat (limited to 'ext/fts5/tool')
-rw-r--r-- | ext/fts5/tool/fts5speed.tcl | 64 | ||||
-rw-r--r-- | ext/fts5/tool/fts5txt2db.tcl | 231 | ||||
-rw-r--r-- | ext/fts5/tool/loadfts5.tcl | 172 | ||||
-rw-r--r-- | ext/fts5/tool/mkfts5c.tcl | 114 | ||||
-rw-r--r-- | ext/fts5/tool/showfts5.tcl | 97 |
5 files changed, 678 insertions, 0 deletions
diff --git a/ext/fts5/tool/fts5speed.tcl b/ext/fts5/tool/fts5speed.tcl new file mode 100644 index 0000000..0f38638 --- /dev/null +++ b/ext/fts5/tool/fts5speed.tcl @@ -0,0 +1,64 @@ + + +set Q { + {1 "SELECT count(*) FROM t1 WHERE t1 MATCH 'enron'"} + {25 "SELECT count(*) FROM t1 WHERE t1 MATCH 'hours'"} + {300 "SELECT count(*) FROM t1 WHERE t1 MATCH 'acid'"} + {100 "SELECT count(*) FROM t1 WHERE t1 MATCH 'loaned OR mobility OR popcore OR sunk'"} + {100 "SELECT count(*) FROM t1 WHERE t1 MATCH 'enron AND myapps'"} + {1 "SELECT count(*) FROM t1 WHERE t1 MATCH 'en* AND my*'"} + + {1 "SELECT count(*) FROM t1 WHERE t1 MATCH 'c:t*'"} + {1 "SELECT count(*) FROM t1 WHERE t1 MATCH 'a:t* OR b:t* OR c:t* OR d:t* OR e:t* OR f:t* OR g:t*'"} + {1 "SELECT count(*) FROM t1 WHERE t1 MATCH 'a:t*'"} + {2 "SELECT count(*) FROM t1 WHERE t1 MATCH 'c:the'"} + + {2 "SELECT count(*) FROM t1 WHERE t1 MATCH 'd:holmes OR e:holmes OR f:holmes OR g:holmes'" } + {2 "SELECT count(*) FROM t1 WHERE t1 MATCH 'd:holmes AND e:holmes AND f:holmes AND g:holmes'" } + {4 "SELECT count(*) FROM t1 WHERE t1 MATCH 'd:holmes NOT e:holmes'" } +} + +proc usage {} { + global Q + puts stderr "Usage: $::argv0 DATABASE QUERY" + puts stderr "" + for {set i 1} {$i <= [llength $Q]} {incr i} { + puts stderr " $i. [lindex $Q [expr $i-1]]" + } + puts stderr "" + exit -1 +} + + +set nArg [llength $argv] +if {$nArg!=2 && $nArg!=3} usage +set database [lindex $argv 0] +set iquery [lindex $argv 1] +if {$iquery<1 || $iquery>[llength $Q]} usage +set nRepeat 0 +if {$nArg==3} { set nRepeat [lindex $argv 2] } + + +sqlite3 db $database +catch { load_static_extension db fts5 } + +incr iquery -1 +set sql [lindex $Q $iquery 1] +if {$nRepeat==0} { + set nRepeat [lindex $Q $iquery 0] +} + +puts "sql: $sql" +puts "nRepeat: $nRepeat" +if {[regexp matchinfo $sql]} { + sqlite3_fts5_register_matchinfo db + db eval $sql +} else { + puts "result: [db eval $sql]" +} + +for {set i 1} {$i < $nRepeat} {incr i} { + db eval $sql +} + + diff --git a/ext/fts5/tool/fts5txt2db.tcl b/ext/fts5/tool/fts5txt2db.tcl new file mode 100644 index 0000000..1996b2c --- /dev/null +++ b/ext/fts5/tool/fts5txt2db.tcl @@ -0,0 +1,231 @@ +########################################################################## +# 2016 Jan 27 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +proc process_cmdline {} { + cmdline::process ::A $::argv { + {fts5 "use fts5 (this is the default)"} + {fts4 "use fts4"} + {trigram "Use tokenize=trigram"} + {colsize "10 10 10" "list of column sizes"} + {tblname "t1" "table name to create"} + {detail "full" "Fts5 detail mode to use"} + {repeat 1 "Load each file this many times"} + {prefix "" "Fts prefix= option"} + {trans 1 "True to use a transaction"} + database + file... + } { + This script is designed to create fts4/5 tables with more than one column. + The -colsize option should be set to a Tcl list of integer values, one for + each column in the table. Each value is the number of tokens that will be + inserted into the column value for each row. For example, setting the -colsize + option to "5 10" creates an FTS table with 2 columns, with roughly 5 and 10 + tokens per row in each, respectively. + + Each "FILE" argument should be a text file. The contents of these text files + is split on whitespace characters to form a list of tokens. The first N1 + tokens are used for the first column of the first row, where N1 is the first + element of the -colsize list. The next N2 are used for the second column of + the first row, and so on. Rows are added to the table until the entire list + of tokens is exhausted. + } +} + +########################################################################### +########################################################################### +# Command line options processor. This is generic code that can be copied +# between scripts. +# +namespace eval cmdline { + proc cmdline_error {O E {msg ""}} { + if {$msg != ""} { + puts stderr "Error: $msg" + puts stderr "" + } + + set L [list] + foreach o $O { + if {[llength $o]==1} { + lappend L [string toupper $o] + } + } + + puts stderr "Usage: $::argv0 ?SWITCHES? $L" + puts stderr "" + puts stderr "Switches are:" + foreach o $O { + if {[llength $o]==3} { + foreach {a b c} $o {} + puts stderr [format " -%-15s %s (default \"%s\")" "$a VAL" $c $b] + } elseif {[llength $o]==2} { + foreach {a b} $o {} + puts stderr [format " -%-15s %s" $a $b] + } + } + puts stderr "" + puts stderr $E + exit -1 + } + + proc process {avar lArgs O E} { + upvar $avar A + set zTrailing "" ;# True if ... is present in $O + set lPosargs [list] + + # Populate A() with default values. Also, for each switch in the command + # line spec, set an entry in the idx() array as follows: + # + # {tblname t1 "table name to use"} + # -> [set idx(-tblname) {tblname t1 "table name to use"} + # + # For each position parameter, append its name to $lPosargs. If the ... + # specifier is present, set $zTrailing to the name of the prefix. + # + foreach o $O { + set nm [lindex $o 0] + set nArg [llength $o] + switch -- $nArg { + 1 { + if {[string range $nm end-2 end]=="..."} { + set zTrailing [string range $nm 0 end-3] + } else { + lappend lPosargs $nm + } + } + 2 { + set A($nm) 0 + set idx(-$nm) $o + } + 3 { + set A($nm) [lindex $o 1] + set idx(-$nm) $o + } + default { + error "Error in command line specification" + } + } + } + + # Set explicitly specified option values + # + set nArg [llength $lArgs] + for {set i 0} {$i < $nArg} {incr i} { + set opt [lindex $lArgs $i] + if {[string range $opt 0 0]!="-" || $opt=="--"} break + set c [array names idx "${opt}*"] + if {[llength $c]==0} { cmdline_error $O $E "Unrecognized option: $opt"} + if {[llength $c]>1} { cmdline_error $O $E "Ambiguous option: $opt"} + + if {[llength $idx($c)]==3} { + if {$i==[llength $lArgs]-1} { + cmdline_error $O $E "Option requires argument: $c" + } + incr i + set A([lindex $idx($c) 0]) [lindex $lArgs $i] + } else { + set A([lindex $idx($c) 0]) 1 + } + } + + # Deal with position arguments. + # + set nPosarg [llength $lPosargs] + set nRem [expr $nArg - $i] + if {$nRem < $nPosarg || ($zTrailing=="" && $nRem > $nPosarg)} { + cmdline_error $O $E + } + for {set j 0} {$j < $nPosarg} {incr j} { + set A([lindex $lPosargs $j]) [lindex $lArgs [expr $j+$i]] + } + if {$zTrailing!=""} { + set A($zTrailing) [lrange $lArgs [expr $j+$i] end] + } + } +} ;# namespace eval cmdline +# End of command line options processor. +########################################################################### +########################################################################### + +process_cmdline + +# If -fts4 was specified, use fts4. Otherwise, fts5. +if {$A(fts4)} { + set A(fts) fts4 +} else { + set A(fts) fts5 +} + +sqlite3 db $A(database) + +# Create the FTS table in the db. Return a list of the table columns. +# +proc create_table {} { + global A + set cols [list a b c d e f g h i j k l m n o p q r s t u v w x y z] + + set nCol [llength $A(colsize)] + set cols [lrange $cols 0 [expr $nCol-1]] + + set sql "CREATE VIRTUAL TABLE IF NOT EXISTS $A(tblname) USING $A(fts) (" + append sql [join $cols ,] + if {$A(fts)=="fts5"} { append sql ",detail=$A(detail)" } + if {$A(trigram)} { append sql ",tokenize=trigram" } + append sql ", prefix='$A(prefix)');" + + db eval $sql + return $cols +} + +# Return a list of tokens from the named file. +# +proc readfile {file} { + set fd [open $file] + set data [read $fd] + close $fd + split $data +} + +proc repeat {L n} { + set res [list] + for {set i 0} {$i < $n} {incr i} { + set res [concat $res $L] + } + set res +} + + +# Load all the data into a big list of tokens. +# +set tokens [list] +foreach f $A(file) { + set tokens [concat $tokens [repeat [readfile $f] $A(repeat)]] +} + +set N [llength $tokens] +set i 0 +set cols [create_table] +set sql "INSERT INTO $A(tblname) VALUES(\$R([lindex $cols 0])" +foreach c [lrange $cols 1 end] { + append sql ", \$R($c)" +} +append sql ")" + +if {$A(trans)} { db eval BEGIN } + while {$i < $N} { + foreach c $cols s $A(colsize) { + set R($c) [lrange $tokens $i [expr $i+$s-1]] + incr i $s + } + db eval $sql + } +if {$A(trans)} { db eval COMMIT } + + + diff --git a/ext/fts5/tool/loadfts5.tcl b/ext/fts5/tool/loadfts5.tcl new file mode 100644 index 0000000..96fd692 --- /dev/null +++ b/ext/fts5/tool/loadfts5.tcl @@ -0,0 +1,172 @@ + + +proc loadfile {f} { + set fd [open $f] + set data [read $fd] + close $fd + return $data +} + +set ::nRow 0 +set ::nRowPerDot 1000 + +proc load_hierachy {dir} { + foreach f [glob -nocomplain -dir $dir *] { + if {$::O(limit) && $::nRow>=$::O(limit)} break + if {[file isdir $f]} { + load_hierachy $f + } else { + db eval { INSERT INTO t1 VALUES($f, loadfile($f)) } + incr ::nRow + + if {$::O(trans) && ($::nRow % $::O(trans))==0} { + db eval { COMMIT } + db eval { INSERT INTO t1(t1) VALUES('integrity-check') } + db eval { BEGIN } + } + + if {($::nRow % $::nRowPerDot)==0} { + puts -nonewline . + if {($::nRow % (65*$::nRowPerDot))==0} { puts "" } + flush stdout + } + + } + } +} + +proc usage {} { + puts stderr "Usage: $::argv0 ?SWITCHES? DATABASE PATH" + puts stderr "" + puts stderr "Switches are:" + puts stderr " -fts4 (use fts4 instead of fts5)" + puts stderr " -fts5 (use fts5)" + puts stderr " -porter (use porter tokenizer)" + puts stderr " -delete (delete the database file before starting)" + puts stderr " -limit N (load no more than N documents)" + puts stderr " -automerge N (set the automerge parameter to N)" + puts stderr " -crisismerge N (set the crisismerge parameter to N)" + puts stderr " -prefix PREFIX (comma separated prefix= argument)" + puts stderr " -trans N (commit after N inserts - 0 == never)" + puts stderr " -hashsize N (set the fts5 hashsize parameter to N)" + puts stderr " -detail MODE (detail mode for fts5 tables)" + exit 1 +} + +set O(vtab) fts5 +set O(tok) "" +set O(limit) 0 +set O(delete) 0 +set O(automerge) -1 +set O(crisismerge) -1 +set O(prefix) "" +set O(trans) 0 +set O(hashsize) -1 +set O(detail) full + +if {[llength $argv]<2} usage +set nOpt [expr {[llength $argv]-2}] +for {set i 0} {$i < $nOpt} {incr i} { + set arg [lindex $argv $i] + switch -- [lindex $argv $i] { + -fts4 { + set O(vtab) fts4 + } + + -fts5 { + set O(vtab) fts5 + } + + -porter { + set O(tok) ", tokenize=porter" + } + + -delete { + set O(delete) 1 + } + + -limit { + if { [incr i]>=$nOpt } usage + set O(limit) [lindex $argv $i] + } + + -trans { + if { [incr i]>=$nOpt } usage + set O(trans) [lindex $argv $i] + } + + -automerge { + if { [incr i]>=$nOpt } usage + set O(automerge) [lindex $argv $i] + } + + -crisismerge { + if { [incr i]>=$nOpt } usage + set O(crisismerge) [lindex $argv $i] + } + + -prefix { + if { [incr i]>=$nOpt } usage + set O(prefix) [lindex $argv $i] + } + + -hashsize { + if { [incr i]>=$nOpt } usage + set O(hashsize) [lindex $argv $i] + } + + -detail { + if { [incr i]>=$nOpt } usage + set O(detail) [lindex $argv $i] + } + + default { + usage + } + } +} + +set dbfile [lindex $argv end-1] +if {$O(delete)} { file delete -force $dbfile } +sqlite3 db $dbfile +catch { load_static_extension db fts5 } +db func loadfile loadfile +db eval "PRAGMA page_size=4096" + +db eval BEGIN + set pref "" + if {$O(prefix)!=""} { set pref ", prefix='$O(prefix)'" } + if {$O(vtab)=="fts5"} { + append pref ", detail=$O(detail)" + } + catch { + db eval "CREATE VIRTUAL TABLE t1 USING $O(vtab) (path, content$O(tok)$pref)" + db eval "INSERT INTO t1(t1, rank) VALUES('pgsz', 4050);" + } + + if {$O(hashsize)>=0} { + catch { + db eval "INSERT INTO t1(t1, rank) VALUES('hashsize', $O(hashsize));" + } + } + + + if {$O(automerge)>=0} { + if {$O(vtab) == "fts5"} { + db eval { INSERT INTO t1(t1, rank) VALUES('automerge', $O(automerge)) } + } else { + db eval { INSERT INTO t1(t1) VALUES('automerge=' || $O(automerge)) } + } + } + if {$O(crisismerge)>=0} { + if {$O(vtab) == "fts5"} { + db eval {INSERT INTO t1(t1, rank) VALUES('crisismerge', $O(crisismerge))} + } else { + } + } + load_hierachy [lindex $argv end] +db eval COMMIT +puts "" + + + diff --git a/ext/fts5/tool/mkfts5c.tcl b/ext/fts5/tool/mkfts5c.tcl new file mode 100644 index 0000000..b1a55fa --- /dev/null +++ b/ext/fts5/tool/mkfts5c.tcl @@ -0,0 +1,114 @@ +#!/bin/sh +# restart with tclsh \ +exec tclsh "$0" "$@" + +set srcdir [file dirname [file dirname [info script]]] +set G(src) [string map [list %dir% $srcdir] { + %dir%/fts5.h + %dir%/fts5Int.h + fts5parse.h + fts5parse.c + %dir%/fts5_aux.c + %dir%/fts5_buffer.c + %dir%/fts5_config.c + %dir%/fts5_expr.c + %dir%/fts5_hash.c + %dir%/fts5_index.c + %dir%/fts5_main.c + %dir%/fts5_storage.c + %dir%/fts5_tokenize.c + %dir%/fts5_unicode2.c + %dir%/fts5_varint.c + %dir%/fts5_vocab.c +}] + +set G(hdr) { + +#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS5) + +#if !defined(NDEBUG) && !defined(SQLITE_DEBUG) +# define NDEBUG 1 +#endif +#if defined(NDEBUG) && defined(SQLITE_DEBUG) +# undef NDEBUG +#endif + +} + +set G(footer) { + +#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS5) */ +} + +#------------------------------------------------------------------------- +# Read and return the entire contents of text file $zFile from disk. +# +proc readfile {zFile} { + set fd [open $zFile] + set data [read $fd] + close $fd + return $data +} + +#------------------------------------------------------------------------- +# This command returns a string identifying the current sqlite version - +# the equivalent of the SQLITE_SOURCE_ID string. +# +proc fts5_source_id {zDir} { + set top [file dirname [file dirname $zDir]] + set uuid [string trim [readfile [file join $top manifest.uuid]]] + + set L [split [readfile [file join $top manifest]]] + set date [lindex $L [expr [lsearch -exact $L D]+1]] + set idx [expr {[string last . $date]-1}] + set date [string range $date 0 $idx] + set date [string map {T { }} $date] + + return "fts5: $date $uuid" +} + +proc fts5c_init {zOut} { + global G + set G(fd) stdout + set G(fd) [open $zOut w] + + puts -nonewline $G(fd) $G(hdr) +} + +proc fts5c_printfile {zIn} { + global G + set data [readfile $zIn] + set zTail [file tail $zIn] + puts $G(fd) "#line 1 \"$zTail\"" + + set sub_map [list --FTS5-SOURCE-ID-- [fts5_source_id $::srcdir]] + if {$zTail=="fts5parse.c"} { + lappend sub_map yy fts5yy YY fts5YY TOKEN FTS5TOKEN + } + + foreach line [split $data "\n"] { + if {[regexp {^#include.*fts5} $line]} { + set line "/* $line */" + } elseif { + ![regexp { sqlite3Fts5Init\(} $line] + && [regexp {^(const )?[a-zA-Z][a-zA-Z0-9]* [*]?sqlite3Fts5} $line] + } { + set line "static $line" + } + set line [string map $sub_map $line] + puts $G(fd) $line + } +} + +proc fts5c_close {} { + global G + puts -nonewline $G(fd) $G(footer) + if {$G(fd)!="stdout"} { + close $G(fd) + } +} + + +fts5c_init fts5.c +foreach f $G(src) { fts5c_printfile $f } +fts5c_close diff --git a/ext/fts5/tool/showfts5.tcl b/ext/fts5/tool/showfts5.tcl new file mode 100644 index 0000000..75ac0f1 --- /dev/null +++ b/ext/fts5/tool/showfts5.tcl @@ -0,0 +1,97 @@ + + + +#------------------------------------------------------------------------- +# Process command line arguments. +# +proc usage {} { + puts stderr "usage: $::argv0 ?OPTIONS? database table" + puts stderr "" + puts stderr " -nterm (count number of terms in each segment)" + puts stderr " -segments (output segment contents)" + puts stderr "" + exit 1 +} + +set O(nterm) 0 +set O(segments) 0 + +if {[llength $argv]<2} usage +foreach a [lrange $argv 0 end-2] { + switch -- $a { + -nterm { + set O(nterm) 1 + } + + -segments { + set O(segments) 1 + } + + default { + usage + } + } +} + +set database [lindex $argv end-1] +set tbl [lindex $argv end] + + +#------------------------------------------------------------------------- +# Count the number of terms in each segment of fts5 table $tbl. Store the +# counts in the array variable in the parent context named by parameter +# $arrayname, indexed by segment-id. Example: +# +# count_terms fts_tbl A +# foreach {k v} [array get A] { puts "segid=$k nTerm=$v" } +# +proc count_terms {tbl arrayname} { + upvar A $arrayname + array unset A + db eval "SELECT fts5_decode(rowid, block) AS d FROM ${tbl}_data" { + set desc [lindex $d 0] + if {[regexp {^segid=([0-9]*)} $desc -> id]} { + foreach i [lrange $d 1 end] { + if {[string match {term=*} $i]} { incr A($id) } + } + } + } +} + + +#------------------------------------------------------------------------- +# Start of main program. +# +sqlite3 db $database +catch { load_static_extension db fts5 } + +if {$O(nterm)} { count_terms $tbl A } + +db eval "SELECT fts5_decode(rowid, block) AS d FROM ${tbl}_data WHERE id=10" { + foreach lvl [lrange $d 1 end] { + puts [lrange $lvl 0 2] + + foreach seg [lrange $lvl 3 end] { + if {$::O(nterm)} { + regexp {^id=([0-9]*)} $seg -> id + set nTerm 0 + catch { set nTerm $A($id) } + puts [format " % -28s nTerm=%d" $seg $nTerm] + } else { + puts [format " % -28s" $seg] + } + } + } +} + +if {$O(segments)} { + puts "" + db eval "SELECT fts5_decode(rowid, block) AS d FROM ${tbl}_data WHERE id>10" { + puts $d + } +} + + + + + |