summaryrefslogtreecommitdiffstats
path: root/ext/fts5/tool
diff options
context:
space:
mode:
Diffstat (limited to 'ext/fts5/tool')
-rw-r--r--ext/fts5/tool/fts5speed.tcl64
-rw-r--r--ext/fts5/tool/fts5txt2db.tcl231
-rw-r--r--ext/fts5/tool/loadfts5.tcl172
-rw-r--r--ext/fts5/tool/mkfts5c.tcl114
-rw-r--r--ext/fts5/tool/showfts5.tcl97
5 files changed, 678 insertions, 0 deletions
diff --git a/ext/fts5/tool/fts5speed.tcl b/ext/fts5/tool/fts5speed.tcl
new file mode 100644
index 0000000..0f38638
--- /dev/null
+++ b/ext/fts5/tool/fts5speed.tcl
@@ -0,0 +1,64 @@
+
+
+set Q {
+ {1 "SELECT count(*) FROM t1 WHERE t1 MATCH 'enron'"}
+ {25 "SELECT count(*) FROM t1 WHERE t1 MATCH 'hours'"}
+ {300 "SELECT count(*) FROM t1 WHERE t1 MATCH 'acid'"}
+ {100 "SELECT count(*) FROM t1 WHERE t1 MATCH 'loaned OR mobility OR popcore OR sunk'"}
+ {100 "SELECT count(*) FROM t1 WHERE t1 MATCH 'enron AND myapps'"}
+ {1 "SELECT count(*) FROM t1 WHERE t1 MATCH 'en* AND my*'"}
+
+ {1 "SELECT count(*) FROM t1 WHERE t1 MATCH 'c:t*'"}
+ {1 "SELECT count(*) FROM t1 WHERE t1 MATCH 'a:t* OR b:t* OR c:t* OR d:t* OR e:t* OR f:t* OR g:t*'"}
+ {1 "SELECT count(*) FROM t1 WHERE t1 MATCH 'a:t*'"}
+ {2 "SELECT count(*) FROM t1 WHERE t1 MATCH 'c:the'"}
+
+ {2 "SELECT count(*) FROM t1 WHERE t1 MATCH 'd:holmes OR e:holmes OR f:holmes OR g:holmes'" }
+ {2 "SELECT count(*) FROM t1 WHERE t1 MATCH 'd:holmes AND e:holmes AND f:holmes AND g:holmes'" }
+ {4 "SELECT count(*) FROM t1 WHERE t1 MATCH 'd:holmes NOT e:holmes'" }
+}
+
+proc usage {} {
+ global Q
+ puts stderr "Usage: $::argv0 DATABASE QUERY"
+ puts stderr ""
+ for {set i 1} {$i <= [llength $Q]} {incr i} {
+ puts stderr " $i. [lindex $Q [expr $i-1]]"
+ }
+ puts stderr ""
+ exit -1
+}
+
+
+set nArg [llength $argv]
+if {$nArg!=2 && $nArg!=3} usage
+set database [lindex $argv 0]
+set iquery [lindex $argv 1]
+if {$iquery<1 || $iquery>[llength $Q]} usage
+set nRepeat 0
+if {$nArg==3} { set nRepeat [lindex $argv 2] }
+
+
+sqlite3 db $database
+catch { load_static_extension db fts5 }
+
+incr iquery -1
+set sql [lindex $Q $iquery 1]
+if {$nRepeat==0} {
+ set nRepeat [lindex $Q $iquery 0]
+}
+
+puts "sql: $sql"
+puts "nRepeat: $nRepeat"
+if {[regexp matchinfo $sql]} {
+ sqlite3_fts5_register_matchinfo db
+ db eval $sql
+} else {
+ puts "result: [db eval $sql]"
+}
+
+for {set i 1} {$i < $nRepeat} {incr i} {
+ db eval $sql
+}
+
+
diff --git a/ext/fts5/tool/fts5txt2db.tcl b/ext/fts5/tool/fts5txt2db.tcl
new file mode 100644
index 0000000..1996b2c
--- /dev/null
+++ b/ext/fts5/tool/fts5txt2db.tcl
@@ -0,0 +1,231 @@
+##########################################################################
+# 2016 Jan 27
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+proc process_cmdline {} {
+ cmdline::process ::A $::argv {
+ {fts5 "use fts5 (this is the default)"}
+ {fts4 "use fts4"}
+ {trigram "Use tokenize=trigram"}
+ {colsize "10 10 10" "list of column sizes"}
+ {tblname "t1" "table name to create"}
+ {detail "full" "Fts5 detail mode to use"}
+ {repeat 1 "Load each file this many times"}
+ {prefix "" "Fts prefix= option"}
+ {trans 1 "True to use a transaction"}
+ database
+ file...
+ } {
+ This script is designed to create fts4/5 tables with more than one column.
+ The -colsize option should be set to a Tcl list of integer values, one for
+ each column in the table. Each value is the number of tokens that will be
+ inserted into the column value for each row. For example, setting the -colsize
+ option to "5 10" creates an FTS table with 2 columns, with roughly 5 and 10
+ tokens per row in each, respectively.
+
+ Each "FILE" argument should be a text file. The contents of these text files
+ is split on whitespace characters to form a list of tokens. The first N1
+ tokens are used for the first column of the first row, where N1 is the first
+ element of the -colsize list. The next N2 are used for the second column of
+ the first row, and so on. Rows are added to the table until the entire list
+ of tokens is exhausted.
+ }
+}
+
+###########################################################################
+###########################################################################
+# Command line options processor. This is generic code that can be copied
+# between scripts.
+#
+namespace eval cmdline {
+ proc cmdline_error {O E {msg ""}} {
+ if {$msg != ""} {
+ puts stderr "Error: $msg"
+ puts stderr ""
+ }
+
+ set L [list]
+ foreach o $O {
+ if {[llength $o]==1} {
+ lappend L [string toupper $o]
+ }
+ }
+
+ puts stderr "Usage: $::argv0 ?SWITCHES? $L"
+ puts stderr ""
+ puts stderr "Switches are:"
+ foreach o $O {
+ if {[llength $o]==3} {
+ foreach {a b c} $o {}
+ puts stderr [format " -%-15s %s (default \"%s\")" "$a VAL" $c $b]
+ } elseif {[llength $o]==2} {
+ foreach {a b} $o {}
+ puts stderr [format " -%-15s %s" $a $b]
+ }
+ }
+ puts stderr ""
+ puts stderr $E
+ exit -1
+ }
+
+ proc process {avar lArgs O E} {
+ upvar $avar A
+ set zTrailing "" ;# True if ... is present in $O
+ set lPosargs [list]
+
+ # Populate A() with default values. Also, for each switch in the command
+ # line spec, set an entry in the idx() array as follows:
+ #
+ # {tblname t1 "table name to use"}
+ # -> [set idx(-tblname) {tblname t1 "table name to use"}
+ #
+ # For each position parameter, append its name to $lPosargs. If the ...
+ # specifier is present, set $zTrailing to the name of the prefix.
+ #
+ foreach o $O {
+ set nm [lindex $o 0]
+ set nArg [llength $o]
+ switch -- $nArg {
+ 1 {
+ if {[string range $nm end-2 end]=="..."} {
+ set zTrailing [string range $nm 0 end-3]
+ } else {
+ lappend lPosargs $nm
+ }
+ }
+ 2 {
+ set A($nm) 0
+ set idx(-$nm) $o
+ }
+ 3 {
+ set A($nm) [lindex $o 1]
+ set idx(-$nm) $o
+ }
+ default {
+ error "Error in command line specification"
+ }
+ }
+ }
+
+ # Set explicitly specified option values
+ #
+ set nArg [llength $lArgs]
+ for {set i 0} {$i < $nArg} {incr i} {
+ set opt [lindex $lArgs $i]
+ if {[string range $opt 0 0]!="-" || $opt=="--"} break
+ set c [array names idx "${opt}*"]
+ if {[llength $c]==0} { cmdline_error $O $E "Unrecognized option: $opt"}
+ if {[llength $c]>1} { cmdline_error $O $E "Ambiguous option: $opt"}
+
+ if {[llength $idx($c)]==3} {
+ if {$i==[llength $lArgs]-1} {
+ cmdline_error $O $E "Option requires argument: $c"
+ }
+ incr i
+ set A([lindex $idx($c) 0]) [lindex $lArgs $i]
+ } else {
+ set A([lindex $idx($c) 0]) 1
+ }
+ }
+
+ # Deal with position arguments.
+ #
+ set nPosarg [llength $lPosargs]
+ set nRem [expr $nArg - $i]
+ if {$nRem < $nPosarg || ($zTrailing=="" && $nRem > $nPosarg)} {
+ cmdline_error $O $E
+ }
+ for {set j 0} {$j < $nPosarg} {incr j} {
+ set A([lindex $lPosargs $j]) [lindex $lArgs [expr $j+$i]]
+ }
+ if {$zTrailing!=""} {
+ set A($zTrailing) [lrange $lArgs [expr $j+$i] end]
+ }
+ }
+} ;# namespace eval cmdline
+# End of command line options processor.
+###########################################################################
+###########################################################################
+
+process_cmdline
+
+# If -fts4 was specified, use fts4. Otherwise, fts5.
+if {$A(fts4)} {
+ set A(fts) fts4
+} else {
+ set A(fts) fts5
+}
+
+sqlite3 db $A(database)
+
+# Create the FTS table in the db. Return a list of the table columns.
+#
+proc create_table {} {
+ global A
+ set cols [list a b c d e f g h i j k l m n o p q r s t u v w x y z]
+
+ set nCol [llength $A(colsize)]
+ set cols [lrange $cols 0 [expr $nCol-1]]
+
+ set sql "CREATE VIRTUAL TABLE IF NOT EXISTS $A(tblname) USING $A(fts) ("
+ append sql [join $cols ,]
+ if {$A(fts)=="fts5"} { append sql ",detail=$A(detail)" }
+ if {$A(trigram)} { append sql ",tokenize=trigram" }
+ append sql ", prefix='$A(prefix)');"
+
+ db eval $sql
+ return $cols
+}
+
+# Return a list of tokens from the named file.
+#
+proc readfile {file} {
+ set fd [open $file]
+ set data [read $fd]
+ close $fd
+ split $data
+}
+
+proc repeat {L n} {
+ set res [list]
+ for {set i 0} {$i < $n} {incr i} {
+ set res [concat $res $L]
+ }
+ set res
+}
+
+
+# Load all the data into a big list of tokens.
+#
+set tokens [list]
+foreach f $A(file) {
+ set tokens [concat $tokens [repeat [readfile $f] $A(repeat)]]
+}
+
+set N [llength $tokens]
+set i 0
+set cols [create_table]
+set sql "INSERT INTO $A(tblname) VALUES(\$R([lindex $cols 0])"
+foreach c [lrange $cols 1 end] {
+ append sql ", \$R($c)"
+}
+append sql ")"
+
+if {$A(trans)} { db eval BEGIN }
+ while {$i < $N} {
+ foreach c $cols s $A(colsize) {
+ set R($c) [lrange $tokens $i [expr $i+$s-1]]
+ incr i $s
+ }
+ db eval $sql
+ }
+if {$A(trans)} { db eval COMMIT }
+
+
+
diff --git a/ext/fts5/tool/loadfts5.tcl b/ext/fts5/tool/loadfts5.tcl
new file mode 100644
index 0000000..96fd692
--- /dev/null
+++ b/ext/fts5/tool/loadfts5.tcl
@@ -0,0 +1,172 @@
+
+
+proc loadfile {f} {
+ set fd [open $f]
+ set data [read $fd]
+ close $fd
+ return $data
+}
+
+set ::nRow 0
+set ::nRowPerDot 1000
+
+proc load_hierachy {dir} {
+ foreach f [glob -nocomplain -dir $dir *] {
+ if {$::O(limit) && $::nRow>=$::O(limit)} break
+ if {[file isdir $f]} {
+ load_hierachy $f
+ } else {
+ db eval { INSERT INTO t1 VALUES($f, loadfile($f)) }
+ incr ::nRow
+
+ if {$::O(trans) && ($::nRow % $::O(trans))==0} {
+ db eval { COMMIT }
+ db eval { INSERT INTO t1(t1) VALUES('integrity-check') }
+ db eval { BEGIN }
+ }
+
+ if {($::nRow % $::nRowPerDot)==0} {
+ puts -nonewline .
+ if {($::nRow % (65*$::nRowPerDot))==0} { puts "" }
+ flush stdout
+ }
+
+ }
+ }
+}
+
+proc usage {} {
+ puts stderr "Usage: $::argv0 ?SWITCHES? DATABASE PATH"
+ puts stderr ""
+ puts stderr "Switches are:"
+ puts stderr " -fts4 (use fts4 instead of fts5)"
+ puts stderr " -fts5 (use fts5)"
+ puts stderr " -porter (use porter tokenizer)"
+ puts stderr " -delete (delete the database file before starting)"
+ puts stderr " -limit N (load no more than N documents)"
+ puts stderr " -automerge N (set the automerge parameter to N)"
+ puts stderr " -crisismerge N (set the crisismerge parameter to N)"
+ puts stderr " -prefix PREFIX (comma separated prefix= argument)"
+ puts stderr " -trans N (commit after N inserts - 0 == never)"
+ puts stderr " -hashsize N (set the fts5 hashsize parameter to N)"
+ puts stderr " -detail MODE (detail mode for fts5 tables)"
+ exit 1
+}
+
+set O(vtab) fts5
+set O(tok) ""
+set O(limit) 0
+set O(delete) 0
+set O(automerge) -1
+set O(crisismerge) -1
+set O(prefix) ""
+set O(trans) 0
+set O(hashsize) -1
+set O(detail) full
+
+if {[llength $argv]<2} usage
+set nOpt [expr {[llength $argv]-2}]
+for {set i 0} {$i < $nOpt} {incr i} {
+ set arg [lindex $argv $i]
+ switch -- [lindex $argv $i] {
+ -fts4 {
+ set O(vtab) fts4
+ }
+
+ -fts5 {
+ set O(vtab) fts5
+ }
+
+ -porter {
+ set O(tok) ", tokenize=porter"
+ }
+
+ -delete {
+ set O(delete) 1
+ }
+
+ -limit {
+ if { [incr i]>=$nOpt } usage
+ set O(limit) [lindex $argv $i]
+ }
+
+ -trans {
+ if { [incr i]>=$nOpt } usage
+ set O(trans) [lindex $argv $i]
+ }
+
+ -automerge {
+ if { [incr i]>=$nOpt } usage
+ set O(automerge) [lindex $argv $i]
+ }
+
+ -crisismerge {
+ if { [incr i]>=$nOpt } usage
+ set O(crisismerge) [lindex $argv $i]
+ }
+
+ -prefix {
+ if { [incr i]>=$nOpt } usage
+ set O(prefix) [lindex $argv $i]
+ }
+
+ -hashsize {
+ if { [incr i]>=$nOpt } usage
+ set O(hashsize) [lindex $argv $i]
+ }
+
+ -detail {
+ if { [incr i]>=$nOpt } usage
+ set O(detail) [lindex $argv $i]
+ }
+
+ default {
+ usage
+ }
+ }
+}
+
+set dbfile [lindex $argv end-1]
+if {$O(delete)} { file delete -force $dbfile }
+sqlite3 db $dbfile
+catch { load_static_extension db fts5 }
+db func loadfile loadfile
+db eval "PRAGMA page_size=4096"
+
+db eval BEGIN
+ set pref ""
+ if {$O(prefix)!=""} { set pref ", prefix='$O(prefix)'" }
+ if {$O(vtab)=="fts5"} {
+ append pref ", detail=$O(detail)"
+ }
+ catch {
+ db eval "CREATE VIRTUAL TABLE t1 USING $O(vtab) (path, content$O(tok)$pref)"
+ db eval "INSERT INTO t1(t1, rank) VALUES('pgsz', 4050);"
+ }
+
+ if {$O(hashsize)>=0} {
+ catch {
+ db eval "INSERT INTO t1(t1, rank) VALUES('hashsize', $O(hashsize));"
+ }
+ }
+
+
+ if {$O(automerge)>=0} {
+ if {$O(vtab) == "fts5"} {
+ db eval { INSERT INTO t1(t1, rank) VALUES('automerge', $O(automerge)) }
+ } else {
+ db eval { INSERT INTO t1(t1) VALUES('automerge=' || $O(automerge)) }
+ }
+ }
+ if {$O(crisismerge)>=0} {
+ if {$O(vtab) == "fts5"} {
+ db eval {INSERT INTO t1(t1, rank) VALUES('crisismerge', $O(crisismerge))}
+ } else {
+ }
+ }
+ load_hierachy [lindex $argv end]
+db eval COMMIT
+puts ""
+
+
+
diff --git a/ext/fts5/tool/mkfts5c.tcl b/ext/fts5/tool/mkfts5c.tcl
new file mode 100644
index 0000000..b1a55fa
--- /dev/null
+++ b/ext/fts5/tool/mkfts5c.tcl
@@ -0,0 +1,114 @@
+#!/bin/sh
+# restart with tclsh \
+exec tclsh "$0" "$@"
+
+set srcdir [file dirname [file dirname [info script]]]
+set G(src) [string map [list %dir% $srcdir] {
+ %dir%/fts5.h
+ %dir%/fts5Int.h
+ fts5parse.h
+ fts5parse.c
+ %dir%/fts5_aux.c
+ %dir%/fts5_buffer.c
+ %dir%/fts5_config.c
+ %dir%/fts5_expr.c
+ %dir%/fts5_hash.c
+ %dir%/fts5_index.c
+ %dir%/fts5_main.c
+ %dir%/fts5_storage.c
+ %dir%/fts5_tokenize.c
+ %dir%/fts5_unicode2.c
+ %dir%/fts5_varint.c
+ %dir%/fts5_vocab.c
+}]
+
+set G(hdr) {
+
+#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS5)
+
+#if !defined(NDEBUG) && !defined(SQLITE_DEBUG)
+# define NDEBUG 1
+#endif
+#if defined(NDEBUG) && defined(SQLITE_DEBUG)
+# undef NDEBUG
+#endif
+
+}
+
+set G(footer) {
+
+#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS5) */
+}
+
+#-------------------------------------------------------------------------
+# Read and return the entire contents of text file $zFile from disk.
+#
+proc readfile {zFile} {
+ set fd [open $zFile]
+ set data [read $fd]
+ close $fd
+ return $data
+}
+
+#-------------------------------------------------------------------------
+# This command returns a string identifying the current sqlite version -
+# the equivalent of the SQLITE_SOURCE_ID string.
+#
+proc fts5_source_id {zDir} {
+ set top [file dirname [file dirname $zDir]]
+ set uuid [string trim [readfile [file join $top manifest.uuid]]]
+
+ set L [split [readfile [file join $top manifest]]]
+ set date [lindex $L [expr [lsearch -exact $L D]+1]]
+ set idx [expr {[string last . $date]-1}]
+ set date [string range $date 0 $idx]
+ set date [string map {T { }} $date]
+
+ return "fts5: $date $uuid"
+}
+
+proc fts5c_init {zOut} {
+ global G
+ set G(fd) stdout
+ set G(fd) [open $zOut w]
+
+ puts -nonewline $G(fd) $G(hdr)
+}
+
+proc fts5c_printfile {zIn} {
+ global G
+ set data [readfile $zIn]
+ set zTail [file tail $zIn]
+ puts $G(fd) "#line 1 \"$zTail\""
+
+ set sub_map [list --FTS5-SOURCE-ID-- [fts5_source_id $::srcdir]]
+ if {$zTail=="fts5parse.c"} {
+ lappend sub_map yy fts5yy YY fts5YY TOKEN FTS5TOKEN
+ }
+
+ foreach line [split $data "\n"] {
+ if {[regexp {^#include.*fts5} $line]} {
+ set line "/* $line */"
+ } elseif {
+ ![regexp { sqlite3Fts5Init\(} $line]
+ && [regexp {^(const )?[a-zA-Z][a-zA-Z0-9]* [*]?sqlite3Fts5} $line]
+ } {
+ set line "static $line"
+ }
+ set line [string map $sub_map $line]
+ puts $G(fd) $line
+ }
+}
+
+proc fts5c_close {} {
+ global G
+ puts -nonewline $G(fd) $G(footer)
+ if {$G(fd)!="stdout"} {
+ close $G(fd)
+ }
+}
+
+
+fts5c_init fts5.c
+foreach f $G(src) { fts5c_printfile $f }
+fts5c_close
diff --git a/ext/fts5/tool/showfts5.tcl b/ext/fts5/tool/showfts5.tcl
new file mode 100644
index 0000000..75ac0f1
--- /dev/null
+++ b/ext/fts5/tool/showfts5.tcl
@@ -0,0 +1,97 @@
+
+
+
+#-------------------------------------------------------------------------
+# Process command line arguments.
+#
+proc usage {} {
+ puts stderr "usage: $::argv0 ?OPTIONS? database table"
+ puts stderr ""
+ puts stderr " -nterm (count number of terms in each segment)"
+ puts stderr " -segments (output segment contents)"
+ puts stderr ""
+ exit 1
+}
+
+set O(nterm) 0
+set O(segments) 0
+
+if {[llength $argv]<2} usage
+foreach a [lrange $argv 0 end-2] {
+ switch -- $a {
+ -nterm {
+ set O(nterm) 1
+ }
+
+ -segments {
+ set O(segments) 1
+ }
+
+ default {
+ usage
+ }
+ }
+}
+
+set database [lindex $argv end-1]
+set tbl [lindex $argv end]
+
+
+#-------------------------------------------------------------------------
+# Count the number of terms in each segment of fts5 table $tbl. Store the
+# counts in the array variable in the parent context named by parameter
+# $arrayname, indexed by segment-id. Example:
+#
+# count_terms fts_tbl A
+# foreach {k v} [array get A] { puts "segid=$k nTerm=$v" }
+#
+proc count_terms {tbl arrayname} {
+ upvar A $arrayname
+ array unset A
+ db eval "SELECT fts5_decode(rowid, block) AS d FROM ${tbl}_data" {
+ set desc [lindex $d 0]
+ if {[regexp {^segid=([0-9]*)} $desc -> id]} {
+ foreach i [lrange $d 1 end] {
+ if {[string match {term=*} $i]} { incr A($id) }
+ }
+ }
+ }
+}
+
+
+#-------------------------------------------------------------------------
+# Start of main program.
+#
+sqlite3 db $database
+catch { load_static_extension db fts5 }
+
+if {$O(nterm)} { count_terms $tbl A }
+
+db eval "SELECT fts5_decode(rowid, block) AS d FROM ${tbl}_data WHERE id=10" {
+ foreach lvl [lrange $d 1 end] {
+ puts [lrange $lvl 0 2]
+
+ foreach seg [lrange $lvl 3 end] {
+ if {$::O(nterm)} {
+ regexp {^id=([0-9]*)} $seg -> id
+ set nTerm 0
+ catch { set nTerm $A($id) }
+ puts [format " % -28s nTerm=%d" $seg $nTerm]
+ } else {
+ puts [format " % -28s" $seg]
+ }
+ }
+ }
+}
+
+if {$O(segments)} {
+ puts ""
+ db eval "SELECT fts5_decode(rowid, block) AS d FROM ${tbl}_data WHERE id>10" {
+ puts $d
+ }
+}
+
+
+
+
+