summaryrefslogtreecommitdiffstats
path: root/ext/fts5/tool/fts5txt2db.tcl
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--ext/fts5/tool/fts5txt2db.tcl231
1 files changed, 231 insertions, 0 deletions
diff --git a/ext/fts5/tool/fts5txt2db.tcl b/ext/fts5/tool/fts5txt2db.tcl
new file mode 100644
index 0000000..1996b2c
--- /dev/null
+++ b/ext/fts5/tool/fts5txt2db.tcl
@@ -0,0 +1,231 @@
+##########################################################################
+# 2016 Jan 27
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+proc process_cmdline {} {
+ cmdline::process ::A $::argv {
+ {fts5 "use fts5 (this is the default)"}
+ {fts4 "use fts4"}
+ {trigram "Use tokenize=trigram"}
+ {colsize "10 10 10" "list of column sizes"}
+ {tblname "t1" "table name to create"}
+ {detail "full" "Fts5 detail mode to use"}
+ {repeat 1 "Load each file this many times"}
+ {prefix "" "Fts prefix= option"}
+ {trans 1 "True to use a transaction"}
+ database
+ file...
+ } {
+ This script is designed to create fts4/5 tables with more than one column.
+ The -colsize option should be set to a Tcl list of integer values, one for
+ each column in the table. Each value is the number of tokens that will be
+ inserted into the column value for each row. For example, setting the -colsize
+ option to "5 10" creates an FTS table with 2 columns, with roughly 5 and 10
+ tokens per row in each, respectively.
+
+ Each "FILE" argument should be a text file. The contents of these text files
+ is split on whitespace characters to form a list of tokens. The first N1
+ tokens are used for the first column of the first row, where N1 is the first
+ element of the -colsize list. The next N2 are used for the second column of
+ the first row, and so on. Rows are added to the table until the entire list
+ of tokens is exhausted.
+ }
+}
+
+###########################################################################
+###########################################################################
+# Command line options processor. This is generic code that can be copied
+# between scripts.
+#
+namespace eval cmdline {
+ proc cmdline_error {O E {msg ""}} {
+ if {$msg != ""} {
+ puts stderr "Error: $msg"
+ puts stderr ""
+ }
+
+ set L [list]
+ foreach o $O {
+ if {[llength $o]==1} {
+ lappend L [string toupper $o]
+ }
+ }
+
+ puts stderr "Usage: $::argv0 ?SWITCHES? $L"
+ puts stderr ""
+ puts stderr "Switches are:"
+ foreach o $O {
+ if {[llength $o]==3} {
+ foreach {a b c} $o {}
+ puts stderr [format " -%-15s %s (default \"%s\")" "$a VAL" $c $b]
+ } elseif {[llength $o]==2} {
+ foreach {a b} $o {}
+ puts stderr [format " -%-15s %s" $a $b]
+ }
+ }
+ puts stderr ""
+ puts stderr $E
+ exit -1
+ }
+
+ proc process {avar lArgs O E} {
+ upvar $avar A
+ set zTrailing "" ;# True if ... is present in $O
+ set lPosargs [list]
+
+ # Populate A() with default values. Also, for each switch in the command
+ # line spec, set an entry in the idx() array as follows:
+ #
+ # {tblname t1 "table name to use"}
+ # -> [set idx(-tblname) {tblname t1 "table name to use"}
+ #
+ # For each position parameter, append its name to $lPosargs. If the ...
+ # specifier is present, set $zTrailing to the name of the prefix.
+ #
+ foreach o $O {
+ set nm [lindex $o 0]
+ set nArg [llength $o]
+ switch -- $nArg {
+ 1 {
+ if {[string range $nm end-2 end]=="..."} {
+ set zTrailing [string range $nm 0 end-3]
+ } else {
+ lappend lPosargs $nm
+ }
+ }
+ 2 {
+ set A($nm) 0
+ set idx(-$nm) $o
+ }
+ 3 {
+ set A($nm) [lindex $o 1]
+ set idx(-$nm) $o
+ }
+ default {
+ error "Error in command line specification"
+ }
+ }
+ }
+
+ # Set explicitly specified option values
+ #
+ set nArg [llength $lArgs]
+ for {set i 0} {$i < $nArg} {incr i} {
+ set opt [lindex $lArgs $i]
+ if {[string range $opt 0 0]!="-" || $opt=="--"} break
+ set c [array names idx "${opt}*"]
+ if {[llength $c]==0} { cmdline_error $O $E "Unrecognized option: $opt"}
+ if {[llength $c]>1} { cmdline_error $O $E "Ambiguous option: $opt"}
+
+ if {[llength $idx($c)]==3} {
+ if {$i==[llength $lArgs]-1} {
+ cmdline_error $O $E "Option requires argument: $c"
+ }
+ incr i
+ set A([lindex $idx($c) 0]) [lindex $lArgs $i]
+ } else {
+ set A([lindex $idx($c) 0]) 1
+ }
+ }
+
+ # Deal with position arguments.
+ #
+ set nPosarg [llength $lPosargs]
+ set nRem [expr $nArg - $i]
+ if {$nRem < $nPosarg || ($zTrailing=="" && $nRem > $nPosarg)} {
+ cmdline_error $O $E
+ }
+ for {set j 0} {$j < $nPosarg} {incr j} {
+ set A([lindex $lPosargs $j]) [lindex $lArgs [expr $j+$i]]
+ }
+ if {$zTrailing!=""} {
+ set A($zTrailing) [lrange $lArgs [expr $j+$i] end]
+ }
+ }
+} ;# namespace eval cmdline
+# End of command line options processor.
+###########################################################################
+###########################################################################
+
+process_cmdline
+
+# If -fts4 was specified, use fts4. Otherwise, fts5.
+if {$A(fts4)} {
+ set A(fts) fts4
+} else {
+ set A(fts) fts5
+}
+
+sqlite3 db $A(database)
+
+# Create the FTS table in the db. Return a list of the table columns.
+#
+proc create_table {} {
+ global A
+ set cols [list a b c d e f g h i j k l m n o p q r s t u v w x y z]
+
+ set nCol [llength $A(colsize)]
+ set cols [lrange $cols 0 [expr $nCol-1]]
+
+ set sql "CREATE VIRTUAL TABLE IF NOT EXISTS $A(tblname) USING $A(fts) ("
+ append sql [join $cols ,]
+ if {$A(fts)=="fts5"} { append sql ",detail=$A(detail)" }
+ if {$A(trigram)} { append sql ",tokenize=trigram" }
+ append sql ", prefix='$A(prefix)');"
+
+ db eval $sql
+ return $cols
+}
+
+# Return a list of tokens from the named file.
+#
+proc readfile {file} {
+ set fd [open $file]
+ set data [read $fd]
+ close $fd
+ split $data
+}
+
+proc repeat {L n} {
+ set res [list]
+ for {set i 0} {$i < $n} {incr i} {
+ set res [concat $res $L]
+ }
+ set res
+}
+
+
+# Load all the data into a big list of tokens.
+#
+set tokens [list]
+foreach f $A(file) {
+ set tokens [concat $tokens [repeat [readfile $f] $A(repeat)]]
+}
+
+set N [llength $tokens]
+set i 0
+set cols [create_table]
+set sql "INSERT INTO $A(tblname) VALUES(\$R([lindex $cols 0])"
+foreach c [lrange $cols 1 end] {
+ append sql ", \$R($c)"
+}
+append sql ")"
+
+if {$A(trans)} { db eval BEGIN }
+ while {$i < $N} {
+ foreach c $cols s $A(colsize) {
+ set R($c) [lrange $tokens $i [expr $i+$s-1]]
+ incr i $s
+ }
+ db eval $sql
+ }
+if {$A(trans)} { db eval COMMIT }
+
+
+