diff options
Diffstat (limited to '')
-rw-r--r-- | ext/fts5/tool/fts5txt2db.tcl | 231 |
1 files changed, 231 insertions, 0 deletions
diff --git a/ext/fts5/tool/fts5txt2db.tcl b/ext/fts5/tool/fts5txt2db.tcl new file mode 100644 index 0000000..1996b2c --- /dev/null +++ b/ext/fts5/tool/fts5txt2db.tcl @@ -0,0 +1,231 @@ +########################################################################## +# 2016 Jan 27 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +proc process_cmdline {} { + cmdline::process ::A $::argv { + {fts5 "use fts5 (this is the default)"} + {fts4 "use fts4"} + {trigram "Use tokenize=trigram"} + {colsize "10 10 10" "list of column sizes"} + {tblname "t1" "table name to create"} + {detail "full" "Fts5 detail mode to use"} + {repeat 1 "Load each file this many times"} + {prefix "" "Fts prefix= option"} + {trans 1 "True to use a transaction"} + database + file... + } { + This script is designed to create fts4/5 tables with more than one column. + The -colsize option should be set to a Tcl list of integer values, one for + each column in the table. Each value is the number of tokens that will be + inserted into the column value for each row. For example, setting the -colsize + option to "5 10" creates an FTS table with 2 columns, with roughly 5 and 10 + tokens per row in each, respectively. + + Each "FILE" argument should be a text file. The contents of these text files + is split on whitespace characters to form a list of tokens. The first N1 + tokens are used for the first column of the first row, where N1 is the first + element of the -colsize list. The next N2 are used for the second column of + the first row, and so on. Rows are added to the table until the entire list + of tokens is exhausted. + } +} + +########################################################################### +########################################################################### +# Command line options processor. This is generic code that can be copied +# between scripts. +# +namespace eval cmdline { + proc cmdline_error {O E {msg ""}} { + if {$msg != ""} { + puts stderr "Error: $msg" + puts stderr "" + } + + set L [list] + foreach o $O { + if {[llength $o]==1} { + lappend L [string toupper $o] + } + } + + puts stderr "Usage: $::argv0 ?SWITCHES? $L" + puts stderr "" + puts stderr "Switches are:" + foreach o $O { + if {[llength $o]==3} { + foreach {a b c} $o {} + puts stderr [format " -%-15s %s (default \"%s\")" "$a VAL" $c $b] + } elseif {[llength $o]==2} { + foreach {a b} $o {} + puts stderr [format " -%-15s %s" $a $b] + } + } + puts stderr "" + puts stderr $E + exit -1 + } + + proc process {avar lArgs O E} { + upvar $avar A + set zTrailing "" ;# True if ... is present in $O + set lPosargs [list] + + # Populate A() with default values. Also, for each switch in the command + # line spec, set an entry in the idx() array as follows: + # + # {tblname t1 "table name to use"} + # -> [set idx(-tblname) {tblname t1 "table name to use"} + # + # For each position parameter, append its name to $lPosargs. If the ... + # specifier is present, set $zTrailing to the name of the prefix. + # + foreach o $O { + set nm [lindex $o 0] + set nArg [llength $o] + switch -- $nArg { + 1 { + if {[string range $nm end-2 end]=="..."} { + set zTrailing [string range $nm 0 end-3] + } else { + lappend lPosargs $nm + } + } + 2 { + set A($nm) 0 + set idx(-$nm) $o + } + 3 { + set A($nm) [lindex $o 1] + set idx(-$nm) $o + } + default { + error "Error in command line specification" + } + } + } + + # Set explicitly specified option values + # + set nArg [llength $lArgs] + for {set i 0} {$i < $nArg} {incr i} { + set opt [lindex $lArgs $i] + if {[string range $opt 0 0]!="-" || $opt=="--"} break + set c [array names idx "${opt}*"] + if {[llength $c]==0} { cmdline_error $O $E "Unrecognized option: $opt"} + if {[llength $c]>1} { cmdline_error $O $E "Ambiguous option: $opt"} + + if {[llength $idx($c)]==3} { + if {$i==[llength $lArgs]-1} { + cmdline_error $O $E "Option requires argument: $c" + } + incr i + set A([lindex $idx($c) 0]) [lindex $lArgs $i] + } else { + set A([lindex $idx($c) 0]) 1 + } + } + + # Deal with position arguments. + # + set nPosarg [llength $lPosargs] + set nRem [expr $nArg - $i] + if {$nRem < $nPosarg || ($zTrailing=="" && $nRem > $nPosarg)} { + cmdline_error $O $E + } + for {set j 0} {$j < $nPosarg} {incr j} { + set A([lindex $lPosargs $j]) [lindex $lArgs [expr $j+$i]] + } + if {$zTrailing!=""} { + set A($zTrailing) [lrange $lArgs [expr $j+$i] end] + } + } +} ;# namespace eval cmdline +# End of command line options processor. +########################################################################### +########################################################################### + +process_cmdline + +# If -fts4 was specified, use fts4. Otherwise, fts5. +if {$A(fts4)} { + set A(fts) fts4 +} else { + set A(fts) fts5 +} + +sqlite3 db $A(database) + +# Create the FTS table in the db. Return a list of the table columns. +# +proc create_table {} { + global A + set cols [list a b c d e f g h i j k l m n o p q r s t u v w x y z] + + set nCol [llength $A(colsize)] + set cols [lrange $cols 0 [expr $nCol-1]] + + set sql "CREATE VIRTUAL TABLE IF NOT EXISTS $A(tblname) USING $A(fts) (" + append sql [join $cols ,] + if {$A(fts)=="fts5"} { append sql ",detail=$A(detail)" } + if {$A(trigram)} { append sql ",tokenize=trigram" } + append sql ", prefix='$A(prefix)');" + + db eval $sql + return $cols +} + +# Return a list of tokens from the named file. +# +proc readfile {file} { + set fd [open $file] + set data [read $fd] + close $fd + split $data +} + +proc repeat {L n} { + set res [list] + for {set i 0} {$i < $n} {incr i} { + set res [concat $res $L] + } + set res +} + + +# Load all the data into a big list of tokens. +# +set tokens [list] +foreach f $A(file) { + set tokens [concat $tokens [repeat [readfile $f] $A(repeat)]] +} + +set N [llength $tokens] +set i 0 +set cols [create_table] +set sql "INSERT INTO $A(tblname) VALUES(\$R([lindex $cols 0])" +foreach c [lrange $cols 1 end] { + append sql ", \$R($c)" +} +append sql ")" + +if {$A(trans)} { db eval BEGIN } + while {$i < $N} { + foreach c $cols s $A(colsize) { + set R($c) [lrange $tokens $i [expr $i+$s-1]] + incr i $s + } + db eval $sql + } +if {$A(trans)} { db eval COMMIT } + + + |