diff options
Diffstat (limited to '')
-rw-r--r-- | ext/fts5/test/fts5tokenizer.test | 304 |
1 files changed, 304 insertions, 0 deletions
diff --git a/ext/fts5/test/fts5tokenizer.test b/ext/fts5/test/fts5tokenizer.test new file mode 100644 index 0000000..2737065 --- /dev/null +++ b/ext/fts5/test/fts5tokenizer.test @@ -0,0 +1,304 @@ +# 2014 Dec 20 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# +# Tests focusing on the built-in fts5 tokenizers. +# + +source [file join [file dirname [info script]] fts5_common.tcl] +set testprefix fts5tokenizer + +# If SQLITE_ENABLE_FTS5 is defined, omit this file. +ifcapable !fts5 { + finish_test + return +} + + +do_execsql_test 1.0 { + CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter); + DROP TABLE ft1; +} +do_execsql_test 1.1 { + CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize='porter'); + DROP TABLE ft1; +} +do_execsql_test 1.2 { + CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = porter); + DROP TABLE ft1; +} +do_execsql_test 1.3 { + CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter'); + DROP TABLE ft1; +} +do_execsql_test 1.4 { + CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii'); + DROP TABLE ft1; +} + +do_catchsql_test 1.5 { + CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'nosuch'); +} {1 {no such tokenizer: nosuch}} + +do_catchsql_test 1.6 { + CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter nosuch'); +} {1 {error in tokenizer constructor}} + +do_execsql_test 2.0 { + CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter); + INSERT INTO ft1 VALUES('embedded databases'); +} +do_execsql_test 2.1 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'embedding' } 1 +do_execsql_test 2.2 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'database' } 1 +do_execsql_test 2.3 { + SELECT rowid FROM ft1 WHERE ft1 MATCH 'database embedding' +} 1 + +proc tcl_create {args} { + set ::targs $args + error "failed" +} +sqlite3_fts5_create_tokenizer db tcl tcl_create + +foreach {tn directive expected} { + 1 {tokenize='tcl a b c'} {a b c} + 2 {tokenize='tcl ''d'' ''e'' ''f'''} {d e f} + 3 {tokenize="tcl 'g' 'h' 'i'"} {g h i} + 4 {tokenize = tcl} {} +} { + do_catchsql_test 3.$tn.1 " + CREATE VIRTUAL TABLE ft2 USING fts5(x, $directive) + " {1 {error in tokenizer constructor}} + do_test 3.$tn.2 { set ::targs } $expected +} + +do_catchsql_test 4.1 { + CREATE VIRTUAL TABLE ft2 USING fts5(x, tokenize = tcl abc); +} {1 {parse error in "tokenize = tcl abc"}} +do_catchsql_test 4.2 { + CREATE VIRTUAL TABLE ft2 USING fts5(x y) +} {1 {unrecognized column option: y}} + +#------------------------------------------------------------------------- +# Test the "separators" and "tokenchars" options a bit. +# +foreach {tn tokenizer} {1 ascii 2 unicode61} { + reset_db + set T "$tokenizer tokenchars ',.:' separators 'xyz'" + execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")" + do_execsql_test 5.$tn.1 { + INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz'); + } + foreach {tn2 token res} { + 1 abc 1 2 def 1 3 ghi 1 4 jkl {} + 5 mno {} 6 pqr {} 7 stu {} 8 jkl.mno,pqr:stu 1 + 9 vw 1 + } { + do_execsql_test 5.$tn.2.$tn2 " + SELECT rowid FROM t1 WHERE t1 MATCH '\"$token\"' + " $res + } +} + +#------------------------------------------------------------------------- +# Miscellaneous tests for the ascii tokenizer. +# +# 5.1.*: Test that the ascii tokenizer ignores non-ASCII characters in the +# 'separators' option. But unicode61 does not. +# +# 5.2.*: An option without an argument is an error. +# + +do_test 5.1.1 { + execsql " + CREATE VIRTUAL TABLE a1 USING fts5(x, tokenize=`ascii separators '\u1234'`); + INSERT INTO a1 VALUES('abc\u1234def'); + " + execsql { SELECT rowid FROM a1 WHERE a1 MATCH 'def' } +} {} + +do_test 5.1.2 { + execsql " + CREATE VIRTUAL TABLE a2 USING fts5( + x, tokenize=`unicode61 separators '\u1234'`); + INSERT INTO a2 VALUES('abc\u1234def'); + " + execsql { SELECT rowid FROM a2 WHERE a2 MATCH 'def' } +} {1} + +do_catchsql_test 5.2 { + CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii tokenchars'); +} {1 {error in tokenizer constructor}} +do_catchsql_test 5.3 { + CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii opt arg'); +} {1 {error in tokenizer constructor}} + +#------------------------------------------------------------------------- +# Test that the ASCII and unicode61 tokenizers both handle SQLITE_DONE +# correctly. +# + +proc test_token_cb {varname token iStart iEnd} { + upvar $varname var + lappend var $token + if {[llength $var]==3} { return "SQLITE_DONE" } + return "SQLITE_OK" +} + +proc tokenize {cmd} { + set res [list] + $cmd xTokenize [$cmd xColumnText 0] [list test_token_cb res] + set res +} +sqlite3_fts5_create_function db tokenize tokenize + +do_execsql_test 6.0 { + CREATE VIRTUAL TABLE x1 USING fts5(a, tokenize=ascii); + INSERT INTO x1 VALUES('q w e r t y'); + INSERT INTO x1 VALUES('y t r e w q'); + SELECT tokenize(x1) FROM x1 WHERE x1 MATCH 'e AND r'; +} { + {q w e} {y t r} +} + +do_execsql_test 6.1 { + CREATE VIRTUAL TABLE x2 USING fts5(a, tokenize=unicode61); + INSERT INTO x2 VALUES('q w e r t y'); + INSERT INTO x2 VALUES('y t r e w q'); + SELECT tokenize(x2) FROM x2 WHERE x2 MATCH 'e AND r'; +} { + {q w e} {y t r} +} + + +#------------------------------------------------------------------------- +# Miscellaneous tests for the unicode tokenizer. +# +do_catchsql_test 6.1 { + CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars'); +} {1 {error in tokenizer constructor}} +do_catchsql_test 6.2 { + CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b'); +} {1 {error in tokenizer constructor}} +do_catchsql_test 6.3 { + CREATE VIRTUAL TABLE a3 USING fts5( + x, y, tokenize = 'unicode61 remove_diacritics 3' + ); +} {1 {error in tokenizer constructor}} +do_catchsql_test 6.4 { + CREATE VIRTUAL TABLE a3 USING fts5( + x, y, tokenize = 'unicode61 remove_diacritics 10' + ); +} {1 {error in tokenizer constructor}} + +#------------------------------------------------------------------------- +# Porter tokenizer with very large tokens. +# +set a [string repeat a 100] +set b [string repeat b 500] +set c [string repeat c 1000] +do_execsql_test 7.0 { + CREATE VIRTUAL TABLE e5 USING fts5(x, tokenize=porter); + INSERT INTO e5 VALUES($a || ' ' || $b); + INSERT INTO e5 VALUES($b || ' ' || $c); + INSERT INTO e5 VALUES($c || ' ' || $a); +} + +do_execsql_test 7.1 {SELECT rowid FROM e5 WHERE e5 MATCH $a} { 1 3 } +do_execsql_test 7.2 {SELECT rowid FROM e5 WHERE e5 MATCH $b} { 1 2 } +do_execsql_test 7.3 {SELECT rowid FROM e5 WHERE e5 MATCH $c} { 2 3 } + +#------------------------------------------------------------------------- +# Test the 'separators' option with the unicode61 tokenizer. +# +do_execsql_test 8.1 { + BEGIN; + CREATE VIRTUAL TABLE e6 USING fts5(x, + tokenize="unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ" + ); + INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog'); + CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); + SELECT term FROM e7; + ROLLBACK; +} { + brown dog fox jumped lazy over quick the +} + +do_execsql_test 8.2 [subst { + BEGIN; + CREATE VIRTUAL TABLE e6 USING fts5(x, + tokenize="unicode61 separators '\u0E01\u0E02\u0E03\u0E04\u0E05\u0E06\u0E07'" + ); + INSERT INTO e6 VALUES('the\u0E01quick\u0E01brown\u0E01fox\u0E01' + || 'jumped\u0E01over\u0E01the\u0E01lazy\u0E01dog' + ); + INSERT INTO e6 VALUES('\u0E08\u0E07\u0E09'); + CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); + SELECT term FROM e7; + ROLLBACK; +}] [subst { + brown dog fox jumped lazy over quick the \u0E08 \u0E09 +}] + +# Test that the porter tokenizer correctly passes arguments through to +# its parent tokenizer. +do_execsql_test 8.3 { + BEGIN; + CREATE VIRTUAL TABLE e6 USING fts5(x, + tokenize="porter unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ" + ); + INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog'); + CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); + SELECT term FROM e7; + ROLLBACK; +} { + brown dog fox jump lazi over quick the +} + +#------------------------------------------------------------------------- +# Check that the FTS5_TOKENIZE_PREFIX flag is passed to the tokenizer +# implementation. +# +reset_db +proc tcl_create {args} { return "tcl_tokenize" } +sqlite3_fts5_create_tokenizer db tcl tcl_create +set ::flags [list] +proc tcl_tokenize {tflags text} { + lappend ::flags $tflags + foreach {w iStart iEnd} [fts5_tokenize_split $text] { + sqlite3_fts5_token $w $iStart $iEnd + } +} + +do_execsql_test 9.1.1 { + CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=tcl); + INSERT INTO t1 VALUES('abc'); + INSERT INTO t1 VALUES('xyz'); +} {} +do_test 9.1.2 { set ::flags } {document document} + +set ::flags [list] +do_execsql_test 9.2.1 { SELECT * FROM t1('abc'); } {abc} +do_test 9.2.2 { set ::flags } {query} + +set ::flags [list] +do_execsql_test 9.3.1 { SELECT * FROM t1('ab*'); } {abc} +do_test 9.3.2 { set ::flags } {prefixquery} + +set ::flags [list] +do_execsql_test 9.4.1 { SELECT * FROM t1('"abc xyz" *'); } {} +do_test 9.4.2 { set ::flags } {prefixquery} + +set ::flags [list] +do_execsql_test 9.5.1 { SELECT * FROM t1('"abc xyz*"'); } {} +do_test 9.5.2 { set ::flags } {query} + + +finish_test |