summaryrefslogtreecommitdiffstats
path: root/ext/fts5/test/fts5tokenizer.test
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-05 17:28:19 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-05 17:28:19 +0000
commit18657a960e125336f704ea058e25c27bd3900dcb (patch)
tree17b438b680ed45a996d7b59951e6aa34023783f2 /ext/fts5/test/fts5tokenizer.test
parentInitial commit. (diff)
downloadsqlite3-upstream.tar.xz
sqlite3-upstream.zip
Adding upstream version 3.40.1.upstream/3.40.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--ext/fts5/test/fts5tokenizer.test304
1 files changed, 304 insertions, 0 deletions
diff --git a/ext/fts5/test/fts5tokenizer.test b/ext/fts5/test/fts5tokenizer.test
new file mode 100644
index 0000000..2737065
--- /dev/null
+++ b/ext/fts5/test/fts5tokenizer.test
@@ -0,0 +1,304 @@
+# 2014 Dec 20
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#***********************************************************************
+#
+# Tests focusing on the built-in fts5 tokenizers.
+#
+
+source [file join [file dirname [info script]] fts5_common.tcl]
+set testprefix fts5tokenizer
+
+# If SQLITE_ENABLE_FTS5 is defined, omit this file.
+ifcapable !fts5 {
+ finish_test
+ return
+}
+
+
+do_execsql_test 1.0 {
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
+ DROP TABLE ft1;
+}
+do_execsql_test 1.1 {
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize='porter');
+ DROP TABLE ft1;
+}
+do_execsql_test 1.2 {
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = porter);
+ DROP TABLE ft1;
+}
+do_execsql_test 1.3 {
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter');
+ DROP TABLE ft1;
+}
+do_execsql_test 1.4 {
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii');
+ DROP TABLE ft1;
+}
+
+do_catchsql_test 1.5 {
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'nosuch');
+} {1 {no such tokenizer: nosuch}}
+
+do_catchsql_test 1.6 {
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter nosuch');
+} {1 {error in tokenizer constructor}}
+
+do_execsql_test 2.0 {
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
+ INSERT INTO ft1 VALUES('embedded databases');
+}
+do_execsql_test 2.1 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'embedding' } 1
+do_execsql_test 2.2 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'database' } 1
+do_execsql_test 2.3 {
+ SELECT rowid FROM ft1 WHERE ft1 MATCH 'database embedding'
+} 1
+
+proc tcl_create {args} {
+ set ::targs $args
+ error "failed"
+}
+sqlite3_fts5_create_tokenizer db tcl tcl_create
+
+foreach {tn directive expected} {
+ 1 {tokenize='tcl a b c'} {a b c}
+ 2 {tokenize='tcl ''d'' ''e'' ''f'''} {d e f}
+ 3 {tokenize="tcl 'g' 'h' 'i'"} {g h i}
+ 4 {tokenize = tcl} {}
+} {
+ do_catchsql_test 3.$tn.1 "
+ CREATE VIRTUAL TABLE ft2 USING fts5(x, $directive)
+ " {1 {error in tokenizer constructor}}
+ do_test 3.$tn.2 { set ::targs } $expected
+}
+
+do_catchsql_test 4.1 {
+ CREATE VIRTUAL TABLE ft2 USING fts5(x, tokenize = tcl abc);
+} {1 {parse error in "tokenize = tcl abc"}}
+do_catchsql_test 4.2 {
+ CREATE VIRTUAL TABLE ft2 USING fts5(x y)
+} {1 {unrecognized column option: y}}
+
+#-------------------------------------------------------------------------
+# Test the "separators" and "tokenchars" options a bit.
+#
+foreach {tn tokenizer} {1 ascii 2 unicode61} {
+ reset_db
+ set T "$tokenizer tokenchars ',.:' separators 'xyz'"
+ execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")"
+ do_execsql_test 5.$tn.1 {
+ INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz');
+ }
+ foreach {tn2 token res} {
+ 1 abc 1 2 def 1 3 ghi 1 4 jkl {}
+ 5 mno {} 6 pqr {} 7 stu {} 8 jkl.mno,pqr:stu 1
+ 9 vw 1
+ } {
+ do_execsql_test 5.$tn.2.$tn2 "
+ SELECT rowid FROM t1 WHERE t1 MATCH '\"$token\"'
+ " $res
+ }
+}
+
+#-------------------------------------------------------------------------
+# Miscellaneous tests for the ascii tokenizer.
+#
+# 5.1.*: Test that the ascii tokenizer ignores non-ASCII characters in the
+# 'separators' option. But unicode61 does not.
+#
+# 5.2.*: An option without an argument is an error.
+#
+
+do_test 5.1.1 {
+ execsql "
+ CREATE VIRTUAL TABLE a1 USING fts5(x, tokenize=`ascii separators '\u1234'`);
+ INSERT INTO a1 VALUES('abc\u1234def');
+ "
+ execsql { SELECT rowid FROM a1 WHERE a1 MATCH 'def' }
+} {}
+
+do_test 5.1.2 {
+ execsql "
+ CREATE VIRTUAL TABLE a2 USING fts5(
+ x, tokenize=`unicode61 separators '\u1234'`);
+ INSERT INTO a2 VALUES('abc\u1234def');
+ "
+ execsql { SELECT rowid FROM a2 WHERE a2 MATCH 'def' }
+} {1}
+
+do_catchsql_test 5.2 {
+ CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii tokenchars');
+} {1 {error in tokenizer constructor}}
+do_catchsql_test 5.3 {
+ CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii opt arg');
+} {1 {error in tokenizer constructor}}
+
+#-------------------------------------------------------------------------
+# Test that the ASCII and unicode61 tokenizers both handle SQLITE_DONE
+# correctly.
+#
+
+proc test_token_cb {varname token iStart iEnd} {
+ upvar $varname var
+ lappend var $token
+ if {[llength $var]==3} { return "SQLITE_DONE" }
+ return "SQLITE_OK"
+}
+
+proc tokenize {cmd} {
+ set res [list]
+ $cmd xTokenize [$cmd xColumnText 0] [list test_token_cb res]
+ set res
+}
+sqlite3_fts5_create_function db tokenize tokenize
+
+do_execsql_test 6.0 {
+ CREATE VIRTUAL TABLE x1 USING fts5(a, tokenize=ascii);
+ INSERT INTO x1 VALUES('q w e r t y');
+ INSERT INTO x1 VALUES('y t r e w q');
+ SELECT tokenize(x1) FROM x1 WHERE x1 MATCH 'e AND r';
+} {
+ {q w e} {y t r}
+}
+
+do_execsql_test 6.1 {
+ CREATE VIRTUAL TABLE x2 USING fts5(a, tokenize=unicode61);
+ INSERT INTO x2 VALUES('q w e r t y');
+ INSERT INTO x2 VALUES('y t r e w q');
+ SELECT tokenize(x2) FROM x2 WHERE x2 MATCH 'e AND r';
+} {
+ {q w e} {y t r}
+}
+
+
+#-------------------------------------------------------------------------
+# Miscellaneous tests for the unicode tokenizer.
+#
+do_catchsql_test 6.1 {
+ CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars');
+} {1 {error in tokenizer constructor}}
+do_catchsql_test 6.2 {
+ CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b');
+} {1 {error in tokenizer constructor}}
+do_catchsql_test 6.3 {
+ CREATE VIRTUAL TABLE a3 USING fts5(
+ x, y, tokenize = 'unicode61 remove_diacritics 3'
+ );
+} {1 {error in tokenizer constructor}}
+do_catchsql_test 6.4 {
+ CREATE VIRTUAL TABLE a3 USING fts5(
+ x, y, tokenize = 'unicode61 remove_diacritics 10'
+ );
+} {1 {error in tokenizer constructor}}
+
+#-------------------------------------------------------------------------
+# Porter tokenizer with very large tokens.
+#
+set a [string repeat a 100]
+set b [string repeat b 500]
+set c [string repeat c 1000]
+do_execsql_test 7.0 {
+ CREATE VIRTUAL TABLE e5 USING fts5(x, tokenize=porter);
+ INSERT INTO e5 VALUES($a || ' ' || $b);
+ INSERT INTO e5 VALUES($b || ' ' || $c);
+ INSERT INTO e5 VALUES($c || ' ' || $a);
+}
+
+do_execsql_test 7.1 {SELECT rowid FROM e5 WHERE e5 MATCH $a} { 1 3 }
+do_execsql_test 7.2 {SELECT rowid FROM e5 WHERE e5 MATCH $b} { 1 2 }
+do_execsql_test 7.3 {SELECT rowid FROM e5 WHERE e5 MATCH $c} { 2 3 }
+
+#-------------------------------------------------------------------------
+# Test the 'separators' option with the unicode61 tokenizer.
+#
+do_execsql_test 8.1 {
+ BEGIN;
+ CREATE VIRTUAL TABLE e6 USING fts5(x,
+ tokenize="unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ );
+ INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog');
+ CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
+ SELECT term FROM e7;
+ ROLLBACK;
+} {
+ brown dog fox jumped lazy over quick the
+}
+
+do_execsql_test 8.2 [subst {
+ BEGIN;
+ CREATE VIRTUAL TABLE e6 USING fts5(x,
+ tokenize="unicode61 separators '\u0E01\u0E02\u0E03\u0E04\u0E05\u0E06\u0E07'"
+ );
+ INSERT INTO e6 VALUES('the\u0E01quick\u0E01brown\u0E01fox\u0E01'
+ || 'jumped\u0E01over\u0E01the\u0E01lazy\u0E01dog'
+ );
+ INSERT INTO e6 VALUES('\u0E08\u0E07\u0E09');
+ CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
+ SELECT term FROM e7;
+ ROLLBACK;
+}] [subst {
+ brown dog fox jumped lazy over quick the \u0E08 \u0E09
+}]
+
+# Test that the porter tokenizer correctly passes arguments through to
+# its parent tokenizer.
+do_execsql_test 8.3 {
+ BEGIN;
+ CREATE VIRTUAL TABLE e6 USING fts5(x,
+ tokenize="porter unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ );
+ INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog');
+ CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row');
+ SELECT term FROM e7;
+ ROLLBACK;
+} {
+ brown dog fox jump lazi over quick the
+}
+
+#-------------------------------------------------------------------------
+# Check that the FTS5_TOKENIZE_PREFIX flag is passed to the tokenizer
+# implementation.
+#
+reset_db
+proc tcl_create {args} { return "tcl_tokenize" }
+sqlite3_fts5_create_tokenizer db tcl tcl_create
+set ::flags [list]
+proc tcl_tokenize {tflags text} {
+ lappend ::flags $tflags
+ foreach {w iStart iEnd} [fts5_tokenize_split $text] {
+ sqlite3_fts5_token $w $iStart $iEnd
+ }
+}
+
+do_execsql_test 9.1.1 {
+ CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=tcl);
+ INSERT INTO t1 VALUES('abc');
+ INSERT INTO t1 VALUES('xyz');
+} {}
+do_test 9.1.2 { set ::flags } {document document}
+
+set ::flags [list]
+do_execsql_test 9.2.1 { SELECT * FROM t1('abc'); } {abc}
+do_test 9.2.2 { set ::flags } {query}
+
+set ::flags [list]
+do_execsql_test 9.3.1 { SELECT * FROM t1('ab*'); } {abc}
+do_test 9.3.2 { set ::flags } {prefixquery}
+
+set ::flags [list]
+do_execsql_test 9.4.1 { SELECT * FROM t1('"abc xyz" *'); } {}
+do_test 9.4.2 { set ::flags } {prefixquery}
+
+set ::flags [list]
+do_execsql_test 9.5.1 { SELECT * FROM t1('"abc xyz*"'); } {}
+do_test 9.5.2 { set ::flags } {query}
+
+
+finish_test