summaryrefslogtreecommitdiffstats
path: root/ext/fts5/test/fts5unicode.test
diff options
context:
space:
mode:
Diffstat (limited to 'ext/fts5/test/fts5unicode.test')
-rw-r--r--ext/fts5/test/fts5unicode.test85
1 files changed, 85 insertions, 0 deletions
diff --git a/ext/fts5/test/fts5unicode.test b/ext/fts5/test/fts5unicode.test
new file mode 100644
index 0000000..e2d0f60
--- /dev/null
+++ b/ext/fts5/test/fts5unicode.test
@@ -0,0 +1,85 @@
+# 2014 Dec 20
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#***********************************************************************
+#
+# Tests focusing on the fts5 tokenizers
+#
+
+source [file join [file dirname [info script]] fts5_common.tcl]
+set testprefix fts5unicode
+
+# If SQLITE_ENABLE_FTS5 is defined, omit this file.
+ifcapable !fts5 {
+ finish_test
+ return
+}
+
+proc tokenize_test {tn tokenizer input output} {
+ uplevel [list do_test $tn [subst -nocommands {
+ set ret {}
+ foreach {z s e} [sqlite3_fts5_tokenize db {$tokenizer} {$input}] {
+ lappend ret [set z]
+ }
+ set ret
+ }] [list {*}$output]]
+}
+
+foreach {tn t} {1 ascii 2 unicode61} {
+ tokenize_test 1.$tn.0 $t {A B C D} {a b c d}
+ tokenize_test 1.$tn.1 $t {May you share freely,} {may you share freely}
+ tokenize_test 1.$tn.2 $t {..May...you.shAre.freely} {may you share freely}
+ tokenize_test 1.$tn.3 $t {} {}
+}
+
+#-------------------------------------------------------------------------
+# Check that "unicode61" really is the default tokenizer.
+#
+do_execsql_test 2.0 "
+ CREATE VIRTUAL TABLE t1 USING fts5(x);
+ CREATE VIRTUAL TABLE t2 USING fts5(x, tokenize = unicode61);
+ CREATE VIRTUAL TABLE t3 USING fts5(x, tokenize = ascii);
+ INSERT INTO t1 VALUES('\xC0\xC8\xCC');
+ INSERT INTO t2 VALUES('\xC0\xC8\xCC');
+ INSERT INTO t3 VALUES('\xC0\xC8\xCC');
+"
+do_execsql_test 2.1 "
+ SELECT 't1' FROM t1 WHERE t1 MATCH '\xE0\xE8\xEC';
+ SELECT 't2' FROM t2 WHERE t2 MATCH '\xE0\xE8\xEC';
+ SELECT 't3' FROM t3 WHERE t3 MATCH '\xE0\xE8\xEC';
+" {t1 t2}
+
+#-------------------------------------------------------------------------
+# Check that codepoints that require 4 bytes to store in utf-8 (those that
+# require 17 or more bits to store).
+#
+
+set A [db one {SELECT char(0x1F75E)}] ;# Type So
+set B [db one {SELECT char(0x1F5FD)}] ;# Type So
+set C [db one {SELECT char(0x2F802)}] ;# Type Lo
+set D [db one {SELECT char(0x2F808)}] ;# Type Lo
+
+do_execsql_test 3.0 "
+ CREATE VIRTUAL TABLE xyz USING fts5(x,
+ tokenize = \"unicode61 separators '$C' tokenchars '$A'\"
+ );
+ CREATE VIRTUAL TABLE xyz_v USING fts5vocab(xyz, row);
+
+ INSERT INTO xyz VALUES('$A$B$C$D');
+"
+
+do_execsql_test 3.1 {
+ SELECT * FROM xyz_v;
+} [list $A 1 1 $D 1 1]
+
+
+
+
+
+finish_test