summaryrefslogtreecommitdiffstats
path: root/www/fts5.html
diff options
context:
space:
mode:
Diffstat (limited to 'www/fts5.html')
-rw-r--r--www/fts5.html3364
1 files changed, 3364 insertions, 0 deletions
diff --git a/www/fts5.html b/www/fts5.html
new file mode 100644
index 0000000..36f3379
--- /dev/null
+++ b/www/fts5.html
@@ -0,0 +1,3364 @@
+<!DOCTYPE html>
+<html><head>
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<meta http-equiv="content-type" content="text/html; charset=UTF-8">
+<link href="sqlite.css" rel="stylesheet">
+<title>SQLite FTS5 Extension</title>
+<!-- path= -->
+</head>
+<body>
+<div class=nosearch>
+<a href="index.html">
+<img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0">
+</a>
+<div><!-- IE hack to prevent disappearing logo --></div>
+<div class="tagline desktoponly">
+Small. Fast. Reliable.<br>Choose any three.
+</div>
+<div class="menu mainmenu">
+<ul>
+<li><a href="index.html">Home</a>
+<li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a>
+<li class='wideonly'><a href='about.html'>About</a>
+<li class='desktoponly'><a href="docs.html">Documentation</a>
+<li class='desktoponly'><a href="download.html">Download</a>
+<li class='wideonly'><a href='copyright.html'>License</a>
+<li class='desktoponly'><a href="support.html">Support</a>
+<li class='desktoponly'><a href="prosupport.html">Purchase</a>
+<li class='search' id='search_menubutton'>
+<a href="javascript:void(0)" onclick='toggle_search()'>Search</a>
+</ul>
+</div>
+<div class="menu submenu" id="submenu">
+<ul>
+<li><a href='about.html'>About</a>
+<li><a href='docs.html'>Documentation</a>
+<li><a href='download.html'>Download</a>
+<li><a href='support.html'>Support</a>
+<li><a href='prosupport.html'>Purchase</a>
+</ul>
+</div>
+<div class="searchmenu" id="searchmenu">
+<form method="GET" action="search">
+<select name="s" id="searchtype">
+<option value="d">Search Documentation</option>
+<option value="c">Search Changelog</option>
+</select>
+<input type="text" name="q" id="searchbox" value="">
+<input type="submit" value="Go">
+</form>
+</div>
+</div>
+<script>
+function toggle_div(nm) {
+var w = document.getElementById(nm);
+if( w.style.display=="block" ){
+w.style.display = "none";
+}else{
+w.style.display = "block";
+}
+}
+function toggle_search() {
+var w = document.getElementById("searchmenu");
+if( w.style.display=="block" ){
+w.style.display = "none";
+} else {
+w.style.display = "block";
+setTimeout(function(){
+document.getElementById("searchbox").focus()
+}, 30);
+}
+}
+function div_off(nm){document.getElementById(nm).style.display="none";}
+window.onbeforeunload = function(e){div_off("submenu");}
+/* Disable the Search feature if we are not operating from CGI, since */
+/* Search is accomplished using CGI and will not work without it. */
+if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){
+document.getElementById("search_menubutton").style.display = "none";
+}
+/* Used by the Hide/Show button beside syntax diagrams, to toggle the */
+function hideorshow(btn,obj){
+var x = document.getElementById(obj);
+var b = document.getElementById(btn);
+if( x.style.display!='none' ){
+x.style.display = 'none';
+b.innerHTML='show';
+}else{
+x.style.display = '';
+b.innerHTML='hide';
+}
+return false;
+}
+var antiRobot = 0;
+function antiRobotGo(){
+if( antiRobot!=3 ) return;
+antiRobot = 7;
+var j = document.getElementById("mtimelink");
+if(j && j.hasAttribute("data-href")) j.href=j.getAttribute("data-href");
+}
+function antiRobotDefense(){
+document.body.onmousedown=function(){
+antiRobot |= 2;
+antiRobotGo();
+document.body.onmousedown=null;
+}
+document.body.onmousemove=function(){
+antiRobot |= 2;
+antiRobotGo();
+document.body.onmousemove=null;
+}
+setTimeout(function(){
+antiRobot |= 1;
+antiRobotGo();
+}, 100)
+antiRobotGo();
+}
+antiRobotDefense();
+</script>
+<div class=fancy>
+<div class=nosearch>
+<div class="fancy_title">
+SQLite FTS5 Extension
+</div>
+<div class="fancy_toc">
+<a onclick="toggle_toc()">
+<span class="fancy_toc_mark" id="toc_mk">&#x25ba;</span>
+Table Of Contents
+</a>
+<div id="toc_sub"><div class="fancy-toc1"><a href="#overview_of_fts5">1. Overview of FTS5</a></div>
+<div class="fancy-toc1"><a href="#compiling_and_using_fts5">2. Compiling and Using FTS5</a></div>
+<div class="fancy-toc2"><a href="#building_fts5_as_part_of_sqlite">2.1. Building FTS5 as part of SQLite</a></div>
+<div class="fancy-toc2"><a href="#building_a_loadable_extension">2.2. Building a Loadable Extension</a></div>
+<div class="fancy-toc1"><a href="#full_text_query_syntax">3. Full-text Query Syntax</a></div>
+<div class="fancy-toc2"><a href="#fts5_strings">3.1. FTS5 Strings</a></div>
+<div class="fancy-toc2"><a href="#fts5_phrases">3.2. FTS5 Phrases</a></div>
+<div class="fancy-toc2"><a href="#fts5_prefix_queries">3.3. FTS5 Prefix Queries</a></div>
+<div class="fancy-toc2"><a href="#fts5_initial_token_queries">3.4. FTS5 Initial Token Queries</a></div>
+<div class="fancy-toc2"><a href="#fts5_near_queries">3.5. FTS5 NEAR Queries</a></div>
+<div class="fancy-toc2"><a href="#fts5_column_filters">3.6. FTS5 Column Filters</a></div>
+<div class="fancy-toc2"><a href="#fts5_boolean_operators">3.7. FTS5 Boolean Operators</a></div>
+<div class="fancy-toc1"><a href="#fts5_table_creation_and_initialization">4. FTS5 Table Creation and Initialization</a></div>
+<div class="fancy-toc2"><a href="#the_unindexed_column_option">4.1. The UNINDEXED column option</a></div>
+<div class="fancy-toc2"><a href="#prefix_indexes">4.2. Prefix Indexes</a></div>
+<div class="fancy-toc2"><a href="#tokenizers">4.3. Tokenizers</a></div>
+<div class="fancy-toc3"><a href="#unicode61_tokenizer">4.3.1. Unicode61 Tokenizer</a></div>
+<div class="fancy-toc3"><a href="#ascii_tokenizer">4.3.2. Ascii Tokenizer</a></div>
+<div class="fancy-toc3"><a href="#porter_tokenizer">4.3.3. Porter Tokenizer</a></div>
+<div class="fancy-toc3"><a href="#the_experimental_trigram_tokenizer">4.3.4. The Experimental Trigram Tokenizer</a></div>
+<div class="fancy-toc2"><a href="#external_content_and_contentless_tables">4.4. External Content and Contentless Tables</a></div>
+<div class="fancy-toc3"><a href="#contentless_tables">4.4.1. Contentless Tables</a></div>
+<div class="fancy-toc3"><a href="#external_content_tables">4.4.2. External Content Tables</a></div>
+<div class="fancy-toc2"><a href="#the_columnsize_option">4.5. The Columnsize Option</a></div>
+<div class="fancy-toc2"><a href="#the_detail_option">4.6. The Detail Option</a></div>
+<div class="fancy-toc1"><a href="#_auxiliary_functions_">5. Auxiliary Functions </a></div>
+<div class="fancy-toc2"><a href="#built_in_auxiliary_functions">5.1. Built-in Auxiliary Functions</a></div>
+<div class="fancy-toc3"><a href="#the_bm25_function">5.1.1. The bm25() function</a></div>
+<div class="fancy-toc3"><a href="#the_highlight_function">5.1.2. The highlight() function</a></div>
+<div class="fancy-toc3"><a href="#the_snippet_function">5.1.3. The snippet() function</a></div>
+<div class="fancy-toc2"><a href="#sorting_by_auxiliary_function_results">5.2. Sorting by Auxiliary Function Results</a></div>
+<div class="fancy-toc1"><a href="#special_insert_commands">6. Special INSERT Commands</a></div>
+<div class="fancy-toc2"><a href="#the_automerge_configuration_option">6.1. The 'automerge' Configuration Option</a></div>
+<div class="fancy-toc2"><a href="#the_crisismerge_configuration_option">6.2. The 'crisismerge' Configuration Option</a></div>
+<div class="fancy-toc2"><a href="#the_delete_command">6.3. The 'delete' Command</a></div>
+<div class="fancy-toc2"><a href="#the_delete_all_command">6.4. The 'delete-all' Command</a></div>
+<div class="fancy-toc2"><a href="#the_integrity_check_command">6.5. The 'integrity-check' Command</a></div>
+<div class="fancy-toc2"><a href="#the_merge_command">6.6. The 'merge' Command</a></div>
+<div class="fancy-toc2"><a href="#the_optimize_command">6.7. The 'optimize' Command</a></div>
+<div class="fancy-toc2"><a href="#the_pgsz_configuration_option">6.8. The 'pgsz' Configuration Option</a></div>
+<div class="fancy-toc2"><a href="#the_rank_configuration_option">6.9. The 'rank' Configuration Option</a></div>
+<div class="fancy-toc2"><a href="#the_rebuild_command">6.10. The 'rebuild' Command</a></div>
+<div class="fancy-toc2"><a href="#the_usermerge_configuration_option">6.11. The 'usermerge' Configuration Option</a></div>
+<div class="fancy-toc1"><a href="#extending_fts5">7. Extending FTS5</a></div>
+<div class="fancy-toc2"><a href="#custom_tokenizers">7.1. Custom Tokenizers</a></div>
+<div class="fancy-toc3"><a href="#synonym_support">7.1.1. Synonym Support</a></div>
+<div class="fancy-toc2"><a href="#custom_auxiliary_functions">7.2. Custom Auxiliary Functions</a></div>
+<div class="fancy-toc3"><a href="#_custom_auxiliary_functions_api_reference_">7.2.1.
+Custom Auxiliary Functions API Reference
+</a></div>
+<div class="fancy-toc1"><a href="#the_fts5vocab_virtual_table_module">8. The fts5vocab Virtual Table Module</a></div>
+<div class="fancy-toc1"><a href="#fts5_data_structures">9. FTS5 Data Structures</a></div>
+<div class="fancy-toc2"><a href="#varint_format">9.1. Varint Format</a></div>
+<div class="fancy-toc2"><a href="#the_fts_index_idx_and_data_tables_">9.2. The FTS Index (%_idx and %_data tables)</a></div>
+<div class="fancy-toc3"><a href="#data_structure">9.2.1. The %_data Table Rowid Space</a></div>
+<div class="fancy-toc3"><a href="#structure_record_format">9.2.2. Structure Record Format</a></div>
+<div class="fancy-toc3"><a href="#averages_record_format">9.2.3. Averages Record Format</a></div>
+<div class="fancy-toc3"><a href="#segment_b_tree_format">9.2.4. Segment B-Tree Format</a></div>
+<div class="fancy-toc4"><a href="#the_key_doclist_format">9.2.4.1. The Key/Doclist Format</a></div>
+<div class="fancy-toc4"><a href="#data_pagination">9.2.4.2. Pagination</a></div>
+<div class="fancy-toc4"><a href="#data_term_index">9.2.4.3. Segment Index Format</a></div>
+<div class="fancy-toc4"><a href="#doclist_index_format">9.2.4.4. Doclist Index Format</a></div>
+<div class="fancy-toc2"><a href="#document_sizes_table_docsize_table_">9.3. Document Sizes Table (%_docsize table)</a></div>
+<div class="fancy-toc2"><a href="#the_table_contents_content_table_">9.4. The Table Contents (%_content table)</a></div>
+<div class="fancy-toc2"><a href="#configuration_options_config_table_">9.5. Configuration Options (%_config table)</a></div>
+<div class="fancy-toc1"><a href="#appendix_a">
+Appendix A: Comparison with FTS3/4
+</a></div>
+<div class="fancy-toc2"><a href="#_application_porting_guide_"> Application Porting Guide </a></div>
+<div class="fancy-toc3"><a href="#_changes_to_create_virtual_table_statements_"> Changes to CREATE VIRTUAL TABLE statements </a></div>
+<div class="fancy-toc3"><a href="#_changes_to_select_statements_"> Changes to SELECT statements </a></div>
+<div class="fancy-toc3"><a href="#_auxiliary_function_changes_"> Auxiliary Function Changes </a></div>
+<div class="fancy-toc3"><a href="#_other_issues"> Other Issues</a></div>
+<div class="fancy-toc2"><a href="#_summary_of_technical_differences_">
+Summary of Technical Differences
+</a></div>
+</div>
+</div>
+<script>
+function toggle_toc(){
+var sub = document.getElementById("toc_sub")
+var mk = document.getElementById("toc_mk")
+if( sub.style.display!="block" ){
+sub.style.display = "block";
+mk.innerHTML = "&#x25bc;";
+} else {
+sub.style.display = "none";
+mk.innerHTML = "&#x25ba;";
+}
+}
+</script>
+</div>
+
+
+
+
+
+
+<h1 id="overview_of_fts5"><span>1. </span>Overview of FTS5</h1>
+
+<p>FTS5 is an SQLite <a href="c3ref/module.html">virtual table module</a> that provides
+<a href="http://en.wikipedia.org/wiki/Full_text_search">full-text search</a>
+functionality to database applications. In their most elementary form,
+full-text search engines allow the user to efficiently search a large
+collection of documents for the subset that contain one or more instances of a
+search term. The search functionality provided to world wide web users by
+<a href="https://www.google.com/">Google</a> is, among other things, a full-text search
+engine, as it allows users to search for all documents on the web that contain,
+for example, the term "fts5".
+
+</p><p>To use FTS5, the user creates an FTS5 virtual table with one or more
+columns. For example:
+
+</p><div class="codeblock"><pre>CREATE VIRTUAL TABLE email USING fts5(sender, title, body);
+</pre></div>
+
+<p>It is an error to add types, constraints or <a href="lang_createtable.html#primkeyconst">PRIMARY KEY</a> declarations to
+a CREATE VIRTUAL TABLE statement used to create an FTS5 table. Once created,
+an FTS5 table may be populated using <a href="lang_insert.html">INSERT</a>, <a href="lang_update.html">UPDATE</a> or <a href="lang_delete.html">DELETE</a> statements
+like any other table. Like any other table with no PRIMARY KEY declaration, an
+FTS5 table has an implicit INTEGER PRIMARY KEY field named rowid.
+
+</p><p>Not shown in the example above is that there are also
+<a href="fts5.html#fts5_table_creation_and_initialization">various options</a> that may be provided to FTS5 as
+part of the CREATE VIRTUAL TABLE statement to configure various aspects of the
+new table. These may be used to modify the way in which the FTS5 table extracts
+terms from documents and queries, to create extra indexes on disk to speed up
+prefix queries, or to create an FTS5 table that acts as an index on content
+stored elsewhere.
+
+</p><p>Once populated, there are three ways to execute a full-text query against
+the contents of an FTS5 table:
+
+</p><ul><li> Using a MATCH operator in the WHERE clause of a SELECT statement, or
+ </li><li> Using an equals ("=") operator in the WHERE clause of a SELECT statement, or
+ </li><li> using the <a href="vtab.html#tabfunc2">table-valued function</a> syntax.
+</li></ul>
+
+<p>If using the MATCH or = operators, the expression to the left of the MATCH
+ operator is usually the name of the FTS5 table (the exception is when
+ <a href="fts5.html#fts5_column_filters">specifying a column-filter</a>). The expression on the right
+ must be a text value specifying the term to search for. For the table-valued
+ function syntax, the term to search for is specified as the first table argument.
+ For example:
+
+</p><div class="codeblock"><pre><i>-- Query for all rows that contain at least once instance of the term</i>
+<i>-- "fts5" (in any column). The following three queries are equivalent.</i>
+SELECT * FROM email WHERE email MATCH 'fts5';
+SELECT * FROM email WHERE email = 'fts5';
+SELECT * FROM email('fts5');
+</pre></div>
+
+<p> By default, FTS5 full-text searches are case-independent. Like any other
+SQL query that does not contain an ORDER BY clause, the example above returns
+results in an arbitrary order. To sort results by relevance (most to least
+relevant), an ORDER BY may be added to a full-text query as follows:
+
+</p><div class="codeblock"><pre><i>-- Query for all rows that contain at least once instance of the term</i>
+<i>-- "fts5" (in any column). Return results in order from best to worst</i>
+<i>-- match. </i>
+SELECT * FROM email WHERE email MATCH 'fts5' ORDER BY rank;
+</pre></div>
+
+<p> As well as the column values and rowid of a matching row, an application
+may use <a href="fts5.html#_auxiliary_functions_">FTS5 auxiliary functions</a> to retrieve extra information regarding
+the matched row. For example, an auxiliary function may be used to retrieve
+a copy of a column value for a matched row with all instances of the matched
+term surrounded by html &lt;b&gt;&lt;/b&gt; tags. Auxiliary functions are
+invoked in the same way as SQLite <a href="lang_corefunc.html">scalar functions</a>, except that the name
+of the FTS5 table is specified as the first argument. For example:
+
+</p><div class="codeblock"><pre><i>-- Query for rows that match "fts5". Return a copy of the "body" column</i>
+<i>-- of each row with the matches surrounded by &lt;b&gt;&lt;/b&gt; tags.</i>
+SELECT highlight(email, 2, '&lt;b&gt;', '&lt;/b&gt;') FROM email('fts5');
+</pre></div>
+
+<p>A description of the available auxiliary functions, and more details
+regarding configuration of the special "rank" column, are
+<a href="fts5.html#_auxiliary_functions_">available below</a>. <a href="fts5.html#custom_auxiliary_functions">Custom auxiliary functions</a> may also be implemented in C and registered with
+FTS5, just as custom SQL functions may be registered with the SQLite core.
+
+</p><p> As well as searching for all rows that contain a term, FTS5 allows
+the user to search for rows that contain:
+
+</p><ul>
+ <li> any terms that begin with a specified prefix,
+ </li><li> "phrases" - sequences of terms or prefix terms that must feature in a
+ document for it to match the query,
+ </li><li> sets of terms, prefix terms or phrases that appear within a specified
+ proximity of each other (these are called "NEAR queries"), or
+ </li><li> boolean combinations of any of the above.
+</li></ul>
+
+<p> Such advanced searches are requested by providing a more complicated
+FTS5 query string as the text to the right of the MATCH operator (or =
+operator, or as the first argument to a table-valued function syntax). The
+full query syntax is <a href="fts5.html#full_text_query_syntax">described here</a>.
+
+</p><a name="compiling_and_using_fts5"></a>
+<h1 tags="FTS5 building" id="compiling_and_using_fts5"><span>2. </span>Compiling and Using FTS5</h1>
+
+<h2 id="building_fts5_as_part_of_sqlite"><span>2.1. </span>Building FTS5 as part of SQLite</h2>
+
+<p>As of <a href="releaselog/3_9_0.html">version 3.9.0</a> (2015-10-14),
+FTS5 is included as part of the SQLite <a href="amalgamation.html">amalgamation</a>.
+If using one of the two autoconf build system, FTS5 is
+enabled by specifying the "--enable-fts5" option when running the configure
+script. (FTS5 is currently disabled by default for the
+source-tree configure script and enabled by default for
+the amalgamation configure script, but these defaults might
+change in the future.)
+
+</p><p>Or, if sqlite3.c is compiled using some other build system, by arranging for
+the SQLITE_ENABLE_FTS5 pre-processor symbol to be defined.
+
+</p><h2 id="building_a_loadable_extension"><span>2.2. </span>Building a Loadable Extension</h2>
+
+<p>Alternatively, FTS5 may be built as a loadable extension.
+
+</p><p>The canonical FTS5 source code consists of a series of *.c and other files
+in the "ext/fts5" directory of the SQLite source tree. A build process reduces
+this to just two files - "fts5.c" and "fts5.h" - which may be used to build an
+SQLite loadable extension.
+
+</p><ol>
+ <li> Obtain the latest SQLite code from fossil.
+ </li><li> Create a Makefile as described in <a href="howtocompile.html">How To Compile SQLite</a>.
+ </li><li> Build the "fts5.c" target. Which also creates fts5.h.
+</li></ol>
+
+<div class="codeblock"><pre>$ wget -c http://www.sqlite.org/src/tarball/SQLite-trunk.tgz?uuid=trunk -O SQLite-trunk.tgz
+.... output ...
+$ tar -xzf SQLite-trunk.tgz
+$ cd SQLite-trunk
+$ ./configure && make fts5.c
+... lots of output ...
+$ ls fts5.&#91;ch]
+fts5.c fts5.h
+</pre></div>
+
+<p>
+ The code in "fts5.c" may then be compiled into a loadable extension or
+ statically linked into an application as described in
+ <a href="loadext.html#build">Compiling Loadable Extensions</a>. There are two entry points defined, both
+ of which do the same thing:
+
+</p><ul>
+ <li> sqlite3_fts_init
+ </li><li> sqlite3_fts5_init
+</li></ul>
+
+<p>
+ The other file, "fts5.h", is not required to compile the FTS5 extension.
+ It is used by applications that implement <a href="fts5.html#extending_fts5">custom FTS5 tokenizers or auxiliary functions</a>.
+
+</p><a name="full_text_query_syntax"></a>
+<h1 tags="FTS5 query syntax" id="full_text_query_syntax"><span>3. </span>Full-text Query Syntax</h1>
+
+<p>
+The following block contains a summary of the FTS query syntax in BNF form.
+A detailed explanation follows.
+
+</p><div class="codeblock"><pre>&lt;phrase&gt; := string &#91;*]
+&lt;phrase&gt; := &lt;phrase&gt; + &lt;phrase&gt;
+&lt;neargroup&gt; := NEAR ( &lt;phrase&gt; &lt;phrase&gt; ... &#91;, N] )
+&lt;query&gt; := &#91; &#91;-] &lt;colspec&gt; :] &#91;&#94;] &lt;phrase&gt;
+&lt;query&gt; := &#91; &#91;-] &lt;colspec&gt; :] &lt;neargroup&gt;
+&lt;query&gt; := &#91; &#91;-] &lt;colspec&gt; :] ( &lt;query&gt; )
+&lt;query&gt; := &lt;query&gt; AND &lt;query&gt;
+&lt;query&gt; := &lt;query&gt; OR &lt;query&gt;
+&lt;query&gt; := &lt;query&gt; NOT &lt;query&gt;
+&lt;colspec&gt; := colname
+&lt;colspec&gt; := { colname1 colname2 ... }
+</pre></div>
+
+<a name="fts5_strings"></a>
+<h2 tags="FTS5 Strings" id="fts5_strings"><span>3.1. </span>FTS5 Strings</h2>
+<p>
+Within an FTS expression a <b>string</b> may be specified in one of two ways:
+
+</p><ul>
+ <li> <p>By enclosing it in double quotes ("). Within a string, any embedded
+ double quote characters may be escaped SQL-style - by adding a second
+ double-quote character.
+
+ </p></li><li> <p>As an FTS5 bareword that is not "AND", "OR" or "NOT" (case sensitive).
+ An FTS5 bareword is a string of one or more consecutive characters that
+ are all either:
+
+ </p><ul>
+ <li> Non-ASCII range characters (i.e. unicode codepoints greater
+ than 127), or
+ </li><li> One of the 52 upper and lower case ASCII characters, or
+ </li><li> One of the 10 decimal digit ASCII characters, or
+ </li><li> The underscore character (unicode codepoint 96).
+ </li><li> The substitute character (unicode codepoint 26).
+ </li></ul>
+
+ Strings that include any other characters must be quoted. Characters
+ that are not currently allowed in barewords, are not quote characters and
+ do not currently serve any special purpose in FTS5 query expressions may
+ at some point in the future be allowed in barewords or used to implement
+ new query functionality. This means that queries that are currently
+ syntax errors because they include such a character outside of a quoted
+ string may be interpreted differently by some future version of FTS5.
+</li></ul>
+
+<a name="fts5_phrases"></a>
+<h2 tags="FTS5 Phrases" id="fts5_phrases"><span>3.2. </span>FTS5 Phrases</h2>
+<p>
+FTS queries are made up of <b>phrases</b>. A phrase is an ordered list of
+one or more tokens. A string is transformed into a phrase by passing it to
+the FTS table tokenizer. Two phrases can be concatenated into a single
+large phrase using the "+" operator. For example, assuming the tokenizer
+module being used tokenizes the input "one.two.three" to three separate
+tokens, the following four queries all specify the same phrase:
+
+</p><div class="codeblock"><pre>... MATCH '"one two three"'
+... MATCH 'one + two + three'
+... MATCH '"one two" + three'
+... MATCH 'one.two.three'
+</pre></div>
+
+<p>
+A phrase matches a document if the document contains at least one sub-sequence
+of tokens that matches the sequence of tokens that make up the phrase.
+
+</p><a name="fts5_prefix_queries"></a>
+<h2 tags="FTS5 prefix queries" id="fts5_prefix_queries"><span>3.3. </span>FTS5 Prefix Queries</h2>
+<p>
+If a "*" character follows a string within an FTS expression, then the final
+token extracted from the string is marked as a <b>prefix token</b>. As you
+might expect, a prefix token matches any document token of which it is a
+prefix. For example, the first two queries in the following block will match
+any document that contains the token "one" immediately followed by the token
+"two" and then any token that begins with "thr".
+
+</p><div class="codeblock"><pre>... MATCH '"one two thr" * '
+... MATCH 'one + two + thr*'
+... MATCH '"one two thr*"' <b>-- May not work as expected!</b>
+</pre></div>
+
+<p>The final query in the block above may not work as expected. Because the
+"*" character is inside the double-quotes, it will be passed to the tokenizer,
+which will likely discard it (or perhaps, depending on the specific tokenizer
+in use, include it as part of the final token) instead of recognizing it as
+a special FTS character.
+
+<a name="carrotq"></a>
+
+</p><a name="fts5_initial_token_queries"></a>
+<h2 tags="FTS5 initial token queries" id="fts5_initial_token_queries"><span>3.4. </span>FTS5 Initial Token Queries</h2>
+<p>
+If a "&#94;" character appears immediately before a phrase that is not part of a
+NEAR query, then that phrase only matches a document only if it starts at the
+first token in a column. The "&#94;" syntax may be combined with a
+<a href="fts5.html#fts5_column_filters">column filter</a>, but may not be inserted into the middle of
+a phrase.
+
+</p><div class="codeblock"><pre>... MATCH '&#94;one' <i>-- first token in any column must be "one"</i>
+... MATCH '&#94; one + two' <i>-- phrase "one two" must appear at start of a column</i>
+... MATCH '&#94; "one two"' <i>-- same as previous </i>
+... MATCH 'a : &#94;two' <i>-- first token of column "a" must be "two"</i>
+... MATCH 'NEAR(&#94;one, two)' <b>-- syntax error! </b>
+... MATCH 'one + &#94;two' <b>-- syntax error! </b>
+... MATCH '"&#94;one two"' <b>-- May not work as expected!</b>
+</pre></div>
+
+<a name="fts5_near_queries"></a>
+<h2 tags="FTS5 NEAR queries" id="fts5_near_queries"><span>3.5. </span>FTS5 NEAR Queries</h2>
+
+<p>Two or more phrases may be grouped into a <b>NEAR group</b>. A NEAR group
+is specified by the token "NEAR" (case sensitive) followed by an open
+parenthesis character, followed by two or more whitespace separated phrases, optionally followed by a comma and the numeric parameter <i>N</i>, followed by
+a close parenthesis. For example:
+
+</p><div class="codeblock"><pre>... MATCH 'NEAR("one two" "three four", 10)'
+... MATCH 'NEAR("one two" thr* + four)'
+</pre></div>
+
+<p>If no <i>N</i> parameter is supplied, it defaults to 10. A NEAR group
+matches a document if the document contains at least one clump of tokens that:
+
+</p><ol>
+ <li> contains at least one instance of each phrase, and
+ </li><li> for which the number of tokens between the end of the first phrase
+ and the beginning of the last phrase in the clump is less than or equal to <i>N</i>.
+</li></ol>
+
+<p>For example:
+
+</p><div class="codeblock"><pre>CREATE VIRTUAL TABLE f USING fts5(x);
+INSERT INTO f(rowid, x) VALUES(1, 'A B C D x x x E F x');
+
+... MATCH 'NEAR(e d, 4)'; <i>-- Matches!</i>
+... MATCH 'NEAR(e d, 3)'; <i>-- Matches!</i>
+... MATCH 'NEAR(e d, 2)'; <i>-- Does not match!</i>
+
+... MATCH 'NEAR("c d" "e f", 3)'; <i>-- Matches!</i>
+... MATCH 'NEAR("c" "e f", 3)'; <i>-- Does not match!</i>
+
+... MATCH 'NEAR(a d e, 6)'; <i>-- Matches!</i>
+... MATCH 'NEAR(a d e, 5)'; <i>-- Does not match!</i>
+
+... MATCH 'NEAR("a b c d" "b c" "e f", 4)'; <i>-- Matches!</i>
+... MATCH 'NEAR("a b c d" "b c" "e f", 3)'; <i>-- Does not match!</i>
+</pre></div>
+
+<a name="fts5_column_filters"></a>
+<h2 tags="FTS5 column filters" id="fts5_column_filters"><span>3.6. </span>FTS5 Column Filters</h2>
+
+<p>
+A single phrase or NEAR group may be restricted to matching text within a
+specified column of the FTS table by prefixing it with the column name
+followed by a colon character. Or to a set of columns by prefixing it
+with a whitespace separated list of column names enclosed in parenthesis
+("curly brackets") followed by a colon character. Column names may be specified
+using either of the two forms described for strings above. Unlike strings that
+are part of phrases, column names are not passed to the tokenizer module.
+Column names are case-insensitive in the usual way for SQLite column names -
+upper/lower case equivalence is understood for ASCII-range characters only.
+
+</p><div class="codeblock"><pre>... MATCH 'colname : NEAR("one two" "three four", 10)'
+... MATCH '"colname" : one + two + three'
+
+... MATCH '{col1 col2} : NEAR("one two" "three four", 10)'
+... MATCH '{col2 col1 col3} : one + two + three'
+</pre></div>
+
+<p>
+If a column filter specification is preceded by a "-" character, then
+it is interpreted as a list of column not to match against. For example:
+
+</p><div class="codeblock"><pre><i>-- Search for matches in all columns except "colname"</i>
+... MATCH '- colname : NEAR("one two" "three four", 10)'
+
+<i>-- Search for matches in all columns except "col1", "col2" and "col3"</i>
+... MATCH '- {col2 col1 col3} : one + two + three'
+</pre></div>
+
+<p>
+Column filter specifications may also be applied to arbitrary expressions
+enclosed in parenthesis. In this case the column filter applies to all
+phrases within the expression. Nested column filter operations may only
+further restrict the subset of columns matched, they can not be used to
+re-enable filtered columns. For example:
+
+</p><div class="codeblock"><pre><i>-- The following are equivalent:</i>
+... MATCH '{a b} : ( {b c} : "hello" AND "world" )'
+... MATCH '(b : "hello") AND ({a b} : "world")'
+</pre></div>
+
+<p>
+Finally, a column filter for a single column may be specified by using
+the column name as the LHS of a MATCH operator (instead of the usual
+table name). For example:
+
+</p><div class="codeblock"><pre><i>-- Given the following table</i>
+CREATE VIRTUAL TABLE ft USING fts5(a, b, c);
+
+<i>-- The following are equivalent</i>
+SELECT * FROM ft WHERE b MATCH 'uvw AND xyz';
+SELECT * FROM ft WHERE ft MATCH 'b : (uvw AND xyz)';
+
+<i>-- This query cannot match any rows (since all columns are filtered out): </i>
+SELECT * FROM ft WHERE b MATCH 'a : xyz';
+</pre></div>
+
+<a name="fts5_boolean_operators"></a>
+<h2 tags="FTS5 boolean operators" id="fts5_boolean_operators"><span>3.7. </span>FTS5 Boolean Operators</h2>
+
+<p>
+Phrases and NEAR groups may be arranged into expressions using <b>boolean
+operators</b>. In order of precedence, from highest (tightest grouping) to
+lowest (loosest grouping), the operators are:
+
+</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
+ <tr style="text-align:left"><th>Operator </th><th>Function
+
+ </th></tr><tr style="text-align:left;background-color:#DDDDDD"><td><code>&lt;query1&gt; NOT &lt;query2&gt;</code>
+ </td><td>Matches if query1 matches and query2 does not match.
+
+ </td></tr><tr style="text-align:left"><td><code>&lt;query1&gt; AND &lt;query2&gt;</code>
+ </td><td>Matches if both query1 and query2 match.
+
+ </td></tr><tr style="text-align:left;background-color:#DDDDDD"><td><code>&lt;query1&gt; OR &lt;query2&gt;</code>
+ </td><td>Matches if either query1 or query2 match.
+
+</td></tr></table>
+
+<p>
+Parenthesis may be used to group expressions in order to modify operator
+precedence in the usual ways. For example:
+
+</p><div class="codeblock"><pre><i>-- Matches documents that contain at least one instance of either "one"</i>
+<i>-- or "two", but do not contain any instances of token "three".</i>
+... MATCH 'one OR two NOT three'
+
+<i>-- Match all documents that contain the token "two" but not "three", or</i>
+<i>-- contain the token "one".</i>
+... MATCH 'one OR (two NOT three)'
+</pre></div>
+
+<p>
+Phrases and NEAR groups may also be connected by <b>implicit AND operators</b>.
+For simplicity, these are not shown in the BNF grammar above. Essentially, any
+sequence of phrases or NEAR groups (including those restricted to matching
+specified columns) separated only by whitespace are handled as if there were an
+implicit AND operator between each pair of phrases or NEAR groups. Implicit
+AND operators are never inserted after or before an expression enclosed in
+parenthesis. For example:
+
+</p><div class="codeblock"><pre>... MATCH 'one two three' <i>-- 'one AND two AND three'</i>
+... MATCH 'three "one two"' <i>-- 'three AND "one two"'</i>
+... MATCH 'NEAR(one two) three' <i>-- 'NEAR(one two) AND three'</i>
+... MATCH 'one OR two three' <i>-- 'one OR two AND three'</i>
+
+... MATCH '(one OR two) three' <i>-- Syntax error!</i>
+... MATCH 'func(one two)' <i>-- Syntax error!</i>
+</pre></div>
+
+<a name="fts5_table_creation_and_initialization"></a>
+<h1 tags="FTS5 CREATE TABLE Options" id="fts5_table_creation_and_initialization"><span>4. </span>FTS5 Table Creation and Initialization</h1>
+
+<p>Each argument specified as part of a "CREATE VIRTUAL TABLE ... USING fts5
+..." statement is either a column declaration or a configuration option. A
+<b>column declaration</b> consists of one or more whitespace separated FTS5
+barewords or string literals quoted in any manner acceptable to SQLite.
+
+</p><p>The first string or bareword in a column declaration is the column name. It
+is an error to attempt to name an fts5 table column "rowid" or "rank", or to
+assign the same name to a column as is used by the table itself. This is not
+supported.
+
+</p><p>Each subsequent string or bareword in a column declaration is a column
+option that modifies the behaviour of that column. Column options are
+case-independent. Unlike the SQLite core, FTS5 considers unrecognized column
+options to be errors. Currently, the only option recognized is
+<a href="fts5.html#the_unindexed_column_option">"UNINDEXED" (see below)</a>.
+
+</p><p>A <b>configuration option</b> consists of an FTS5 bareword - the option name -
+followed by an "=" character, followed by the option value. The option value is
+specified using either a single FTS5 bareword or a string literal, again quoted
+in any manner acceptable to the SQLite core. For example:
+
+</p><div class="codeblock"><pre>CREATE VIRTUAL TABLE mail USING fts5(sender, title, body, tokenize = 'porter ascii');
+</pre></div>
+
+<p> There are currently the following configuration options:
+
+</p><ul>
+ <li> The "tokenize" option, used to configure a <a href="fts5.html#tokenizers">custom tokenizer</a>.
+ </li><li> The "prefix" option, used to add <a href="fts5.html#prefix_indexes">prefix indexes</a>
+ to an FTS5 table.
+ </li><li> The "content" option, used to make the FTS5 table an
+ <a href="fts5.html#external_content_and_contentless_tables">external content or contentless table</a>.
+ </li><li> The "content_rowid" option, used to set the rowid field of an
+ <a href="fts5.html#external_content_tables">external content table</a>.
+ </li><li> The <a href="fts5.html#the_columnsize_option">"columnsize" option</a>, used to configure
+ whether or not the size in tokens of each value in the FTS5 table is
+ stored separately within the database.
+ </li><li> The <a href="fts5.html#the_detail_option">"detail" option</a>. This option may be used
+ to reduce the size of the FTS index on disk by omitting some information
+ from it.
+</li></ul>
+
+<a name="the_unindexed_column_option"></a>
+<h2 tags="unindexed" id="the_unindexed_column_option"><span>4.1. </span>The UNINDEXED column option</h2>
+
+<p>The contents of columns qualified with the UNINDEXED column option are not
+added to the FTS index. This means that for the purposes of MATCH queries and
+<a href="fts5.html#_auxiliary_functions_">FTS5 auxiliary functions</a>, the column contains no matchable tokens.
+
+</p><p>For example, to avoid adding the contents of the "uuid" field to the FTS
+index:
+</p><div class="codeblock"><pre>CREATE VIRTUAL TABLE customers USING fts5(name, addr, uuid UNINDEXED);
+</pre></div>
+
+<a name="prefix_indexes"></a>
+<h2 tags="FTS5 prefix indexes" id="prefix_indexes"><span>4.2. </span>Prefix Indexes</h2>
+
+<p> By default, FTS5 maintains a single index recording the location of each
+token instance within the document set. This means that querying for complete
+tokens is fast, as it requires a single lookup, but querying for a prefix
+token can be slow, as it requires a range scan. For example, to query for
+the prefix token "abc*" requires a range scan of all tokens greater than
+or equal to "abc" and less than "abd".
+
+</p><p> A prefix index is a separate index that records the location of all
+instances of prefix tokens of a certain length in characters used to speed
+up queries for prefix tokens. For example, optimizing a query for prefix
+token "abc*" requires a prefix index of three-character prefixes.
+
+</p><p> To add prefix indexes to an FTS5 table, the "prefix" option is set to
+either a single positive integer or a text value containing a white-space
+separated list of one or more positive integer values. A prefix index is
+created for each integer specified. If more than one "prefix" option is
+specified as part of a single CREATE VIRTUAL TABLE statement, all apply.
+
+</p><div class="codeblock"><pre><i>-- Two ways to create an FTS5 table that maintains prefix indexes for
+-- two and three character prefix tokens.</i>
+CREATE VIRTUAL TABLE ft USING fts5(a, b, prefix='2 3');
+CREATE VIRTUAL TABLE ft USING fts5(a, b, prefix=2, prefix=3);
+</pre></div>
+
+<a name="tokenizers"></a>
+<h2 tags="FTS5 tokenizers" id="tokenizers"><span>4.3. </span>Tokenizers</h2>
+
+<p> The CREATE VIRTUAL TABLE "tokenize" option is used to configure the
+specific tokenizer used by the FTS5 table. The option argument must be either
+an FTS5 bareword, or an SQL text literal. The text of the argument is itself
+treated as a white-space series of one or more FTS5 barewords or SQL text
+literals. The first of these is the name of the tokenizer to use. The second
+and subsequent list elements, if they exist, are arguments passed to the
+tokenizer implementation.
+
+</p><p> Unlike option values and column names, SQL text literals intended as
+tokenizers must be quoted using single quote characters. For example:
+
+</p><div class="codeblock"><pre><i>-- The following are all equivalent</i>
+CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'porter ascii');
+CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = "porter ascii");
+CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = "'porter' 'ascii'");
+CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = '''porter'' ''ascii''');
+
+<i>-- But this will fail:</i>
+CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = '"porter" "ascii"');
+
+<i>-- This will fail too:</i>
+CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'porter' 'ascii');
+</pre></div>
+
+
+<p>
+FTS5 features three built-in tokenizer modules, described in subsequent
+sections:
+
+</p><ul>
+ <li> The <b>unicode61</b> tokenizer, based on the Unicode 6.1 standard. This
+ is the default.
+
+ </li><li> The <b>ascii</b> tokenizer, which assumes all characters outside of
+ the ASCII codepoint range (0-127) are to be treated as token characters.
+
+ </li><li> The <b>porter</b> tokenizer, which implements the
+<a href="http://tartarus.org/martin/PorterStemmer/">porter stemming algorithm</a>.
+</li></ul>
+
+<p> It is also possible to create custom tokenizers for FTS5. The API for doing so is <a href="fts5.html#custom_tokenizers">described here</a>.
+
+</p><h3 id="unicode61_tokenizer"><span>4.3.1. </span>Unicode61 Tokenizer</h3>
+
+<p> The unicode tokenizer classifies all unicode characters as either
+"separator" or "token" characters. By default all space and punctuation
+characters, as defined by Unicode 6.1, are considered separators, and all
+other characters as token characters. More specifically, all unicode
+characters assigned to a
+<a href="https://en.wikipedia.org/wiki/Unicode_character_property#General_Category">
+general category</a> beginning with "L" or "N" (letters and numbers,
+specifically) or to category "Co" ("other, private use") are considered tokens.
+All other characters are separators.
+
+</p><p>Each contiguous run of one or more token characters is considered to be a
+token. The tokenizer is case-insensitive according to the rules defined by
+Unicode 6.1.
+
+</p><p> By default, diacritics are removed from all Latin script characters. This
+means, for example, that "A", "a", "&#192;", "&#224;", "&#194;" and "&#226;"
+are all considered to be equivalent.
+
+</p><p> Any arguments following "unicode61" in the token specification are treated
+as a list of alternating option names and values. Unicode61 supports the
+following options:
+
+</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
+ <tr style="text-align:left"><th> Option </th><th> Usage
+ </th></tr><tr style="text-align:left;background-color:#DDDDDD"><td> remove_diacritics
+ </td><td>This option should be set to "0", "1" or "2". The default value is "1".
+ If it is set to "1" or "2", then diacritics are removed from Latin script
+ characters as described above. However, if it is set to "1", then diacritics
+ are not removed in the fairly uncommon case where a single unicode codepoint
+ is used to represent a character with more that one diacritic. For example,
+ diacritics are not removed from codepoint 0x1ED9 ("LATIN SMALL LETTER O WITH
+ CIRCUMFLEX AND DOT BELOW"). This is technically a bug, but cannot be fixed
+ without creating backwards compatibility problems. If this option is set to
+ "2", then diacritics are correctly removed from all Latin characters.
+
+ </td></tr><tr style="text-align:left"><td> categories
+ </td><td>This option may be used to modify the set of Unicode general categories
+ that are considered to correspond to token characters. The argument must
+ consist of a space separated list of two-character general category
+ abbreviations (e.g. "Lu" or "Nd"), or of the same with the second character
+ replaced with an asterisk ("*"), interpreted as a glob pattern. The default
+ value is "L* N* Co".
+
+ </td></tr><tr style="text-align:left;background-color:#DDDDDD"><td> tokenchars
+ </td><td> This option is used to specify additional unicode characters that
+ should be considered token characters, even if they are white-space or
+ punctuation characters according to Unicode 6.1. All characters in the
+ string that this option is set to are considered token characters.
+
+ </td></tr><tr style="text-align:left"><td> separators
+ </td><td> This option is used to specify additional unicode characters that
+ should be considered as separator characters, even if they are token
+ characters according to Unicode 6.1. All characters in the string that
+ this option is set to are considered separators.
+</td></tr></table>
+
+<p> For example:
+
+</p><div class="codeblock"><pre><i>-- Create an FTS5 table that does not remove diacritics from Latin
+-- script characters, and that considers hyphens and underscore characters
+-- to be part of tokens. </i>
+CREATE VIRTUAL TABLE ft USING fts5(a, b,
+ tokenize = "unicode61 remove_diacritics 0 tokenchars '-_'"
+);
+</pre></div>
+
+<p> or:
+
+</p><div class="codeblock"><pre><i>-- Create an FTS5 table that, as well as the default token character classes,</i>
+<i>-- considers characters in class "Mn" to be token characters.</i>
+CREATE VIRTUAL TABLE ft USING fts5(a, b,
+ tokenize = "unicode61 categories 'L* N* Co Mn'"
+);
+</pre></div>
+
+<p> The fts5 unicode61 tokenizer is byte-for-byte compatible with the fts3/4
+unicode61 tokenizer.
+
+</p><h3 id="ascii_tokenizer"><span>4.3.2. </span>Ascii Tokenizer</h3>
+
+<p> The Ascii tokenizer is similar to the Unicode61 tokenizer, except that:
+
+</p><ul>
+ <li> All non-ASCII characters (those with codepoints greater than 127) are
+ always considered token characters. If any non-ASCII characters are specified
+ as part of the separators option, they are ignored.
+
+ </li><li> Case-folding is only performed for ASCII characters. So while "A" and
+ "a" are considered to be equivalent, "&#195;" and "&#227;" are distinct.
+
+ </li><li> The remove_diacritics option is not supported.
+</li></ul>
+
+<p> For example:
+
+</p><div class="codeblock"><pre><i>-- Create an FTS5 table that uses the ascii tokenizer, but does not
+-- consider numeric characters to be part of tokens.</i>
+CREATE VIRTUAL TABLE ft USING fts5(a, b,
+ tokenize = "ascii separators '0123456789'"
+);
+</pre></div>
+
+<h3 id="porter_tokenizer"><span>4.3.3. </span>Porter Tokenizer</h3>
+
+<p> The porter tokenizer is a wrapper tokenizer. It takes the output of some
+other tokenizer and applies the
+<a href="http://tartarus.org/martin/PorterStemmer/">porter stemming algorithm</a>
+to each token before it returns it to FTS5. This allows search terms like
+"correction" to match similar words such as "corrected" or "correcting". The
+porter stemmer algorithm is designed for use with English language terms
+only - using it with other languages may or may not improve search utility.
+
+</p><p> By default, the porter tokenizer operates as a wrapper around the default
+tokenizer (unicode61). Or, if one or more extra arguments are added to the
+"tokenize" option following "porter", they are treated as a specification for
+the underlying tokenizer that the porter stemmer uses. For example:
+
+</p><div class="codeblock"><pre><i>-- Two ways to create an FTS5 table that uses the porter tokenizer to
+-- stem the output of the default tokenizer (unicode61). </i>
+CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = porter);
+CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'porter unicode61');
+
+<i>-- A porter tokenizer used to stem the output of the unicode61 tokenizer,
+-- with diacritics removed before stemming.</i>
+CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'porter unicode61 remove_diacritics 1');
+</pre></div>
+
+<a name="trigramidx"></a>
+
+<h3 id="the_experimental_trigram_tokenizer"><span>4.3.4. </span>The Experimental Trigram Tokenizer</h3>
+
+<p>
+The <b>experimental</b> trigram tokenizer extends FTS5 to support substring
+matching in general, instead of the usual token matching. When using the
+trigram tokenizer, a query or phrase token may match any sequence of characters
+within a row, not just a complete token. For example:
+
+</p><div class="codeblock"><pre>CREATE VIRTUAL TABLE tri USING fts5(a, tokenize="trigram");
+INSERT INTO tri VALUES('abcdefghij KLMNOPQRST uvwxyz');
+
+<i>-- The following queries all match the single row in the table</i>
+SELECT * FROM tri('cdefg');
+SELECT * FROM tri('cdefg AND pqr');
+SELECT * FROM tri('"hij klm" NOT stuv');
+</pre></div>
+
+<p>
+The trigram tokenizer supports a single option - "case_sensitive". With the default
+value, 0, matching is case-insensitive. If this value is set to 1, then all matches
+are case-sensitive.
+
+</p><div class="codeblock"><pre><i>-- A case-sensitive trigram index</i>
+CREATE VIRTUAL TABLE tri USING fts5(a, tokenize="trigram case_sensitive 1");
+</pre></div>
+
+<p>
+FTS5 tables that use the trigram tokenizer also support indexed GLOB and LIKE
+pattern matching. For example:
+
+</p><div class="codeblock"><pre>SELECT * FROM tri WHERE a LIKE '%cdefg%';
+SELECT * FROM tri WHERE a GLOB '*ij klm*xyz';
+</pre></div>
+
+<p>
+If an FTS5 trigram tokenizer is created with the case_sensitive option set to 1,
+it may only index GLOB queries, not LIKE.
+
+</p><p>
+Notes:
+
+</p><ul>
+ <li> Substrings consisting of fewer than 3 unicode characters do not match any
+ rows when used with a full-text query. If a LIKE or GLOB pattern does not
+ contain at least one sequence of non-wildcard unicode characters, FTS5
+ falls back to a linear scan of the entire table.
+
+ </li><li> If the FTS5 table is created with the detail=none or detail=column option
+ specified, full-text queries may not contain any tokens longer than 3
+ unicode characters. LIKE and GLOB pattern matching may be slightly slower,
+ but still works. If the index is to be used only for LIKE and/or GLOB
+ pattern matching, these options are worth experimenting with to reduce
+ the index size.
+</li></ul>
+
+<a name="external_content_and_contentless_tables"></a>
+<h2 tags="FTS5 content option" id="external_content_and_contentless_tables"><span>4.4. </span>External Content and Contentless Tables</h2>
+
+<p>
+Normally, when a row is inserted into an FTS5 table, as well as the various
+full-text index entries and other data a copy of the row is stored in a private
+table managed by the FTS5 module. When column values are requested from the
+FTS5 table by the user or by an auxiliary function implementation, they are
+read from this private table. The "content" option may be used to create an
+FTS5 table that stores only FTS full-text index entries. Because the column
+values themselves are usually much larger than the associated full-text index
+entries, this can save significant database space.
+
+</p><p>
+There are two ways to use the "content" option:
+</p><ul>
+ <li> By setting it to an empty string to create a contentless FTS5 table. In
+ this case FTS5 assumes that the original column values are unavailable
+ to it when processing queries. Full-text queries and some auxiliary
+ functions can still be used, but no column values apart from the rowid
+ may be read from the table.
+
+ </li><li> By setting it to the name of a database object (table, virtual table or
+ view) that may be queried by FTS5 at any time to retrieve the column
+ values. This is known as an "external content" table. In this case all
+ FTS5 functionality may be used, but it is the responsibility of the user
+ to ensure that the contents of the full-text index are consistent with
+ the named database object. If they are not, query results may be
+ unpredictable.
+</li></ul>
+
+<a name="contentless_tables"></a>
+<h3 tags="FTS5 contentless tables" id="contentless_tables"><span>4.4.1. </span>Contentless Tables</h3>
+
+<p> A contentless FTS5 table is created by setting the "content" option to
+an empty string. For example:
+
+</p><div class="codeblock"><pre>CREATE VIRTUAL TABLE f1 USING fts5(a, b, c, content='');
+</pre></div>
+
+<p> Contentless FTS5 tables do not support UPDATE or DELETE statements, or
+INSERT statements that do not supply a non-NULL value for the rowid field.
+Contentless tables do not support REPLACE conflict handling. REPLACE
+and INSERT OR REPLACE statements are treated as regular INSERT statements.
+Rows may be deleted from a contentless table using an <a href="fts5.html#the_delete_command">FTS5 delete command</a>.
+
+</p><p> Attempting to read any column value except the rowid from a contentless
+FTS5 table returns an SQL NULL value.
+
+</p><a name="external_content_tables"></a>
+<h3 tags="FTS5 external content tables" id="external_content_tables"><span>4.4.2. </span>External Content Tables</h3>
+
+<p> An external content FTS5 table is created by setting the content
+option to the name of a table, virtual table or view (hereafter the "content
+table") within the same database. Whenever column values are required by
+FTS5, it queries the content table as follows, with the rowid of the row
+for which values are required bound to the SQL variable:
+
+</p><div class="codeblock"><pre>SELECT &lt;content_rowid&gt;, &lt;cols&gt; FROM &lt;content&gt; WHERE &lt;content_rowid&gt; = ?;
+</pre></div>
+
+<p> In the above, &lt;content&gt; is replaced by the name of the content table.
+By default, &lt;content_rowid&gt; is replaced by the literal text "rowid". Or,
+if the "content_rowid" option is set within the CREATE VIRTUAL TABLE statement,
+by the value of that option. &lt;cols&gt; is replaced by a comma-separated list
+of the FTS5 table column names. For example:
+
+</p><div class="codeblock"><pre><i>-- If the database schema is: </i>
+CREATE TABLE tbl (a, b, c, d INTEGER PRIMARY KEY);
+CREATE VIRTUAL TABLE fts USING fts5(a, c, content=tbl, content_rowid=d);
+
+<i>-- Fts5 may issue queries such as:</i>
+SELECT d, a, c FROM tbl WHERE d = ?;
+</pre></div>
+
+<p> The content table may also be queried as follows:
+
+</p><div class="codeblock"><pre>SELECT &lt;content_rowid&gt;, &lt;cols&gt; FROM &lt;content&gt; ORDER BY &lt;content_rowid&gt; ASC;
+SELECT &lt;content_rowid&gt;, &lt;cols&gt; FROM &lt;content&gt; ORDER BY &lt;content_rowid&gt; DESC;
+</pre></div>
+
+<p> It is still the responsibility of the user to ensure that the contents of
+an external content FTS5 table are kept up to date with the content table.
+One way to do this is with triggers. For example:
+
+</p><div class="codeblock"><pre><i>-- Create a table. And an external content fts5 table to index it.</i>
+CREATE TABLE tbl(a INTEGER PRIMARY KEY, b, c);
+CREATE VIRTUAL TABLE fts_idx USING fts5(b, c, content='tbl', content_rowid='a');
+
+<i>-- Triggers to keep the FTS index up to date.</i>
+CREATE TRIGGER tbl_ai AFTER INSERT ON tbl BEGIN
+ INSERT INTO fts_idx(rowid, b, c) VALUES (new.a, new.b, new.c);
+END;
+CREATE TRIGGER tbl_ad AFTER DELETE ON tbl BEGIN
+ INSERT INTO fts_idx(fts_idx, rowid, b, c) VALUES('delete', old.a, old.b, old.c);
+END;
+CREATE TRIGGER tbl_au AFTER UPDATE ON tbl BEGIN
+ INSERT INTO fts_idx(fts_idx, rowid, b, c) VALUES('delete', old.a, old.b, old.c);
+ INSERT INTO fts_idx(rowid, b, c) VALUES (new.a, new.b, new.c);
+END;
+</pre></div>
+
+<p> Like contentless tables, external content tables do not support REPLACE
+conflict handling. Any operations that specify REPLACE conflict handling are
+handled using ABORT.
+
+</p><a name="the_columnsize_option"></a>
+<h2 tags="FTS5 columnsize option" id="the_columnsize_option"><span>4.5. </span>The Columnsize Option</h2>
+
+<p>Normally, FTS5 maintains a special backing table within the database that
+stores the size of each column value in tokens inserted into the main FTS5
+table in a separate table. This backing table is used by the
+<a href="#xColumnSize">xColumnSize</a><a> API function, which is in turn used by
+the built-in <a href="fts5.html#the_bm25_function">bm25 ranking function</a> (and is likely to be useful
+to other ranking functions as well).
+
+</a></p><p>In order to save space, this backing table may be omitted by setting the
+columnsize option to zero. For example:
+
+</p><div class="codeblock"><pre><i>-- A table without the xColumnSize() values stored on disk:</i>
+CREATE VIRTUAL TABLE ft USING fts5(a, b, c, columnsize=0);
+
+<i>-- Three equivalent ways of creating a table that does store the</i>
+<i>-- xColumnSize() values on disk:</i>
+CREATE VIRTUAL TABLE ft USING fts5(a, b, c);
+CREATE VIRTUAL TABLE ft USING fts5(a, b, c, columnsize=1);
+CREATE VIRTUAL TABLE ft USING fts5(a, b, columnsize='1', c);
+</pre></div>
+
+<p> It is an error to set the columnsize option to any value other than
+0 or 1.
+
+</p><p> If an FTS5 table is configured with columnsize=0 but is not a
+<a href="fts5.html#contentless_tables">contentless table</a>, the xColumnSize API function
+still works, but runs much more slowly. In this case, instead of reading
+the value to return directly from the database, it reads the text value
+itself and count the tokens within it on demand.
+
+</p><p>Or, if the table is also a <a href="fts5.html#contentless_tables">contentless table</a>,
+then the following apply:
+
+</p><ul>
+ <li> <p>The xColumnSize API always returns -1. There is no way to determine
+ the number of tokens in a value stored within a contentless FTS5 table
+ configured with columnsize=0.
+
+ </p></li><li> <p>Each inserted row must be accompanied by an explicitly specified rowid
+ value. If a contentless table is configured with columnsize=0,
+ attempting to insert a NULL value into the rowid is an SQLITE_MISMATCH
+ error.
+
+ </p></li><li> <p>All queries on the table must be full-text queries. In other words,
+ they must use the MATCH or = operator with the table-name column as the
+ left-hand operand, or else use the table-valued function syntax. Any
+ query that is not a full-text query results in an error.
+</p></li></ul>
+
+<p> The name of the table in which the xColumnSize values are stored
+(unless columnsize=0 is specified) is "&lt;name&gt;_docsize", where
+&lt;name&gt; is the name of the FTS5 table itself. The
+<a href="https://www.sqlite.org/download.html">sqlite3_analyzer</a>
+tool may be used on an existing database in order to determine how much
+space might be saved by recreating an FTS5 table using columnsize=0.
+
+</p><a name="the_detail_option"></a>
+<h2 tags="FTS5 detail option" id="the_detail_option"><span>4.6. </span>The Detail Option</h2>
+
+<p> For each term in a document, the FTS index maintained by FTS5
+stores the rowid of the document, the column number of the column that contains
+the term and the offset of the term within the column value. The "detail"
+option may be used to omit some of this information. This reduces the space
+that the index consumes within the database file, but also reduces the
+capability and efficiency of the system.
+
+</p><p> The detail option may be set to "full" (the default value), "column" or
+"none". For example:
+
+</p><div class="codeblock"><pre><i>-- The following two lines are equivalent (because the default value</i>
+<i>-- of "detail" is "full". </i>
+CREATE VIRTUAL TABLE ft1 USING fts5(a, b, c);
+CREATE VIRTUAL TABLE ft1 USING fts5(a, b, c, detail=full);
+
+CREATE VIRTUAL TABLE ft2 USING fts5(a, b, c, detail=column);
+CREATE VIRTUAL TABLE ft3 USING fts5(a, b, c, detail=none);
+</pre></div>
+
+<p>If the detail option is set to <b>column</b>, then for each term the FTS
+index records the rowid and column number only, omitting the term offset
+information. This results in the following restrictions:
+
+</p><ul>
+ <li> NEAR queries are not available.
+ </li><li> Phrase queries are not available.
+ </li><li> Assuming the table is not also a
+ <a href="fts5.html#contentless_tables">contentless table</a>, the
+ <a href="#xInstCount">xInstCount</a>, <a href="#xInst">xInst</a>,
+ <a href="#xPhraseFirst">xPhraseFirst</a> and <a href="#xPhraseNext">xPhraseNext</a>
+ are slower than usual. This is because instead of reading the required data
+ directly from the FTS index they have to load and tokenize the document text
+ on demand.
+ </li><li> If the table is also a contentless table, the xInstCount, xInst,
+ xPhraseFirst and xPhraseNext APIs behave as if the current row contains no
+ phrase matches at all (i.e. xInstCount() returns 0).
+</li></ul>
+
+<p>If the detail option is set to <b>none</b>, then for each term the FTS
+index records just the rowid is stored. Both column and offset information
+are omitted. As well as the restrictions itemized above for detail=column
+mode, this imposes the following extra limitations:
+
+</p><ul>
+ <li> Column filter queries are not available.
+ </li><li> Assuming the table is not also a contentless table, the
+ <a href="#xPhraseFirstColumn">xPhraseFirstColumn</a> and
+ <a href="#xPhraseNextColumn">xPhraseNextColumn</a> are slower than usual.
+
+ </li><li> If the table is also a contentless table, the xPhraseFirstColumn and
+ xPhraseNextColumn APIs behave as if the current row contains no phrase
+ matches at all (i.e. xPhraseFirstColumn() sets the iterator to EOF).
+</li></ul>
+
+<p> In one test that indexed a large set of emails (1636 MiB on disk), the FTS
+index was 743 MiB on disk with detail=full, 340 MiB with detail=column and 134
+MiB with detail=none.
+
+</p><a name="_auxiliary_functions_"></a>
+<h1 tags="FTS5 auxiliary functions" id="_auxiliary_functions_"><span>5. </span> Auxiliary Functions </h1>
+
+<p> Auxiliary functions are similar to <a href="lang_corefunc.html">SQL scalar functions</a>,
+except that they may only be used within full-text queries (those that use
+the MATCH operator) on an FTS5 table. Their results are calculated based not
+only on the arguments passed to them, but also on the current match and
+matched row. For example, an auxiliary function may return a numeric value
+indicating the accuracy of the match (see the <a href="fts5.html#the_bm25_function">bm25()</a> function),
+or a fragment of text from the matched row that contains one or more
+instances of the search terms (see the <a href="fts5.html#the_snippet_function">snippet()</a> function).
+
+</p><p>To invoke an auxiliary function, the name of the FTS5 table should be
+specified as the first argument. Other arguments may follow the first,
+depending on the specific auxiliary function being invoked. For example, to
+invoke the "highlight" function:
+
+</p><div class="codeblock"><pre>SELECT highlight(email, 2, '&lt;b&gt;', '&lt;/b&gt;') FROM email WHERE email MATCH 'fts5'
+</pre></div>
+
+<p>The built-in auxiliary functions provided as part of FTS5 are described in
+the following section. Applications may also implement
+<a href="fts5.html#custom_auxiliary_functions">custom auxiliary functions in C</a>.
+
+</p><h2 id="built_in_auxiliary_functions"><span>5.1. </span>Built-in Auxiliary Functions</h2>
+
+<p> FTS5 provides three built-in auxiliary functions:
+
+</p><ul>
+ <li> The <a href="fts5.html#the_bm25_function">bm25() auxiliary function</a> returns a real value
+ reflecting the accuracy of the current match. Better matches are
+ assigned numerically lower values.
+
+ </li><li> The <a href="fts5.html#the_highlight_function">highlight() auxiliary function</a> returns a copy
+ of the text from one of the columns of the current match with each
+ instance of a queried term within the result surrounded by specified
+ markup (for example "&lt;b&gt;" and "&lt;/b&gt;").
+
+ </li><li> The <a href="fts5.html#the_snippet_function">snippet() auxiliary function</a> selects a short
+ fragment of text from one of the columns of the matched row and returns
+ it with each instance of a queried term surrounded by markup in
+ the same manner as the highlight() function. The fragment of text is
+ selected so as to maximize the number of queried terms it contains.
+</li></ul>
+
+<a name="the_bm25_function"></a>
+<h3 tags="FTS5 bm25" id="the_bm25_function"><span>5.1.1. </span>The bm25() function</h3>
+
+<p> The built-in auxiliary function bm25() returns a real value indicating
+how well the current row matches the full-text query. The better the match,
+the numerically smaller the value returned. A query such as the following may
+be used to return matches in order from best to worst match:
+
+</p><div class="codeblock"><pre>SELECT * FROM fts WHERE fts MATCH ? ORDER BY bm25(fts)
+</pre></div>
+
+<p> In order to calculate a documents score, the full-text query is separated
+ into its component phrases. The bm25 score for document <i>D</i> and
+ query <i>Q</i> is then calculated as follows:
+
+</p><p> <img src="images/fts5_formula1.png" style="width:75ex;margin-left:5ex">
+
+</p><p> In the above, <i>nPhrase</i> is the number of phrases in the query.
+ <i>|D|</i> is the number of tokens in the current document, and
+ <i>avgdl</i> is the average number of tokens in all documents within the
+ FTS5 table. <i>k<sub>1</sub></i> and <i>b</i> are both constants,
+ hard-coded at 1.2 and 0.75 respectively.
+
+</p><p> The "-1" term at the start of the formula is not found in most
+implementations of the BM25 algorithm. Without it, a better match is assigned
+a numerically higher BM25 score. Since the default sorting order is
+"ascending", this means that appending "ORDER BY bm25(fts)" to a query would
+cause results to be returned in order from worst to best. The "DESC" keyword
+would be required in order to return the best matches first. In order to
+avoid this pitfall, the FTS5 implementation of BM25 multiplies the result
+by -1 before returning it, ensuring that better matches are assigned
+numerically lower scores.
+
+</p><p> <i>IDF(q<sub>i</sub>)</i> is the inverse-document-frequency of query
+ phrase <i>i</i>. It is calculated as follows, where <i>N</i> is the total
+ number of rows in the FTS5 table and <i>n(q<sub>i</sub>)</i> is the total
+ number of rows that contain at least one instance of phrase <i>i</i>:
+
+</p><p> <img src="images/fts5_formula2.png" style="width:75ex;margin-left:5ex">
+
+</p><p> Finally, <i>f(q<sub>i</sub>,D)</i> is the phrase frequency of phrase
+<i>i</i>. By default, this is simply the number of occurrences of the phrase
+within the current row. However, by passing extra real value arguments to
+the bm25() SQL function, each column of the table may be assigned a different
+weight and the phrase frequency calculated as follows:
+
+</p><p> <img src="images/fts5_formula3.png" style="width:75ex;margin-left:5ex">
+
+</p><p> where <i>w<sub>c</sub></i> is the weight assigned to column <i>c</i> and
+<i>n(q<sub>i</sub>,c)</i> is the number of occurrences of phrase <i>i</i> in
+column <i>c</i> of the current row. The first argument passed to bm25()
+following the table name is the weight assigned to the leftmost column of
+the FTS5 table. The second is the weight assigned to the second leftmost
+column, and so on. If there are not enough arguments for all table columns,
+remaining columns are assigned a weight of 1.0. If there are too many
+trailing arguments, the extras are ignored. For example:
+
+</p><div class="codeblock"><pre><i>-- Assuming the following schema:</i>
+CREATE VIRTUAL TABLE email USING fts5(sender, title, body);
+
+<i>-- Return results in bm25 order, with each phrase hit in the "sender"</i>
+<i>-- column considered the equal of 10 hits in the "body" column, and</i>
+<i>-- each hit in the "title" column considered as valuable as 5 hits in</i>
+<i>-- the "body" column.</i>
+SELECT * FROM email WHERE email MATCH ? ORDER BY bm25(email, 10.0, 5.0);
+</pre></div>
+
+<p>Refer to wikipedia for
+<a href="http://en.wikipedia.org/wiki/Okapi_BM25">more information regarding
+BM25</a> and its variants.
+
+</p><a name="the_highlight_function"></a>
+<h3 tags="FTS5 highlight" id="the_highlight_function"><span>5.1.2. </span>The highlight() function</h3>
+
+<p> The highlight() function returns a copy of the text from a specified
+column of the current row with extra markup text inserted to mark the start
+and end of phrase matches.
+
+</p><p>The highlight() must be invoked with exactly three arguments following
+the table name. To be interpreted as follows:
+
+</p><ol>
+ <li> An integer indicating the index of the FTS table column to read the
+ text from. Columns are numbered from left to right starting at zero.
+
+ </li><li> The text to insert before each phrase match.
+
+ </li><li> The text to insert after each phrase match.
+</li></ol>
+
+<p>For example:
+
+</p><div class="codeblock"><pre><i>-- Return a copy of the text from the leftmost column of the current</i>
+<i>-- row, with phrase matches marked using html "b" tags.</i>
+SELECT highlight(fts, 0, '&lt;b&gt;', '&lt;/b&gt;') FROM fts WHERE fts MATCH ?
+</pre></div>
+
+<p>In cases where two or more phrase instances overlap (share one or more
+tokens in common), a single open and close marker is inserted for each set
+of overlapping phrases. For example:
+
+</p><div class="codeblock"><pre><i>-- Assuming this:</i>
+CREATE VIRTUAL TABLE ft USING fts5(a);
+INSERT INTO ft VALUES('a b c x c d e');
+INSERT INTO ft VALUES('a b c c d e');
+INSERT INTO ft VALUES('a b c d e');
+
+<i>-- The following SELECT statement returns these three rows:</i>
+<i>-- '&#91;a b c&#93; x &#91;c d e&#93;'</i>
+<i>-- '&#91;a b c&#93; &#91;c d e&#93;'</i>
+<i>-- '&#91;a b c d e&#93;'</i>
+SELECT highlight(ft, 0, '&#91;', '&#93;') FROM ft WHERE ft MATCH 'a+b+c AND c+d+e';
+</pre></div>
+
+<a name="the_snippet_function"></a>
+<h3 tags="FTS5 snippet" id="the_snippet_function"><span>5.1.3. </span>The snippet() function</h3>
+
+<p>The snippet() function is similar to highlight(), except that instead of
+returning entire column values, it automatically selects and extracts a
+short fragment of document text to process and return. The snippet() function
+must be passed five parameters following the table name argument:
+
+</p><ol>
+ <li> An integer indicating the index of the FTS table column to select
+ the returned text from. Columns are numbered from left to right
+ starting at zero. A negative value indicates that the column should
+ be automatically selected.
+
+ </li><li> The text to insert before each phrase match within the returned text.
+
+ </li><li> The text to insert after each phrase match within the returned text.
+
+ </li><li> The text to add to the start or end of the selected text to indicate
+ that the returned text does not occur at the start or end of its column,
+ respectively.
+
+ </li><li> The maximum number of tokens in the returned text. This must be greater
+ than zero and equal to or less than 64.
+</li></ol>
+
+<a name="sorting_by_auxiliary_function_results"></a>
+<h2 tags="auxiliary function mapping" id="sorting_by_auxiliary_function_results"><span>5.2. </span>Sorting by Auxiliary Function Results</h2>
+
+<p> All FTS5 tables feature a special hidden column named "rank". If the
+current query is not a full-text query (i.e. if it does not include a MATCH
+operator), the value of the "rank" column is always NULL. Otherwise, in a
+full-text query, column rank contains by default the same value as would be
+returned by executing the bm25() auxiliary function with no trailing
+arguments.
+
+</p><p> The difference between reading from the rank column and using the bm25()
+function directly within the query is only significant when sorting by the
+returned value. In this case, using "rank" is faster than using bm25().
+
+</p><div class="codeblock"><pre><i>-- The following queries are logically equivalent. But the second may</i>
+<i>-- be faster, particularly if the caller abandons the query before</i>
+<i>-- all rows have been returned (or if the queries were modified to </i>
+<i>-- include LIMIT clauses).</i>
+SELECT * FROM fts WHERE fts MATCH ? ORDER BY bm25(fts);
+SELECT * FROM fts WHERE fts MATCH ? ORDER BY rank;
+</pre></div>
+
+<p> Instead of using bm25() with no trailing arguments, the specific auxiliary
+function mapped to the rank column may be configured either on a per-query
+basis, or by setting a different persistent default for the FTS table.
+
+</p><p> In order to change the mapping of the rank column for a single query,
+a term similar to either of the following is added to the WHERE clause of a
+query:
+
+</p><div class="codeblock"><pre>rank MATCH 'auxiliary-function-name(arg1, arg2, ...)'
+rank = 'auxiliary-function-name(arg1, arg2, ...)'
+</pre></div>
+
+<p> The right-hand-side of the MATCH or = operator must be a constant
+expression that evaluates to a string consisting of the auxiliary function to
+invoke, followed by zero or more comma separated arguments within parenthesis.
+Arguments must be SQL literals. For example:
+
+</p><div class="codeblock"><pre><i>-- The following queries are logically equivalent. But the second may</i>
+<i>-- be faster. See above. </i>
+SELECT * FROM fts WHERE fts MATCH ? ORDER BY bm25(fts, 10.0, 5.0);
+SELECT * FROM fts WHERE fts MATCH ? AND rank MATCH 'bm25(10.0, 5.0)' ORDER BY rank;
+</pre></div>
+
+<p> The table-valued function syntax may also be used to specify an alternative
+ranking function. In this case the text describing the ranking function should
+be specified as the second table-valued function argument. The following three
+queries are equivalent:
+
+</p><div class="codeblock"><pre>SELECT * FROM fts WHERE fts MATCH ? AND rank MATCH 'bm25(10.0, 5.0)' ORDER BY rank;
+SELECT * FROM fts WHERE fts = ? AND rank = 'bm25(10.0, 5.0)' ORDER BY rank;
+SELECT * FROM fts WHERE fts(?, 'bm25(10.0, 5.0)') ORDER BY rank;
+</pre></div>
+
+<p> The default mapping of the rank column for a table may be modified
+using the <a href="fts5.html#the_rank_configuration_option">FTS5 rank configuration option</a>.
+
+</p><h1 id="special_insert_commands"><span>6. </span>Special INSERT Commands</h1>
+
+<a name="the_automerge_configuration_option"></a>
+<h2 tags="FTS5 automerge option" id="the_automerge_configuration_option"><span>6.1. </span>The 'automerge' Configuration Option</h2>
+
+<p>
+ Instead of using a single data structure on disk to store the full-text
+ index, FTS5 uses a series of b-trees. Each time a new transaction is
+ committed, a new b-tree containing the contents of the committed transaction
+ is written into the database file. When the full-text index is queried, each
+ b-tree must be queried individually and the results merged before being
+ returned to the user.
+
+</p><p>
+ In order to prevent the number of b-trees in the database from becoming too
+ large (slowing down queries), smaller b-trees are periodically merged into
+ single larger b-trees containing the same data. By default, this happens
+ automatically within INSERT, UPDATE or DELETE statements that modify the
+ full-text index. The 'automerge' parameter determines how many smaller
+ b-trees are merged together at a time. Setting it to a small value can
+ speed up queries (as they have to query and merge the results from fewer
+ b-trees), but can also slow down writing to the database (as each INSERT,
+ UPDATE or DELETE statement has to do more work as part of the automatic
+ merging process).
+
+</p><p>
+ Each of the b-trees that make up the full-text index is assigned to a "level"
+ based on its size. Level-0 b-trees are the smallest, as they contain the
+ contents of a single transaction. Higher level b-trees are the result of
+ merging two or more level-0 b-trees together and so they are larger. FTS5
+ begins to merge b-trees together once there exist <i>M</i> or more b-trees
+ with the same level, where <i>M</i> is the value of the 'automerge'
+ parameter.
+
+</p><p>
+ The maximum allowed value for the 'automerge' parameter is 16. The default
+ value is 4. Setting the 'automerge' parameter to 0 disables the automatic
+ incremental merging of b-trees altogether.
+
+</p><div class="codeblock"><pre>INSERT INTO ft(ft, rank) VALUES('automerge', 8);
+</pre></div>
+
+<h2 id="the_crisismerge_configuration_option"><span>6.2. </span>The 'crisismerge' Configuration Option</h2>
+
+<p>The 'crisismerge' option is similar to 'automerge', in that it determines
+how and how often the component b-trees that make up the full-text index are
+merged together. Once there exist <i>C</i> or more b-trees on a single level
+within the full-text index, where <i>C</i> is the value of the 'crisismerge'
+option, all b-trees on the level are immediately merged into a single b-tree.
+
+</p><p>The difference between this option and the 'automerge' option is that when
+the 'automerge' limit is reached FTS5 only begins to merge the b-trees
+together. Most of the work is performed as part of subsequent INSERT,
+UPDATE or DELETE operations. Whereas when the 'crisismerge' limit is reached,
+the offending b-trees are all merged immediately. This means that an INSERT,
+UPDATE or DELETE that triggers a crisis-merge may take a long time to
+complete.
+
+</p><p>The default 'crisismerge' value is 16. There is no maximum limit. Attempting
+to set the 'crisismerge' parameter to a value of 0 or 1 is equivalent to
+setting it to the default value (16). It is an error to attempt to set the
+'crisismerge' option to a negative value.
+
+</p><div class="codeblock"><pre>INSERT INTO ft(ft, rank) VALUES('crisismerge', 16);
+</pre></div>
+
+<a name="the_delete_command"></a>
+<h2 tags="FTS5 delete command" id="the_delete_command"><span>6.3. </span>The 'delete' Command</h2>
+
+<p> This command is only available with <a href="fts5.html#external_content_tables">external content</a> and <a href="fts5.html#contentless_tables">contentless</a> tables. It
+is used to delete the index entries associated with a single row from the
+full-text index. This command and the <a href="fts5.html#the_delete_all_command">delete-all</a>
+command are the only ways to remove entries from the full-text index of a
+contentless table.
+
+</p><p> In order to use this command to delete a row, the text value 'delete'
+must be inserted into the special column with the same name as the table.
+The rowid of the row to delete is inserted into the rowid column. The
+values inserted into the other columns must match the values currently
+stored in the table. For example:
+
+</p><div class="codeblock"><pre><i>-- Insert a row with rowid=14 into the fts5 table.</i>
+INSERT INTO ft(rowid, a, b, c) VALUES(14, $a, $b, $c);
+
+<i>-- Remove the same row from the fts5 table.</i>
+INSERT INTO ft(ft, rowid, a, b, c) VALUES('delete', 14, $a, $b, $c);
+</pre></div>
+
+<p> If the values "inserted" into the text columns as part of a 'delete'
+command are not the same as those currently stored within the table, the
+results may be unpredictable.
+
+</p><p> The reason for this is easy to understand: When a document is inserted
+into the FTS5 table, an entry is added to the full-text index to record the
+position of each token within the new document. When a document is removed,
+the original data is required in order to determine the set of entries that
+need to be removed from the full-text index. So if the data supplied to FTS5
+when a row is deleted using this command is different from that used to
+determine the set of token instances when it was inserted, some full-text
+index entries may not be correctly deleted, or FTS5 may try to remove index
+entries that do not exist. This can leave the full-text index in an
+unpredictable state, making future query results unreliable.
+
+</p><a name="the_delete_all_command"></a>
+<h2 tags="FTS5 delete-all command" id="the_delete_all_command"><span>6.4. </span>The 'delete-all' Command</h2>
+
+<p> This command is only available with <a href="fts5.html#external_content_tables">external content</a> and <a href="fts5.html#contentless_tables">contentless</a> tables. It
+deletes all entries from the full-text index.
+
+</p><div class="codeblock"><pre>INSERT INTO ft(ft) VALUES('delete-all');
+</pre></div>
+
+<h2 id="the_integrity_check_command"><span>6.5. </span>The 'integrity-check' Command</h2>
+
+<p> This command is used to verify that the full-text index is internally
+consistent, and, optionally, that it is consistent with any
+<a href="fts5.html#external_content_tables">external content</a> table.
+
+</p><p>The integrity-check command is invoked by inserting the text value
+'integrity-check' into the special column with the same name as the FTS5
+table. If a value is supplied for the "rank" column, it must be either
+0 or 1. For example:
+
+</p><div class="codeblock"><pre>INSERT INTO ft(ft) VALUES('integrity-check');
+INSERT INTO ft(ft, rank) VALUES('integrity-check', 0);
+INSERT INTO ft(ft, rank) VALUES('integrity-check', 1);
+</pre></div>
+
+<p>The three forms above are equivalent for all FTS tables that are
+not external content tables. They check that the index data structures are
+not corrupt, and, if the FTS table is not contentless, that the contents of
+the index match the contents of the table itself.
+
+</p><p>For an external content table, the contents of the index are only
+compared to the contents of the external content table if the value
+specified for the rank column is 1.
+
+</p><p>In all cases, if any discrepancies are found, the command fails
+with an <a href="rescode.html#corrupt_vtab">SQLITE_CORRUPT_VTAB</a> error.
+
+</p><a name="the_merge_command"></a>
+<h2 tags="FTS5 merge command" id="the_merge_command"><span>6.6. </span>The 'merge' Command</h2>
+
+<div class="codeblock"><pre>INSERT INTO ft(ft, rank) VALUES('merge', 500);
+</pre></div>
+
+<p> This command merges b-tree structures together until roughly N pages
+of merged data have been written to the database, where N is the absolute
+value of the parameter specified as part of the 'merge' command. The size of
+each page is as configured by the <a href="fts5.html#the_pgsz_configuration_option">FTS5 pgsz option</a>.
+
+</p><p> If the parameter is a positive value, B-tree structures are only eligible
+for merging if one of the following is true:
+
+</p><ul>
+ <li> There are U or more such b-trees on a
+ single level (see the documentation for the <a href="fts5.html#the_automerge_configuration_option">FTS5 automerge option</a>
+ for an explanation of b-tree levels), where U is the value assigned
+ to the <a href="fts5.html#the_usermerge_configuration_option">FTS5 usermerge option</a> option.
+ </li><li> A merge has already been started (perhaps by a 'merge' command that
+ specified a negative parameter).
+</li></ul>
+
+<p> It is possible to tell whether or not the 'merge' command found any
+b-trees to merge together by checking the value returned by the
+<a href="c3ref/total_changes.html">sqlite3_total_changes()</a> API before and after the command is executed. If
+the difference between the two values is 2 or greater, then work was performed.
+If the difference is less than 2, then the 'merge' command was a no-op. In this
+case there is no reason to execute the same 'merge' command again, at least
+until after the FTS table is next updated.
+
+</p><p> If the parameter is negative, and there are B-tree structures on more than
+one level within the FTS index, all B-tree structures are assigned to the same
+level before the merge operation is commenced. Additionally, if the parameter
+is negative, the value of the usermerge configuration option is not
+respected - as few as two b-trees from the same level may be merged together.
+
+</p><p> The above means that executing the 'merge' command with a negative
+parameter until the before and after difference in the return value of
+<a href="c3ref/total_changes.html">sqlite3_total_changes()</a> is less than two optimizes the FTS index in the
+same way as the <a href="fts5.html#the_optimize_command">FTS5 optimize command</a>. However, if a new b-tree is added
+to the FTS index while this process is ongoing, FTS5 will move the new
+b-tree to the same level as the existing b-trees and restart the merge. To
+avoid this, only the first call to 'merge' should specify a negative parameter.
+Each subsequent call to 'merge' should specify a positive value so that the
+merge started by the first call is run to completion even if new b-trees are
+added to the FTS index.
+
+</p><a name="the_optimize_command"></a>
+<h2 tags="FTS5 optimize command" id="the_optimize_command"><span>6.7. </span>The 'optimize' Command</h2>
+
+<p>This command merges all individual b-trees that currently make up the
+full-text index into a single large b-tree structure. This ensures that the
+full-text index consumes the minimum space within the database and is in the
+fastest form to query.
+
+</p><p>Refer to the documentation for the <a href="fts5.html#the_automerge_configuration_option">FTS5 automerge option</a> for more details
+regarding the relationship between the full-text index and its component
+b-trees.
+
+</p><div class="codeblock"><pre>INSERT INTO ft(ft) VALUES('optimize');
+</pre></div>
+
+<p>Because it reorganizes the entire FTS index, the optimize command can
+take a long time to run. The <a href="fts5.html#the_merge_command">FTS5 merge command</a> can be used to divide
+the work of optimizing the FTS index into multiple steps. To do this:
+
+</p><ul>
+ <li> Invoke the 'merge' command once with the parameter set to -N, then
+ </li><li> Invoke the 'merge' command zero or more times with the parameter set to N.
+</li></ul>
+
+<p>where N is the number of pages of data to merge within each invocation of
+the merge command. The application should stop invoking merge when the
+difference in the value returned by the sqlite3_total_changes() function before
+and after the merge command drops to below two. The merge commands may be
+issued as part of the same or separate transactions, and by the same or
+different database clients. Refer to the documentation for the
+<a href="fts5.html#the_merge_command">merge command</a> for further details.
+
+</p><a name="the_pgsz_configuration_option"></a>
+<h2 tags="FTS5 pgsz option" id="the_pgsz_configuration_option"><span>6.8. </span>The 'pgsz' Configuration Option</h2>
+
+<p> This command is used to set the persistent "pgsz" option.
+
+</p><p> The full-text index maintained by FTS5 is stored as a series of fixed-size
+blobs in a database table. It is not strictly necessary for all blobs that make
+up a full-text index to be the same size. The pgsz option determines the size
+of all blobs created by subsequent index writers. The default value is 1000.
+
+</p><div class="codeblock"><pre>INSERT INTO ft(ft, rank) VALUES('pgsz', 4072);
+</pre></div>
+
+<a name="the_rank_configuration_option"></a>
+<h2 tags="FTS5 rank configuration option" id="the_rank_configuration_option"><span>6.9. </span>The 'rank' Configuration Option</h2>
+
+<p> This command is used to set the persistent "rank" option.
+
+</p><p> The rank option is used to change the default auxiliary function mapping
+for the rank column. The option should be set to a text value in the same
+format as described for <a href="fts5.html#sorting_by_auxiliary_function_results">"rank MATCH ?"</a> terms
+above. For example:
+
+</p><div class="codeblock"><pre>INSERT INTO ft(ft, rank) VALUES('rank', 'bm25(10.0, 5.0)');
+</pre></div>
+
+<a name="the_rebuild_command"></a>
+<h2 tags="FTS5 rebuild command" id="the_rebuild_command"><span>6.10. </span>The 'rebuild' Command</h2>
+
+<p> This command first deletes the entire full-text index, then rebuilds it
+based on the contents of the table or <a href="fts5.html#external_content_tables">content
+table</a>. It is not available with <a href="fts5.html#contentless_tables">contentless
+tables</a>.
+
+</p><div class="codeblock"><pre>INSERT INTO ft(ft) VALUES('rebuild');
+</pre></div>
+
+<a name="the_usermerge_configuration_option"></a>
+<h2 tags="FTS5 usermerge option" id="the_usermerge_configuration_option"><span>6.11. </span>The 'usermerge' Configuration Option</h2>
+
+<p> This command is used to set the persistent "usermerge" option.
+
+</p><p> The usermerge option is similar to the automerge and crisismerge options.
+It is the minimum number of b-tree segments that will be merged together by
+a 'merge' command with a positive parameter. For example:
+
+</p><div class="codeblock"><pre>INSERT INTO ft(ft, rank) VALUES('usermerge', 4);
+</pre></div>
+
+<p> The default value of the usermerge option is 4. The minimum allowed value
+is 2, and the maximum 16.
+
+</p><a name="extending_fts5"></a>
+<h1 tags="Extending FTS5" id="extending_fts5"><span>7. </span>Extending FTS5</h1>
+
+<p>FTS5 features APIs allowing it to be extended by:
+
+</p><ul>
+ <li> Adding new auxiliary functions implemented in C, and
+ </li><li> Adding new tokenizers, also implemented in C.
+</li></ul>
+
+<p> The built-in tokenizers and auxiliary functions described in this
+document are all implemented using the publicly available API described
+below.
+
+</p><p> Before a new auxiliary function or tokenizer implementation may be
+registered with FTS5, an application must obtain a pointer to the "fts5_api"
+structure. There is one fts5_api structure for each database connection with
+which the FTS5 extension is registered. To obtain the pointer, the application
+invokes the SQL user-defined function fts5() with a single argument. That
+argument must be set to a pointer to a pointer to an fts5_api object
+using the <a href="c3ref/bind_blob.html">sqlite3_bind_pointer()</a> interface.
+The following example code demonstrates the technique:
+
+</p><div class="codeblock"><pre><i>/*
+** Return a pointer to the fts5_api pointer for database connection db.
+** If an error occurs, return NULL and leave an error in the database
+** handle (accessible using sqlite3_errcode()/errmsg()).
+*/</i>
+fts5_api *fts5_api_from_db(sqlite3 *db){
+ fts5_api *pRet = 0;
+ sqlite3_stmt *pStmt = 0;
+
+ if( SQLITE_OK==sqlite3_prepare(db, "SELECT fts5(?1)", -1, &pStmt, 0) ){
+ sqlite3_bin_pointer(pStmt, (void*)&pRet, "fts5_api_ptr", NULL);
+ sqlite3_step(pStmt);
+ }
+ sqlite3_finalize(pStmt);
+ return pRet;
+}
+</pre></div>
+
+<p><b>Backwards Compatibility Warning:</b>
+Prior to SQLite version 3.20.0 (2017-08-01), the fts5() worked slightly
+differently. Older applications that extend FTS5 must be revised to use
+the new technique shown above.
+
+</p><p> The fts5_api structure is defined as follows. It exposes three methods,
+one each for registering new auxiliary functions and tokenizers, and one for
+retrieving existing tokenizer. The latter is intended to facilitate the
+implementation of "tokenizer wrappers" similar to the built-in
+porter tokenizer.
+
+</p><div class="codeblock"><pre>typedef struct fts5_api fts5_api;
+struct fts5_api {
+ int iVersion; <i>/* Currently always set to 2 */</i>
+
+ <i>/* Create a new tokenizer */</i>
+ int (*xCreateTokenizer)(
+ fts5_api *pApi,
+ const char *zName,
+ void *pContext,
+ fts5_tokenizer *pTokenizer,
+ void (*xDestroy)(void*)
+ );
+
+ <i>/* Find an existing tokenizer */</i>
+ int (*xFindTokenizer)(
+ fts5_api *pApi,
+ const char *zName,
+ void **ppContext,
+ fts5_tokenizer *pTokenizer
+ );
+
+ <i>/* Create a new auxiliary function */</i>
+ int (*xCreateFunction)(
+ fts5_api *pApi,
+ const char *zName,
+ void *pContext,
+ fts5_extension_function xFunction,
+ void (*xDestroy)(void*)
+ );
+};
+</pre></div>
+
+<p> To invoke a method of the fts5_api object, the fts5_api pointer itself
+should be passed as the methods first argument followed by the other, method
+specific, arguments. For example:
+
+</p><div class="codeblock"><pre>rc = pFts5Api->xCreateTokenizer(pFts5Api, ... other args ...);
+</pre></div>
+
+<p> The fts5_api structure methods are described individually in the following
+sections.
+
+</p><a name="custom_tokenizers"></a>
+<h2 tags="custom tokenizers" id="custom_tokenizers"><span>7.1. </span>Custom Tokenizers</h2>
+
+<p> To create a custom tokenizer, an application must implement three
+functions: a tokenizer constructor (xCreate), a destructor (xDelete) and a
+function to do the actual tokenization (xTokenize). The type of each
+function is as for the member variables of the fts5_tokenizer struct:
+
+</p><div class="codeblock"><pre>typedef struct Fts5Tokenizer Fts5Tokenizer;
+typedef struct fts5_tokenizer fts5_tokenizer;
+struct fts5_tokenizer {
+ int (*xCreate)(void*, const char **azArg, int nArg, Fts5Tokenizer **ppOut);
+ void (*xDelete)(Fts5Tokenizer*);
+ int (*xTokenize)(Fts5Tokenizer*,
+ void *pCtx,
+ int flags, <i>/* Mask of FTS5_TOKENIZE_* flags */</i>
+ const char *pText, int nText,
+ int (*xToken)(
+ void *pCtx, <i>/* Copy of 2nd argument to xTokenize() */</i>
+ int tflags, <i>/* Mask of FTS5_TOKEN_* flags */</i>
+ const char *pToken, <i>/* Pointer to buffer containing token */</i>
+ int nToken, <i>/* Size of token in bytes */</i>
+ int iStart, <i>/* Byte offset of token within input text */</i>
+ int iEnd <i>/* Byte offset of end of token within input text */</i>
+ )
+ );
+};
+
+<i>/* Flags that may be passed as the third argument to xTokenize() */</i>
+#define FTS5_TOKENIZE_QUERY 0x0001
+#define FTS5_TOKENIZE_PREFIX 0x0002
+#define FTS5_TOKENIZE_DOCUMENT 0x0004
+#define FTS5_TOKENIZE_AUX 0x0008
+
+<i>/* Flags that may be passed by the tokenizer implementation back to FTS5
+** as the third argument to the supplied xToken callback. */</i>
+#define FTS5_TOKEN_COLOCATED 0x0001 <i>/* Same position as prev. token */</i>
+</pre></div>
+
+<p> The implementation is registered with the FTS5 module by calling the
+xCreateTokenizer() method of the fts5_api object. If there is already a
+tokenizer with the same name, it is replaced. If a non-NULL xDestroy parameter
+is passed to xCreateTokenizer(), it is invoked with a copy of the pContext
+pointer passed as the only argument when the database handle is closed or when
+the tokenizer is replaced.
+
+</p><p> If successful, xCreateTokenizer() returns SQLITE_OK. Otherwise, it
+returns an SQLite error code. In this case the xDestroy function is <b>not</b>
+invoked.
+
+</p><p> When an FTS5 table uses the custom tokenizer, the FTS5 core calls xCreate()
+once to create a tokenizer, then xTokenize() zero or more times to tokenize
+strings, then xDelete() to free any resources allocated by xCreate(). More
+specifically:
+
+</p><dl>
+<dt><b>xCreate:</b></dt><dd><p style="margin-top:0">
+ This function is used to allocate and initialize a tokenizer instance.
+ A tokenizer instance is required to actually tokenize text.
+</p><p>
+ The first argument passed to this function is a copy of the (void*)
+ pointer provided by the application when the fts5_tokenizer object
+ was registered with FTS5 (the third argument to xCreateTokenizer()).
+ The second and third arguments are an array of nul-terminated strings
+ containing the tokenizer arguments, if any, specified following the
+ tokenizer name as part of the CREATE VIRTUAL TABLE statement used
+ to create the FTS5 table.
+</p><p>
+ The final argument is an output variable. If successful, (*ppOut)
+ should be set to point to the new tokenizer handle and SQLITE_OK
+ returned. If an error occurs, some value other than SQLITE_OK should
+ be returned. In this case, fts5 assumes that the final value of *ppOut
+ is undefined.
+</p><p>
+</p></dd><dt><b> xDelete:</b></dt><dd><p style="margin-top:0">
+ This function is invoked to delete a tokenizer handle previously
+ allocated using xCreate(). Fts5 guarantees that this function will
+ be invoked exactly once for each successful call to xCreate().
+</p><p>
+</p></dd><dt><b> xTokenize:</b></dt><dd><p style="margin-top:0">
+ This function is expected to tokenize the nText byte string indicated
+ by argument pText. pText may or may not be nul-terminated. The first
+ argument passed to this function is a pointer to an Fts5Tokenizer object
+ returned by an earlier call to xCreate().
+</p><p>
+ The second argument indicates the reason that FTS5 is requesting
+ tokenization of the supplied text. This is always one of the following
+ four values:
+</p><p>
+ </p><ul><li> <b>FTS5_TOKENIZE_DOCUMENT</b> - A document is being inserted into
+ or removed from the FTS table. The tokenizer is being invoked to
+ determine the set of tokens to add to (or delete from) the
+ FTS index.
+<p>
+ </p></li><li> <b>FTS5_TOKENIZE_QUERY</b> - A MATCH query is being executed
+ against the FTS index. The tokenizer is being called to tokenize
+ a bareword or quoted string specified as part of the query.
+<p>
+ </p></li><li> <b>(FTS5_TOKENIZE_QUERY | FTS5_TOKENIZE_PREFIX)</b> - Same as
+ FTS5_TOKENIZE_QUERY, except that the bareword or quoted string is
+ followed by a "*" character, indicating that the last token
+ returned by the tokenizer will be treated as a token prefix.
+<p>
+ </p></li><li> <b>FTS5_TOKENIZE_AUX</b> - The tokenizer is being invoked to
+ satisfy an fts5_api.xTokenize() request made by an auxiliary
+ function. Or an fts5_api.xColumnSize() request made by the same
+ on a columnsize=0 database.
+ </li></ul>
+<p>
+ For each token in the input string, the supplied callback xToken() must
+ be invoked. The first argument to it should be a copy of the pointer
+ passed as the second argument to xTokenize(). The third and fourth
+ arguments are a pointer to a buffer containing the token text, and the
+ size of the token in bytes. The 4th and 5th arguments are the byte offsets
+ of the first byte of and first byte immediately following the text from
+ which the token is derived within the input.
+</p><p>
+ The second argument passed to the xToken() callback ("tflags") should
+ normally be set to 0. The exception is if the tokenizer supports
+ synonyms. In this case see the discussion below for details.
+</p><p>
+ FTS5 assumes the xToken() callback is invoked for each token in the
+ order that they occur within the input text.
+</p><p>
+ If an xToken() callback returns any value other than SQLITE_OK, then
+ the tokenization should be abandoned and the xTokenize() method should
+ immediately return a copy of the xToken() return value. Or, if the
+ input buffer is exhausted, xTokenize() should return SQLITE_OK. Finally,
+ if an error occurs with the xTokenize() implementation itself, it
+ may abandon the tokenization and return any error code other than
+ SQLITE_OK or SQLITE_DONE.
+</p><p>
+</p></dd></dl><h3 id="synonym_support"><span>7.1.1. </span>Synonym Support</h3>
+<p>
+ Custom tokenizers may also support synonyms. Consider a case in which a
+ user wishes to query for a phrase such as "first place". Using the
+ built-in tokenizers, the FTS5 query 'first + place' will match instances
+ of "first place" within the document set, but not alternative forms
+ such as "1st place". In some applications, it would be better to match
+ all instances of "first place" or "1st place" regardless of which form
+ the user specified in the MATCH query text.
+</p><p>
+ There are several ways to approach this in FTS5:
+</p><p>
+ </p><ol><li> By mapping all synonyms to a single token. In this case, using
+ the above example, this means that the tokenizer returns the
+ same token for inputs "first" and "1st". Say that token is in
+ fact "first", so that when the user inserts the document "I won
+ 1st place" entries are added to the index for tokens "i", "won",
+ "first" and "place". If the user then queries for '1st + place',
+ the tokenizer substitutes "first" for "1st" and the query works
+ as expected.
+<p>
+ </p></li><li> By querying the index for all synonyms of each query term
+ separately. In this case, when tokenizing query text, the
+ tokenizer may provide multiple synonyms for a single term
+ within the document. FTS5 then queries the index for each
+ synonym individually. For example, faced with the query:
+<p>
+ </p><div class="codeblock"><pre>... MATCH 'first place'
+</pre></div>
+<p>
+ the tokenizer offers both "1st" and "first" as synonyms for the
+ first token in the MATCH query and FTS5 effectively runs a query
+ similar to:
+</p><p>
+ </p><div class="codeblock"><pre>... MATCH '(first OR 1st) place'
+</pre></div>
+<p>
+ except that, for the purposes of auxiliary functions, the query
+ still appears to contain just two phrases - "(first OR 1st)"
+ being treated as a single phrase.
+</p><p>
+ </p></li><li> By adding multiple synonyms for a single term to the FTS index.
+ Using this method, when tokenizing document text, the tokenizer
+ provides multiple synonyms for each token. So that when a
+ document such as "I won first place" is tokenized, entries are
+ added to the FTS index for "i", "won", "first", "1st" and
+ "place".
+<p>
+ This way, even if the tokenizer does not provide synonyms
+ when tokenizing query text (it should not - to do so would be
+ inefficient), it doesn't matter if the user queries for
+ 'first + place' or '1st + place', as there are entries in the
+ FTS index corresponding to both forms of the first token.
+ </p></li></ol>
+<p>
+ Whether it is parsing document or query text, any call to xToken that
+ specifies a <i>tflags</i> argument with the FTS5_TOKEN_COLOCATED bit
+ is considered to supply a synonym for the previous token. For example,
+ when parsing the document "I won first place", a tokenizer that supports
+ synonyms would call xToken() 5 times, as follows:
+</p><p>
+ </p><div class="codeblock"><pre>xToken(pCtx, 0, "i", 1, 0, 1);
+xToken(pCtx, 0, "won", 3, 2, 5);
+xToken(pCtx, 0, "first", 5, 6, 11);
+xToken(pCtx, FTS5_TOKEN_COLOCATED, "1st", 3, 6, 11);
+xToken(pCtx, 0, "place", 5, 12, 17);
+</pre></div>
+<p>
+ It is an error to specify the FTS5_TOKEN_COLOCATED flag the first time
+ xToken() is called. Multiple synonyms may be specified for a single token
+ by making multiple calls to xToken(FTS5_TOKEN_COLOCATED) in sequence.
+ There is no limit to the number of synonyms that may be provided for a
+ single token.
+</p><p>
+ In many cases, method (1) above is the best approach. It does not add
+ extra data to the FTS index or require FTS5 to query for multiple terms,
+ so it is efficient in terms of disk space and query speed. However, it
+ does not support prefix queries very well. If, as suggested above, the
+ token "first" is substituted for "1st" by the tokenizer, then the query:
+</p><p>
+ </p><div class="codeblock"><pre>... MATCH '1s*'
+</pre></div>
+<p>
+ will not match documents that contain the token "1st" (as the tokenizer
+ will probably not map "1s" to any prefix of "first").
+</p><p>
+ For full prefix support, method (3) may be preferred. In this case,
+ because the index contains entries for both "first" and "1st", prefix
+ queries such as 'fi*' or '1s*' will match correctly. However, because
+ extra entries are added to the FTS index, this method uses more space
+ within the database.
+</p><p>
+ Method (2) offers a midpoint between (1) and (3). Using this method,
+ a query such as '1s*' will match documents that contain the literal
+ token "1st", but not "first" (assuming the tokenizer is not able to
+ provide synonyms for prefixes). However, a non-prefix query like '1st'
+ will match against "1st" and "first". This method does not require
+ extra disk space, as no extra entries are added to the FTS index.
+ On the other hand, it may require more CPU cycles to run MATCH queries,
+ as separate queries of the FTS index are required for each synonym.
+</p><p>
+ When using methods (2) or (3), it is important that the tokenizer only
+ provide synonyms when tokenizing document text (method (2)) or query
+ text (method (3)), not both. Doing so will not cause any errors, but is
+ inefficient.
+
+
+
+</p><a name="custom_auxiliary_functions"></a>
+<h2 tags="FTS5 custom auxiliary functions" id="custom_auxiliary_functions"><span>7.2. </span>Custom Auxiliary Functions</h2>
+
+<p> Implementing a custom auxiliary function is similar to implementing a
+<a href="appfunc.html">scalar SQL function</a>. The implementation
+should be a C function of type fts5_extension_function, defined as follows:
+
+</p><div class="codeblock"><pre>typedef struct Fts5ExtensionApi Fts5ExtensionApi;
+typedef struct Fts5Context Fts5Context;
+typedef struct Fts5PhraseIter Fts5PhraseIter;
+
+typedef void (*fts5_extension_function)(
+ const Fts5ExtensionApi *pApi, <i>/* API offered by current FTS version */</i>
+ Fts5Context *pFts, <i>/* First arg to pass to pApi functions */</i>
+ sqlite3_context *pCtx, <i>/* Context for returning result/error */</i>
+ int nVal, <i>/* Number of values in apVal[] array */</i>
+ sqlite3_value **apVal <i>/* Array of trailing arguments */</i>
+);
+</pre></div>
+
+<p> The implementation is registered with the FTS5 module by calling the
+xCreateFunction() method of the fts5_api object. If there is already an
+auxiliary function with the same name, it is replaced by the new function.
+If a non-NULL xDestroy parameter is passed to xCreateFunction(), it is invoked
+with a copy of the pContext pointer passed as the only argument when the
+database handle is closed or when the registered auxiliary function is
+replaced.
+
+</p><p> If successful, xCreateFunction() returns SQLITE_OK. Otherwise, it
+returns an SQLite error code. In this case the xDestroy function is <b>not</b>
+invoked.
+
+</p><p> The final three arguments passed to the auxiliary function callback are
+similar to the three arguments passed to the implementation of a scalar SQL
+function. All arguments except the first passed to the auxiliary function are
+available to the implementation in the apVal&#91;&#93; array. The
+implementation should return a result or error via the content handle pCtx.
+
+</p><p> The first argument passed to an auxiliary function callback is a pointer
+to a structure containing methods that may be invoked in order to obtain
+information regarding the current query or row. The second argument is an
+opaque handle that should be passed as the first argument to any such method
+invocation. For example, the following auxiliary function definition returns
+the total number of tokens in all columns of the current row:
+
+</p><div class="codeblock"><pre><i>/*
+** Implementation of an auxiliary function that returns the number
+** of tokens in the current row (including all columns).
+*/</i>
+static void column_size_imp(
+ const Fts5ExtensionApi *pApi,
+ Fts5Context *pFts,
+ sqlite3_context *pCtx,
+ int nVal,
+ sqlite3_value **apVal
+){
+ int rc;
+ int nToken;
+ rc = pApi->xColumnSize(pFts, -1, &nToken);
+ if( rc==SQLITE_OK ){
+ sqlite3_result_int(pCtx, nToken);
+ }else{
+ sqlite3_result_error_code(pCtx, rc);
+ }
+}
+</pre></div>
+
+<p>The following section describes the API offered to auxiliary function
+implementations in detail. Further examples may be found in the "fts5_aux.c"
+file of the source code.
+
+</p><a name="_custom_auxiliary_functions_api_reference_"></a>
+<h3 tags="custom auxiliary functions" id="_custom_auxiliary_functions_api_reference_"><span>7.2.1. </span>
+ Custom Auxiliary Functions API Reference
+</h3>
+
+<div class="codeblock"><pre>struct Fts5ExtensionApi {
+ int iVersion; <i>/* Currently always set to 3 */</i>
+
+ void *(*<a href="#xUserData">xUserData</a>)(Fts5Context*);
+
+ int (*<a href="#xColumnCount">xColumnCount</a>)(Fts5Context*);
+ int (*<a href="#xRowCount">xRowCount</a>)(Fts5Context*, sqlite3_int64 *pnRow);
+ int (*<a href="#xColumnTotalSize">xColumnTotalSize</a>)(Fts5Context*, int iCol, sqlite3_int64 *pnToken);
+
+ int (*<a href="#xTokenize">xTokenize</a>)(Fts5Context*,
+ const char *pText, int nText, <i>/* Text to tokenize */</i>
+ void *pCtx, <i>/* Context passed to xToken() */</i>
+ int (*xToken)(void*, int, const char*, int, int, int) <i>/* Callback */</i>
+ );
+
+ int (*<a href="#xPhraseCount">xPhraseCount</a>)(Fts5Context*);
+ int (*<a href="#xPhraseSize">xPhraseSize</a>)(Fts5Context*, int iPhrase);
+
+ int (*<a href="#xInstCount">xInstCount</a>)(Fts5Context*, int *pnInst);
+ int (*<a href="#xInst">xInst</a>)(Fts5Context*, int iIdx, int *piPhrase, int *piCol, int *piOff);
+
+ sqlite3_int64 (*<a href="#xRowid">xRowid</a>)(Fts5Context*);
+ int (*<a href="#xColumnText">xColumnText</a>)(Fts5Context*, int iCol, const char **pz, int *pn);
+ int (*<a href="#xColumnSize">xColumnSize</a>)(Fts5Context*, int iCol, int *pnToken);
+
+ int (*<a href="#xQueryPhrase">xQueryPhrase</a>)(Fts5Context*, int iPhrase, void *pUserData,
+ int(*)(const Fts5ExtensionApi*,Fts5Context*,void*)
+ );
+ int (*<a href="#xSetAuxdata">xSetAuxdata</a>)(Fts5Context*, void *pAux, void(*xDelete)(void*));
+ void *(*<a href="#xGetAuxdata">xGetAuxdata</a>)(Fts5Context*, int bClear);
+
+ int (*<a href="#xPhraseFirst">xPhraseFirst</a>)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*, int*);
+ void (*<a href="#xPhraseNext">xPhraseNext</a>)(Fts5Context*, Fts5PhraseIter*, int *piCol, int *piOff);
+
+ int (*<a href="#xPhraseFirst">xPhraseFirst</a>Column)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*);
+ void (*<a href="#xPhraseNext">xPhraseNext</a>Column)(Fts5Context*, Fts5PhraseIter*, int *piCol);
+};
+</pre></div>
+
+<dl>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xUserData">
+<b>void *(*xUserData)(Fts5Context*)</b></dt><dd>
+<p style="margin-top:0.1em">
+Return a copy of the context pointer the extension function was
+ registered with.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xColumnTotalSize">
+<b>int (*xColumnTotalSize)(Fts5Context*, int iCol, sqlite3_int64 *pnToken)</b></dt><dd>
+<p style="margin-top:0.1em">
+If parameter iCol is less than zero, set output variable *pnToken
+ to the total number of tokens in the FTS5 table. Or, if iCol is
+ non-negative but less than the number of columns in the table, return
+ the total number of tokens in column iCol, considering all rows in
+ the FTS5 table.
+</p>
+
+<p>
+ If parameter iCol is greater than or equal to the number of columns
+ in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g.
+ an OOM condition or IO error), an appropriate SQLite error code is
+ returned.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xColumnCount">
+<b>int (*xColumnCount)(Fts5Context*)</b></dt><dd>
+<p style="margin-top:0.1em">
+Return the number of columns in the table.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xColumnSize">
+<b>int (*xColumnSize)(Fts5Context*, int iCol, int *pnToken)</b></dt><dd>
+<p style="margin-top:0.1em">
+If parameter iCol is less than zero, set output variable *pnToken
+ to the total number of tokens in the current row. Or, if iCol is
+ non-negative but less than the number of columns in the table, set
+ *pnToken to the number of tokens in column iCol of the current row.
+</p>
+
+<p>
+ If parameter iCol is greater than or equal to the number of columns
+ in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g.
+ an OOM condition or IO error), an appropriate SQLite error code is
+ returned.
+</p>
+
+<p>
+ This function may be quite inefficient if used with an FTS5 table
+ created with the "columnsize=0" option.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xColumnText">
+<b>int (*xColumnText)(Fts5Context*, int iCol, const char **pz, int *pn)</b></dt><dd>
+<p style="margin-top:0.1em">
+This function attempts to retrieve the text of column iCol of the
+ current document. If successful, (*pz) is set to point to a buffer
+ containing the text in utf-8 encoding, (*pn) is set to the size in bytes
+ (not characters) of the buffer and SQLITE_OK is returned. Otherwise,
+ if an error occurs, an SQLite error code is returned and the final values
+ of (*pz) and (*pn) are undefined.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xPhraseCount">
+<b>int (*xPhraseCount)(Fts5Context*)</b></dt><dd>
+<p style="margin-top:0.1em">
+Returns the number of phrases in the current query expression.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xPhraseSize">
+<b>int (*xPhraseSize)(Fts5Context*, int iPhrase)</b></dt><dd>
+<p style="margin-top:0.1em">
+Returns the number of tokens in phrase iPhrase of the query. Phrases
+ are numbered starting from zero.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xInstCount">
+<b>int (*xInstCount)(Fts5Context*, int *pnInst)</b></dt><dd>
+<p style="margin-top:0.1em">
+Set *pnInst to the total number of occurrences of all phrases within
+ the query within the current row. Return SQLITE_OK if successful, or
+ an error code (i.e. SQLITE_NOMEM) if an error occurs.
+</p>
+
+<p>
+ This API can be quite slow if used with an FTS5 table created with the
+ "detail=none" or "detail=column" option. If the FTS5 table is created
+ with either "detail=none" or "detail=column" and "content=" option
+ (i.e. if it is a contentless table), then this API always returns 0.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xInst">
+<b>int (*xInst)(Fts5Context*, int iIdx, int *piPhrase, int *piCol, int *piOff)</b></dt><dd>
+<p style="margin-top:0.1em">
+Query for the details of phrase match iIdx within the current row.
+ Phrase matches are numbered starting from zero, so the iIdx argument
+ should be greater than or equal to zero and smaller than the value
+ output by xInstCount().
+</p>
+
+<p>
+ Usually, output parameter *piPhrase is set to the phrase number, *piCol
+ to the column in which it occurs and *piOff the token offset of the
+ first token of the phrase. Returns SQLITE_OK if successful, or an error
+ code (i.e. SQLITE_NOMEM) if an error occurs.
+</p>
+
+<p>
+ This API can be quite slow if used with an FTS5 table created with the
+ "detail=none" or "detail=column" option.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xRowid">
+<b>sqlite3_int64 (*xRowid)(Fts5Context*)</b></dt><dd>
+<p style="margin-top:0.1em">
+Returns the rowid of the current row.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xTokenize">
+<b>int (*xTokenize)(Fts5Context*,
+ const char *pText, int nText,
+ void *pCtx,
+ int (*xToken)(void*, int, const char*, int, int, int)
+)</b></dt><dd>
+<p style="margin-top:0.1em">
+Tokenize text using the tokenizer belonging to the FTS5 table.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xQueryPhrase">
+<b>int (*xQueryPhrase)(Fts5Context*, int iPhrase, void *pUserData,
+ int(*)(const Fts5ExtensionApi*,Fts5Context*,void*)
+)</b></dt><dd>
+<p style="margin-top:0.1em">
+This API function is used to query the FTS table for phrase iPhrase
+ of the current query. Specifically, a query equivalent to:
+</p>
+
+<div class="codeblock"><pre>... FROM ftstable WHERE ftstable MATCH $p ORDER BY rowid
+</pre></div>
+
+<p>
+ with $p set to a phrase equivalent to the phrase iPhrase of the
+ current query is executed. Any column filter that applies to
+ phrase iPhrase of the current query is included in $p. For each
+ row visited, the callback function passed as the fourth argument
+ is invoked. The context and API objects passed to the callback
+ function may be used to access the properties of each matched row.
+ Invoking Api.xUserData() returns a copy of the pointer passed as
+ the third argument to pUserData.
+</p>
+
+<p>
+ If the callback function returns any value other than SQLITE_OK, the
+ query is abandoned and the xQueryPhrase function returns immediately.
+ If the returned value is SQLITE_DONE, xQueryPhrase returns SQLITE_OK.
+ Otherwise, the error code is propagated upwards.
+</p>
+
+<p>
+ If the query runs to completion without incident, SQLITE_OK is returned.
+ Or, if some error occurs before the query completes or is aborted by
+ the callback, an SQLite error code is returned.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xSetAuxdata">
+<b>int (*xSetAuxdata)(Fts5Context*, void *pAux, void(*xDelete)(void*))</b></dt><dd>
+<p style="margin-top:0.1em">
+Save the pointer passed as the second argument as the extension function's
+ "auxiliary data". The pointer may then be retrieved by the current or any
+ future invocation of the same fts5 extension function made as part of
+ the same MATCH query using the xGetAuxdata() API.
+</p>
+
+<p>
+ Each extension function is allocated a single auxiliary data slot for
+ each FTS query (MATCH expression). If the extension function is invoked
+ more than once for a single FTS query, then all invocations share a
+ single auxiliary data context.
+</p>
+
+<p>
+ If there is already an auxiliary data pointer when this function is
+ invoked, then it is replaced by the new pointer. If an xDelete callback
+ was specified along with the original pointer, it is invoked at this
+ point.
+</p>
+
+<p>
+ The xDelete callback, if one is specified, is also invoked on the
+ auxiliary data pointer after the FTS5 query has finished.
+</p>
+
+<p>
+ If an error (e.g. an OOM condition) occurs within this function,
+ the auxiliary data is set to NULL and an error code returned. If the
+ xDelete parameter was not NULL, it is invoked on the auxiliary data
+ pointer before returning.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xGetAuxdata">
+<b>void *(*xGetAuxdata)(Fts5Context*, int bClear)</b></dt><dd>
+<p style="margin-top:0.1em">
+Returns the current auxiliary data pointer for the fts5 extension
+ function. See the xSetAuxdata() method for details.
+</p>
+
+<p>
+ If the bClear argument is non-zero, then the auxiliary data is cleared
+ (set to NULL) before this function returns. In this case the xDelete,
+ if any, is not invoked.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xRowCount">
+<b>int (*xRowCount)(Fts5Context*, sqlite3_int64 *pnRow)</b></dt><dd>
+<p style="margin-top:0.1em">
+This function is used to retrieve the total number of rows in the table.
+ In other words, the same value that would be returned by:
+</p>
+
+<div class="codeblock"><pre>SELECT count(*) FROM ftstable;
+</pre></div>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xPhraseFirst">
+<b>int (*xPhraseFirst)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*, int*)</b></dt><dd>
+<p style="margin-top:0.1em">
+This function is used, along with type Fts5PhraseIter and the xPhraseNext
+ method, to iterate through all instances of a single query phrase within
+ the current row. This is the same information as is accessible via the
+ xInstCount/xInst APIs. While the xInstCount/xInst APIs are more convenient
+ to use, this API may be faster under some circumstances. To iterate
+ through instances of phrase iPhrase, use the following code:
+</p>
+
+<div class="codeblock"><pre>Fts5PhraseIter iter;
+int iCol, iOff;
+for(pApi->xPhraseFirst(pFts, iPhrase, &iter, &iCol, &iOff);
+ iCol>=0;
+ pApi->xPhraseNext(pFts, &iter, &iCol, &iOff)
+){
+ // An instance of phrase iPhrase at offset iOff of column iCol
+}
+</pre></div>
+
+<p>
+ The Fts5PhraseIter structure is defined above. Applications should not
+ modify this structure directly - it should only be used as shown above
+ with the xPhraseFirst() and xPhraseNext() API methods (and by
+ xPhraseFirstColumn() and xPhraseNextColumn() as illustrated below).
+</p>
+
+<p>
+ This API can be quite slow if used with an FTS5 table created with the
+ "detail=none" or "detail=column" option. If the FTS5 table is created
+ with either "detail=none" or "detail=column" and "content=" option
+ (i.e. if it is a contentless table), then this API always iterates
+ through an empty set (all calls to xPhraseFirst() set iCol to -1).
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xPhraseNext">
+<b>void (*xPhraseNext)(Fts5Context*, Fts5PhraseIter*, int *piCol, int *piOff)</b></dt><dd>
+<p style="margin-top:0.1em">
+See xPhraseFirst above.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xPhraseFirstColumn">
+<b>int (*xPhraseFirstColumn)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*)</b></dt><dd>
+<p style="margin-top:0.1em">
+This function and xPhraseNextColumn() are similar to the xPhraseFirst()
+ and xPhraseNext() APIs described above. The difference is that instead
+ of iterating through all instances of a phrase in the current row, these
+ APIs are used to iterate through the set of columns in the current row
+ that contain one or more instances of a specified phrase. For example:
+</p>
+
+<div class="codeblock"><pre>Fts5PhraseIter iter;
+int iCol;
+for(pApi->xPhraseFirstColumn(pFts, iPhrase, &iter, &iCol);
+ iCol>=0;
+ pApi->xPhraseNextColumn(pFts, &iter, &iCol)
+){
+ // Column iCol contains at least one instance of phrase iPhrase
+}
+</pre></div>
+
+<p>
+ This API can be quite slow if used with an FTS5 table created with the
+ "detail=none" option. If the FTS5 table is created with either
+ "detail=none" "content=" option (i.e. if it is a contentless table),
+ then this API always iterates through an empty set (all calls to
+ xPhraseFirstColumn() set iCol to -1).
+</p>
+
+<p>
+ The information accessed using this API and its companion
+ xPhraseFirstColumn() may also be obtained using xPhraseFirst/xPhraseNext
+ (or xInst/xInstCount). The chief advantage of this API is that it is
+ significantly more efficient than those alternatives when used with
+ "detail=column" tables.
+</p>
+</dd>
+<dt style="white-space:pre;font-family:monospace;font-size:120%" id="xPhraseNextColumn">
+<b>void (*xPhraseNextColumn)(Fts5Context*, Fts5PhraseIter*, int *piCol)</b></dt><dd>
+<p style="margin-top:0.1em">
+See xPhraseFirstColumn above.
+</p>
+</dd>
+</dl>
+
+
+<a name="the_fts5vocab_virtual_table_module"></a>
+<h1 tags="fts5vocab" id="the_fts5vocab_virtual_table_module"><span>8. </span>The fts5vocab Virtual Table Module</h1>
+
+<p> The fts5vocab virtual table module allows users to extract information from
+an FTS5 full-text index directly. The fts5vocab module is a part of FTS5 - it
+is available whenever FTS5 is.
+
+</p><p> Each fts5vocab table is associated with a single FTS5 table. An fts5vocab
+table is usually created by specifying two arguments in place of column names
+in the CREATE VIRTUAL TABLE statement - the name of the associated FTS5 table
+and the type of fts5vocab table. Currently there are three types of fts5vocab
+table; "row", "col" and "instance". Unless the fts5vocab table is created
+within the "temp" database, it must be part of the same database as the
+associated FTS5 table.
+
+</p><div class="codeblock"><pre><i>-- Create an fts5vocab "row" table to query the full-text index belonging
+-- to FTS5 table "ft1".</i>
+CREATE VIRTUAL TABLE ft1_v USING fts5vocab('ft1', 'row');
+
+<i>-- Create an fts5vocab "col" table to query the full-text index belonging
+-- to FTS5 table "ft2".</i>
+CREATE VIRTUAL TABLE ft2_v USING fts5vocab(ft2, col);
+
+<i>-- Create an fts5vocab "instance" table to query the full-text index
+-- belonging to FTS5 table "ft3".</i>
+CREATE VIRTUAL TABLE ft3_v USING fts5vocab(ft3, instance);
+</pre></div>
+
+<p> If an fts5vocab table is created in the temp database, it may be associated
+with an FTS5 table in any attached database. In order to attach the fts5vocab
+table to an FTS5 table located in a database other than "temp", the name of the
+database is inserted before the FTS5 table name in the CREATE VIRTUAL TABLE
+arguments. For example:
+
+</p><div class="codeblock"><pre><i>-- Create an fts5vocab "row" table to query the full-text index belonging
+-- to FTS5 table "ft1" in database "main".</i>
+CREATE VIRTUAL TABLE temp.ft1_v USING fts5vocab(main, 'ft1', 'row');
+
+<i>-- Create an fts5vocab "col" table to query the full-text index belonging
+-- to FTS5 table "ft2" in attached database "aux".</i>
+CREATE VIRTUAL TABLE temp.ft2_v USING fts5vocab('aux', ft2, col);
+
+<i>-- Create an fts5vocab "instance" table to query the full-text index
+-- belonging to FTS5 table "ft3" in attached database "other".</i>
+CREATE VIRTUAL TABLE temp.ft2_v USING fts5vocab('aux', ft3, 'instance');
+</pre></div>
+
+<p> Specifying three arguments when creating an fts5vocab table in any database
+other than "temp" results in an error.
+
+</p><p> An fts5vocab table of type "row" contains one row for each distinct term
+in the associated FTS5 table. The table columns are as follows:
+
+</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
+ <tr style="text-align:left"><th>Column</th><th>Contents
+ </th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>term</td><td> The term, as stored in the FTS5 index.
+ </td></tr><tr style="text-align:left"><td>doc</td><td> The number of rows that contain at least one instance of the term.
+ </td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>cnt</td><td> The total number of instances of the term in the entire FTS5 table.
+</td></tr></table>
+
+<p> An fts5vocab table of type "col" contains one row for each distinct term/column
+combination in the associated FTS5 table. Table columns are as follows:
+
+</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
+ <tr style="text-align:left"><th>Column</th><th>Contents
+ </th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>term</td><td> The term, as stored in the FTS5 index.
+ </td></tr><tr style="text-align:left"><td>col</td><td> The name of the FTS5 table column that contains the term.
+ </td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>doc</td><td> The number of rows in the FTS5 table for which column $col
+ contains at least one instance of the term.
+ </td></tr><tr style="text-align:left"><td>cnt</td><td> The total number of instances of the term that appear in
+ column $col of the FTS5 table (considering all rows).
+</td></tr></table>
+
+<p> An fts5vocab table of type "instance" contains one row for each term
+instance stored in the associated FTS index. Assuming the FTS5 table is
+created with the 'detail' option set to 'full', table columns are as follows:
+
+</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
+ <tr style="text-align:left"><th>Column</th><th>Contents
+ </th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>term</td><td> The term, as stored in the FTS5 index.
+ </td></tr><tr style="text-align:left"><td>doc</td><td> The rowid of the document that contains the term instance.
+ </td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>col</td><td> The name of the column that contains the term instance.
+ </td></tr><tr style="text-align:left"><td>offset</td><td> The index of the term instance within its column. Terms
+ are numbered in order of occurrence starting from 0.
+</td></tr></table>
+
+<p> If the FTS5 table is created with the 'detail' option set to 'col', then
+the <i>offset</i> column of an instance virtual table always contains NULL.
+In this case there is one row in the table for each unique term/doc/col
+combination. Or, if the FTS5 table is created with 'detail' set to 'none',
+then both <i>offset</i> and <i>col</i> always contain NULL values. For
+detail=none FTS5 tables, there is one row in the fts5vocab table for each
+unique term/doc combination.
+
+</p><p>Example:
+
+</p><div class="codeblock"><pre><i>-- Assuming a database created using:</i>
+CREATE VIRTUAL TABLE ft1 USING fts5(c1, c2);
+INSERT INTO ft1 VALUES('apple banana cherry', 'banana banana cherry');
+INSERT INTO ft1 VALUES('cherry cherry cherry', 'date date date');
+
+<i>-- Then querying the following fts5vocab table (type "col") returns:
+--
+-- apple | c1 | 1 | 1
+-- banana | c1 | 1 | 1
+-- banana | c2 | 1 | 2
+-- cherry | c1 | 2 | 4
+-- cherry | c2 | 1 | 1
+-- date | c3 | 1 | 3
+--</i>
+CREATE VIRTUAL TABLE ft1_v_col USING fts5vocab(ft1, col);
+
+<i>-- Querying an fts5vocab table of type "row" returns:
+--
+-- apple | 1 | 1
+-- banana | 1 | 3
+-- cherry | 2 | 5
+-- date | 1 | 3
+--</i>
+CREATE VIRTUAL TABLE ft1_v_row USING fts5vocab(ft1, row);
+
+<i>-- And, for type "instance"
+INSERT INTO ft1 VALUES('apple banana cherry', 'banana banana cherry');
+INSERT INTO ft1 VALUES('cherry cherry cherry', 'date date date');
+--
+-- apple | 1 | c1 | 0
+-- banana | 1 | c1 | 1
+-- banana | 1 | c2 | 0
+-- banana | 1 | c2 | 1
+-- cherry | 1 | c1 | 2
+-- cherry | 1 | c2 | 2
+-- cherry | 2 | c1 | 0
+-- cherry | 2 | c1 | 1
+-- cherry | 2 | c1 | 2
+-- date | 2 | c2 | 0
+-- date | 2 | c2 | 1
+-- date | 2 | c2 | 2
+--</i>
+CREATE VIRTUAL TABLE ft1_v_instance USING fts5vocab(ft1, instance);
+</pre></div>
+
+<a name="fts5shadowtables"></a>
+
+<h1 id="fts5_data_structures"><span>9. </span>FTS5 Data Structures</h1>
+
+<p>This section describes at a high-level the way the FTS module stores its
+index and content in the database. It is not necessary to read or understand
+the material in this section in order to use FTS in an application. However, it
+may be useful to application developers attempting to analyze and understand
+FTS performance characteristics, or to developers contemplating enhancements to
+the existing FTS feature set.
+
+</p><p>
+When an FTS5 virtual table is created in a database, between 3 and 5 real
+tables are created in the database. These are known as "<a href="vtab.html#xshadowname">shadow tables</a>", and
+are used by the virtual table module to store persistent data. They should not
+be accessed directly by the user. Many other virtual table modules, including
+<a href="fts3.html">FTS3</a> and <a href="rtree.html">rtree</a>, also create and use shadow tables.
+
+</p><p>FTS5 creates the following shadow tables. In each case the actual table name
+is based on the name of the FTS5 virtual table (in the following, replace
+% with the name of the virtual table to find the actual shadow table name).
+
+</p><div class="codeblock"><pre><i>-- This table contains most of the full-text index data. </i>
+CREATE TABLE %_data(id INTEGER PRIMARY KEY, block BLOB);
+
+<i>-- This table contains the remainder of the full-text index data. </i>
+<i>-- It is almost always much smaller than the %_data table. </i>
+CREATE TABLE %_idx(segid, term, pgno, PRIMARY KEY(segid, term)) WITHOUT ROWID;
+
+<i>-- Contains the values of persistent configuration parameters.</i>
+CREATE TABLE %_config(k PRIMARY KEY, v) WITHOUT ROWID;
+
+<i>-- Contains the size of each column of each row in the virtual table</i>
+<i>-- in tokens. This shadow table is not present if the "columnsize"</i>
+<i>-- option is set to 0.</i>
+CREATE TABLE %_docsize(id INTEGER PRIMARY KEY, sz BLOB);
+
+<i>-- Contains the actual data inserted into the FTS5 table. There</i>
+<i>-- is one "cN" column for each indexed column in the FTS5 table.</i>
+<i>-- This shadow table is not present for contentless or external </i>
+<i>-- content FTS5 tables. </i>
+CREATE TABLE %_content(id INTEGER PRIMARY KEY, c0, c1...);
+</pre></div>
+
+<p>The following sections describe in more detail how these five tables are
+used to store FTS5 data.
+
+</p><h2 id="varint_format"><span>9.1. </span>Varint Format</h2>
+
+<p>The sections below refer to 64-bit signed integers stored in "varint" form.
+FTS5 uses the same varint format as used in various places by the SQLite core.
+
+</p><p>A varint is between 1 and 9 bytes in length. The varint consists of either
+zero or more bytes which have the high-order bit set followed by a single byte
+with the high-order bit clear, or nine bytes, whichever is shorter. The lower
+seven bits of each of the first eight bytes and all 8 bits of the ninth byte
+are used to reconstruct the 64-bit twos-complement integer. Varints are
+big-endian: bits taken from the earlier byte of the varint are more significant
+than bits taken from the later bytes.
+
+</p><h2 id="the_fts_index_idx_and_data_tables_"><span>9.2. </span>The FTS Index (%_idx and %_data tables)</h2>
+
+<p>The FTS index is an ordered key-value store where the keys are document
+terms or term prefixes and the associated values are "doclists". A doclist is a
+packed array of varints that encodes the position of each instance of the term
+within the FTS5 table. The position of a single term instance is defined as the
+combination of:
+
+</p><ul>
+ <li> The rowid of the FTS5 table row it appears in,
+ </li><li> The index of the column the term instance appears in (columns are
+ numbered from left to right starting from zero), and
+ </li><li> The offset of the term within the column value (i.e. the number of
+ tokens that appear within the column value before this one).
+</li></ul>
+
+<p>The FTS index contains up to (nPrefix+1) entries for each token in the
+data set, where nPrefix is the number of defined <a href="#prefix_indexes">
+prefix indexes</a>.
+
+</p><p>Keys associated with the main FTS index (the one that is not a prefix
+index) are prefixed with the character "0". Keys for the first prefix
+index are prefixed with "1". Keys for the second prefix index are
+prefixed with "2", and so on. For example, if the token "document" is
+inserted into an FTS5 table with <a href="#prefix_indexes">prefix indexes</a>
+specified by prefix="2 4", then the keys added to the FTS index would be
+"0document", "1do" and "2docu".
+
+</p><p>The FTS index entries are not stored in a single tree or hash table
+structure. Instead, they are stored in a series of immutable b-tree like
+structures referred to as "segment b-trees". Each time a write to the FTS5
+table is committed, one or more (but usually just one) new segment b-trees
+are added containing both the new entries and tombstones for any deleted
+entries. When the FTS index is queried, the reader queries each segment
+b-tree in turn and merges the results, giving priority to newer data.
+
+</p><p>Each segment b-tree is assigned a numerical level. When a new segment
+b-tree is written to the database as part of committing a transaction,
+it is assigned to level 0. Segment b-trees belonging to a single level are
+periodically merged together to create a single, larger segment b-tree
+that is assigned to the next level (i.e. level 0 segment b-trees are
+merged to become a single level 1 segment b-tree). Thus the numerically
+larger levels contain older data in (usually) larger segment b-trees.
+Refer to the
+<a href="#the_automerge_configuration_option">'automerge'</a>,
+<a href="#the_crisismerge_configuration_option">'crisismerge'</a> and
+<a href="#the_usermerge_configuration_option">'usermerge'</a> options, along
+with the
+<a href="#the_merge_command">'merge'</a> and
+<a href="#the_optimize_command">'optimize'</a> commands for details on how to
+control the merging.
+
+</p><p>In cases where the doclist associated with a term or term prefix is very
+large, there may be an associated <a href="#doclist_index_format">doclist
+index</a>. A doclist index is similar to the set of internal nodes of a b-tree.
+It allows a large doclist to be efficiently queried for rowids or ranges of
+rowids. For example, when processing a query like:
+
+</p><div class="codeblock"><pre>SELECT ... FROM fts_table('term') WHERE rowid BETWEEN ? AND ?
+</pre></div>
+
+<p>FTS5 uses the segment b-tree index to locate the doclist for term "term",
+then uses its doclist index (assuming it is present) to efficiently identify
+the subset of matches with rowids in the required range.
+
+</p><h3 id="data_structure"><span>9.2.1. </span>The %_data Table Rowid Space</h3>
+
+<div class="codeblock"><pre>CREATE TABLE %_data(
+ id INTEGER PRIMARY KEY,
+ block BLOB
+);
+</pre></div>
+
+<p>The %_data table is used to store three types of records:
+
+</p><ul>
+ <li> The special <a href="#structure_record_format">structure record</a>,
+ stored with id=1.
+ </li><li> The special <a href="#averages_record_format">averages record</a>,
+ stored with id=10.
+ </li><li> A record to store each <a href="#segment_b_tree_format">segment b-tree</a>
+ leaf and <a href="#doclist_index_format">doclist index</a> leaf and
+ internal node. See below for how id values are calculated for these
+ records.
+</li></ul>
+
+<p>Each segment b-tree in the system is assigned a unique 16-bit segment id.
+Segment ids may only be reused after the original owner segment b-tree is
+completely merged into a higher level segment b-tree. Within a segment b-tree,
+each leaf page is assigned a unique page number - 1 for the first leaf page, 2
+for the second, and so on.
+
+</p><p>Each doclist index leaf page is also assigned a page number. The first
+(leftmost) leaf page in a doclist index is assigned the same page number as
+the segment b-tree leaf page on which its term appears (because doclist indexes
+are only created for terms with very long doclists, at most one term per
+segment b-tree leaf has an associated doclist index). Call this page number P.
+If the doclist is so large that it requires a second leaf, the second leaf is
+assigned page number P+1. The third leaf P+2. Each tier of a doclist index
+b-tree (leaves, parents of leaves, grandparents etc.) is assigned page numbers
+in this fashion, starting with page number P.
+
+</p><p>The "id" value used in the %_data table to store any given segment b-tree
+leaf or doclist index leaf or node is composed as follows:
+
+</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
+<tr style="text-align:left"><th>Rowid&nbsp;Bits </th><th>Contents
+</th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>38..43 </td><td> (16 bit) Segment b-tree id value.
+</td></tr><tr style="text-align:left"><td>37 </td><td> (1 bit) Doclist index flag. Set for doclist index pages, clear
+ for segment b-tree leaves.
+</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>32..36 </td><td> (5 bits) Height in tree. This is set to 0 for segment b-tree
+ and doclist index leaves, to 1 for the parents of doclist
+ index leaves, 2 for the grandparents, etc.
+</td></tr><tr style="text-align:left"><td>0..31 </td><td> (32 bits) Page number
+</td></tr></table>
+
+<h3 id="structure_record_format"><span>9.2.2. </span>Structure Record Format</h3>
+
+<p>The structure record identifies the set of segment b-trees that make up the
+current FTS index, along with details of any ongoing incremental merge
+operations. It is stored in the %_data table with id=1.
+
+A structure record begins with a single 32-bit unsigned value - the cookie
+value. This value is incremented each time the structure is modified.
+Following the cookie value are three varint values, as follows:
+
+</p><ul>
+ <li> The number of levels in the index (i.e. the maximum level associated
+ with any segment b-tree plus one).
+ </li><li> The total number of segment b-trees in the index.
+ </li><li> The total number of segment b-tree leaves written to level 0 trees
+ since the FTS5 table was created.
+</li></ul>
+
+<p>Then, for each level from 0 to nLevel:
+
+</p><ul>
+ <li> The number of input segments from the previous level being used as
+ inputs for the current incremental merge, or zero if there is no
+ ongoing incremental merge to create a new segment b-tree for this level.
+ </li><li> The total number of segment b-trees on the level.
+ </li><li> Then, for each segment b-tree, from oldest to newest:
+ <ul>
+ <li> The segment id.
+ </li><li> Page number of first leaf (often 1, always >0).
+ </li><li> Page number of last leaf (always >0).
+ </li></ul>
+</li></ul>
+
+<h3 id="averages_record_format"><span>9.2.3. </span>Averages Record Format</h3>
+
+<p>The averages record, which is always stored with id=10 in the %_data table,
+does not store the average of anything. Instead, it contains a vector of
+(nCol+1) packed varint values, where nCol is the number of columns in the FTS5
+table, including unindexed columns. The first varint contains the total
+number of rows in the FTS5 table. The second contains the total number of
+tokens in all values stored in the leftmost FTS5 table column. The third the
+number of tokens in all values for the next leftmost, and so on. The value for
+unindexed columns is always zero.
+
+</p><h3 id="segment_b_tree_format"><span>9.2.4. </span>Segment B-Tree Format</h3>
+
+<h4 id="the_key_doclist_format"><span>9.2.4.1. </span>The Key/Doclist Format</h4>
+
+<p>The key/doclist format is a format used to store a series of keys (document
+terms or term prefixes prefixed by a single character to indentify the specific
+index to which they belong) in sorted order, each with their associated
+doclist. The format consists of alternating keys and doclists packed together.
+
+</p><p>The first key is stored as:
+</p><ul>
+ <li> A varint indicating the number of bytes in the key (N), followed by
+ </li><li> The key data itself (N bytes).
+</li></ul>
+
+<p>Each subsequent key is stored as:
+</p><ul>
+ <li> A varint indicating the size of the prefix that the key has in common
+ with the previous key in bytes,
+ </li><li> A varint indicating the number of bytes in the key following the
+ common prefix (N), followed by
+ </li><li> The key suffix data itself (N bytes).
+</li></ul>
+
+<p>For example, if the first two keys in an FTS5 key/doclist record are
+"0challenger" and "0chandelier", then the first key is stored as varint 11
+followed by the 11 bytes "0challenger", and the second key is stored as varints
+4 and 7, followed by the 7 bytes "ndelier".
+
+</p><center><div style="max-width:956px;"><svg xmlns='http://www.w3.org/2000/svg' class="pikchr" viewBox="0 0 956.102 126.72">
+<path d="M62,77L91,77L91,48L62,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<path d="M91,77L149,77L149,48L91,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<path d="M149,77L437,77L437,48L149,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="293" y="63" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">doclist 0</text>
+<path d="M437,77L466,77L466,48L437,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<path d="M466,77L494,77L494,48L466,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<path d="M494,77L538,77L538,48L494,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<path d="M538,77L826,77L826,48L538,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="682" y="63" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">doclist 1</text>
+<text x="892" y="63" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">key/doclist 2...</text>
+<path d="M826,48L883,48" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<path d="M826,77L883,77" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="120" y="114" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">key 0 data</text>
+<polygon points="120,77 124,89 116,89" style="fill:rgb(0,0,0)"/>
+<path d="M120,99L120,83" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="77" y="12" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">key 0 size (varint)</text>
+<polygon points="77,48 72,37 81,37" style="fill:rgb(0,0,0)"/>
+<path d="M77,27L77,43" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="358" y="114" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">key 1 prefix size (varint)</text>
+<polygon points="451,77 441,84 439,76" style="fill:rgb(0,0,0)"/>
+<path d="M358,99L446,79" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="480" y="12" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">key 1 suffix size (varint)</text>
+<polygon points="480,48 476,37 484,37" style="fill:rgb(0,0,0)"/>
+<path d="M480,27L480,43" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="585" y="114" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">key 1 prefix data</text>
+<polygon points="516,77 528,77 526,85" style="fill:rgb(0,0,0)"/>
+<path d="M585,99L521,79" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+</svg>
+</div>
+<p><i>Figure 1 - Term/Doclist Format
+</i></p></center>
+
+<p>Each doclist identifies the rows (by their rowid values) that contain at
+least one instance of the term or term prefix and an associated position list,
+or "poslist" enumerating the position of each term instance within the row. In
+this sense a "position" is defined as a column number and term offset within
+the column value.
+
+</p><p>Within a doclist, documents are always stored in order sorted by rowid. The
+first rowid in a doclist is stored as is, as a varint. It is immediately
+followed by its associated position list. Following this, the difference
+between the first rowid and the second, as a varint, followed by the doclist
+associated with the second rowid in the doclist. And so on.
+
+</p><p>There is no way to determine the size of a doclist by parsing it. This must
+be stored externally. See the <a href="#data_pagination">section below</a> for
+details of how this is accomplished in FTS5.
+
+</p><center><div style="max-width:889px;"><svg xmlns='http://www.w3.org/2000/svg' class="pikchr" viewBox="0 0 889.459 79.92">
+<path d="M46,77L89,77L89,48L46,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<path d="M89,77L377,77L377,48L89,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="233" y="63" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">position list 0</text>
+<path d="M377,77L420,77L420,48L377,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<path d="M420,77L708,77L708,48L420,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="564" y="63" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">position list 1</text>
+<path d="M708,77L751,77L751,48L708,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="822" y="63" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">position list 2...</text>
+<path d="M751,48L809,48" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<path d="M751,77L809,77" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="67" y="12" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">rowid 0 (varint)</text>
+<polygon points="67,48 63,37 72,37" style="fill:rgb(0,0,0)"/>
+<path d="M67,27L67,43" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="398" y="12" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">rowid 1 (delta-encoded varint)</text>
+<polygon points="398,48 394,37 403,37" style="fill:rgb(0,0,0)"/>
+<path d="M398,27L398,43" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="730" y="12" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">rowid 3 (delta-encoded varint)</text>
+<polygon points="730,48 725,37 734,37" style="fill:rgb(0,0,0)"/>
+<path d="M730,27L730,43" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+</svg>
+</div>
+<p><i>Figure 2 - Doclist Format
+</i></p></center>
+
+<p>A position list - often shortened to "poslist" - identifies the column
+and token offset within the row of each instance of the token in question.
+The format of a poslist is:
+
+</p><ul>
+ <li> Varint set to twice the size of the poslist, not including this field,
+ plus one if the "delete" flag is set on the entry.
+ </li><li> A (possibly empty) list of offsets for column 0 (the leftmost column) of
+ the row. Each offset is stored as a varint. The first varint contains
+ the value of the first offset, plus 2. The second variant contains the
+ difference between the second and first offsets, plus 2. etc. For
+ example, if the offset list is to contain offsets 0, 10, 15 and 16, it
+ is encoded by packing the following values, encoded as varints, end to
+ end:
+ <pre>
+ 2, 12, 7, 3
+</pre>
+ </li><li> For each column other than column 0 that contains one of more instances
+ of the token:
+ <ul>
+ <li> Byte value 0x01.
+ </li><li> The column number, as a varint.
+ </li><li> An offset list, in the same format as the offset list for column 0.
+ </li></ul>
+</li></ul>
+
+<center><div style="max-width:785px;"><svg xmlns='http://www.w3.org/2000/svg' class="pikchr" viewBox="0 0 785.606 133.92">
+<path d="M77,77L121,77L121,48L77,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<path d="M121,77L409,77L409,48L121,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="265" y="63" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">col 0 offset-list</text>
+<path d="M409,77L452,77L452,48L409,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="430" y="63" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">0x01</text>
+<path d="M452,77L495,77L495,48L452,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<path d="M495,77L783,77L783,48L495,48Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="639" y="63" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">col i offset-list</text>
+<text x="99" y="12" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">nSize*2 + bDel (varint)</text>
+<polygon points="99,48 95,37 103,37" style="fill:rgb(0,0,0)"/>
+<path d="M99,27L99,43" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="473" y="12" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">column number (i)</text>
+<polygon points="473,48 469,37 478,37" style="fill:rgb(0,0,0)"/>
+<path d="M473,27L473,43" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<path d="M121,92L121,106L783,106L783,92" style="fill:none;stroke-width:2.16;stroke-linejoin:round;stroke:rgb(0,0,0);" />
+<text x="452" y="121" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">nSize bytes</text>
+</svg>
+</div>
+<p><i>Figure 3 - Position List (poslist) With Offsets in Columns 0 and i
+</i></p></center>
+
+<h4 id="data_pagination"><span>9.2.4.2. </span>Pagination</h4>
+
+<p>If it is small enough (by default this means smaller than 4000 bytes), the
+entire contents of a segment b-tree may be stored in the key/doclist format
+described in the previous section as a single blob within the %_data table.
+Otherwise, the key/doclist is split into pages (by default, of approximately
+4000 bytes each) and stored in a contiguous set of entries in the %_data table
+(<a href="#data_structure">see above</a> for details).
+
+</p><p>When a key/doclist is divided into pages, the following modifications are
+made to the format:
+
+</p><ul>
+ <li> A single varint or key data field never spans two pages.
+ </li><li> The first key on each page is not prefix-compressed. It is stored in
+ the format described above for the first key of a doclist - its size as
+ a varint followed by the key data.
+ </li><li> If there are one or more rowids on a page before the first key, then
+ the first of them is not delta compressed. It is stored as is, just as
+ if it were the first rowid of its doclist (which it may or may not be).
+</li></ul>
+
+<p>Each page also has fixed-size 4-byte header and a variably-sized footer.
+The header is divided into 2 16-bit big-endian integer fields. They
+contain:
+
+</p><ul>
+ <li> The byte offset of the first rowid value on the page, if it occurs
+ before the first key, or 0 otherwise.
+ </li><li> The byte offset of the page footer.
+</li></ul>
+
+<p>The page footer consists of a series of varints containing the byte offset
+of each key that appears on the page. The page footer is zero bytes in size
+if there are no keys on the page.
+
+</p><center><div style="max-width:721px;"><svg xmlns='http://www.w3.org/2000/svg' class="pikchr" viewBox="0 0 721.267 79.92">
+<path d="M10,30L54,30L54,2L10,2Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="32" y="16" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">hdr</text>
+<path d="M54,30L630,30L630,2L54,2Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="342" y="16" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">modified key/doclist data</text>
+<path d="M630,30L702,30L702,2L630,2Z" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="666" y="16" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">footer</text>
+<text x="32" y="67" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">4 bytes</text>
+<text x="666" y="67" text-anchor="middle" fill="rgb(0,0,0)" dominant-baseline="central">variable size</text>
+<polygon points="32,30 36,42 28,42" style="fill:rgb(0,0,0)"/>
+<path d="M32,52L32,36" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<polygon points="666,30 670,42 661,42" style="fill:rgb(0,0,0)"/>
+<path d="M666,52L666,36" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+</svg>
+</div>
+<p><i>Figure 4 - Page Format
+</i></p></center>
+
+<h4 id="data_term_index"><span>9.2.4.3. </span>Segment Index Format</h4>
+
+<p>The result of formatting the contents of the segment b-tree in the
+key/doclist format and then splitting it into pages is something very
+similar to the leaves of a b+tree. Instead of creating a format for
+the internal nodes of this b+tree and storing them in the %_data table
+alongside the leaves, the keys that would have been stored on such nodes are
+added to the %_idx table, defined as:
+
+</p><div class="codeblock"><pre>CREATE TABLE %_idx(
+ segid INTEGER, <i>-- segment id</i>
+ term TEXT, <i>-- prefix of first key on page</i>
+ pgno INTEGER, <i>-- (2*pgno + bDoclistIndex)</i>
+ PRIMARY KEY(segid, term)
+);
+</pre></div>
+
+<p>For each "leaf" page that contains at least one key, an entry is added
+to the %_idx table. Fields are set as follows:
+
+</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
+ <tr style="text-align:left"><th>Column</th><th>Contents
+ </th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>segid</td><td> The integer segment id.
+ </td></tr><tr style="text-align:left"><td>term</td><td> The smallest prefix of the first key on the page that
+ is larger than all keys on the previous page. For the
+ first page in a segment, this prefix is zero bytes in
+ size.
+ </td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>pgno</td><td> This field encodes both the page number (within the
+ segment - starting from 1) and the doclist index flag.
+ The doclist index flag is set if the final key on the
+ page has an <a href="#data_doclist_index">associated
+ doclist index</a>. The value of this field is:
+ <pre>
+ (pgno*2 + bDoclistIndexFlag)
+</pre>
+</td></tr></table>
+
+<p>Then, to find the leaf for segment i that may contain term t, instead of
+searching through internal nodes, FTS5 runs the query:
+
+</p><div class="codeblock"><pre>SELECT pgno FROM %_idx WHERE segid=$i AND term>=$t ORDER BY term LIMIT 1
+</pre></div>
+
+<h4 data_doclist_index="1" id="doclist_index_format"><span>9.2.4.4. </span>Doclist Index Format</h4>
+
+<p>The segment index described in the <a href="#data_term_index">previous
+section</a> allows a segment b-tree to be efficiently queried by term or,
+assuming there is a prefix index of the required size, a term prefix. The data
+structure described in this section, doclist indexes, allows FTS5 to
+efficiently search for a rowid or range or rowids within the doclist associated
+with a single term or term prefix.
+
+</p><p>Not all keys have associated doclists indexes. By default, a doclist index
+is only added for a key if its doclist spans more than 4 segment b-tree leaf
+pages. Doclist indexes are themselves b-trees, with both leaves and internal
+nodes stored as entries in the %_data table, but in practice most doclists are
+small enough to fit on a single leaf. FTS5 uses the same rough size for doclist
+index node and leaves as it does for segment b-tree leaves (by default 4000
+bytes).
+
+</p><p>Doclist index leaves and internal nodes use the same page format. The first
+byte is a "flags" byte. This is set to 0x00 for the root page of the doclist
+index b-tree, and 0x01 for all other pages. The remainder of the page is a
+series of tightly packed varints, as follows:
+
+</p><ul>
+ <li> page number of leftmost child page, followed by
+ </li><li> the smallest rowid value on the left most child page, followed by
+ </li><li> one varint for each subsequent child page, containing the value:
+ <ul>
+ <li> 0x00 if there are no rowids on the child page (this can only happen
+ when the "child" page is actually a segment b-tree leaf), or
+ </li><li> the difference between the smallest rowid on the child page and
+ the previous rowid value stored on the doclist index page.
+ </li></ul>
+</li></ul>
+
+<p>For the leftmost doclist index leaf in a doclist index, the leftmost child
+page is the first segment b-tree leaf after the one that contains the key
+itself.
+
+</p><h2 id="document_sizes_table_docsize_table_"><span>9.3. </span>Document Sizes Table (%_docsize table)</h2>
+
+<div class="codeblock"><pre>CREATE TABLE %_docsize(
+ id INTEGER PRIMARY KEY, -- id of FTS5 row this record pertains to
+ sz BLOB -- blob containing nCol packed varints
+);
+</pre></div>
+
+<p>Many common search result ranking functions require as an input the size
+in tokens of the result document (as a search term hit in a short document is
+considered more significant than one in a long document). To provide fast
+access to this information, for each row in the FTS5 table there exists a
+corresponding record (with the same rowid) in the %_docsize shadow table
+that contains the size of each column value in the row, in tokens.
+
+</p><p>The column value sizes are stored in a blob containing one packed varint for
+each column of the FTS5 table, from left to right. The varint contains, of
+course, the total number of tokens in the corresponding column value. Unindexed
+columns are included in this vector of varints; for them the value is always
+set to zero.
+
+</p><p>This table is used by the <a href="#xColumnSize">xColumnSize</a> API. It can
+be omitted altogether by specifying the
+<a href="#the_columnsize_option">columnsize=0</a> option. In that case the
+xColumnSize API is still available to auxiliary functions, but runs much more
+slowly.
+
+</p><h2 id="the_table_contents_content_table_"><span>9.4. </span>The Table Contents (%_content table)</h2>
+
+<div class="codeblock"><pre>CREATE TABLE %_content(id INTEGER PRIMARY KEY, c0, c1...);
+</pre></div>
+
+<p>The actual table content - the values inserted into the FTS5 table, is
+stored in the %_content table. This table is created with one "c*" column for
+each column of the FTS5 table, including any unindexed columns. The values for
+the leftmost FTS5 table column are stored in column "c0" of the %_content
+table, the values from the next FTS5 table column in column "c1", and so on.
+
+</p><p>This table is omitted completely for <a href="#content_and_contentless_tables">
+external content or contentless</a> FTS5 tables.
+tables.
+
+</p><h2 id="configuration_options_config_table_"><span>9.5. </span>Configuration Options (%_config table)</h2>
+
+<div class="codeblock"><pre>CREATE TABLE %_config(k PRIMARY KEY, v) WITHOUT ROWID;
+</pre></div>
+
+<p>This table stores the values of any persistent configuration options.
+Column "k" stores the name of the option (text) and column "v" the value.
+Example contents:
+
+</p><div class="codeblock"><pre>sqlite> SELECT * FROM fts_tbl_config;
+┌─────────────┬──────┐
+│ k │ v │
+├─────────────┼──────┤
+│ crisismerge │ 8 │
+│ pgsz │ 8000 │
+│ usermerge │ 4 │
+│ version │ 4 │
+└─────────────┴──────┘
+</pre></div>
+
+<a name="appendix_a"></a>
+<h1 id="appendix_a" nonumber="1" tags="comparison with fts4">
+ Appendix A: Comparison with FTS3/4
+</h1>
+
+<p> Also available is the similar but more mature <a href="fts3.html">FTS3/4</a> module.
+FTS5 is a new version of FTS4 that includes various fixes and solutions for
+problems that could not be fixed in FTS4 without sacrificing backwards
+compatibility. Some of these problems are
+<a href="fts5.html#_summary_of_technical_differences_">described below</a>.
+
+</p><h2 nonumber="1" id="_application_porting_guide_"> Application Porting Guide </h2>
+
+<p> In order to use FTS5 instead of FTS3 or FTS4, applications usually require
+minimal modifications. Most of these fall into three categories - changes
+required to the CREATE VIRTUAL TABLE statement used to create the FTS table,
+changes required to SELECT queries used to execute queries against the table,
+and changes required to applications that use <a href="fts3.html#snippet">FTS auxiliary functions</a>.
+
+</p><h3 nonumber="1" id="_changes_to_create_virtual_table_statements_"> Changes to CREATE VIRTUAL TABLE statements </h3>
+
+<ol>
+<li> <p>The module name must be changed from "fts3" or "fts4" to "fts5".
+
+</p></li><li> <p>All type information or constraint specifications must be removed from
+ column definitions. FTS3/4 ignores everything following the column name in
+ a column definition, FTS5 attempts to parse it (and will report an error
+ if it fails to).
+
+</p></li><li> <p>The "matchinfo=fts3" option is not available. The
+ <a href="fts5.html#the_columnsize_option">"columnsize=0"</a> option is equivalent.
+
+</p></li><li> <p>The notindexed= option is not available. Adding <a href="fts5.html#the_unindexed_column_option">UNINDEXED</a>
+ to the column definition is equivalent.
+
+</p></li><li> <p>The ICU tokenizer is not available.
+
+</p></li><li> <p>The compress=, uncompress= and languageid= options are not available.
+ There is as of yet no equivalent for their functionality.
+</p></li></ol>
+
+<div class="codeblock"><pre><i> -- FTS3/4 statement </i>
+CREATE VIRTUAL TABLE t1 USING fts4(
+ linkid INTEGER,
+ header CHAR(20),
+ text VARCHAR,
+ notindexed=linkid,
+ matchinfo=fts3,
+ tokenizer=unicode61
+);
+
+<i> -- FTS5 equivalent (note - the "tokenizer=unicode61" option is not</i>
+<i> -- required as this is the default for FTS5 anyway)</i>
+CREATE VIRTUAL TABLE t1 USING fts5(
+ linkid UNINDEXED,
+ header,
+ text,
+ columnsize=0
+);
+</pre></div>
+
+<h3 nonumber="1" id="_changes_to_select_statements_"> Changes to SELECT statements </h3>
+
+<ol>
+ <li> <p>The "docid" alias does not exist. Applications must use "rowid"
+ instead.
+
+ </p></li><li> <p>The behaviour of queries when a column-filter is specified both as
+ part of the FTS query and by using a column as the LHS of a MATCH
+ operator is slightly different. For a table with columns "a" and "b"
+ and a query similar to:
+</p><div class="codeblock"><pre>... a MATCH 'b: string'
+</pre></div>
+ <p>FTS3/4 searches for matches in column "b". However, FTS5 always
+ returns zero rows, as results are first filtered for column "b", then
+ for column "a", leaving no results. In other words, in FTS3/4 the
+ inner filter overrides the outer, in FTS5 both filters are applied.
+
+ </p></li><li> <p>The FTS query syntax (right hand side of the MATCH operator) has
+ changed in some ways. The FTS5 syntax is quite close to the FTS4
+ "enhanced syntax". The main difference is that FTS5 is fussier
+ about unrecognized punctuation characters and similar within query
+ strings. Most queries that work with FTS3/4 should also work with
+ FTS5, and those that do not should return parse errors.
+</p></li></ol>
+
+<h3 nonumber="1" id="_auxiliary_function_changes_"> Auxiliary Function Changes </h3>
+
+<p> FTS5 has no matchinfo() or offsets() function, and the snippet() function
+is not as fully-featured as in FTS3/4. However, since FTS5 does provide
+an API allowing applications to create <a href="fts5.html#_custom_auxiliary_functions_api_reference_">custom auxiliary functions</a>, any
+required functionality may be implemented within the application code.
+
+</p><p> The set of built-in auxiliary functions provided by FTS5 may be
+improved upon in the future.
+
+</p><h3 nonumber="1" id="_other_issues"> Other Issues</h3>
+
+<ol>
+ <li><p> The functionality provided by the fts4aux module is now provided
+ by <a href="fts5.html#the_fts5vocab_virtual_table_module">fts5vocab</a>. The schema of these two tables is slightly different.
+
+ </p></li><li><p> The FTS3/4 "merge=X,Y" command has been replaced by the
+ <a href="fts5.html#the_merge_command">FTS5 merge command</a>.
+
+ </p></li><li><p> The FTS3/4 "automerge=X" command has been replaced by the
+ <a href="fts5.html#the_automerge_configuration_option">FTS5 automerge option</a>.
+</p></li></ol>
+
+<a name="_summary_of_technical_differences_"></a>
+<h2 nonumber="1" tags="fts5 technical differences" id="_summary_of_technical_differences_">
+ Summary of Technical Differences
+</h2>
+
+<p>FTS5 is similar to FTS3/4 in that the primary task of each is to maintain
+an index mapping from each unique token to a list of instances of that token
+within a set of documents, where each instance is identified by the document
+in which it appears and its position within that document. For example:
+
+</p><div class="codeblock"><pre><i>-- Given the following SQL:</i>
+CREATE VIRTUAL TABLE ft USING fts5(a, b);
+INSERT INTO ft(rowid, a, b) VALUES(1, 'X Y', 'Y Z');
+INSERT INTO ft(rowid, a, b) VALUES(2, 'A Z', 'Y Y');
+
+<i>-- The FTS5 module creates the following mapping on disk:</i>
+A --&gt; (2, 0, 0)
+X --&gt; (1, 0, 0)
+Y --&gt; (1, 0, 1) (1, 1, 0) (2, 1, 0) (2, 1, 1)
+Z --&gt; (1, 1, 1) (2, 0, 1)
+</pre></div>
+
+<p>In the example above, each triple identifies the location of a token
+instance by rowid, column number (columns are numbered sequentially
+starting at 0 from left to right) and position within the column value (the
+first token in a column value is 0, the second is 1, and so on). Using this
+index, FTS5 is able to provide timely answers to queries such as "the set
+of all documents that contain the token 'A'", or "the set of all documents
+that contain the sequence 'Y Z'". The list of instances associated with a
+single token is called an "instance-list".
+
+</p><p>The principle difference between FTS3/4 and FTS5 is that in FTS3/4,
+each instance-list is stored as a single large database record, whereas
+in FTS5 large instance-lists are divided between multiple database records.
+This has the following implications for dealing with large databases that
+contain large lists:
+
+</p><ul>
+ <li> <p>FTS5 is able to load instance-lists into memory incrementally in
+ order to reduce memory usage and peak allocation size. FTS3/4 very
+ often loads entire instance-lists into memory.
+
+ </p></li><li> <p>When processing queries that feature more than one token, FTS5 is
+ sometimes able to determine that the query can be answered by
+ inspecting a subset of a large instance-list. FTS3/4 almost always
+ has to traverse entire instance-lists.
+
+ </p></li><li> If an instance-list grows so large that it exceeds
+ the <a href="limits.html#max_length">SQLITE_MAX_LENGTH</a> limit, FTS3/4 is unable to handle it. FTS5
+ does not have this problem.
+</li></ul>
+
+<p>For these reasons, many complex queries may use less memory and run faster
+using FTS5.
+
+</p><p>Some other ways in which FTS5 differs from FTS3/4 are:
+
+</p><ul>
+ <li> <p>FTS5 supports "ORDER BY rank" for returning results in order of
+ decreasing relevancy.
+
+ </p></li><li> <p>FTS5 features an API allowing users to create custom auxiliary
+ functions for advanced ranking and text processing applications. The
+ special "rank" column may be mapped to a custom auxiliary function
+ so that adding "ORDER BY rank" to a query works as expected.
+
+ </p></li><li> <p>FTS5 recognizes unicode separator characters and case equivalence by
+ default. This is also possible using FTS3/4, but must be explicitly
+ enabled.
+
+ </p></li><li> <p>The query syntax has been revised where necessary to remove
+ ambiguities and to make it possible to escape special characters
+ in query terms.
+
+ </p></li><li> <p>By default, FTS3/4 occasionally merges together two or more of the
+ b-trees that make up its full-text index within an INSERT, UPDATE or
+ DELETE statement executed by the user. This means that any operation
+ on an FTS3/4 table may turn out to be surprisingly slow, as FTS3/4
+ may unpredictably choose to merge together two or more large b-trees
+ within it. FTS5 uses incremental merging by default, which limits
+ the amount of processing that may take place within any given
+ INSERT, UPDATE or DELETE operation.
+</p></li></ul>
+<p align="center"><small><i>This page last modified on <a href="https://sqlite.org/docsrc/honeypot" id="mtimelink" data-href="https://sqlite.org/docsrc/finfo/pages/fts5.in?m=13ff6fb4a9d162419">2022-09-24 16:31:11</a> UTC </small></i></p>
+