diff options
Diffstat (limited to '')
-rw-r--r-- | www/optoverview.html | 1623 |
1 files changed, 1623 insertions, 0 deletions
diff --git a/www/optoverview.html b/www/optoverview.html new file mode 100644 index 0000000..5fa1146 --- /dev/null +++ b/www/optoverview.html @@ -0,0 +1,1623 @@ +<!DOCTYPE html> +<html><head> +<meta name="viewport" content="width=device-width, initial-scale=1.0"> +<meta http-equiv="content-type" content="text/html; charset=UTF-8"> +<link href="sqlite.css" rel="stylesheet"> +<title>The SQLite Query Optimizer Overview</title> +<!-- path= --> +</head> +<body> +<div class=nosearch> +<a href="index.html"> +<img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0"> +</a> +<div><!-- IE hack to prevent disappearing logo --></div> +<div class="tagline desktoponly"> +Small. Fast. Reliable.<br>Choose any three. +</div> +<div class="menu mainmenu"> +<ul> +<li><a href="index.html">Home</a> +<li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a> +<li class='wideonly'><a href='about.html'>About</a> +<li class='desktoponly'><a href="docs.html">Documentation</a> +<li class='desktoponly'><a href="download.html">Download</a> +<li class='wideonly'><a href='copyright.html'>License</a> +<li class='desktoponly'><a href="support.html">Support</a> +<li class='desktoponly'><a href="prosupport.html">Purchase</a> +<li class='search' id='search_menubutton'> +<a href="javascript:void(0)" onclick='toggle_search()'>Search</a> +</ul> +</div> +<div class="menu submenu" id="submenu"> +<ul> +<li><a href='about.html'>About</a> +<li><a href='docs.html'>Documentation</a> +<li><a href='download.html'>Download</a> +<li><a href='support.html'>Support</a> +<li><a href='prosupport.html'>Purchase</a> +</ul> +</div> +<div class="searchmenu" id="searchmenu"> +<form method="GET" action="search"> +<select name="s" id="searchtype"> +<option value="d">Search Documentation</option> +<option value="c">Search Changelog</option> +</select> +<input type="text" name="q" id="searchbox" value=""> +<input type="submit" value="Go"> +</form> +</div> +</div> +<script> +function toggle_div(nm) { +var w = document.getElementById(nm); +if( w.style.display=="block" ){ +w.style.display = "none"; +}else{ +w.style.display = "block"; +} +} +function toggle_search() { +var w = document.getElementById("searchmenu"); +if( w.style.display=="block" ){ +w.style.display = "none"; +} else { +w.style.display = "block"; +setTimeout(function(){ +document.getElementById("searchbox").focus() +}, 30); +} +} +function div_off(nm){document.getElementById(nm).style.display="none";} +window.onbeforeunload = function(e){div_off("submenu");} +/* Disable the Search feature if we are not operating from CGI, since */ +/* Search is accomplished using CGI and will not work without it. */ +if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){ +document.getElementById("search_menubutton").style.display = "none"; +} +/* Used by the Hide/Show button beside syntax diagrams, to toggle the */ +function hideorshow(btn,obj){ +var x = document.getElementById(obj); +var b = document.getElementById(btn); +if( x.style.display!='none' ){ +x.style.display = 'none'; +b.innerHTML='show'; +}else{ +x.style.display = ''; +b.innerHTML='hide'; +} +return false; +} +var antiRobot = 0; +function antiRobotGo(){ +if( antiRobot!=3 ) return; +antiRobot = 7; +var j = document.getElementById("mtimelink"); +if(j && j.hasAttribute("data-href")) j.href=j.getAttribute("data-href"); +} +function antiRobotDefense(){ +document.body.onmousedown=function(){ +antiRobot |= 2; +antiRobotGo(); +document.body.onmousedown=null; +} +document.body.onmousemove=function(){ +antiRobot |= 2; +antiRobotGo(); +document.body.onmousemove=null; +} +setTimeout(function(){ +antiRobot |= 1; +antiRobotGo(); +}, 100) +antiRobotGo(); +} +antiRobotDefense(); +</script> +<div class=fancy> +<div class=nosearch> +<div class="fancy_title"> +The SQLite Query Optimizer Overview +</div> +<div class="fancy_toc"> +<a onclick="toggle_toc()"> +<span class="fancy_toc_mark" id="toc_mk">►</span> +Table Of Contents +</a> +<div id="toc_sub"><div class="fancy-toc1"><a href="#introduction">1. Introduction</a></div> +<div class="fancy-toc1"><a href="#where_clause_analysis">2. WHERE Clause Analysis</a></div> +<div class="fancy-toc2"><a href="#index_term_usage_examples">2.1. Index Term Usage Examples</a></div> +<div class="fancy-toc1"><a href="#the_between_optimization">3. The BETWEEN Optimization</a></div> +<div class="fancy-toc1"><a href="#or_optimizations">4. OR Optimizations</a></div> +<div class="fancy-toc1"><a href="#the_like_optimization">5. The LIKE Optimization</a></div> +<div class="fancy-toc1"><a href="#the_skip_scan_optimization">6. The Skip-Scan Optimization</a></div> +<div class="fancy-toc1"><a href="#joins">7. Joins</a></div> +<div class="fancy-toc2"><a href="#order_of_tables_in_a_join">7.1. Order of Tables in a Join</a></div> +<div class="fancy-toc2"><a href="#manual_control_of_query_plans_using_sqlite_stat_tables">7.2. Manual Control Of Query Plans Using SQLITE_STAT Tables</a></div> +<div class="fancy-toc2"><a href="#manual_control_of_query_plans_using_cross_join">7.3. Manual Control of Query Plans using CROSS JOIN</a></div> +<div class="fancy-toc1"><a href="#choosing_between_multiple_indexes">8. Choosing Between Multiple Indexes</a></div> +<div class="fancy-toc2"><a href="#disqualifying_where_clause_terms_using_unary_">8.1. Disqualifying WHERE Clause Terms using Unary-"+"</a></div> +<div class="fancy-toc2"><a href="#range_queries">8.2. Range Queries</a></div> +<div class="fancy-toc1"><a href="#covering_indexes">9. Covering Indexes</a></div> +<div class="fancy-toc1"><a href="#order_by_optimizations">10. ORDER BY Optimizations</a></div> +<div class="fancy-toc2"><a href="#partial_order_by_via_index">10.1. Partial ORDER BY via Index</a></div> +<div class="fancy-toc1"><a href="#subquery_flattening">11. Subquery Flattening</a></div> +<div class="fancy-toc1"><a href="#subquery_co_routines">12. Subquery Co-routines</a></div> +<div class="fancy-toc2"><a href="#using_co_routines_to_defer_work_until_after_the_sorting">12.1. Using Co-routines to Defer Work until after the Sorting</a></div> +<div class="fancy-toc1"><a href="#the_min_max_optimization">13. The MIN/MAX Optimization</a></div> +<div class="fancy-toc1"><a href="#automatic_indexes">14. Automatic Indexes</a></div> +<div class="fancy-toc2"><a href="#hash_joins">14.1. Hash Joins</a></div> +<div class="fancy-toc1"><a href="#the_push_down_optimization">15. The Push-Down Optimization</a></div> +<div class="fancy-toc1"><a href="#the_left_join_strength_reduction_optimization">16. The LEFT JOIN Strength Reduction Optimization</a></div> +<div class="fancy-toc1"><a href="#the_omit_left_join_optimization">17. The Omit LEFT JOIN Optimization</a></div> +<div class="fancy-toc1"><a href="#the_constant_propagation_optimization">18. The Constant Propagation Optimization</a></div> +</div> +</div> +<script> +function toggle_toc(){ +var sub = document.getElementById("toc_sub") +var mk = document.getElementById("toc_mk") +if( sub.style.display!="block" ){ +sub.style.display = "block"; +mk.innerHTML = "▼"; +} else { +sub.style.display = "none"; +mk.innerHTML = "►"; +} +} +</script> +</div> + + + + + +<h1 id="introduction"><span>1. </span>Introduction</h1> +<p> + This document provides an overview of how the query planner and optimizer + for SQLite works. + + +</p><p> + Given a single SQL statement, there might be dozens, hundreds, or even + thousands of ways to implement that statement, depending on the complexity + of the statement itself and of the underlying database schema. The + task of the query planner is to select the algorithm that minimizes + disk I/O and CPU overhead. + + +</p><p> + Additional background information is available in the + <a href="queryplanner.html">indexing tutorial</a> document. + + +</p><p> + With release 3.8.0 (2013-08-26), + the SQLite query planner was reimplemented as the + <a href="queryplanner-ng.html">Next Generation Query Planner</a> or "NGQP". All of the features, techniques, + and algorithms described in this document are applicable to both the + pre-3.8.0 legacy query planner and to the NGQP. For further information on + how the NGQP differs from the legacy query planner, see the + <a href="queryplanner-ng.html">detailed description of the NGQP</a>. + + + + +<a name="where_clause"></a> + +</p><h1 id="where_clause_analysis"><span>2. </span>WHERE Clause Analysis</h1> + +<p> + The WHERE clause on a query is broken up into "terms" where each term + is separated from the others by an AND operator. + If the WHERE clause is composed of constraints separated by the OR + operator then the entire clause is considered to be a single "term" + to which the <a href="#or_opt">OR-clause optimization</a> is applied. + +</p><p> + All terms of the WHERE clause are analyzed to see if they can be + satisfied using indexes. + To be usable by an index a term must usually be of one of the following + forms: + +<blockquote><pre><b> + </b><i>column</i><b> = </b><i>expression</i><b> + </b><i>column</i><b> IS </b><i>expression</i><b> + </b><i>column</i><b> > </b><i>expression</i><b> + </b><i>column</i><b> >= </b><i>expression</i><b> + </b><i>column</i><b> < </b><i>expression</i><b> + </b><i>column</i><b> <= </b><i>expression</i><b> + </b><i>expression</i><b> = </b><i>column</i><b> + </b><i>expression</i><b> > </b><i>column</i><b> + </b><i>expression</i><b> >= </b><i>column</i><b> + </b><i>expression</i><b> < </b><i>column</i><b> + </b><i>expression</i><b> <= </b><i>column</i><b> + </b><i>column</i><b> IN (</b><i>expression-list</i><b>) + </b><i>column</i><b> IN (</b><i>subquery</i><b>) + </b><i>column</i><b> IS NULL + </b><i>column</i><b> LIKE </b><i>pattern</i><b> + </b><i>column</i><b> GLOB </b><i>pattern</i><b> +</b></pre></blockquote> + +</p><p> + If an index is created using a statement like this: + +</p><div class="codeblock"><pre>CREATE INDEX idx_ex1 ON ex1(a,b,c,d,e,...,y,z); +</pre></div> + +<p> + Then the index might be used if the initial columns of the index + (columns a, b, and so forth) appear in WHERE clause terms. + The initial columns of the index must be used with + the <b>=</b> or <b>IN</b> or <b>IS</b> operators. + The right-most column that is used can employ inequalities. + For the right-most + column of an index that is used, there can be up to two inequalities + that must sandwich the allowed values of the column between two extremes. + +</p><p> + It is not necessary for every column of an index to appear in a + WHERE clause term in order for that index to be used. + However, there cannot be gaps in the columns of the index that are used. + Thus for the example index above, if there is no WHERE clause term + that constrains column c, then terms that constrain columns a and b can + be used with the index but not terms that constrain columns d through z. + Similarly, index columns will not normally be used (for indexing purposes) + if they are to the right of a + column that is constrained only by inequalities. + (See the <a href="optoverview.html#skipscan">skip-scan optimization</a> below for the exception.) + +</p><p> + In the case of <a href="expridx.html">indexes on expressions</a>, whenever the word "column" is + used in the foregoing text, one can substitute "indexed expression" + (meaning a copy of the expression that appears in the <a href="lang_createindex.html">CREATE INDEX</a> + statement) and everything will work the same. + + +<a name="idxexamp"></a> + +</p><h2 id="index_term_usage_examples"><span>2.1. </span>Index Term Usage Examples</h2> +<p> + For the index above and WHERE clause like this: + +</p><div class="codeblock"><pre>... WHERE a=5 AND b IN (1,2,3) AND c IS NULL AND d='hello' +</pre></div> +<p> + The first four columns a, b, c, and d of the index would be usable since + those four columns form a prefix of the index and are all bound by + equality constraints. + +</p><p> + For the index above and WHERE clause like this: + +</p><div class="codeblock"><pre>... WHERE a=5 AND b IN (1,2,3) AND c>12 AND d='hello' +</pre></div> +<p> + Only columns a, b, and c of the index would be usable. The d column + would not be usable because it occurs to the right of c and c is + constrained only by inequalities. + +</p><p> + For the index above and WHERE clause like this: + +</p><div class="codeblock"><pre>... WHERE a=5 AND b IN (1,2,3) AND d='hello' +</pre></div> +<p> + Only columns a and b of the index would be usable. The d column + would not be usable because column c is not constrained and there can + be no gaps in the set of columns that usable by the index. + +</p><p> + For the index above and WHERE clause like this: + +</p><div class="codeblock"><pre>... WHERE b IN (1,2,3) AND c NOT NULL AND d='hello' +</pre></div> +<p> + The index is not usable at all because the left-most column of the + index (column "a") is not constrained. Assuming there are no other + indexes, the query above would result in a full table scan. + +</p><p> + For the index above and WHERE clause like this: + +</p><div class="codeblock"><pre>... WHERE a=5 OR b IN (1,2,3) OR c NOT NULL OR d='hello' +</pre></div> +<p> + The index is not usable because the WHERE clause terms are connected + by OR instead of AND. This query would result in a full table scan. + However, if three additional indexes where added that contained columns + b, c, and d as their left-most columns, then the + <a href="#or_opt">OR-clause optimization</a> might apply. + + +<a name="between_opt"></a> + +</p><h1 id="the_between_optimization"><span>3. </span>The BETWEEN Optimization</h1> + +<p> + If a term of the WHERE clause is of the following form: + +<blockquote><pre><b> + </b><i>expr1</i><b> BETWEEN </b><i>expr2</i><b> AND </b><i>expr3</i><b> +</b></pre></blockquote> +</p><p> + Then two "virtual" terms are added as follows: + +<blockquote><pre><b> + </b><i>expr1</i><b> >= </b><i>expr2</i><b> AND </b><i>expr1</i><b> <= </b><i>expr3</i><b> +</b></pre></blockquote> +</p><p> + Virtual terms are used for analysis only and do not cause any byte-code + to be generated. + If both virtual terms end up being used as constraints on an index, + then the original BETWEEN term is omitted and the corresponding test + is not performed on input rows. + Thus if the BETWEEN term ends up being used as an index constraint + no tests are ever performed on that term. + On the other hand, the + virtual terms themselves never causes tests to be performed on + input rows. + Thus if the BETWEEN term is not used as an index constraint and + instead must be used to test input rows, the <i>expr1</i> expression is + only evaluated once. + +<a name="or_opt"></a> + +</p><h1 id="or_optimizations"><span>4. </span>OR Optimizations</h1> + +<p> + WHERE clause constraints that are connected by OR instead of AND can + be handled in two different ways. + If a term consists of multiple subterms containing a common column + name and separated by OR, like this: + +<blockquote><pre><b> + </b><i>column</i><b> = </b><i>expr1</i><b> OR </b><i>column</i><b> = </b><i>expr2</i><b> OR </b><i>column</i><b> = </b><i>expr3</i><b> OR ... +</b></pre></blockquote> +</p><p> + Then that term is rewritten as follows: + +<blockquote><pre><b> + </b><i>column</i><b> IN (</b><i>expr1</i><b>,</b><i>expr2</i><b>,</b><i>expr3</i><b>,...) +</b></pre></blockquote> +</p><p> + The rewritten term then might go on to constrain an index using the + normal rules for <b>IN</b> operators. Note that <i>column</i> must be + the same column in every OR-connected subterm, + although the column can occur on either the left or the right side of + the <b>=</b> operator. + +</p><p> + If and only if the previously described conversion of OR to an IN operator + does not work, the second OR-clause optimization is attempted. + Suppose the OR clause consists of multiple subterms as follows: + +<blockquote><pre><b> + </b><i>expr1</i><b> OR </b><i>expr2</i><b> OR </b><i>expr3</i><b> +</b></pre></blockquote> +</p><p> + Individual subterms might be a single comparison expression like + <b>a=5</b> or <b>x>y</b> or they can be + LIKE or BETWEEN expressions, or a subterm + can be a parenthesized list of AND-connected sub-subterms. + Each subterm is analyzed as if it were itself the entire WHERE clause + in order to see if the subterm is indexable by itself. + If <u>every</u> subterm of an OR clause is separately indexable + then the OR clause might be coded such that a separate index is used + to evaluate each term of the OR clause. One way to think about how + SQLite uses separate indexes for each OR clause term is to imagine + that the WHERE clause where rewritten as follows: + +<blockquote><pre><b> + rowid IN (SELECT rowid FROM </b><i>table</i><b> WHERE </b><i>expr1</i><b> + UNION SELECT rowid FROM </b><i>table</i><b> WHERE </b><i>expr2</i><b> + UNION SELECT rowid FROM </b><i>table</i><b> WHERE </b><i>expr3</i><b>) +</b></pre></blockquote> +</p><p> + The rewritten expression above is conceptual; WHERE clauses containing + OR are not really rewritten this way. + The actual implementation of the OR clause uses a mechanism that is + more efficient and that works even for <a href="withoutrowid.html">WITHOUT ROWID</a> tables or + tables in which the "rowid" is inaccessible. Nevertheless, + the essence of the implementation is captured by the statement + above: Separate indexes are used to find candidate result rows + from each OR clause term and the final result is the union of + those rows. + +</p><p> + Note that in most cases, SQLite will only use a single index for each + table in the FROM clause of a query. The second OR-clause optimization + described here is the exception to that rule. With an OR-clause, + a different index might be used for each subterm in the OR-clause. + +</p><p> + For any given query, the fact that the OR-clause optimization described + here can be used does not guarantee that it will be used. + SQLite uses a cost-based query planner that estimates the CPU and + disk I/O costs of various competing query plans and chooses the plan + that it thinks will be the fastest. If there are many OR terms in + the WHERE clause or if some of the indexes on individual OR-clause + subterms are not very selective, then SQLite might decide that it is + faster to use a different query algorithm, or even a full-table scan. + Application developers can use the + <a href="lang_explain.html">EXPLAIN QUERY PLAN</a> prefix on a statement to get a + high-level overview of the chosen query strategy. + +<a name="like_opt"></a> + +</p><h1 id="the_like_optimization"><span>5. </span>The LIKE Optimization</h1> + +<p> + A WHERE-clause term that uses the <a href="lang_expr.html#like">LIKE</a> or <a href="lang_expr.html#glob">GLOB</a> operator + can sometimes be used with an index to do a range search, + almost as if the LIKE or GLOB were an alternative to a <a href="lang_expr.html#between">BETWEEN</a> + operator. + There are many conditions on this optimization: + +</p><p> + </p><ol> + <li>The right-hand side of the LIKE or GLOB must be either a string literal + or a <a href="lang_expr.html#varparam">parameter</a> bound to a string literal + that does not begin with a wildcard character.</li> + <li>It must not be possible to make the LIKE or GLOB operator true by + having a numeric value (instead of a string or blob) on the + left-hand side. This means that either: + <ol type="A"> + <li> the left-hand side of the LIKE or GLOB operator is the name + of an indexed column with <a href="datatype3.html#affinity">TEXT affinity</a>, or</li> + <li> the right-hand side pattern argument does not begin with a + minus sign ("-") or a digit.</li> + </ol> + This constraint arises from the fact that numbers do not sort in + lexicographical order. For example: 9<10 but '9'>'10'.</li> + <li>The built-in functions used to implement LIKE and GLOB must not + have been overloaded using the <a href="c3ref/create_function.html">sqlite3_create_function()</a> API.</li> + <li>For the GLOB operator, the column must be indexed using the + built-in BINARY collating sequence.</li> + <li>For the LIKE operator, if <a href="pragma.html#pragma_case_sensitive_like">case_sensitive_like</a> mode is enabled then + the column must indexed using BINARY collating sequence, or if + <a href="pragma.html#pragma_case_sensitive_like">case_sensitive_like</a> mode is disabled then the column must indexed + using built-in NOCASE collating sequence.</li> + <li>If the ESCAPE option is used, the ESCAPE character must be ASCII, + or a single-byte character in UTF-8. + </li></ol> + +<p> + The LIKE operator has two modes that can be set by a + <a href="pragma.html#pragma_case_sensitive_like">pragma</a>. The + default mode is for LIKE comparisons to be insensitive to differences + of case for latin1 characters. Thus, by default, the following + expression is true: + +</p><div class="codeblock"><pre>'a' LIKE 'A' +</pre></div> +<p> + If the case_sensitive_like pragma is enabled as follows: + +</p><div class="codeblock"><pre>PRAGMA case_sensitive_like=ON; +</pre></div> +<p> + Then the LIKE operator pays attention to case and the example above would + evaluate to false. Note that case insensitivity only applies to + latin1 characters - basically the upper and lower case letters of English + in the lower 127 byte codes of ASCII. International character sets + are case sensitive in SQLite unless an application-defined + <a href="datatype3.html#collation">collating sequence</a> and <a href="lang_corefunc.html#like">like() SQL function</a> are provided that + take non-ASCII characters into account. + If an application-defined collating sequence and/or like() SQL + function are provided, the LIKE optimization described here will never + be taken. + +</p><p> + The LIKE operator is case insensitive by default because this is what + the SQL standard requires. You can change the default behavior at + compile time by using the <a href="compile.html#case_sensitive_like">SQLITE_CASE_SENSITIVE_LIKE</a> command-line option + to the compiler. + +</p><p> + The LIKE optimization might occur if the column named on the left of the + operator is indexed using the built-in BINARY collating sequence and + case_sensitive_like is turned on. Or the optimization might occur if + the column is indexed using the built-in NOCASE collating sequence and the + case_sensitive_like mode is off. These are the only two combinations + under which LIKE operators will be optimized. + +</p><p> + The GLOB operator is always case sensitive. The column on the left side + of the GLOB operator must always use the built-in BINARY collating sequence + or no attempt will be made to optimize that operator with indexes. + +</p><p> + The LIKE optimization will only be attempted if + the right-hand side of the GLOB or LIKE operator is either + literal string or a <a href="lang_expr.html#varparam">parameter</a> that has been <a href="c3ref/bind_blob.html">bound</a> + to a string literal. The string literal must not + begin with a wildcard; if the right-hand side begins with a wildcard + character then this optimization is not attempted. If the right-hand side + is a <a href="lang_expr.html#varparam">parameter</a> that is bound to a string, then this optimization is + only attempted if the <a href="c3ref/stmt.html">prepared statement</a> containing the expression + was compiled with <a href="c3ref/prepare.html">sqlite3_prepare_v2()</a> or <a href="c3ref/prepare.html">sqlite3_prepare16_v2()</a>. + The LIKE optimization is not attempted if the + right-hand side is a <a href="lang_expr.html#varparam">parameter</a> and the statement was prepared using + <a href="c3ref/prepare.html">sqlite3_prepare()</a> or <a href="c3ref/prepare.html">sqlite3_prepare16()</a>. + +</p><p> + Suppose the initial sequence of non-wildcard characters on the right-hand + side of the LIKE or GLOB operator is <i>x</i>. We are using a single + character to denote this non-wildcard prefix but the reader should + understand that the prefix can consist of more than 1 character. + Let <i>y</i> be the smallest string that is the same length as /x/ but which + compares greater than <i>x</i>. For example, if <i>x</i> is + <tt>'hello'</tt> then + <i>y</i> would be <tt>'hellp'</tt>. + The LIKE and GLOB optimizations consist of adding two virtual terms + like this: + +<blockquote><pre><b> + </b><i>column</i><b> >= </b><i>x</i><b> AND </b><i>column</i><b> < </b><i>y</i><b> +</b></pre></blockquote> +</p><p> + Under most circumstances, the original LIKE or GLOB operator is still + tested against each input row even if the virtual terms are used to + constrain an index. This is because we do not know what additional + constraints may be imposed by characters to the right + of the <i>x</i> prefix. However, if there is only a single + global wildcard to the right of <i>x</i>, then the original LIKE or + GLOB test is disabled. + In other words, if the pattern is like this: + +<blockquote><pre><b> + </b><i>column</i><b> LIKE </b><i>x</i><b>% + </b><i>column</i><b> GLOB </b><i>x</i><b>* +</b></pre></blockquote> +</p><p> + then the original LIKE or GLOB tests are disabled when the virtual + terms constrain an index because in that case we know that all of the + rows selected by the index will pass the LIKE or GLOB test. + +</p><p> + Note that when the right-hand side of a LIKE or GLOB operator is + a <a href="lang_expr.html#varparam">parameter</a> and the statement is prepared using <a href="c3ref/prepare.html">sqlite3_prepare_v2()</a> + or <a href="c3ref/prepare.html">sqlite3_prepare16_v2()</a> then the statement is automatically reparsed + and recompiled on the first <a href="c3ref/step.html">sqlite3_step()</a> call of each run if the binding + to the right-hand side parameter has changed since the previous run. + This reparse and recompile is essentially the same action that occurs + following a schema change. The recompile is necessary so that the query + planner can examine the new value bound to the right-hand side of the + LIKE or GLOB operator and determine whether or not to employ the + optimization described above. + +<a name="skipscan"></a> + +</p><h1 id="the_skip_scan_optimization"><span>6. </span>The Skip-Scan Optimization</h1> + +<p> + The general rule is that indexes are only useful if there are + WHERE-clause constraints on the left-most columns of the index. + However, in some cases, + SQLite is able to use an index even if the first few columns of + the index are omitted from the WHERE clause but later columns + are included. + + +</p><p> + Consider a table such as the following: + +</p><div class="codeblock"><pre>CREATE TABLE people( + name TEXT PRIMARY KEY, + role TEXT NOT NULL, + height INT NOT NULL, -- in cm + CHECK( role IN ('student','teacher') ) +); +CREATE INDEX people_idx1 ON people(role, height); +</pre></div> + +<p> + The people table has one entry for each person in a large + organization. Each person is either a "student" or a "teacher", + as determined by the "role" field. The table also records the height in + centimeters of each person. The role and height are indexed. + Notice that the left-most column of the index is not very + selective - it only contains two possible values. + + +</p><p> + Now consider a query to find the names of everyone in the + organization that is 180cm tall or taller: + + +</p><div class="codeblock"><pre>SELECT name FROM people WHERE height>=180; +</pre></div> + +<p> + Because the left-most column of the index does not appear in the + WHERE clause of the query, one is tempted to conclude that the + index is not usable here. However, SQLite is able to use the index. + Conceptually, SQLite uses the index as if the query were more + like the following: + + +</p><div class="codeblock"><pre>SELECT name FROM people + WHERE role IN (SELECT DISTINCT role FROM people) + AND height>=180; +</pre></div> + +<p> + Or this: + +</p><div class="codeblock"><pre>SELECT name FROM people WHERE role='teacher' AND height>=180 +UNION ALL +SELECT name FROM people WHERE role='student' AND height>=180; +</pre></div> + +<p> + The alternative query formulations shown above are conceptual only. + SQLite does not really transform the query. + The actual query plan is like this: + SQLite locates the first possible value for "role", which it + can do by rewinding the "people_idx1" index to the beginning and reading + the first record. SQLite stores this first "role" value in an + internal variable that we will here call "$role". Then SQLite + runs a query like: "SELECT name FROM people WHERE role=$role AND height>=180". + This query has an equality constraint on the left-most column of the + index and so the index can be used to resolve that query. Once + that query is finished, SQLite then uses the "people_idx1" index to + locate the next value of the "role" column, using code that is logically + similar to "SELECT role FROM people WHERE role>$role LIMIT 1". + This new "role" value overwrites the $role variable, and the process + repeats until all possible values for "role" have been examined. + +</p><p> + We call this kind of index usage a "skip-scan" because the database + engine is basically doing a full scan of the index but it optimizes the + scan (making it less than "full") by occasionally skipping ahead to the + next candidate value. + +</p><p> + SQLite might use a skip-scan on an index if it knows that the first + one or more columns contain many duplication values. + If there are too few duplicates + in the left-most columns of the index, then it would + be faster to simply step ahead to the next value, and thus do + a full table scan, than to do a binary search on an index to locate + the next left-column value. + +</p><p> + The only way that SQLite can know that there are many duplicates + in the left-most columns of an index + is if the <a href="lang_analyze.html">ANALYZE</a> command has been run + on the database. + Without the results of ANALYZE, SQLite has to guess at the "shape" of + the data in the table, and the default guess is that there are an average + of 10 duplicates for every value in the left-most column of the index. + Skip-scan only becomes profitable (it only gets to be faster than + a full table scan) when the number of duplicates is about 18 or more. + Hence, a skip-scan is never used on a database that has not been analyzed. + +<a name="joins"></a> + +</p><h1 id="joins"><span>7. </span>Joins</h1> + +<p> + The ON and USING clauses of an inner join are converted into additional + terms of the WHERE clause prior to WHERE clause analysis described + <a href="#where_clause">above in paragraph 2.0</a>. + Thus with SQLite, there is no computational + advantage to use the newer SQL92 join syntax + over the older SQL89 comma-join syntax. They both end up accomplishing + exactly the same thing on inner joins. + +</p><p> + For an OUTER JOIN the situation is more complex. The following + two queries are not equivalent: + +</p><div class="codeblock"><pre>SELECT * FROM tab1 LEFT JOIN tab2 ON tab1.x=tab2.y; +SELECT * FROM tab1 LEFT JOIN tab2 WHERE tab1.x=tab2.y; +</pre></div> +<p> + For an inner join, the two queries above would be identical. However, + special processing applies to the ON and USING clauses of an OUTER join: + specifically, the constraints in an ON or USING clause do not apply if + the right table of the join is on a null row, but the constraints do apply + in the WHERE clause. The net effect is that putting the ON or USING + clause expressions for a LEFT JOIN in the WHERE clause effectively converts + the query to an + ordinary INNER JOIN - albeit an inner join that runs more slowly. + +<a name="table_order"></a> + +</p><h2 id="order_of_tables_in_a_join"><span>7.1. </span>Order of Tables in a Join</h2> + +<p> + The current implementation of + SQLite uses only loop joins. That is to say, joins are implemented as + nested loops. + +</p><p> + The default order of the nested loops in a join is for the left-most + table in the FROM clause to form the outer loop and the right-most + table to form the inner loop. + However, SQLite will nest the loops in a different order if doing so + will help it to select better indexes. + +</p><p> + Inner joins can be freely reordered. However a left outer join is + neither commutative nor associative and hence will not be reordered. + Inner joins to the left and right of the outer join might be reordered + if the optimizer thinks that is advantageous but the outer joins are + always evaluated in the order in which they occur. + +</p><p> + SQLite <a href="lang_select.html#crossjoin">treats the CROSS JOIN operator specially</a>. + The CROSS JOIN operator is commutative, in theory. However, SQLite chooses to + never reorder tables in a CROSS JOIN. This provides a mechanism + by which the programmer can force SQLite to choose a particular loop nesting + order. + +</p><p> + When selecting the order of tables in a join, SQLite uses an efficient + polynomial-time algorithm. Because of this, + SQLite is able to plan queries with 50- or 60-way joins in a matter of + microseconds + +</p><p> + Join reordering is automatic and usually works well enough that + programmers do not have to think about it, especially if <a href="lang_analyze.html">ANALYZE</a> + has been used to gather statistics about the available indexes, + though occasionally some hints from the programmer are needed. + Consider, for example, the following schema: + +</p><div class="codeblock"><pre>CREATE TABLE node( + id INTEGER PRIMARY KEY, + name TEXT +); +CREATE INDEX node_idx ON node(name); +CREATE TABLE edge( + orig INTEGER REFERENCES node, + dest INTEGER REFERENCES node, + PRIMARY KEY(orig, dest) +); +CREATE INDEX edge_idx ON edge(dest,orig); +</pre></div> +<p> + The schema above defines a directed graph with the ability to store a + name at each node. Now consider a query against this schema: + +</p><div class="codeblock"><pre>SELECT * + FROM edge AS e, + node AS n1, + node AS n2 + WHERE n1.name = 'alice' + AND n2.name = 'bob' + AND e.orig = n1.id + AND e.dest = n2.id; +</pre></div> +<p> + This query asks for is all information about edges that go from + nodes labeled "alice" to nodes labeled "bob". + The query optimizer in SQLite has basically two choices on how to + implement this query. (There are actually six different choices, but + we will only consider two of them here.) + Pseudocode below demonstrating these two choices. + +<a name="option1"></a> + +</p><p>Option 1: +</p><div class="codeblock"><pre>foreach n1 where n1.name='alice' do: + foreach n2 where n2.name='bob' do: + foreach e where e.orig=n1.id and e.dest=n2.id + return n1.*, n2.*, e.* + end + end +end +</pre></div> +<a name="option2"></a> + +<p>Option 2: +</p><div class="codeblock"><pre>foreach n1 where n1.name='alice' do: + foreach e where e.orig=n1.id do: + foreach n2 where n2.id=e.dest and n2.name='bob' do: + return n1.*, n2.*, e.* + end + end +end +</pre></div> +<p> + The same indexes are used to speed up every loop in both implementation + options. + The only difference in these two query plans is the order in which + the loops are nested. + +</p><p> + So which query plan is better? It turns out that the answer depends on + what kind of data is found in the node and edge tables. + +</p><p> + Let the number of alice nodes be M and the number of bob nodes be N. + Consider two scenarios. In the first scenario, M and N are both 2 but + there are thousands of edges on each node. In this case, option 1 is + preferred. With option 1, the inner loop checks for the existence of + an edge between a pair of nodes and outputs the result if found. + Because there are only 2 alice and bob nodes each, the inner loop + only has to run four times and the query is very quick. Option 2 would + take much longer here. The outer loop of option 2 only executes twice, + but because there are a large number of edges leaving each alice node, + the middle loop has to iterate many thousands of times. It will be + much slower. So in the first scenario, we prefer to use option 1. + +</p><p> + Now consider the case where M and N are both 3500. Alice nodes are + abundant. This time suppose each of these nodes is connected by only one + or two edges. Now option 2 is preferred. With option 2, + the outer loop still has to run 3500 times, but the middle loop only + runs once or twice for each outer loop and the inner loop will only + run once for each middle loop, if at all. So the total number of + iterations of the inner loop is around 7000. Option 1, on the other + hand, has to run both its outer loop and its middle loop 3500 times + each, resulting in 12 million iterations of the middle loop. + Thus in the second scenario, option 2 is nearly 2000 times faster + than option 1. + +</p><p> + So you can see that depending on how the data is structured in the table, + either query plan 1 or query plan 2 might be better. Which plan does + SQLite choose by default? As of version 3.6.18, without running <a href="lang_analyze.html">ANALYZE</a>, + SQLite will choose option 2. + If the <a href="lang_analyze.html">ANALYZE</a> command is run in order to gather statistics, + a different choice might be made if the statistics indicate that the + alternative is likely to run faster. + +<a name="manctrl"></a> + +</p><h2 id="manual_control_of_query_plans_using_sqlite_stat_tables"><span>7.2. </span>Manual Control Of Query Plans Using SQLITE_STAT Tables</h2> + +<p> + SQLite provides the ability for advanced programmers to exercise control + over the query plan chosen by the optimizer. One method for doing this + is to fudge the <a href="lang_analyze.html">ANALYZE</a> results in the <a href="fileformat2.html#stat1tab">sqlite_stat1</a>, + <a href="fileformat2.html#stat3tab">sqlite_stat3</a>, and/or <a href="fileformat2.html#stat4tab">sqlite_stat4</a> tables. This is not + recommended for most situations. + +<a name="crossjoin"></a> + +</p><h2 id="manual_control_of_query_plans_using_cross_join"><span>7.3. </span>Manual Control of Query Plans using CROSS JOIN</h2> + +<p> + Programmers can force SQLite to use a particular loop nesting order + for a join by using the CROSS JOIN operator instead of just JOIN, + INNER JOIN, NATURAL JOIN, or a "," join. Though CROSS JOINs are + commutative in theory, SQLite chooses to never reorder the tables in + a CROSS JOIN. Hence, the left table of a CROSS JOIN will always be + in an outer loop relative to the right table. + +</p><p> + In the following query, the optimizer is free to reorder the + tables of FROM clause any way it sees fit: + +</p><div class="codeblock"><pre>SELECT * + FROM node AS n1, + edge AS e, + node AS n2 + WHERE n1.name = 'alice' + AND n2.name = 'bob' + AND e.orig = n1.id + AND e.dest = n2.id; +</pre></div> +<p> + In the following logically equivalent formulation of the same query, + the substitution of "CROSS JOIN" for the "," means that the order + of tables must be N1, E, N2. + +</p><div class="codeblock"><pre>SELECT * + FROM node AS n1 CROSS JOIN + edge AS e CROSS JOIN + node AS n2 + WHERE n1.name = 'alice' + AND n2.name = 'bob' + AND e.orig = n1.id + AND e.dest = n2.id; +</pre></div> +<p> + In the latter query, the query plan must be + <a href="#option2">option 2</a>. Note that + you must use the keyword "CROSS" in order to disable the table reordering + optimization; INNER JOIN, NATURAL JOIN, JOIN, and other similar + combinations work just like a comma join in that the optimizer is + free to reorder tables as it sees fit. (Table reordering is also + disabled on an outer join, but that is because outer joins are not + associative or commutative. Reordering tables in OUTER JOIN changes + the result.) + +</p><p> + See "<a href="queryplanner-ng.html#fossilcasestudy">The Fossil NGQP Upgrade Case Study</a>" for another real-world example + of using CROSS JOIN to manually control the nesting order of a join. + The <a href="queryplanner-ng.html#howtofix">query planner checklist</a> found later in the same document provides + further guidance on manual control of the query planner. + +<a name="multi_index"></a> + +</p><h1 id="choosing_between_multiple_indexes"><span>8. </span>Choosing Between Multiple Indexes</h1> + +<p> + Each table in the FROM clause of a query can use at most one index + (except when the <a href="#or_opt">OR-clause optimization</a> comes into + play) + and SQLite strives to use at least one index on each table. Sometimes, + two or more indexes might be candidates for use on a single table. + For example: + +</p><div class="codeblock"><pre>CREATE TABLE ex2(x,y,z); +CREATE INDEX ex2i1 ON ex2(x); +CREATE INDEX ex2i2 ON ex2(y); +SELECT z FROM ex2 WHERE x=5 AND y=6; +</pre></div> +<p> + For the SELECT statement above, the optimizer can use the ex2i1 index + to lookup rows of ex2 that contain x=5 and then test each row against + the y=6 term. Or it can use the ex2i2 index to lookup rows + of ex2 that contain y=6 then test each of those rows against the + x=5 term. + +</p><p> + When faced with a choice of two or more indexes, SQLite tries to estimate + the total amount of work needed to perform the query using each option. + It then selects the option that gives the least estimated work. + +</p><p> + To help the optimizer get a more accurate estimate of the work involved + in using various indexes, the user may optionally run the <a href="lang_analyze.html">ANALYZE</a> command. + The <a href="lang_analyze.html">ANALYZE</a> command scans all indexes of database where there might + be a choice between two or more indexes and gathers statistics on the + selectiveness of those indexes. The statistics gathered by + this scan are stored in special database tables names shows names all + begin with "<b>sqlite_stat</b>". + The content of these tables is not updated as the database + changes so after making significant changes it might be prudent to + rerun <a href="lang_analyze.html">ANALYZE</a>. + The results of an ANALYZE command are only available to database connections + that are opened after the ANALYZE command completes. + +</p><p> + The various <b>sqlite_stat</b><i>N</i> tables contain information on how + selective the various indexes are. For example, the <a href="fileformat2.html#stat1tab">sqlite_stat1</a> + table might indicate that an equality constraint on column x reduces the + search space to 10 rows on average, whereas an equality constraint on + column y reduces the search space to 3 rows on average. In that case, + SQLite would prefer to use index ex2i2 since that index is more selective. + +<a name="uplus"></a> + +</p><h2 id="disqualifying_where_clause_terms_using_unary_"><span>8.1. </span>Disqualifying WHERE Clause Terms using Unary-"+"</h2> + +<p> + Terms of the WHERE clause can be manually disqualified for use with + indexes by prepending a unary <b>+</b> operator to the column name. The + unary <b>+</b> is a no-op and will not generate any byte code in the prepared + statement. + However, the unary <b>+</b> operator will prevent the term from + constraining an index. + So, in the example above, if the query were rewritten as: + +</p><div class="codeblock"><pre>SELECT z FROM ex2 WHERE +x=5 AND y=6; +</pre></div> +<p> + The <b>+</b> operator on the <b>x</b> column will prevent that term from + constraining an index. This would force the use of the ex2i2 index. + +</p><p> + Note that the unary <b>+</b> operator also removes + <a href="datatype3.html#affinity">type affinity</a> from + an expression, and in some cases this can cause subtle changes in + the meaning of an expression. + In the example above, + if column <b>x</b> has <a href="datatype3.html#affinity">TEXT affinity</a> + then the comparison "x=5" will be done as text. The <b>+</b> operator + removes the affinity. So the comparison "<b>+x=5</b>" will compare the text + in column <b>x</b> with the numeric value 5 and will always be false. + +<a name="rangequery"></a> + +</p><h2 id="range_queries"><span>8.2. </span>Range Queries</h2> + +<p> + Consider a slightly different scenario: + +</p><div class="codeblock"><pre>CREATE TABLE ex2(x,y,z); +CREATE INDEX ex2i1 ON ex2(x); +CREATE INDEX ex2i2 ON ex2(y); +SELECT z FROM ex2 WHERE x BETWEEN 1 AND 100 AND y BETWEEN 1 AND 100; +</pre></div> +<p> + Further suppose that column x contains values spread out + between 0 and 1,000,000 and column y contains values + that span between 0 and 1,000. In that scenario, + the range constraint on column x should reduce the search space by + a factor of 10,000 whereas the range constraint on column y should + reduce the search space by a factor of only 10. So the ex2i1 index + should be preferred. + +</p><p> + SQLite will make this determination, but only if it has been compiled + with <a href="compile.html#enable_stat3">SQLITE_ENABLE_STAT3</a> or <a href="compile.html#enable_stat4">SQLITE_ENABLE_STAT4</a>. + The <a href="compile.html#enable_stat3">SQLITE_ENABLE_STAT3</a> and <a href="compile.html#enable_stat4">SQLITE_ENABLE_STAT4</a> options causes + the <a href="lang_analyze.html">ANALYZE</a> command to collect a histogram of column content in the + <a href="fileformat2.html#stat3tab">sqlite_stat3</a> or <a href="fileformat2.html#stat4tab">sqlite_stat4</a> tables and to use this histogram to + make a better guess at the best query to use for range constraints + such as the above. The main difference between STAT3 and STAT4 is + that STAT3 records histogram data for only the left-most column of + an index whereas STAT4 records histogram data for all columns of an + index. For single-column indexes, STAT3 and STAT4 work the same. + +</p><p> + The histogram data is only useful if the right-hand side of the constraint + is a simple compile-time constant or <a href="lang_expr.html#varparam">parameter</a> and not an expression. + +</p><p> + Another limitation of the histogram data is that it only applies to the + left-most column on an index. Consider this scenario: + +</p><div class="codeblock"><pre>CREATE TABLE ex3(w,x,y,z); +CREATE INDEX ex3i1 ON ex2(w, x); +CREATE INDEX ex3i2 ON ex2(w, y); +SELECT z FROM ex3 WHERE w=5 AND x BETWEEN 1 AND 100 AND y BETWEEN 1 AND 100; +</pre></div> +<p> + Here the inequalities are on columns x and y which are not the + left-most index columns. Hence, the histogram data which is collected no + left-most column of indexes is useless in helping to choose between the + range constraints on columns x and y. + +<a name="covidx"></a> + +</p><h1 id="covering_indexes"><span>9. </span>Covering Indexes</h1> + +<p> + When doing an indexed lookup of a row, the usual procedure is to + do a binary search on the index to find the index entry, then extract + the <a href="lang_createtable.html#rowid">rowid</a> from the index and use that <a href="lang_createtable.html#rowid">rowid</a> to do a binary search on + the original table. Thus a typical indexed lookup involves two + binary searches. + If, however, all columns that were to be fetched from the table are + already available in the index itself, SQLite will use the values + contained in the index and will never look up the original table + row. This saves one binary search for each row and can make many + queries run twice as fast. + +</p><p> + When an index contains all of the data needed for a query and when the + original table never needs to be consulted, we call that index a + "covering index". + +<a name="order_by"></a> + +</p><h1 id="order_by_optimizations"><span>10. </span>ORDER BY Optimizations</h1> + +<p> + SQLite attempts to use an index to satisfy the ORDER BY clause of a + query when possible. + When faced with the choice of using an index to satisfy WHERE clause + constraints or satisfying an ORDER BY clause, SQLite does the same + cost analysis described above + and chooses the index that it believes will result in the fastest answer. + +</p><p> + SQLite will also attempt to use indexes to help satisfy GROUP BY clauses + and the DISTINCT keyword. If the nested loops of the join can be arranged + such that rows that are equivalent for the GROUP BY or for the DISTINCT are + consecutive, then the GROUP BY or DISTINCT logic can determine if the + current row is part of the same group or if the current row is distinct + simply by comparing the current row to the previous row. + This can be much faster than the alternative of comparing each row to + all prior rows. + +<a name="partsort"></a> + +</p><h2 id="partial_order_by_via_index"><span>10.1. </span>Partial ORDER BY via Index</h2> + +<p> + If a query contains an ORDER BY clause with multiple terms, it might + be that SQLite can use indexes to cause rows to come out in the order + of some prefix of the terms in the ORDER BY but that later terms in + the ORDER BY are not satisfied. In that case, SQLite does block sorting. + Suppose the ORDER BY clause has four terms and the natural order of the + query results in rows appearing in order of the first two terms. As + each row is output by the query engine and enters the sorter, the + outputs in the current row corresponding to the first two terms of + the ORDER BY are compared against the previous row. If they have + changed, the current sort is finished and output and a new sort is + started. This results in a slightly faster sort. Event bigger + advantages are that many fewer rows need to be held in memory, + reducing memory requirements, and outputs can begin to appear before + the core query has run to completion. + +<a name="flattening"></a> + +</p><h1 id="subquery_flattening"><span>11. </span>Subquery Flattening</h1> + +<p> + When a subquery occurs in the FROM clause of a SELECT, the simplest + behavior is to evaluate the subquery into a transient table, then run + the outer SELECT against the transient table. Such a plan + can be suboptimal since the transient table will not have any indexes + and the outer query (which is likely a join) will be forced to do a + full table scan on the transient table. + +</p><p> + To overcome this problem, SQLite attempts to flatten subqueries in + the FROM clause of a SELECT. + This involves inserting the FROM clause of the subquery into the + FROM clause of the outer query and rewriting expressions in + the outer query that refer to the result set of the subquery. + For example: + +</p><div class="codeblock"><pre>SELECT t1.a, t2.b FROM t2, (SELECT x+y AS a FROM t1 WHERE z<100) WHERE a>5 +</pre></div> +<p> + Would be rewritten using query flattening as: + +</p><div class="codeblock"><pre>SELECT t1.x+t1.y AS a, t2.b FROM t2, t1 WHERE z<100 AND a>5 +</pre></div> +<p> + There is a long list of conditions that must all be met in order for + query flattening to occur. Some of the constraints are marked as + obsolete by italic text. These extra constraints are retained in the + documentation to preserve the numbering of the other constraints. + +</p><p> + Casual readers are not expected to understand all of these rules. + A key take-away from this section is that the rules for determining + if query flatting is safe or unsafe are subtle and + complex. There have been multiple bugs over the years caused by + over-aggressive query flattening. On the other hand, performance + of complex queries and/or queries involving views tends to suffer + if query flattening is more conservative. + +</p><p> + </p><ol> + <li value="1"> <i>(Obsolete. Query flattening is no longer + attempted for aggregate subqueries.)</i> + + </li><li value="2"> <i>(Obsolete. Query flattening is no longer + attempted for aggregate subqueries.)</i> + + </li><li value="3"> + If the subquery is the right operand of a LEFT JOIN then + <ol type="a"><li> the subquery may not be a join, and + </li><li> the FROM clause of the subquery may + not contain a virtual table, and + </li><li> the outer query may not be an aggregate.</li></ol></li> + + <li value="4"> The subquery is not DISTINCT. + + </li><li value="5"> <i>(Subsumed into constraint 4)</i> + + </li><li value="6"> <i>(Obsolete. Query flattening is no longer + attempted for aggregate subqueries.)</i> + + </li><li value="7"> + The subquery has a FROM clause. + + </li><li value="8"> + The subquery does not use LIMIT or the outer query is not a join. + + </li><li value="9"> + The subquery does not use LIMIT or the outer query does not use + aggregates. + + </li><li value="10"> <i>(Restriction relaxed in 2005)</i> + + </li><li value="11"> + The subquery and the outer query do not both have ORDER BY clauses. + + </li><li value="12"> <i>(Subsumed into constraint 3)</i> + + </li><li value="13"> The subquery and outer query do not both use LIMIT. + + </li><li value="14"> The subquery does not use OFFSET. + + </li><li value="15"> + If the outer query is part of a compound select, then the + subquery may not have a LIMIT clause. + + </li><li value="16"> + If the outer query is an aggregate, then the subquery may + not contain ORDER BY. + + </li><li value="17"> + If the sub-query is a compound SELECT, then + <ol type="a"> + <li> all compound operators must be UNION ALL, and + </li><li> no terms with the subquery compound may be aggregate + or DISTINCT, and + </li><li> every term within the subquery must have a FROM clause, and + </li><li> the outer query may not be an aggregate, DISTINCT query, or join. + </li></ol> + + The parent and sub-query may contain WHERE clauses. Subject to + rules (11), (12) and (13), they may also contain ORDER BY, + LIMIT and OFFSET clauses. + + </li><li value="18"> + If the sub-query is a compound select, then all terms of the + ORDER by clause of the parent must be simple references to + columns of the sub-query. + + </li><li value="19"> + If the subquery uses LIMIT then the outer query may not + have a WHERE clause. + + </li><li value="20"> + If the sub-query is a compound select, then it must not use + an ORDER BY clause. + + </li><li value="21"> + If the subquery uses LIMIT, then the outer query may not be + DISTINCT. + + </li><li value="22"> The subquery may not be a recursive CTE. + + </li><li value="23"> <i>(Subsumed into constraint 17d.)</i> + + </li><li value="24"> <i>(Obsolete. Query flattening is no longer + attempted for aggregate subqueries.)</i> + </li></ol> + + +<p> + Query flattening is an important optimization when views are used as + each use of a view is translated into a subquery. + +<a name="coroutines"></a> + +</p><h1 id="subquery_co_routines"><span>12. </span>Subquery Co-routines</h1> + +<p> + Prior to SQLite 3.7.15 (2012-12-12), + a subquery in the FROM clause would be + either flattened into the outer query, or else the subquery would be run + to completion + before the outer query started, the result set from the subquery + would be stored in a transient table, + and then the transient table would be used in the outer query. Newer + versions of SQLite have a third option, which is to implement the subquery + using a co-routine. + +</p><p> + A co-routine is like a subroutine in that it runs in the same thread + as the caller and eventually returns control back to the caller. The + difference is that a co-routine also has the ability to return + before it has finished, and then resume where it left off the next + time it is called. + +</p><p> + When a subquery is implemented as a co-routine, byte-code is generated + to implement the subquery as if it were a standalone query, except + instead of returning rows of results back to the application, the + co-routine yields control back to the caller after each row is computed. + The caller can then use that one computed row as part of its computation, + then invoke the co-routine again when it is ready for the next row. + +</p><p> + Co-routines are better than storing the complete result set of the subquery + in a transient table because co-routines use less memory. With a co-routine, + only a single row of the result needs to be remembered, whereas all rows of + the result must be stored for a transient table. Also, because the + co-routine does not need to run to completion before the outer query + begins its work, the first rows of output can appear much sooner, and if + the overall query is aborted, less work is done overall. + +</p><p> + On the other hand, if the result of the subquery must be scanned multiple + times (because, for example, it is just one table in a join) then it + is better to use a transient table to remember the entire result of the + subquery, in order to avoid computing the subquery more than once. + +<a name="deferred_work"></a> + +</p><h2 id="using_co_routines_to_defer_work_until_after_the_sorting"><span>12.1. </span>Using Co-routines to Defer Work until after the Sorting</h2> + +<p> + As of SQLite version 3.21.0 (2017-10-24), the query planner will + always prefer to use a co-routine to implement FROM-clause subqueries + that contains an ORDER BY clause and that are not part of a join when + the result set of the outer query is "complex". This feature allows + applications to shift expensive computations from before the + sorter until after the sorter, which can result in faster operation. + For example, consider this query: + +</p><div class="codeblock"><pre>SELECT expensive_function(a) FROM tab ORDER BY date DESC LIMIT 5; +</pre></div> +<p> + The goal of this query is to compute some value for the five most + recent entries in the table. In the query above, the + "expensive_function()" is invoked prior to the sort and thus is + invoked on every row of the table, even + rows that are ultimately omitted due to the LIMIT clause. + A co-routine can be used to work around this: + +</p><div class="codeblock"><pre>SELECT expensive_function(a) FROM ( + SELECT a FROM tab ORDER BY date DESC LIMIT 5 +); +</pre></div> +<p> + In the revised query, the subquery implemented by a co-routine computes + the five most recent values for "a". Those five values are passed from the + co-routine up into the outer query where the "expensive_function()" is + invoked on only the specific rows that the application cares about. + +</p><p> + The query planner in future versions of SQLite might grow smart enough + to make transformations such as the above automatically, in both directions. + That is to say, future versions of SQLite might transform queries of the + first form into the second, or queries written the second way into the + first. As of SQLite version 3.22.0 (2018-01-22), the query planner + will flatten the subquery if the outer query does not make use of any + user-defined functions or subqueries in its result set. For the examples + shown above, however, SQLite implements each of the queries as + written. + +<a name="minmax"></a> + +</p><h1 id="the_min_max_optimization"><span>13. </span>The MIN/MAX Optimization</h1> + +<p> + Queries that contain a single MIN() or MAX() aggregate function whose + argument is the left-most column of an index might be satisfied + by doing a single index lookup rather than by scanning the entire table. + Examples: + +</p><div class="codeblock"><pre>SELECT MIN(x) FROM table; +SELECT MAX(x)+1 FROM table; +</pre></div> + +<a name="autoindex"></a> + +<h1 id="automatic_indexes"><span>14. </span>Automatic Indexes</h1> + +<p> + When no indexes are available to aid the evaluation of a query, SQLite + might create an automatic index that lasts only for the duration + of a single SQL statement. + Since the cost of constructing the automatic index is + O(NlogN) (where N is the number of entries in the table) and the cost of + doing a full table scan is only O(N), an automatic index will + only be created if SQLite expects that the lookup will be run more than + logN times during the course of the SQL statement. Consider an example: + +</p><div class="codeblock"><pre>CREATE TABLE t1(a,b); +CREATE TABLE t2(c,d); +-- Insert many rows into both t1 and t2 +SELECT * FROM t1, t2 WHERE a=c; +</pre></div> +<p> + In the query above, if both t1 and t2 have approximately N rows, then + without any indexes the query will require O(N*N) time. On the other + hand, creating an index on table t2 requires O(NlogN) time and using + that index to evaluate the query requires an additional O(NlogN) time. + In the absence of <a href="lang_analyze.html">ANALYZE</a> information, SQLite guesses that N is one + million and hence it believes that constructing the automatic index will + be the cheaper approach. + +</p><p> + An automatic index might also be used for a subquery: + +</p><div class="codeblock"><pre>CREATE TABLE t1(a,b); +CREATE TABLE t2(c,d); +-- Insert many rows into both t1 and t2 +SELECT a, (SELECT d FROM t2 WHERE c=b) FROM t1; +</pre></div> +<p> + In this example, the t2 table is used in a subquery to translate values + of the t1.b column. If each table contains N rows, SQLite expects that + the subquery will run N times, and hence it will believe it is faster + to construct an automatic, transient index on t2 first and then use + that index to satisfy the N instances of the subquery. + +</p><p> + The automatic indexing capability can be disabled at run-time using + the <a href="pragma.html#pragma_automatic_index">automatic_index pragma</a>. Automatic indexing is turned on by + default, but this can be changed so that automatic indexing is off + by default using the <a href="compile.html#default_automatic_index">SQLITE_DEFAULT_AUTOMATIC_INDEX</a> compile-time option. + The ability to create automatic indexes can be completely disabled by + compiling with the <a href="compile.html#omit_automatic_index">SQLITE_OMIT_AUTOMATIC_INDEX</a> compile-time option. + +</p><p> + In SQLite <a href="releaselog/3_8_0.html">version 3.8.0</a> (2013-08-26) and later, + an <a href="rescode.html#warning_autoindex">SQLITE_WARNING_AUTOINDEX</a> message is sent + to the <a href="errlog.html">error log</a> every time a statement is prepared that uses an + automatic index. Application developers can and should use these warnings + to identify the need for new persistent indexes in the schema. + +</p><p> + Do not confuse automatic indexes with the <a href="fileformat2.html#intschema">internal indexes</a> (having names + like "sqlite_autoindex_<i>table</i>_<i>N</i>") that are sometimes + created to implement a <a href="lang_createtable.html#primkeyconst">PRIMARY KEY constraint</a> or <a href="lang_createtable.html#uniqueconst">UNIQUE constraint</a>. + The automatic indexes described here exist only for the duration of a + single query, are never persisted to disk, and are only visible to a + single database connection. Internal indexes are part of the implementation + of PRIMARY KEY and UNIQUE constraints, are long-lasting and persisted + to disk, and are visible to all database connections. The term "autoindex" + appears in the names of <a href="fileformat2.html#intschema">internal indexes</a> for legacy reasons and does + not indicate that internal indexes and automatic indexes are related. + +<a name="hashjoin"></a> + +</p><h2 id="hash_joins"><span>14.1. </span>Hash Joins</h2> + +<p> + An automatic index is about the same thing as a + <a href="https://en.wikipedia.org/wiki/Hash_join">hash join</a>. The only difference + is that a B-Tree is used instead of a hash table. If you are willing to + say that the transient B-Tree constructed for an automatic index is + really just a fancy hash table, then a query that uses an automatic + index is just a hash join. + +</p><p> + SQLite constructs an transient index instead of a hash table in this + instance because it already has a robust and high performance B-Tree + implementation at hand, whereas a hash-table would need to be added. + Adding a separate hash table implementation to handle this one case + would increase the size of the library (which is designed for use on + low-memory embedded devices) for minimal performance gain. SQLite might + be enhanced with a hash-table implementation someday, but for now it seems + better to continue using automatic indexes in cases where client/server + database engines might use a hash join. + +<a name="pushdown"></a> + +</p><h1 id="the_push_down_optimization"><span>15. </span>The Push-Down Optimization</h1> + +<p> + If a subquery cannot be <a href="optoverview.html#flattening">flattened</a> into the outer query, it might + still be possible to enhance performance by "pushing down" WHERE clause + terms from the outer query into the subquery. Consider an example: + +</p><div class="codeblock"><pre>CREATE TABLE t1(a INT, b INT); +CREATE TABLE t2(x INT, y INT); +CREATE VIEW v1(a,b) AS SELECT DISTINCT a, b FROM t1; + +SELECT x, y, b + FROM t2 JOIN v1 ON (x=a) + WHERE b BETWEEN 10 AND 20; +</pre></div> + +<p> + The view v1 cannot be <a href="optoverview.html#flattening">flattened</a> because it is DISTINCT. It must + instead be run as a subquery with the results being stored in a + transient table, then the join is performed between t2 and the + transient table. The push-down optimization pushes down the + "b BETWEEN 10 AND 20" term into the view. This makes the transient + table smaller, and helps the subquery to run faster if there + is an index on t1.b. The resulting evaluation is like this: + +</p><div class="codeblock"><pre>SELECT x, y, b + FROM t2 + JOIN (SELECT DISTINCT a, b FROM t1 WHERE b BETWEEN 10 AND 20) + WHERE b BETWEEN 10 AND 20; +</pre></div> + +<p> + The push-down optimization cannot always be used. For example, + if the subquery contains a LIMIT, then pushing down any part of + the WHERE clause from the outer query could change the result of + the inner query. There are other restrictions, explained in a + comment in the source code on the pushDownWhereTerms() routine + that implements this optimization. + +<a name="leftjoinreduction"></a> + +</p><h1 id="the_left_join_strength_reduction_optimization"><span>16. </span>The LEFT JOIN Strength Reduction Optimization</h1> + +<p> + A LEFT JOIN can sometimes be converted into an ordinary JOIN + if there are terms in the WHERE clause that guarantee that the + two joins will give identical results. In particular, if any + column in the right-hand table of the LEFT JOIN must be non-NULL + in order for the WHERE clause to be true, then the LEFT JOIN is + demoted to an ordinary JOIN. + +</p><p> + The prover that determines whether any column of the right-hand + table of a LEFT JOIN must be non-NULL in the WHERE clause is + imperfect. It sometimes returns a false negative. In other words, + it sometimes fails to reduce the strength of a LEFT JOIN when doing + so was in fact possible. For example, the prover does not know + the <a href="lang_datefunc.html">datetime() SQL function</a> will always return NULL if its first + argument is NULL, and so it will not recognize that the LEFT JOIN + in the following query could be strength-reduced: + +</p><div class="codeblock"><pre>SELECT urls.url + FROM urls + LEFT JOIN + (SELECT * + FROM (SELECT url_id AS uid, max(retrieval_time) AS rtime + FROM lookups GROUP BY 1 ORDER BY 1) + WHERE uid IN (358341,358341,358341) + ) recent + ON u.source_seed_id = recent.xyz OR u.url_id = recent.xyz + WHERE + DATETIME(recent.rtime) > DATETIME('now', '-5 days'); +</pre></div> + +<p> + It is possible that future enhancements to the prover might enable it + to recognize that NULL inputs to certain built-in functions + always result in a NULL answer. However, not all built-in + functions have that property (for example <a href="lang_corefunc.html#coalesce">coalesce()</a>) and, of + course, the prover will never be able to reason about + <a href="appfunc.html">application-defined SQL functions</a>. + + +<a name="omitnoopjoin"></a> + +</p><h1 id="the_omit_left_join_optimization"><span>17. </span>The Omit LEFT JOIN Optimization</h1> + +<p> + Sometimes a LEFT JOIN can be completely omitted from a query without + changing the result. This can happen if all of the following are + true: + +</p><p> + </p><ol> + <li> The query is not an aggregate + </li><li> Either the query is DISTINCT or else the ON or USING clause + on the LEFT JOIN constrains the join such that it matches + only a single row + </li><li> The right-hand table of the LEFT JOIN is not be used anywhere + in the query outside of its own USING or ON clause. + </li></ol> + +<p> + LEFT JOIN elimination often comes up when LEFT JOINs are used + inside of views, and then the view is used in such as way that + none of the columns of the right-hand table of the LEFT JOIN are + referenced. + +</p><p> + Here is a simple example of omitting a LEFT JOIN: + +</p><div class="codeblock"><pre>CREATE TABLE t1(ipk INTEGER PRIMARY KEY, v1); +CREATE TABLE t2(ipk INTEGER PRIMARY KEY, v2); +CREATE TABLE t3(ipk INTEGER PRIMARY KEY, v3); + +SELECT v1, v3 FROM t1 + LEFT JOIN t2 ON (t1.ipk=t2.ipk) + LEFT JOIN t3 ON (t1.ipk=t3.ipk) +</pre></div> + +<p> + The t2 table is completely unused in the query above, and so the + query planner is able to implement the query as if it were written: + +</p><div class="codeblock"><pre>SELECT v1, v3 FROM t1 + LEFT JOIN t3 ON (t1.ipk=t3.ipk) +</pre></div> + + +<a name="constprop"></a> + +<h1 id="the_constant_propagation_optimization"><span>18. </span>The Constant Propagation Optimization</h1> + +<p> + When a WHERE clause contains two or more equality constraints connected + by the AND operator such that all of the <a href="datatype3.html#affinity">affinities</a> of the various + constraints are the same, then SQLite might use the transitive property + of equality to construct new "virtual" constraints that can be used to + simplify expressions and/or improve performance. This is called the + "constant-propagation optimization". + +</p><p> + For example, consider the following schema and query: + +</p><div class="codeblock"><pre>CREATE TABLE t1(a INTEGER PRIMARY KEY, b INT, c INT); +SELECT * FROM t1 WHERE a=b AND b=5; +</pre></div> + +<p> + SQLite looks at the "a=b" and "b=5" constraints and deduces that + if those two constraints are true, then it must also be the case + that "a=5" is true. This means that the desired row can be looked up + quickly using a value of 5 for the INTEGER PRIMARY KEY. +</p><p align="center"><small><i>This page last modified on <a href="https://sqlite.org/docsrc/honeypot" id="mtimelink" data-href="https://sqlite.org/docsrc/finfo/pages/optoverview.in?m=f697eab2d82b4d002">2022-09-19 12:27:16</a> UTC </small></i></p> + |