summaryrefslogtreecommitdiffstats
path: root/www/queryplanner-ng.html
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--www/queryplanner-ng.html1052
1 files changed, 1052 insertions, 0 deletions
diff --git a/www/queryplanner-ng.html b/www/queryplanner-ng.html
new file mode 100644
index 0000000..79efbc9
--- /dev/null
+++ b/www/queryplanner-ng.html
@@ -0,0 +1,1052 @@
+<!DOCTYPE html>
+<html><head>
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<meta http-equiv="content-type" content="text/html; charset=UTF-8">
+<link href="sqlite.css" rel="stylesheet">
+<title>The Next-Generation Query Planner</title>
+<!-- path= -->
+</head>
+<body>
+<div class=nosearch>
+<a href="index.html">
+<img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0">
+</a>
+<div><!-- IE hack to prevent disappearing logo --></div>
+<div class="tagline desktoponly">
+Small. Fast. Reliable.<br>Choose any three.
+</div>
+<div class="menu mainmenu">
+<ul>
+<li><a href="index.html">Home</a>
+<li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a>
+<li class='wideonly'><a href='about.html'>About</a>
+<li class='desktoponly'><a href="docs.html">Documentation</a>
+<li class='desktoponly'><a href="download.html">Download</a>
+<li class='wideonly'><a href='copyright.html'>License</a>
+<li class='desktoponly'><a href="support.html">Support</a>
+<li class='desktoponly'><a href="prosupport.html">Purchase</a>
+<li class='search' id='search_menubutton'>
+<a href="javascript:void(0)" onclick='toggle_search()'>Search</a>
+</ul>
+</div>
+<div class="menu submenu" id="submenu">
+<ul>
+<li><a href='about.html'>About</a>
+<li><a href='docs.html'>Documentation</a>
+<li><a href='download.html'>Download</a>
+<li><a href='support.html'>Support</a>
+<li><a href='prosupport.html'>Purchase</a>
+</ul>
+</div>
+<div class="searchmenu" id="searchmenu">
+<form method="GET" action="search">
+<select name="s" id="searchtype">
+<option value="d">Search Documentation</option>
+<option value="c">Search Changelog</option>
+</select>
+<input type="text" name="q" id="searchbox" value="">
+<input type="submit" value="Go">
+</form>
+</div>
+</div>
+<script>
+function toggle_div(nm) {
+var w = document.getElementById(nm);
+if( w.style.display=="block" ){
+w.style.display = "none";
+}else{
+w.style.display = "block";
+}
+}
+function toggle_search() {
+var w = document.getElementById("searchmenu");
+if( w.style.display=="block" ){
+w.style.display = "none";
+} else {
+w.style.display = "block";
+setTimeout(function(){
+document.getElementById("searchbox").focus()
+}, 30);
+}
+}
+function div_off(nm){document.getElementById(nm).style.display="none";}
+window.onbeforeunload = function(e){div_off("submenu");}
+/* Disable the Search feature if we are not operating from CGI, since */
+/* Search is accomplished using CGI and will not work without it. */
+if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){
+document.getElementById("search_menubutton").style.display = "none";
+}
+/* Used by the Hide/Show button beside syntax diagrams, to toggle the */
+function hideorshow(btn,obj){
+var x = document.getElementById(obj);
+var b = document.getElementById(btn);
+if( x.style.display!='none' ){
+x.style.display = 'none';
+b.innerHTML='show';
+}else{
+x.style.display = '';
+b.innerHTML='hide';
+}
+return false;
+}
+var antiRobot = 0;
+function antiRobotGo(){
+if( antiRobot!=3 ) return;
+antiRobot = 7;
+var j = document.getElementById("mtimelink");
+if(j && j.hasAttribute("data-href")) j.href=j.getAttribute("data-href");
+}
+function antiRobotDefense(){
+document.body.onmousedown=function(){
+antiRobot |= 2;
+antiRobotGo();
+document.body.onmousedown=null;
+}
+document.body.onmousemove=function(){
+antiRobot |= 2;
+antiRobotGo();
+document.body.onmousemove=null;
+}
+setTimeout(function(){
+antiRobot |= 1;
+antiRobotGo();
+}, 100)
+antiRobotGo();
+}
+antiRobotDefense();
+</script>
+<div class=fancy>
+<div class=nosearch>
+<div class="fancy_title">
+The Next-Generation Query Planner
+</div>
+<div class="fancy_toc">
+<a onclick="toggle_toc()">
+<span class="fancy_toc_mark" id="toc_mk">&#x25ba;</span>
+Table Of Contents
+</a>
+<div id="toc_sub"><div class="fancy-toc1"><a href="#_introduction">1. Introduction</a></div>
+<div class="fancy-toc1"><a href="#_background">2. Background</a></div>
+<div class="fancy-toc2"><a href="#_query_planning_in_sqlite">2.1. Query Planning In SQLite</a></div>
+<div class="fancy-toc2"><a href="#_the_sqlite_query_planner_stability_guarantee">2.2. The SQLite Query Planner Stability Guarantee</a></div>
+<div class="fancy-toc1"><a href="#_a_difficult_case">3. A Difficult Case</a></div>
+<div class="fancy-toc2"><a href="#_query_details">3.1. Query Details</a></div>
+<div class="fancy-toc2"><a href="#_complications">3.2. Complications</a></div>
+<div class="fancy-toc2"><a href="#_finding_the_best_query_plan">3.3. Finding The Best Query Plan</a></div>
+<div class="fancy-toc2"><a href="#_the_n_nearest_neighbors_or_n3_heuristic">3.4. The N Nearest Neighbors or "N3" Heuristic</a></div>
+<div class="fancy-toc1"><a href="#_hazards_of_upgrading_to_ngqp">4. Hazards Of Upgrading To NGQP</a></div>
+<div class="fancy-toc2"><a href="#_case_study_upgrading_fossil_to_the_ngqp">4.1. Case Study: Upgrading Fossil to the NGQP</a></div>
+<div class="fancy-toc2"><a href="#_fixing_the_problem">4.2. Fixing The Problem</a></div>
+<div class="fancy-toc2"><a href="#update_2017_a_better_fix">4.3. Update 2017: A Better Fix</a></div>
+<div class="fancy-toc1"><a href="#_checklist_for_avoiding_or_fixing_query_planner_problems">5. Checklist For Avoiding Or Fixing Query Planner Problems</a></div>
+<div class="fancy-toc1"><a href="#_summary">6. Summary</a></div>
+</div>
+</div>
+<script>
+function toggle_toc(){
+var sub = document.getElementById("toc_sub")
+var mk = document.getElementById("toc_mk")
+if( sub.style.display!="block" ){
+sub.style.display = "block";
+mk.innerHTML = "&#x25bc;";
+} else {
+sub.style.display = "none";
+mk.innerHTML = "&#x25ba;";
+}
+}
+</script>
+</div>
+
+
+
+
+
+<h1 id="_introduction"><span>1. </span> Introduction</h1>
+
+<p>
+The task of the "query planner" is to figure
+out the best algorithm or "query plan" to accomplish an SQL statement.
+Beginning with SQLite <a href="releaselog/3_8_0.html">version 3.8.0</a> (2013-08-26),
+the query planner component has been
+rewritten so that it runs faster and generates better plans. The
+rewrite is called the "next generation query planner" or "NGQP".
+</p>
+
+<p>This article overviews the importance of query planning, describes some
+of the problems inherent to query planning, and outlines how the NGQP
+solves those problems.</p>
+
+<p>The NGQP is almost always better than the legacy query planner.
+However, there may exist legacy applications that unknowingly depend on
+undefined and/or suboptimal behavior in the legacy query planner, and
+upgrading to the NGQP on those legacy applications could cause performance
+regressions. This risk is considered and a checklist is provided
+for reducing the risk and for fixing any issues that do arise.</p>
+
+<p>This document focuses on the NGQP. For a more general overview of the
+SQLite query planner that encompasses the entire history of SQLite, see the
+"<a href="optoverview.html">The SQLite Query Optimizer Overview</a>" and
+"<a href="queryplanner.html">How Indexes Work</a>" documents.</p>
+
+<h1 id="_background"><span>2. </span> Background</h1>
+
+<p>For simple queries against a single table with few indexes, there is usually
+an obvious choice for the best algorithm.
+But for larger and more complex queries, such as
+multi-way joins with many indexes
+and subqueries, there can be hundreds, thousands, or millions
+of reasonable algorithms for computing the result.
+The job of the query planner is to choose the single "best" query plan from
+this multitude of possibilities.</p>
+
+<p>Query planners are what make SQL database engines so amazingly useful and powerful.
+(This is true of all SQL database engines, not just SQLite.)
+The query planner frees the programmer from the chore of selecting
+a particular query plan, and thereby allows the programmer to
+focus more mental energy on higher-level application issues and on
+providing more value to the end user. For simple queries where the choice
+of query plan is obvious, this is convenient but not hugely important.
+But as applications and schemas and queries grow more complex, a
+clever query planner can greatly speed and simplify the work of application
+development.
+There is amazing power in being about to tell
+the database engine what content is desired, and then let the database
+engine figure out the best way to retrieve that content.</p>
+
+<p>Writing a good query planner is more art than science.
+The query planner must work with incomplete information.
+It cannot determine how long any particular plan will take
+without actually running that plan. So when comparing two
+or more plans to figure out which is "best", the query planner has to make
+some guesses and assumptions and those guesses and assumptions will
+sometimes be wrong. A good query planner is one that will
+find the correct solution often enough that application
+programmers rarely need to get involved.</p>
+
+<h2 id="_query_planning_in_sqlite"><span>2.1. </span> Query Planning In SQLite</h2>
+
+<p>SQLite computes joins using nested loops,
+one loop for each table
+in the join. (Additional loops might be inserted for IN
+and OR operators in the WHERE clause. SQLite considers those too,
+but for simplicity we will ignore them in this essay.)
+One or more indexes might be used on each loop to speed the search,
+or a loop might be a "full table scan" that reads every row in the
+table. Thus query planning decomposes into two subtasks:</p>
+<ol>
+<li> Picking the nested order of the various loops
+</li><li> Choosing good indexes for each loop
+</li></ol>
+<p>Picking the nesting order is generally the more challenging problem.
+Once the nesting order of the join is established, the choice of indexes
+for each loop is normally obvious.</p>
+
+<a name="qpstab"></a>
+
+<h2 id="_the_sqlite_query_planner_stability_guarantee"><span>2.2. </span> The SQLite Query Planner Stability Guarantee</h2>
+
+<p>When the Query Planner Stability Guarantee (QPSG) is enabled
+SQLite will always pick the same query plan for any
+given SQL statement as long as:
+
+</p><ol type="a">
+<li>the database schema does not change in significant ways such as
+ adding or dropping indexes,</li>
+<li>the ANALYZE command is not rerun, </li>
+<li>the same version of SQLite is used.</li>
+</ol>
+
+<p>The QPSG is disabled by default. It can be enabled at compile-time
+using the <a href="compile.html#enable_qpsg">SQLITE_ENABLE_QPSG</a> compile-time option, or at run-time by
+invoking <a href="c3ref/db_config.html">sqlite3_db_config</a>(db,<a href="c3ref/c_dbconfig_defensive.html#sqlitedbconfigenableqpsg">SQLITE_DBCONFIG_ENABLE_QPSG</a>,1,0).
+
+</p><p>The QPSG means that if all of your queries run efficiently
+during testing, and if your application does not change the schema,
+then SQLite will not suddenly decide to start using a different
+query plan, possibly causing a performance problem after your application
+is released to users. If your application works in the lab, it
+will continue working the same way after deployment.</p>
+
+<p>Enterprise-class client/server SQL database engines do not normally
+make this guarantee.
+In client/server SQL database engines, the server keeps track of
+statistics on the sizes of tables and on the quality of indexes
+and the query planner uses those statistics to help select the best plans.
+As content is added, deleted, or changed in the database, the statistics
+will evolve and may cause the query planner to begin using a different
+query plan for some particular query. Usually the new plan will be better
+for the evolving structure of the data. But sometimes the new query plan will
+cause a performance reduction. With a client/server database engine, there
+is typically a Database Administrator (DBA) on hand to deal with these
+rare problems as they come up. But DBAs are not available to fix problems
+in an embedded database like SQLite, and hence SQLite is careful to
+ensure that plans do not change unexpectedly after deployment.</p>
+
+<p>It is important to note that changing versions of SQLite might cause
+changes in query plans.
+The same version of SQLite will
+always pick the same query plan, but if you relink your application to use
+a different version of SQLite, then query plans might change. In rare
+cases, an SQLite version change might lead to a performance regression.
+This is one reason
+you should consider statically linking your applications against SQLite
+rather than use a system-wide SQLite shared library which might
+change without your knowledge or control.</p>
+
+<h1 id="_a_difficult_case"><span>3. </span> A Difficult Case</h1>
+
+<p>
+"TPC-H Q8" is a test query from the
+<a href="http://www.tpc.org/tpch/">Transaction Processing Performance
+Council</a>. The query planners in SQLite versions 3.7.17 and earlier
+do not choose good plans for TPC-H Q8. And it has been determined that
+no amount
+of tweaking of the legacy query planner will fix that. In order to find
+a good solution to the TPC-H Q8 query, and to continue improving the
+quality of SQLite's query planner, it became necessary to redesign the
+query planner. This section tries to explain why this redesign was
+necessary and how the NGQP is different and addresses the TPC-H Q8 problem.
+</p>
+
+<h2 id="_query_details"><span>3.1. </span> Query Details</h2>
+
+<p>
+TPC-H Q8 is an eight-way join.
+As observed above, the main task of the query planner is
+to figure out the best nesting order of the eight loops in order to minimize
+the work needed to complete the join.
+A simplified model of this problem for the case of TPC-H Q8 is shown
+by the following diagram:
+</p>
+
+<center>
+<div style="max-width:677px;"><svg xmlns='http://www.w3.org/2000/svg' class="pikchr" viewBox="0 0 677.503 310.031">
+<circle cx="132" cy="104" r="28.3066" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="132" y="104" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" font-size="156.25%" dominant-baseline="central">S</text>
+<circle cx="234" cy="104" r="28.3066" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="234" y="104" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" font-size="156.25%" dominant-baseline="central">L</text>
+<circle cx="336" cy="104" r="28.3066" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="336" y="104" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" font-size="156.25%" dominant-baseline="central">O</text>
+<circle cx="438" cy="104" r="28.3066" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="438" y="104" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" font-size="156.25%" dominant-baseline="central">C</text>
+<circle cx="539" cy="104" r="28.3066" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="539" y="104" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" font-size="156.25%" dominant-baseline="central">N1</text>
+<circle cx="641" cy="104" r="28.3066" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="641" y="104" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" font-size="156.25%" dominant-baseline="central">R</text>
+<circle cx="234" cy="205" r="28.3066" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="234" y="205" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" font-size="156.25%" dominant-baseline="central">P</text>
+<circle cx="30" cy="104" r="28.3066" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="30" y="104" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" font-size="156.25%" dominant-baseline="central">N2</text>
+<polygon points="104,101 91,101 94,93" style="fill:rgb(0,0,0)"/>
+<path d="M58,101 L 70,96 Q 81,92 90,95 L 98,99" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="81" y="84" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">6.00</text>
+<polygon points="58,106 71,106 68,114" style="fill:rgb(0,0,0)"/>
+<path d="M104,106 L 92,111 Q 81,115 72,112 L 64,108" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="81" y="123" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">2.08</text>
+<polygon points="205,101 193,101 196,93" style="fill:rgb(0,0,0)"/>
+<path d="M160,101 L 171,96 Q 183,92 191,95 L 200,99" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="183" y="84" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">9.17</text>
+<polygon points="160,106 172,106 169,114" style="fill:rgb(0,0,0)"/>
+<path d="M205,106 L 194,111 Q 183,115 174,112 L 166,108" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="183" y="123" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">2.30</text>
+<polygon points="307,101 295,101 298,93" style="fill:rgb(0,0,0)"/>
+<path d="M262,101 L 273,96 Q 285,92 293,95 L 302,99" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="285" y="84" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">2.77</text>
+<polygon points="262,106 274,106 271,114" style="fill:rgb(0,0,0)"/>
+<path d="M307,106 L 296,111 Q 285,115 276,112 L 267,108" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="285" y="123" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">4.03</text>
+<polygon points="409,101 397,101 400,93" style="fill:rgb(0,0,0)"/>
+<path d="M364,101 L 375,96 Q 387,92 395,95 L 404,99" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="387" y="84" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">2.64</text>
+<polygon points="364,106 376,106 373,114" style="fill:rgb(0,0,0)"/>
+<path d="M409,106 L 398,111 Q 387,115 378,112 L 369,108" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="387" y="123" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">5.30</text>
+<polygon points="511,101 499,101 502,93" style="fill:rgb(0,0,0)"/>
+<path d="M466,101 L 477,96 Q 489,92 497,95 L 506,99" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="489" y="84" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">2.08</text>
+<polygon points="466,106 478,106 475,114" style="fill:rgb(0,0,0)"/>
+<path d="M511,106 L 500,111 Q 489,115 480,112 L 471,108" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="489" y="123" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">6.40</text>
+<polygon points="613,101 601,101 604,93" style="fill:rgb(0,0,0)"/>
+<path d="M568,101 L 579,96 Q 590,92 599,95 L 608,99" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="590" y="84" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">1.79</text>
+<polygon points="568,106 580,106 577,114" style="fill:rgb(0,0,0)"/>
+<path d="M613,106 L 602,111 Q 590,115 582,112 L 573,108" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="590" y="123" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">3.47</text>
+<polygon points="237,177 237,165 245,168" style="fill:rgb(0,0,0)"/>
+<path d="M237,132 L 241,143 Q 245,155 242,163 L 239,172" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="245" y="155" text-anchor="start" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">2.64</text>
+<polygon points="231,132 231,144 223,141" style="fill:rgb(0,0,0)"/>
+<path d="M231,177 L 227,166 Q 222,155 226,146 L 229,137" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="222" y="155" text-anchor="end" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">6.01</text>
+<circle cx="30" cy="16" r="14.1533" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="30" y="16" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" dominant-baseline="central">*</text>
+<polygon points="30,75 26,64 34,64" style="fill:rgb(0,0,0)"/>
+<path d="M30,30L30,69" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="30" y="53" text-anchor="start" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central"> 5.52</text>
+<circle cx="132" cy="16" r="14.1533" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="132" y="16" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" dominant-baseline="central">*</text>
+<polygon points="132,75 128,64 136,64" style="fill:rgb(0,0,0)"/>
+<path d="M132,30L132,69" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="132" y="53" text-anchor="start" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central"> 9.47</text>
+<circle cx="234" cy="16" r="14.1533" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="234" y="16" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" dominant-baseline="central">*</text>
+<polygon points="234,75 229,64 238,64" style="fill:rgb(0,0,0)"/>
+<path d="M234,30L234,69" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="234" y="53" text-anchor="start" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central"> 16.40</text>
+<circle cx="336" cy="16" r="14.1533" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="336" y="16" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" dominant-baseline="central">*</text>
+<polygon points="336,75 331,64 340,64" style="fill:rgb(0,0,0)"/>
+<path d="M336,30L336,69" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="336" y="53" text-anchor="start" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central"> 13.87</text>
+<circle cx="438" cy="16" r="14.1533" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="438" y="16" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" dominant-baseline="central">*</text>
+<polygon points="438,75 433,64 442,64" style="fill:rgb(0,0,0)"/>
+<path d="M438,30L438,69" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="438" y="53" text-anchor="start" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central"> 12.56</text>
+<circle cx="539" cy="16" r="14.1533" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="539" y="16" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" dominant-baseline="central">*</text>
+<polygon points="539,75 535,64 544,64" style="fill:rgb(0,0,0)"/>
+<path d="M539,30L539,69" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="539" y="53" text-anchor="start" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central"> 5.52</text>
+<circle cx="641" cy="16" r="14.1533" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="641" y="16" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" dominant-baseline="central">*</text>
+<polygon points="641,75 637,64 646,64" style="fill:rgb(0,0,0)"/>
+<path d="M641,30L641,69" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="641" y="53" text-anchor="start" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central"> 3.56</text>
+<circle cx="234" cy="293" r="14.1533" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="234" y="293" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" dominant-baseline="central">*</text>
+<polygon points="234,234 238,245 229,245" style="fill:rgb(0,0,0)"/>
+<path d="M234,279L234,240" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="234" y="256" text-anchor="start" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central"> 7.71</text>
+</svg>
+</div>
+</center>
+
+<p>
+In the diagram, each of the 8 tables in the FROM clause of the query is
+identified by a large circle with the label of the FROM-clause term:
+N2, S, L, P, O, C, N1 and R.
+The arcs in the graph represent the estimated cost of computing each term
+assuming that the origin of the arc is in an outer loop. For example, the
+cost of running the S loop as an inner loop to L is 2.30 whereas the
+cost of running the S loop as an outer loop to L is 9.17.</p>
+
+<p>The "cost" here is logarithmic. With nested loops, the work
+is multiplied, not added. But it is customary to think of graphs
+with additive weights and so the graph shows the logarithm of the
+various costs. The graph shows a cost advantage of S being inside of
+L of about 6.87, but this translates into the query running about
+963 times faster when S loop is inside of the L loop rather than
+being outside of it.</p>
+
+<p>The arrows from the small circles labeled with "*" indicate the cost
+of running each loop with no dependencies. The outermost loop must use this
+*-cost. Inner loops have the option of using the *-cost or a cost assuming
+one of the other terms is in an outer loop, whichever gives the best
+result. One can think of the *-costs as a short-hand notation indicating
+multiple arcs, one from each of the other nodes in the
+graph. The graph is therefore "complete", meaning that there are arcs
+(some explicit and some implied) in both directions between every pair of
+nodes in the graph.</p>
+
+<p>The problem of finding the best query plan is equivalent to finding
+a minimum-cost path through the graph that visits each node
+exactly once.</p>
+
+<p>(Side note: The cost estimates in the TPC-H Q8 graph above were computed
+by the query planner in SQLite 3.7.16 and converted using a natural logarithm.)
+</p>
+
+<h2 id="_complications"><span>3.2. </span> Complications</h2>
+
+<p>The presentation of the query planner problem above is a simplification.
+The costs are estimates. We cannot
+know what the true cost of running a loop is until we actually run the loop.
+SQLite makes guesses for the cost of running a loop based on
+the availability of indexes and constraints found in the WHERE
+clause. These guesses are usually pretty good, but they can sometimes be
+off. Using the <a href="lang_analyze.html">ANALYZE</a> command to collect additional statistical
+information about the database can sometimes enable SQLite to make better
+guesses about the cost.</p>
+
+<p>The costs are comprised of multiple numbers, not a single number as
+shown in the graph.
+SQLite computes several different estimated costs for each loop that apply at
+different times. For example, there is a "setup" cost that is incurred
+just once when the query starts. The setup cost is the cost of computing
+an <a href="optoverview.html#autoindex">automatic index</a> for a table that does not already
+have an index. Then there
+is the cost of running each step of the loop. Finally, there is an estimate
+of the number rows generated by the loop, which is information needed in
+estimating the costs of inner loops. Sorting costs may come into play
+if the query has an ORDER BY clause.</p>
+
+<p>In a general query, dependencies need not be on a single loop, and hence
+the matrix of dependencies might not be representable as a graph.
+For example, one of the WHERE clause constraints might be
+S.a=L.b+P.c, implying that the S loop must be an inner loop of both
+L and P. Such dependencies cannot be drawn as a graph
+since there is no way for an arc to originate at two or more nodes at
+once.</p>
+
+<p>If the query contains an ORDER BY clause or a GROUP BY clause or if
+the query uses the DISTINCT keyword then it is advantageous to select a
+path through the graph that causes rows to naturally appear in sorted order,
+so that no separate sorting step is required. Automatic elimination of
+ORDER BY clauses
+can make a large performance difference, so this is another factor
+that needs to be considered in a complete implementation.</p>
+
+<p>In the TPC-H Q8 query, the setup costs are all negligible,
+all dependencies are between individual nodes, and there is no ORDER BY,
+GROUP BY, or DISTINCT clause. So for TPC-H Q8,
+the graph above is a reasonable representation of what needs to be computed.
+The general case involves a lot of extra complication, which for clarity
+is neglected in the remainder of this article.</p>
+
+<h2 id="_finding_the_best_query_plan"><span>3.3. </span> Finding The Best Query Plan</h2>
+
+<p>Prior to <a href="releaselog/3_8_0.html">version 3.8.0</a> (2013-08-26), SQLite always used
+the "Nearest Neighbor" or "NN" heuristic when searching for the best query plan.
+The NN heuristic makes a single traversal of the graph, always choosing
+the lowest-cost arc as the next step.
+The NN heuristic works surprisingly well in most cases.
+And NN is fast, so that SQLite is able to quickly find good plans
+for even large 64-way joins. In contrast, other SQL database engines that
+do more extensive searching tend to bog down when the
+number of tables in a join goes above 10 or 15.</p>
+
+<p>Unfortunately, the query plan computed by NN for TPC-H Q8 is not optimal.
+The plan computed using NN is R-N1-N2-S-C-O-L-P with a cost of 36.92.
+The notation
+in the previous sentence means that the R table is run in the outer loop,
+N1 is in the next inner loop, N2 is in the third loop, and so forth down
+to P which is in the inner-most loop. The shortest path through the
+graph (as found via exhaustive search) is P-L-O-C-N1-R-S-N2
+with a cost of 27.38. The difference might not seem like much, but
+remember that
+the costs are logarithmic, so the shortest path is nearly 750 times
+faster than that path found using the NN heuristic.</p>
+
+<p>One solution to this problem is to change SQLite to do an exhaustive
+search for the best path. But an exhaustive search requires time
+proportional to
+K! (where K is the number of tables in the join) and so when you get
+beyond a 10-way join, the time
+to run <a href="c3ref/prepare.html">sqlite3_prepare()</a> becomes very large.</p>
+
+<h2 id="_the_n_nearest_neighbors_or_n3_heuristic"><span>3.4. </span> The N Nearest Neighbors or "N3" Heuristic</h2>
+
+<p>The NGQP uses a new heuristic for seeking the best path through the
+graph: "N Nearest Neighbors" (hereafter "N3"). With N3, instead of
+choosing just one nearest neighbor for each step, the algorithm keeps
+track of the N bests paths at each step for some small integer N.</p>
+
+<p>Suppose N=4. Then for the TPC-H Q8 graph, the first step finds
+the four shortest paths to visit any single node in the graph:
+
+</p><blockquote>
+ R (cost: 3.56) <br>
+ N1 (cost: 5.52) <br>
+ N2 (cost: 5.52) <br>
+ P (cost: 7.71) <br>
+</blockquote>
+
+<p>The second step finds the four shortest paths to visit two nodes
+beginning with one of the four paths from the previous step. In the
+case where two or more paths are equivalent (they have the same set of
+visited nodes, though possibly in a different order) only the
+first and lowest-cost path is retained. We have:</p>
+
+<blockquote>
+ R-N1 (cost: 7.03) <br>
+ R-N2 (cost: 9.08) <br>
+ N2-N1 (cost: 11.04) <br>
+ R-P {cost: 11.27} <br>
+</blockquote>
+
+<p>The third step starts with the four shortest two-node paths and finds
+the four shortest three-node paths:</p>
+
+<blockquote>
+ R-N1-N2 (cost: 12.55) <br>
+ R-N1-C (cost: 13.43) <br>
+ R-N1-P (cost: 14.74) <br>
+ R-N2-S (cost: 15.08) <br>
+</blockquote>
+
+<p>And so forth. There are 8 nodes in the TPC-H Q8 query,
+so this process repeats a total of 8
+times. In the general case of a K-way join, the storage requirement
+is O(N) and the computation time is O(K*N), which is significantly faster
+than the O(2<small><sup>K</sup></small>) exact solution.</p>
+
+<p>But what value to choose for N? One might try N=K. This makes the
+algorithm O(K<small><sup>2</sup></small>)
+which is actually still quite efficient, since the
+maximum value of K is 64 and K rarely exceeds 10.
+But that is not enough for the TPC-H Q8
+problem. With N=8 on TPC-H Q8 the N3 algorithm finds
+the solution R-N1-C-O-L-S-N2-P with a cost of 29.78.
+That is a big improvement over NN, but it is still
+not optimal. N3 finds the optimal solution for TPC-H Q8
+when N is 10 or greater.</p>
+
+<p>The initial implementation of NGQP chooses N=1 for simple queries, N=5
+for two-way joins and N=10 for all joins with three or more tables. This
+formula for selecting N might change in subsequent releases.</p>
+
+<a name="hazards"></a>
+
+<h1 id="_hazards_of_upgrading_to_ngqp"><span>4. </span> Hazards Of Upgrading To NGQP</h1>
+
+<p><i>Update on 2018-11-24: This section was important
+when the NGQP was new. But five years have elapsed, the NGQP has been
+deployed successfully to billions of devices, and everyone has upgraded.
+The upgrade hazard has vanished.
+This section is retained for historical reference only.
+Modern reads can skip ahead to the <a href="queryplanner-ng.html#howtofix">query planner checklist</a>.</i>
+
+</p><p>For most applications, upgrading from the legacy query planner to the NGQP
+requires little thought or effort.
+Simply replace the older SQLite version with the newer version of SQLite
+and recompile and the application will run faster.
+There are no API changes nor modifications
+to compilation procedures.</p>
+
+<p>But as with any query planner change, upgrading to the NGQP does carry
+a small risk of introducing performance regressions. The problem here is
+not that the NGQP is incorrect or buggy or inferior to the legacy query
+planner. Given reliable information about the selectivity of indexes,
+the NGQP should always pick a plan that is as good or better than before.
+The problem is that some applications may be using low-quality and
+low-selectivity indexes without having run <a href="lang_analyze.html">ANALYZE</a>. The older query
+planners look at many fewer possible implementations for each query and
+so they may have stumbled over a good plan by stupid luck. The NGQP, on
+the other hand, looks at many more query plan possibilities, and it may
+choose a different query plan that
+works better in theory, assuming good indexes, but which gives a performance
+regression in practice, because of the shape of the data.</p>
+
+<p>Key points:</p>
+
+<ul>
+<li><p>The NGQP will always find an equal or better query plan, compared to
+ prior query planners, as long as it
+ has access to accurate <a href="lang_analyze.html">ANALYZE</a> data in the <a href="fileformat2.html#stat1tab">SQLITE_STAT1</a> file.</p>
+</li><li><p>The NGQP will always find a good query plan
+ as long as the schema does not contain indexes that have more than
+ about 10 or 20 rows with the same value in the left-most column of the
+ index.</p>
+</li></ul>
+
+<p>Not all applications meet these conditions. Fortunately,
+the NGQP will still usually find good query plans, even without these conditions.
+However, cases do arise (rarely) where performance regressions can occur.</p>
+
+<a name="fossilcasestudy"></a>
+
+<h2 id="_case_study_upgrading_fossil_to_the_ngqp"><span>4.1. </span> Case Study: Upgrading Fossil to the NGQP</h2>
+
+<p>The <a href="http://www.fossil-scm.org/">Fossil DVCS</a> is the version
+control system used to track all of the SQLite source code.
+A Fossil repository is an SQLite database file.
+(Readers are invited to ponder this recursion as an independent exercise.)
+Fossil is both the version-control system for SQLite and a test
+platform for SQLite. Whenever enhancements are made to SQLite,
+Fossil is one of the first applications to test and evaluate those
+enhancements. So Fossil was an early adopter of the NGQP.</p>
+
+<p>Unfortunately, the NGQP caused a
+performance regression in Fossil.</p>
+
+<p>One of the many reports that Fossil makes available is a timeline of
+changes to a single branch showing all merges in and out of that branch. See
+<a href="http://www.sqlite.org/src/timeline?nd&n=200&r=trunk">http://www.sqlite.org/src/timeline?nd&n=200&r=trunk</a>
+for a typical
+example of such a report. Generating such a report normally takes just
+a few milliseconds. But after upgrading to the NGQP we noticed that
+this one report was taking closer to 10 seconds for the trunk of the
+repository.</p>
+
+<p>The core query used to generate the branch timeline is shown below.
+(Readers are not expected to understand the details of this query.
+Commentary will follow.)</p>
+
+<blockquote><pre>
+SELECT
+ blob.rid AS blobRid,
+ uuid AS uuid,
+ datetime(event.mtime,'localtime') AS timestamp,
+ coalesce(ecomment, comment) AS comment,
+ coalesce(euser, user) AS user,
+ blob.rid IN leaf AS leaf,
+ bgcolor AS bgColor,
+ event.type AS eventType,
+ (SELECT group_concat(substr(tagname,5), ', ')
+ FROM tag, tagxref
+ WHERE tagname GLOB 'sym-*'
+ AND tag.tagid=tagxref.tagid
+ AND tagxref.rid=blob.rid
+ AND tagxref.tagtype>0) AS tags,
+ tagid AS tagid,
+ brief AS brief,
+ event.mtime AS mtime
+ FROM event CROSS JOIN blob
+ WHERE blob.rid=event.objid
+ AND (EXISTS(SELECT 1 FROM tagxref
+ WHERE tagid=11 AND tagtype>0 AND rid=blob.rid)
+ OR EXISTS(SELECT 1 FROM plink JOIN tagxref ON rid=cid
+ WHERE tagid=11 AND tagtype>0 AND pid=blob.rid)
+ OR EXISTS(SELECT 1 FROM plink JOIN tagxref ON rid=pid
+ WHERE tagid=11 AND tagtype>0 AND cid=blob.rid))
+ ORDER BY event.mtime DESC
+ LIMIT 200;
+</pre></blockquote>
+
+<p>This query is not
+especially complicated, but even so it replaces hundreds or
+perhaps thousands of lines of procedural code.
+The gist of the query is this: Scan down the EVENT table looking
+for the most recent 200 check-ins that satisfy any one of three conditions:
+
+</p><p></p><ol>
+<li> The check-in has a "trunk" tag.
+</li><li> The check-in has a child that has a "trunk" tag.
+</li><li> The check-in has a parent that has a "trunk" tag.
+</li></ol>
+
+<p>
+The first condition causes all of the trunk check-ins to be displayed and
+the second and third cause check-ins that merge into or fork from
+the trunk to also be included.
+The three conditions are implemented by the three OR-connected
+EXISTS statements in the WHERE clause of the query.
+The slowdown that occurred with the NGQP was caused by the second and
+third conditions. The problem is the same in each, so we will examine
+just the second one.
+The subquery of the second condition can be rewritten (with minor
+and immaterial simplifications) as follows:</p>
+
+<blockquote><pre>
+SELECT 1
+ FROM plink JOIN tagxref ON tagxref.rid=plink.cid
+ WHERE tagxref.tagid=$trunk
+ AND plink.pid=$ckid;
+</pre></blockquote>
+
+<p>The PLINK table holds parent-child relationships between
+check-ins. The TAGXREF table maps tags into check-ins.
+For reference, the relevant portions of the schemas
+for these two tables is shown here:</p>
+
+<blockquote><pre>
+CREATE TABLE plink(
+ pid INTEGER REFERENCES blob,
+ cid INTEGER REFERENCES blob
+);
+CREATE UNIQUE INDEX plink_i1 ON plink(pid,cid);
+
+CREATE TABLE tagxref(
+ tagid INTEGER REFERENCES tag,
+ mtime TIMESTAMP,
+ rid INTEGER REFERENCE blob,
+ UNIQUE(rid, tagid)
+);
+CREATE INDEX tagxref_i1 ON tagxref(tagid, mtime);
+</pre></blockquote>
+
+<p>There are only two reasonable ways to implement this query.
+(There are many other possible algorithms, but none of the
+others are contenders for being the "best" algorithm.)</p>
+
+<ol type="1">
+<li value="1"><p>
+Find all children of check-in $ckid and test each one to see if
+it has the $trunk tag.
+</p></li><li value="2"><p>
+Find all check-ins with the $trunk tag and test each one to see if
+it is a child of $ckid.
+</p></li></ol>
+
+<p>
+Intuitively, we humans understand that algorithm-1 is best.
+Each check-in is likely to have few children (one child is
+the most common case) and each child can be tested for the
+$trunk tag in logarithmic time. Indeed, algorithm-1 is the
+faster choice in practice. But the NGQP has no intuition. The
+NGQP must use hard math, and algorithm-2 is slightly
+better mathematically. This is because, in the absence of other information,
+the NGQP must assume that the indexes PLINK_I1 and TAGXREF_I1 are of
+equal quality and are equally selective. Algorithm-2 uses one field
+of the TAGXREF_I1 index and both fields of the PLINK_I1 index whereas
+algorithm-1 only uses the first field of each index. Since
+algorithm-2 uses more index material, the NGQP is correct
+to judge it to be the better algorithm. The scores are close and
+algorithm-2 just barely squeaks ahead of algorithm-1. But
+algorithm-2 really is the correct choice here.
+</p>
+
+<p>
+Unfortunately, algorithm-2 is slower than algorithm-1 in
+this application.
+</p>
+
+<p>
+The problem is that the indexes are not of equal quality.
+A check-in is likely to only have one child. So the first
+field of PLINK_I1 will usually narrow down the search to just a single
+row. But there are thousands and thousands check-ins tagged with "trunk",
+so the first field of TAGXREF_I1 will be
+of little help in narrowing down the search.
+</p>
+
+<p>
+The NGQP has no way of knowing that TAGXREF_I1 is almost useless in this
+query, unless <a href="lang_analyze.html">ANALYZE</a> has been run on the database. The <a href="lang_analyze.html">ANALYZE</a> command
+gathers statistics on the quality of the various indexes and stores those
+statistics in <a href="fileformat2.html#stat1tab">SQLITE_STAT1</a> table.
+Having access to this statistical information,
+the NGQP easily chooses algorithm-1 as the best algorithm, by a wide
+margin.</p>
+
+<p>Why didn't the legacy query planner choose algorithm-2?
+Easy: because the NN algorithm
+never even considered algorithm-2. Graphs of the planning
+problem look like this:</p>
+
+<center>
+<div style="max-width:523px;"><svg xmlns='http://www.w3.org/2000/svg' class="pikchr" viewBox="0 0 523.232 186.76">
+<circle cx="55" cy="104" r="28.3066" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="55" y="104" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" font-size="156.25%" dominant-baseline="central">P</text>
+<circle cx="157" cy="104" r="28.3066" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="157" y="104" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" font-size="156.25%" dominant-baseline="central">T</text>
+<polygon points="129,101 117,101 120,93" style="fill:rgb(0,0,0)"/>
+<path d="M84,101 L 95,96 Q 106,92 115,95 L 124,99" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="106" y="84" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">4.8</text>
+<polygon points="84,106 96,106 93,114" style="fill:rgb(0,0,0)"/>
+<path d="M129,106 L 118,111 Q 106,115 98,112 L 89,108" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="106" y="123" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">4.4</text>
+<circle cx="55" cy="16" r="14.1533" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="55" y="16" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" dominant-baseline="central">*</text>
+<polygon points="55,75 51,64 60,64" style="fill:rgb(0,0,0)"/>
+<path d="M55,30L55,69" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="55" y="53" text-anchor="start" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central"> 4.9</text>
+<circle cx="157" cy="16" r="14.1533" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="157" y="16" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" dominant-baseline="central">*</text>
+<polygon points="157,75 153,64 162,64" style="fill:rgb(0,0,0)"/>
+<path d="M157,30L157,69" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="157" y="53" text-anchor="start" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central"> 5.2</text>
+<text x="106" y="171" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" font-size="125%" dominant-baseline="central">without ANALYZE</text>
+<circle cx="384" cy="104" r="28.3066" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="384" y="104" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" font-size="156.25%" dominant-baseline="central">P</text>
+<circle cx="486" cy="104" r="28.3066" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="486" y="104" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" font-size="156.25%" dominant-baseline="central">T</text>
+<polygon points="457,101 445,101 448,93" style="fill:rgb(0,0,0)"/>
+<path d="M412,101 L 423,96 Q 435,92 443,95 L 452,99" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="435" y="84" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">4.4</text>
+<polygon points="412,106 424,106 421,114" style="fill:rgb(0,0,0)"/>
+<path d="M457,106 L 446,111 Q 435,115 426,112 L 417,108" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="435" y="123" text-anchor="middle" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central">3.8</text>
+<circle cx="384" cy="16" r="14.1533" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="384" y="16" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" dominant-baseline="central">*</text>
+<polygon points="384,75 379,64 388,64" style="fill:rgb(0,0,0)"/>
+<path d="M384,30L384,69" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="384" y="53" text-anchor="start" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central"> 3.9</text>
+<circle cx="486" cy="16" r="14.1533" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="486" y="16" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" dominant-baseline="central">*</text>
+<polygon points="486,75 481,64 490,64" style="fill:rgb(0,0,0)"/>
+<path d="M486,30L486,69" style="fill:none;stroke-width:2.16;stroke:rgb(0,0,0);" />
+<text x="486" y="53" text-anchor="start" fill="rgb(0,0,0)" font-size="80%" dominant-baseline="central"> 6.1</text>
+<text x="435" y="171" text-anchor="middle" font-weight="bold" fill="rgb(0,0,0)" font-size="125%" dominant-baseline="central">with ANALYZE</text>
+</svg>
+</div>
+</center>
+
+<p>
+In the "without ANALYZE" case on the left, the NN algorithm chooses
+loop P (PLINK) as the outer loop because 4.9 is less than 5.2, resulting
+in path P-T which is algorithm-1. NN only looks at the single best choice
+at each step so it completely misses the fact that
+5.2+4.4 makes a slightly cheaper plan than 4.9+4.8. But the N3 algorithm
+keeps track of the 5 best paths for a 2-way join, so it ends up
+selecting path T-P because of its slightly lower overall cost.
+Path T-P is algorithm-2.
+</p>
+
+<p>
+Note that with ANALYZE the cost estimates are
+better aligned with reality and algorithm-1 is
+selected by both NN and N3.
+</p>
+
+<p>(Side note: The costs estimates in the two most recent graphs
+were computed by the NGQP using a base-2 logarithm and slightly different
+cost assumptions compared to the legacy query planner.
+Hence, the cost estimates in
+these latter two graphs are not directly comparable to the cost estimates
+in the TPC-H Q8 graph.)</p>
+
+<h2 id="_fixing_the_problem"><span>4.2. </span> Fixing The Problem</h2>
+
+<p>Running <a href="lang_analyze.html">ANALYZE</a> on the repository database immediately fixed the
+performance problem. However, we want Fossil to be robust and to always
+work quickly regardless of whether or not its repository has been analyzed.
+For this reason, the query was modified to use the CROSS JOIN operator
+instead of the plain JOIN operator.
+SQLite will not reorder the tables of a CROSS JOIN.
+This is a long-standing feature of SQLite that is specifically designed
+to allow knowledgeable programmers
+to enforce a particular loop nesting order. Once the join
+was changed to CROSS JOIN (the addition of a single keyword) the NGQP was
+forced to choose the faster algorithm-1 regardless of whether or not
+statistical information had been gathered using ANALYZE.</p>
+
+<p>We say that algorithm-1 is "faster", but this
+is not strictly true. Algorithm-1 is faster in common repositories, but
+it is possible to construct a repository in which
+every check-in is on a different uniquely-named branch and
+all check-ins are children of the root check-in.
+In that case, TAGXREF_I1 would become more selective
+than PLINK_I1 and algorithm-2 really would be the faster choice.
+However such repositories are very unlikely to appear in
+practice and so hard-coding the loop nested order using the
+CROSS JOIN syntax is a reasonable solution
+to the problem in this case.</p>
+
+<h2 id="update_2017_a_better_fix"><span>4.3. </span>Update 2017: A Better Fix</h2>
+
+<p>The prior text was written in early 2013, before the first release of
+SQLite version 3.8.0. This paragraph was added in mid 2021.
+While all of the previous discussion remains true, a lot of improvements
+have been made to the query planner, making this whole section largely moot.
+
+</p><p>In 2017, Fossil was enhanced to make use of the new
+<a href="pragma.html#pragma_optimize">PRAGMA optimize</a> statement. Whenever Fossil is about to close the
+database connection to its repository, it first runs
+"PRAGMA optimize", which will in turn cause ANALYZE to be run if it
+is needed. Usually an ANALYZE is not needed, and so there is no
+measurable performance penalty for doing this. But every now and
+then ANALYZE might be run on a few of the tables in the repository
+database. Because if this, query planning issues such as the one
+described here no longer come up in Fossil. The fact that ANALYZE
+is run periodically to keep the <a href="fileformat2.html#stat1tab">sqlite_stat1</a> table up-to-date means
+that hand-tuning of queries is no longer required. We have not had
+to tweak a query in Fossil in ages.
+
+</p><p>Therefore, the current recommendation for avoiding problems such
+as this one is to simply run "PRAGMA optimize" (possibly preceded by
+"<a href="pragma.html#pragma_analysis_limit">PRAGMA analysis_limit=200</a>") just prior to closing each database
+connection. The CROSS JOIN hack is still available, but if you keep
+the query planner statistics in the <a href="fileformat2.html#stat1tab">sqlite_stat1</a> table up-to-date,
+it usually won't be necessary.
+
+<a name="howtofix"></a>
+
+</p><h1 id="_checklist_for_avoiding_or_fixing_query_planner_problems"><span>5. </span> Checklist For Avoiding Or Fixing Query Planner Problems</h1>
+
+<ol>
+<li><p><b>Don't panic!</b>
+Cases where the query planner picks an inferior plan are actually quite
+rare. You are unlikely to run across any problems in your application.
+If you are not having performance issues, you do not need to worry
+about any of this.</p>
+
+</li><li><p><b>Create appropriate indexes.</b>
+Most SQL performance problems arise not because of query planner issues
+but rather due to lack of appropriate indexes. Make sure indexes are
+available to assist all large queries. Most performance issues can be
+resolved by one or two CREATE INDEX commands and with no changes to
+application code.</p>
+
+</li><li><p><b>Avoid creating low-quality indexes.</b>.
+A low-quality index (for the purpose of this checklist) is one where
+there are more than 10 or 20 rows in the table that have the same value
+for the left-most column of the index. In particular, avoid using
+boolean or "enum" columns as the left-most columns of your indexes.</p>
+
+<p>The Fossil performance problem described in the previous section of
+this document arose because there were over
+ten-thousand entries in the TAGXREF table with the same value for the
+left-most column (the TAGID column) of the TAGXREF_I1 index.</p>
+
+</li><li><p><b>If you must use a low-quality index, be sure to run <a href="lang_analyze.html">ANALYZE</a>.</b>
+Low-quality indexes will not confuse the query planner as long as the
+query planner knows that the indexes are of low quality. And the way
+the query planner knows this is by the content of the <a href="fileformat2.html#stat1tab">SQLITE_STAT1</a> table,
+which is computed by the ANALYZE command.</p>
+
+<p>Of course, ANALYZE only works effectively if you have a significant
+amount of content in your database in the first place. When creating a
+new database that you expect to accumulate a lot of data, you can run
+the command "ANALYZE sqlite_schema" to create the SQLITE_STAT1 table,
+then prepopulate the <a href="fileformat2.html#stat1tab">sqlite_stat1</a> table (using ordinary INSERT statements)
+with content that describes a typical
+database for your application - perhaps content that you extracted after
+running ANALYZE on a well-populated template database in the lab.
+Or, you could just run "<a href="pragma.html#pragma_optimize">PRAGMA optimize</a>" before shutting down
+database connections so that ANALYZE will be run automatically as
+needed to keep the <a href="fileformat2.html#stat1tab">sqlite_stat1</a> table current.</p></li>
+
+<li><p><b>Instrument your code.</b>
+Add logic that lets you know quickly and easily which queries are taking
+too much time. Then work on just those specific queries.</p>
+
+</li><li><p><b>Use <a href="lang_corefunc.html#unlikely">unlikely()</a> and <a href="lang_corefunc.html#likelihood">likelihood()</a> SQL functions.</b>
+SQLite normally assumes that terms in the WHERE clause that cannot be used
+by indexes have a strong probability of being true. If this assumption
+is incorrect, it could lead to a suboptimal query plan. The
+<a href="lang_corefunc.html#unlikely">unlikely()</a> and <a href="lang_corefunc.html#likelihood">likelihood()</a> SQL functions can be used to provide
+hints to the query planner about WHERE clause terms that are probably
+not true, and thus aid the query planner in selecting the best possible
+plan.
+
+</p></li><li><p><b>Use the <a href="optoverview.html#crossjoin">CROSS JOIN</a> syntax to enforce a particular
+loop nesting order on queries that might use low-quality indexes in an
+unanalyzed database.</b>
+SQLite <a href="lang_select.html#crossjoin">treats the CROSS JOIN operator specially</a>, forcing the table to
+the left to be an outer loop relative to the table on the right.</p>
+
+<p>Avoid this step if possible, as it defeats one of the huge advantages
+of the whole SQL language concept, specifically that the application
+programmer does not need to get involved with query planning. If you
+do use CROSS JOIN, wait until late in your development cycle to do so,
+and comment the use of CROSS JOIN carefully so that you can take it out
+later if possible. Avoid using CROSS JOIN early in the development
+cycle as doing so is a premature optimization, which is well known to
+be <a href="http://c2.com/cgi/wiki?PrematureOptimization">the root of
+all evil</a>.</p>
+
+</li><li><p><b>Use unary "+" operators to disqualify WHERE clause terms.</b>
+If the query planner insists on selecting a poor-quality index for a particular
+query when a much higher-quality index is available, then
+<a href="optoverview.html#uplus">careful use of unary "+" operators</a> in the WHERE clause
+can force the query planner away from the poor-quality index.
+Avoid using this trick if at all possible, and especially avoid it
+early in the application development cycle. Beware that
+adding a unary "+" operator to an equality expression might change
+the result of that expression if
+<a href="datatype3.html#affinity">type affinity</a> is involved.</p>
+
+</li><li><p><b>Use the <a href="lang_indexedby.html">INDEXED BY</a> syntax to enforce the selection of
+particular indexes on problem queries.</b>
+As with the previous two bullets, avoid this step if possible, and
+especially avoid doing this early in development as it is clearly a
+premature optimization.</p>
+</li></ol>
+
+<h1 id="_summary"><span>6. </span> Summary</h1>
+
+<p>The query planner in SQLite normally does a terrific job of selecting
+fast algorithms for running your SQL statements. This is true of the
+legacy query planner and even more true of the new NGQP. There may be
+an occasional situation where, due to incomplete information, the query
+planner selects a suboptimal plan. This will happen less often with the
+NGQP than with the legacy query planner, but it might still happen. Only
+in those rare cases do application developers need to get involved and
+help the query planner to do the right thing. In the common case, the
+NGQP is just a new enhancement to SQLite that makes the application run
+a little faster and which requires no new developer thought or action.</p>
+<p align="center"><small><i>This page last modified on <a href="https://sqlite.org/docsrc/honeypot" id="mtimelink" data-href="https://sqlite.org/docsrc/finfo/pages/queryplanner-ng.in?m=0789a542ae0d36ce1">2022-09-19 12:27:16</a> UTC </small></i></p>
+