summaryrefslogtreecommitdiffstats
path: root/www/whybytecode.html
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--www/whybytecode.html435
1 files changed, 435 insertions, 0 deletions
diff --git a/www/whybytecode.html b/www/whybytecode.html
new file mode 100644
index 0000000..98da8b3
--- /dev/null
+++ b/www/whybytecode.html
@@ -0,0 +1,435 @@
+<!DOCTYPE html>
+<html><head>
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<meta http-equiv="content-type" content="text/html; charset=UTF-8">
+<link href="sqlite.css" rel="stylesheet">
+<title>Why SQLite Uses Bytecode</title>
+<!-- path= -->
+</head>
+<body>
+<div class=nosearch>
+<a href="index.html">
+<img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0">
+</a>
+<div><!-- IE hack to prevent disappearing logo --></div>
+<div class="tagline desktoponly">
+Small. Fast. Reliable.<br>Choose any three.
+</div>
+<div class="menu mainmenu">
+<ul>
+<li><a href="index.html">Home</a>
+<li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a>
+<li class='wideonly'><a href='about.html'>About</a>
+<li class='desktoponly'><a href="docs.html">Documentation</a>
+<li class='desktoponly'><a href="download.html">Download</a>
+<li class='wideonly'><a href='copyright.html'>License</a>
+<li class='desktoponly'><a href="support.html">Support</a>
+<li class='desktoponly'><a href="prosupport.html">Purchase</a>
+<li class='search' id='search_menubutton'>
+<a href="javascript:void(0)" onclick='toggle_search()'>Search</a>
+</ul>
+</div>
+<div class="menu submenu" id="submenu">
+<ul>
+<li><a href='about.html'>About</a>
+<li><a href='docs.html'>Documentation</a>
+<li><a href='download.html'>Download</a>
+<li><a href='support.html'>Support</a>
+<li><a href='prosupport.html'>Purchase</a>
+</ul>
+</div>
+<div class="searchmenu" id="searchmenu">
+<form method="GET" action="search">
+<select name="s" id="searchtype">
+<option value="d">Search Documentation</option>
+<option value="c">Search Changelog</option>
+</select>
+<input type="text" name="q" id="searchbox" value="">
+<input type="submit" value="Go">
+</form>
+</div>
+</div>
+<script>
+function toggle_div(nm) {
+var w = document.getElementById(nm);
+if( w.style.display=="block" ){
+w.style.display = "none";
+}else{
+w.style.display = "block";
+}
+}
+function toggle_search() {
+var w = document.getElementById("searchmenu");
+if( w.style.display=="block" ){
+w.style.display = "none";
+} else {
+w.style.display = "block";
+setTimeout(function(){
+document.getElementById("searchbox").focus()
+}, 30);
+}
+}
+function div_off(nm){document.getElementById(nm).style.display="none";}
+window.onbeforeunload = function(e){div_off("submenu");}
+/* Disable the Search feature if we are not operating from CGI, since */
+/* Search is accomplished using CGI and will not work without it. */
+if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){
+document.getElementById("search_menubutton").style.display = "none";
+}
+/* Used by the Hide/Show button beside syntax diagrams, to toggle the */
+function hideorshow(btn,obj){
+var x = document.getElementById(obj);
+var b = document.getElementById(btn);
+if( x.style.display!='none' ){
+x.style.display = 'none';
+b.innerHTML='show';
+}else{
+x.style.display = '';
+b.innerHTML='hide';
+}
+return false;
+}
+var antiRobot = 0;
+function antiRobotGo(){
+if( antiRobot!=3 ) return;
+antiRobot = 7;
+var j = document.getElementById("mtimelink");
+if(j && j.hasAttribute("data-href")) j.href=j.getAttribute("data-href");
+}
+function antiRobotDefense(){
+document.body.onmousedown=function(){
+antiRobot |= 2;
+antiRobotGo();
+document.body.onmousedown=null;
+}
+document.body.onmousemove=function(){
+antiRobot |= 2;
+antiRobotGo();
+document.body.onmousemove=null;
+}
+setTimeout(function(){
+antiRobot |= 1;
+antiRobotGo();
+}, 100)
+antiRobotGo();
+}
+antiRobotDefense();
+</script>
+<div class=fancy>
+<div class=nosearch>
+<div class="fancy_title">
+Why SQLite Uses Bytecode
+</div>
+<div class="fancy_toc">
+<a onclick="toggle_toc()">
+<span class="fancy_toc_mark" id="toc_mk">&#x25ba;</span>
+Table Of Contents
+</a>
+<div id="toc_sub"><div class="fancy-toc1"><a href="#introduction">1. Introduction</a></div>
+<div class="fancy-toc2"><a href="#how_to_provide_feedback">1.1. How To Provide Feedback</a></div>
+<div class="fancy-toc2"><a href="#definition_of_bytecode_">1.2. Definition Of "Bytecode"</a></div>
+<div class="fancy-toc2"><a href="#definition_of_abstract_syntax_tree_or_ast_">1.3. Definition Of "Abstract Syntax Tree" or "AST"</a></div>
+<div class="fancy-toc2"><a href="#dataflow_programming">1.4. Dataflow Programming</a></div>
+<div class="fancy-toc1"><a href="#advantages_to_compiling_into_bytecode">2. Advantages To Compiling Into Bytecode</a></div>
+<div class="fancy-toc2"><a href="#bytecode_is_easier_to_understand">2.1. Bytecode Is Easier To Understand</a></div>
+<div class="fancy-toc2"><a href="#bytecode_is_easier_to_debug">2.2. Bytecode Is Easier To Debug</a></div>
+<div class="fancy-toc2"><a href="#bytecode_can_be_run_incrementally">2.3. Bytecode Can Be Run Incrementally</a></div>
+<div class="fancy-toc2"><a href="#bytecode_is_smaller">2.4. Bytecode Is Smaller</a></div>
+<div class="fancy-toc2"><a href="#bytecode_is_faster">2.5. Bytecode Is Faster</a></div>
+<div class="fancy-toc1"><a href="#advantages_of_compiling_into_a_tree_of_objects">3. Advantages Of Compiling Into A Tree Of Objects</a></div>
+<div class="fancy-toc2"><a href="#query_planning_decisions_can_be_deferred_until_runtime">3.1. Query Planning Decisions Can Be Deferred Until Runtime</a></div>
+<div class="fancy-toc2"><a href="#dataflow_programs_are_easy_to_parallelize">3.2. Dataflow Programs Are Easy To Parallelize</a></div>
+</div>
+</div>
+<script>
+function toggle_toc(){
+var sub = document.getElementById("toc_sub")
+var mk = document.getElementById("toc_mk")
+if( sub.style.display!="block" ){
+sub.style.display = "block";
+mk.innerHTML = "&#x25bc;";
+} else {
+sub.style.display = "none";
+mk.innerHTML = "&#x25ba;";
+}
+}
+</script>
+</div>
+
+
+
+
+
+<h1 id="introduction"><span>1. </span>Introduction</h1>
+
+<p>Every SQL database engine works in roughly the same way: It first
+translates the input SQL text into a "prepared statement". Then it "executes"
+the prepared statement to generate a result.
+
+</p><p>A prepared statement is an object that represents the steps needed
+to accomplish the input SQL. Or, to think of it in another way,
+the prepared statement is the SQL statement translated into a form that is
+more easily understood by the computer.
+
+</p><p>In SQLite, a prepared statement is an instance of the
+<a href="c3ref/stmt.html">sqlite3_stmt object</a>. In other systems, the prepared
+statement is usually an internal data structure that is not directly visible to
+the application programmer. Developers of other SQL database engines
+do not necessarily call these objects "prepared statements".
+But such objects exists, whatever they might be called.
+This paper will use the term "prepared statement".
+
+</p><p>There are countless ways of implementing a prepared statement. This
+paper will look at the two most common methods:
+
+</p><ol>
+<li><p>
+<b>Bytecode</b> &rarr; The input SQL is translated into a virtual machine language
+that is then run by a virtual machine interpreter. This is the technique
+used by SQLite.
+
+</p></li><li><p>
+<b>Tree-Of-Objects</b> &rarr; The input SQL is translated in a tree of objects
+that represent the processing to be done. The SQL is executed by walking this
+tree. This is the technique used by MySQL and PostgreSQL.
+</p></li></ol>
+
+<p>
+There are advantages and disadvantages to each of these representations of
+a prepared statement. The purpose of this paper is to articulate some of those
+advantages and disadvantages.
+
+</p><h2 id="how_to_provide_feedback"><span>1.1. </span>How To Provide Feedback</h2>
+
+<p>
+This document is written from the perspective of the original author of SQLite.
+If you disagree with any of the opinions offered in this document, you are
+welcomed to offer corrections and/or contrary views on the
+<a href="https://sqlite.org/forum">SQLite Forum</a>. Or you can email the author
+directly.
+
+</p><h2 id="definition_of_bytecode_"><span>1.2. </span>Definition Of "Bytecode"</h2>
+
+<p>The <a href="opcode.html">bytecode generated by SQLite</a> might be a little different from what
+many readers think of as bytecode. The bytecode used (for example) by the
+<a href="https://en.wikipedia.org/wiki/Java_virtual_machine">Java virtual machine</a> or
+by <a href="https://en.wikipedia.org/wiki/WebAssembly">WebAssembly</a> consists almost
+entirely of low-level operations, similar to what physical CPUs implement:
+basic math operators, comparisons, conditional jumps, and
+instructions to move content between different memory locations. SQLite bytecode
+has these kinds of low-level instructions, too. But SQLite bytecode also contains
+some high-level operations that are specific to the needs of a database engine.
+Here are just a few examples:
+
+</p><p>
+</p><ul>
+<li><p><b>OP_Column</b> &rarr; Extract the value from the N-th column of the
+database row that a particular cursor is currently pointing at.
+</p></li><li><p><b>OP_CreateBtree</b> &rarr; Allocate space for a new B-Tree in the
+database file.
+</p></li><li><p><b>OP_ParseSchema</b> &rarr; Reread and reparse all or part of the
+<a href="schematab.html">sqlite_schema table</a> and refresh internal symbol tables accordingly.
+</p></li><li><p><b>OP_SeekGE</b> &rarr; Move a cursor on a particular B-Tree to the first
+entry that is greater than or equal to a given key.
+</p></li><li><p><b>OP_Next</b> &rarr; Advance a cursor on a particular B-Tree to the next
+entry in the B-Tree and jump, or fall through if there are no more entries in that
+B-Tree.
+</p></li></ul>
+
+<p>In other words, the "bytecode" used by SQLite is not so much a set of CPU
+instructions as it is a list of database primitives that are to be run
+in a particular order.
+
+</p><h2 id="definition_of_abstract_syntax_tree_or_ast_"><span>1.3. </span>Definition Of "Abstract Syntax Tree" or "AST"</h2>
+
+<p>
+An "<a href="https://en.wikipedia.org/wiki/Abstract_syntax_tree">Abstract Syntax Tree</a>" or AST
+is a data structure that describes a program or statement in some kind of formal
+language. In our context, the formal language is SQL. An AST is typically
+implemented as a tree of objects where each object represents one small part of
+the overall SQL statement. ASTs emerge naturally from parsers for formal languages.
+The usual technique is to use an
+<a href="https://en.wikipedia.org/wiki/LALR_parser">LALR(1) parser</a>.
+With such a parser, each terminal
+symbol holds metadata that will become a leaf of the AST, and each non-terminal
+symbol holds metadata that will become a sub-branch of the overall AST.
+As rules of the grammar are "reduced" by the parser, new nodes of the AST are
+allocated and connected to subnodes.
+After the parse completes, the start-symbol of the grammar is left holding the
+root of the AST.
+
+</p><p>An AST is a tree of objects. But an AST is not a suitable form for
+a prepared statement. After being generated, an AST first needs to be
+transformed in various ways before it can executed. Symbols need to be
+resolved. Semantic rules need to be checked. Optimizations need to be
+applied that transform input SQL statement into different forms that
+execute more quickly. Finally, the AST needs to be translated into an
+alternative representation that is more amenable to execution.
+
+</p><p>Some people refer to the tree of objects that is used as the
+executable form for MySQL and PostgreSQL as an AST. This is probably a
+misuse of the term "AST", because by the time the tree of objects is
+ready to be executed, it has been changed so much that it has little
+resemblance to the original SQL text. The confusion arises in part because
+both the final prepared statement object and the original AST are both
+trees of objects. The usual technique is for the original AST that comes
+directly out of the parser to be transformed little by little, in multiple
+passes, until at the end it is fully converted into an tree of objects
+that is no longer strictly an AST but that can be evaluated to
+generate a result. There is not necessarily a clear point during this
+process when the tree-of-objects ceases to be an AST and becomes a
+prepared statement instead. And because there is no clear boundary between an
+AST and a prepared statement, people often refer to a prepared statement
+that is represented as a tree of objects as an "AST", even though that
+description is not precise.
+
+</p><h2 id="dataflow_programming"><span>1.4. </span>Dataflow Programming</h2>
+
+<p><a href="https://en.wikipedia.org/wiki/Dataflow_programming">Dataflow programming</a>
+is a style of programming in which individual nodes specialize in doing
+one small part of the overall computation. Each node receives inputs from
+other nodes and sends its output to other nodes. Thus the nodes form a
+directed graph that carry inputs into outputs.
+
+</p><p>
+A "dataflow program" is perhaps a better description than "AST" for
+the tree of objects that an SQL database engine uses as a prepared statement.
+
+</p><h1 id="advantages_to_compiling_into_bytecode"><span>2. </span>Advantages To Compiling Into Bytecode</h1>
+
+<p>SQLite compiles to bytecode, and the SQLite developers are very happy
+with this approach. Here is why:
+
+</p><h2 id="bytecode_is_easier_to_understand"><span>2.1. </span>Bytecode Is Easier To Understand</h2>
+
+<p>A flat list of opcodes can be easily printed to see exactly
+how an SQL statement is being implemented. This is what happens in SQLite
+when you preface an SQL statement with the "EXPLAIN" keyword: Instead of
+actually running the SQL, the result is a listing of the bytecode
+that would have been used to implement that SQL.
+
+</p><p>Bytecode lends itself to this because a bytecode program is easily
+represented as a table. In SQLite bytecode, each instruction
+has one opcode and five operands. Thus a prepared statement can be
+rendered as if it were a query against a six-column table.
+
+</p><p>A tree-of-objects representation is more difficult to publish in
+a human-readable form. The objects that comprise the tree tend to
+all be very different, and thus it is tricky to come up with a
+consistent and simple table representation with which to display
+the objects. Any such table representation that you do come up
+with would almost certainly have more than six columns, probably many more.
+The problem of rendering a tree-of-objects as a table is sufficiently
+difficult that nobody does it, as far as I know. Hence, no
+tree-of-objects database engine provides the level
+of detail in their "EXPLAIN" output that SQLite provides.
+
+</p><h2 id="bytecode_is_easier_to_debug"><span>2.2. </span>Bytecode Is Easier To Debug</h2>
+
+<p>Bytecode provides a clear separation between front-end parsing and
+analysis and back-end evaluation of an SQL statement. When problems arise
+(incorrect answers and/or poor performance), the developers can examine
+the bytecode to quickly determine if the source of the trouble is either the
+front-end analysis or the back-end data storage section of the product.
+
+</p><p>In debugging builds of SQLite, the <a href="pragma.html#pragma_vdbe_trace">PRAGMA vdbe_trace=ON;</a> command will
+cause a trace of the bytecode execution to appear on the console.
+
+</p><h2 id="bytecode_can_be_run_incrementally"><span>2.3. </span>Bytecode Can Be Run Incrementally</h2>
+
+<p>
+SQL statements written in bytecode can be evaluated incrementally.
+For example, a statement can be run until it generates just its first
+row of output. The statement then pauses until it is stepped again.
+It is not necessary to run the statement to completion before examining
+the first row of output.
+
+</p><p>
+This is more difficult to achieve in a tree-of-objects design. When
+the prepared statement is a tree-of-objects, execution is normally
+accomplished by walking the tree. To pause the statement in the middle
+of a computation means unwinding the stack back up to the caller, all
+the while saving enough state to resume evaluation where it last left
+off. This is not impossible to do, but is sufficiently difficult that
+I have never seen it actually done.
+
+</p><p>
+Most SQL database engines do not really need to do incremental
+execution of prepared statements because most SQL database engines
+are client/server. In client/server engines, a single SQL statement is sent
+to the server, then the complete reply comes back over the wire
+all at once. Thus each statement runs to completion in a single go.
+But SQLite is not client/server. SQLite is a library
+that runs in the same address space and using the same stack as the
+application. Being able to easily and reliably perform incremental
+execution of an SQL statement is important to SQLite.
+
+</p><h2 id="bytecode_is_smaller"><span>2.4. </span>Bytecode Is Smaller</h2>
+
+<p>
+The bytecode generated by SQLite is usually smaller than the corresponding
+AST coming out of the parser. During initial processing of SQL text
+(during the call to <a href="c3ref/prepare.html">sqlite3_prepare()</a> and similar) both the AST and the
+bytecode exist in memory at the same time, so more memory is used then.
+But that is a transient state. The AST
+is quickly discarded and its memory recycled, even before the call to
+<a href="c3ref/prepare.html">sqlite3_prepare()</a> returns, so the resulting <a href="c3ref/stmt.html">prepared statement</a> ends
+up consuming less memory in its bytecode representation than it did as an AST.
+This is important because calls to <a href="c3ref/prepare.html">sqlite3_prepare()</a> are transient, but
+prepared statements are often cached for possible reuse and persist in memory
+for a long time.
+
+</p><h2 id="bytecode_is_faster"><span>2.5. </span>Bytecode Is Faster</h2>
+
+<p>
+I <i>believe</i> that a bytecode representation of a prepared statement
+runs faster, because fewer decisions need to be made for each step of
+the computation. Emphasis on "believe" in the previous sentence
+&rarr; it is difficult to verify
+this claim experimentally since nobody has ever put in the multiple years
+of effort necessary to generate equivalent bytecode and tree-of-object
+representations of a prepared statement to see which one actually runs faster.
+We do know that <a href="fasterthanfs.html">SQLite is very fast</a>, but we
+do not have good side-by-side comparisons with other SQL databases since
+the other databases spend a lot of time doing client/server message processing,
+and it is difficult to untangle the message round-trip overhead from the
+actual processing time.
+
+</p><h1 id="advantages_of_compiling_into_a_tree_of_objects"><span>3. </span>Advantages Of Compiling Into A Tree Of Objects</h1>
+
+<p>The SQLite developers think that the bytecode approach is
+best, at least for the use cases the SQLite tries to fill, but the
+tree-of-objects approach to processing SQL does have some advantages over
+bytecode. There are always tradeoffs.
+
+</p><h2 id="query_planning_decisions_can_be_deferred_until_runtime"><span>3.1. </span>Query Planning Decisions Can Be Deferred Until Runtime</h2>
+
+<p>
+When a prepared statement is bytecode, once the bytecode has been generated,
+the algorithm is fixed and cannot be subsequently changed without completely
+rewriting the bytecode.
+This is not the case with a tree-of-objects prepared
+statement. A tree-of-objects is easier to modify on-the-fly. The query
+plan is mutable and can be tweaked as it is running, based on the progress
+of the query. Thus a query can be dynamically self-tuning.
+
+</p><h2 id="dataflow_programs_are_easy_to_parallelize"><span>3.2. </span>Dataflow Programs Are Easy To Parallelize</h2>
+
+<p>In a dataflow program, each processing node can be assigned to a
+different thread. There needs to be some kind of threadsafe queuing
+mechanism for transferring intermediate results from one node to the
+next. But no synchronization primitives are typically needed within
+each node of the program. Node schedule is trivial: A node becomes
+eligible to run when it has data available and there is space in its
+output queue.
+
+</p><p>This is an important consideration for database engines that are
+designed to run large analytic queries
+(<a href="https://en.wikipedia.org/wiki/Online_analytical_processing">OLAP</a>)
+on large multi-core servers.
+The primary focus of SQLite is transaction processing
+(<a href="https://en.wikipedia.org/wiki/Online_transaction_processing">OLTP</a>)
+on the internet-of-things, so there is less need to
+represent prepared statements as dataflow programs in SQLite.
+</p><p align="center"><small><i>This page last modified on <a href="https://sqlite.org/docsrc/honeypot" id="mtimelink" data-href="https://sqlite.org/docsrc/finfo/pages/whybytecode.in?m=d62430c8d6">2024-05-09 17:38:03</a> UTC </small></i></p>
+