summaryrefslogtreecommitdiffstats
path: root/www/walformat.html
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--www/walformat.html893
1 files changed, 893 insertions, 0 deletions
diff --git a/www/walformat.html b/www/walformat.html
new file mode 100644
index 0000000..3634a44
--- /dev/null
+++ b/www/walformat.html
@@ -0,0 +1,893 @@
+<!DOCTYPE html>
+<html><head>
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<meta http-equiv="content-type" content="text/html; charset=UTF-8">
+<link href="sqlite.css" rel="stylesheet">
+<title>WAL-mode File Format</title>
+<!-- path= -->
+</head>
+<body>
+<div class=nosearch>
+<a href="index.html">
+<img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0">
+</a>
+<div><!-- IE hack to prevent disappearing logo --></div>
+<div class="tagline desktoponly">
+Small. Fast. Reliable.<br>Choose any three.
+</div>
+<div class="menu mainmenu">
+<ul>
+<li><a href="index.html">Home</a>
+<li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a>
+<li class='wideonly'><a href='about.html'>About</a>
+<li class='desktoponly'><a href="docs.html">Documentation</a>
+<li class='desktoponly'><a href="download.html">Download</a>
+<li class='wideonly'><a href='copyright.html'>License</a>
+<li class='desktoponly'><a href="support.html">Support</a>
+<li class='desktoponly'><a href="prosupport.html">Purchase</a>
+<li class='search' id='search_menubutton'>
+<a href="javascript:void(0)" onclick='toggle_search()'>Search</a>
+</ul>
+</div>
+<div class="menu submenu" id="submenu">
+<ul>
+<li><a href='about.html'>About</a>
+<li><a href='docs.html'>Documentation</a>
+<li><a href='download.html'>Download</a>
+<li><a href='support.html'>Support</a>
+<li><a href='prosupport.html'>Purchase</a>
+</ul>
+</div>
+<div class="searchmenu" id="searchmenu">
+<form method="GET" action="search">
+<select name="s" id="searchtype">
+<option value="d">Search Documentation</option>
+<option value="c">Search Changelog</option>
+</select>
+<input type="text" name="q" id="searchbox" value="">
+<input type="submit" value="Go">
+</form>
+</div>
+</div>
+<script>
+function toggle_div(nm) {
+var w = document.getElementById(nm);
+if( w.style.display=="block" ){
+w.style.display = "none";
+}else{
+w.style.display = "block";
+}
+}
+function toggle_search() {
+var w = document.getElementById("searchmenu");
+if( w.style.display=="block" ){
+w.style.display = "none";
+} else {
+w.style.display = "block";
+setTimeout(function(){
+document.getElementById("searchbox").focus()
+}, 30);
+}
+}
+function div_off(nm){document.getElementById(nm).style.display="none";}
+window.onbeforeunload = function(e){div_off("submenu");}
+/* Disable the Search feature if we are not operating from CGI, since */
+/* Search is accomplished using CGI and will not work without it. */
+if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){
+document.getElementById("search_menubutton").style.display = "none";
+}
+/* Used by the Hide/Show button beside syntax diagrams, to toggle the */
+function hideorshow(btn,obj){
+var x = document.getElementById(obj);
+var b = document.getElementById(btn);
+if( x.style.display!='none' ){
+x.style.display = 'none';
+b.innerHTML='show';
+}else{
+x.style.display = '';
+b.innerHTML='hide';
+}
+return false;
+}
+var antiRobot = 0;
+function antiRobotGo(){
+if( antiRobot!=3 ) return;
+antiRobot = 7;
+var j = document.getElementById("mtimelink");
+if(j && j.hasAttribute("data-href")) j.href=j.getAttribute("data-href");
+}
+function antiRobotDefense(){
+document.body.onmousedown=function(){
+antiRobot |= 2;
+antiRobotGo();
+document.body.onmousedown=null;
+}
+document.body.onmousemove=function(){
+antiRobot |= 2;
+antiRobotGo();
+document.body.onmousemove=null;
+}
+setTimeout(function(){
+antiRobot |= 1;
+antiRobotGo();
+}, 100)
+antiRobotGo();
+}
+antiRobotDefense();
+</script>
+<div class=fancy>
+<div class=nosearch>
+<div class="fancy_title">
+WAL-mode File Format
+</div>
+<div class="fancy_toc">
+<a onclick="toggle_toc()">
+<span class="fancy_toc_mark" id="toc_mk">&#x25ba;</span>
+Table Of Contents
+</a>
+<div id="toc_sub"><div class="fancy-toc1"><a href="#files_on_disk">1. Files On Disk</a></div>
+<div class="fancy-toc2"><a href="#the_main_database_file">1.1. The Main Database File</a></div>
+<div class="fancy-toc2"><a href="#the_write_ahead_log_or_wal_file">1.2. The Write-Ahead-Log or "-wal" File</a></div>
+<div class="fancy-toc2"><a href="#the_wal_index_or_shm_file">1.3. The Wal-Index or "-shm" file</a></div>
+<div class="fancy-toc2"><a href="#file_lifecycles">1.4. File Lifecycles</a></div>
+<div class="fancy-toc2"><a href="#variations">1.5. Variations</a></div>
+<div class="fancy-toc1"><a href="#the_wal_index_file_format">2. The WAL-Index File Format</a></div>
+<div class="fancy-toc2"><a href="#the_wal_index_header">2.1. The WAL-Index Header</a></div>
+<div class="fancy-toc3"><a href="#the_mxframe_field">2.1.1. The mxFrame field</a></div>
+<div class="fancy-toc3"><a href="#the_nbackfill_field">2.1.2. The nBackfill field</a></div>
+<div class="fancy-toc3"><a href="#wal_locks">2.1.3. WAL Locks</a></div>
+<div class="fancy-toc2"><a href="#wal_index_hash_tables">2.2. WAL-Index Hash Tables</a></div>
+<div class="fancy-toc2"><a href="#locking_matrix">2.3. Locking Matrix</a></div>
+<div class="fancy-toc3"><a href="#how_the_various_locks_are_used">2.3.1. How the various locks are used</a></div>
+<div class="fancy-toc3"><a href="#operations_that_require_locks_and_which_locks_those_operations_use">2.3.2. Operations that require locks and which locks those operations use</a></div>
+<div class="fancy-toc1"><a href="#recovery">3. Recovery</a></div>
+</div>
+</div>
+<script>
+function toggle_toc(){
+var sub = document.getElementById("toc_sub")
+var mk = document.getElementById("toc_mk")
+if( sub.style.display!="block" ){
+sub.style.display = "block";
+mk.innerHTML = "&#x25bc;";
+} else {
+sub.style.display = "none";
+mk.innerHTML = "&#x25ba;";
+}
+}
+</script>
+</div>
+
+
+
+
+
+<p>This document describes low-level details on how <a href="wal.html">WAL mode</a> is
+implemented on unix and windows.
+
+</p><p>The separate <a href="fileformat2.html">file format</a> description provides details on the
+structure of a database file and of the write-head log file used in
+<a href="wal.html">WAL mode</a>. But details of the locking protocol and of the format
+of the WAL-index are deliberately omitted since those details
+are left to discretion of individual <a href="vfs.html">VFS</a> implementations. This
+document fills in those missing details for the unix and windows <a href="vfs.html">VFSes</a>.
+
+</p><p>For completeness, some of the higher level formatting information
+contains in the <a href="fileformat2.html">file format</a> document and elsewhere is replicated here,
+when it pertains to WAL mode processing.
+
+</p><h1 id="files_on_disk"><span>1. </span>Files On Disk</h1>
+
+<p>When in active use, the state of a WAL mode database is described
+by three separate files:
+
+</p><ol>
+<li> The main database file with an arbitrary name "X".
+</li><li> The write-ahead log file, usually named "X-wal".
+</li><li> The wal-index file, usually named "X-shm".
+</li></ol>
+
+<h2 id="the_main_database_file"><span>1.1. </span>The Main Database File</h2>
+
+<p>The format of the main database file is as described in the
+<a href="fileformat2.html">file format</a> document. The <a href="fileformat2.html#vnums">file format version numbers</a> at offsets
+18 and 19 into the main database must both be 2 to indicate that the
+database is in WAL mode. The main database may have an arbitrary
+name allowed by the underlying filesystem. No special file suffixes
+are required, though ".db", ".sqlite", and ".sqlite3" seem to be
+popular choices.
+
+</p><h2 id="the_write_ahead_log_or_wal_file"><span>1.2. </span>The Write-Ahead-Log or "-wal" File</h2>
+<p>The write-ahead log or "wal" file is a roll-forward journal
+that records transactions that have been committed but not yet applied
+to the main database. Details on the format of the wal file are
+describe in the <a href="fileformat2.html#walformat">WAL format</a> subsection of the main <a href="fileformat2.html">file format</a>
+document. The wal file is named by appending the four characters
+"-wal" to the end of the name of the main database file. Except
+on 8+3 filesystems, such names are not allowed, and in that case
+the file suffix is changed to ".WAL". But as 8+3 filesystems are
+increasingly rare, that exceptional case can usually be ignored.
+
+<a name="shm"></a>
+
+</p><h2 id="the_wal_index_or_shm_file"><span>1.3. </span>The Wal-Index or "-shm" file</h2>
+<p>The wal-index file or "shm" file is not actually used as a file.
+Rather, individual database clients mmap the shm file and use it
+as shared memory for coordinating access to the database and as a cache
+for quickly locating frame within the wal file. The name
+of the shm file is the main database file name with the four characters
+"-shm" appended. Or, for 8+3 filesystems, the shm file is the main
+database file with the suffix changed to ".SHM".
+
+</p><p>The shm does not contain any database content and is not required
+to recover the database following a crash. For that reason, the first
+client to connect to a quiescent database will normally truncate the
+shm file if it exists. Since the content of the shm file does not need
+to be preserved across a crash, the shm file is never fsync()-ed to disk.
+In fact, if there were a mechanism by which SQLite could tell the
+operating system to never persist the shm file to disk but always hold
+it in cache memory, SQLite would use that mechanism to avoid any
+unnecessary disk I/O associated with the shm file. However, no such
+mechanism exists in standard posix.
+
+</p><p>Because the shm is only used to coordinate access between concurrent
+clients, the shm file is omitted if <a href="pragma.html#pragma_locking_mode">exclusive locking mode</a>
+is set, as an optimization. When <a href="pragma.html#pragma_locking_mode">exclusive locking mode</a> is set,
+SQLite uses heap memory in place of the memory-mapped shm file.
+
+</p><h2 id="file_lifecycles"><span>1.4. </span>File Lifecycles</h2>
+
+<p>When a WAL mode database is in active use, all three of the above
+files usually exist. Except, the Wal-Index file is omitted if
+<a href="pragma.html#pragma_locking_mode">exclusive locking mode</a> is set.
+
+</p><p>If the last client using the database shuts down cleanly by
+calling <a href="c3ref/close.html">sqlite3_close()</a>, then a <a href="wal.html#ckpt">checkpoint</a> is run automatically
+in order to transfer all information from the wal file
+over into the main database, and both the shm file
+and the wal file are unlinked. Thus, when the database is not in
+use by any client, it is usually the case that only the main
+database file exists on disk.
+However, if the last client did not call <a href="c3ref/close.html">sqlite3_close()</a> before it
+shut down, or if the last client to disconnect was a read-only client,
+then the final cleanup operation does not occur and the
+shm and wal files may still exist on disk even when the database is
+not in use.
+
+</p><h2 id="variations"><span>1.5. </span>Variations</h2>
+
+<p>When <a href="pragma.html#pragma_locking_mode">PRAGMA locking_mode=EXCLUSIVE</a> (exclusive locking mode) is set,
+only a single client is allowed to have the database open at one time. Since
+only a single client can use the database, the shm file is omitted.
+The single client uses a buffer in heap memory as a substitute for the
+memory-mapped shm file.
+
+</p><p>If a read/write client invokes
+<a href="c3ref/file_control.html">sqlite3_file_control</a>(<a href="c3ref/c_fcntl_begin_atomic_write.html#sqlitefcntlpersistwal">SQLITE_FCNTL_PERSIST_WAL</a>) prior to shutdown,
+then at shutdown a checkpoint is still run, but the shm file and wal
+file are not deleted.
+This allows subsequent read-only clients to connect to and read the
+database.
+
+<a name="walidxfmt"></a>
+
+</p><h1 id="the_wal_index_file_format"><span>2. </span>The WAL-Index File Format</h1>
+
+<p>
+The WAL-index or "shm" file is used to coordinate access to the database
+by multiple clients, and as a cache to help clients quickly locate frames
+within the wal file.
+
+</p><p>
+Because the shm file is not involved in recovery, the shm file does not
+need to be machine byte-order independent. Hence, numeric values in
+the shm file are written in the native byte order of the host computer,
+rather than being converted into a specific cross-platform byte order as
+is done with the main database file and the wal file.
+
+</p><p>
+The shm file consists of one or more hash tables, where each hash table
+is 32768 bytes in size. Except, a 136-byte header is carved out of the
+front of the very first hash table, so the first hash table is only
+32632 bytes in size. The total size of the shm file is always a multiple
+of 32768. In most cases, the total size of the shm file is exactly 32768
+bytes. The shm file only needs to grow beyond a single hash table if
+when the wal file grows very large (more than 4079 frames). Since the
+default <a href="c3ref/wal_autocheckpoint.html">automatic checkpoint threshold</a> is
+1000, WAL files rare reach the 4079 threshold needed to make the shm file
+grow.
+
+</p><h2 id="the_wal_index_header"><span>2.1. </span>The WAL-Index Header</h2>
+
+<p>The first 136 bytes of the shm file are a header. The shm header has
+three main divisions as follows:
+
+</p><center>
+<i>WAL-Index Header Divisions</i><br>
+<table width="80%" border="1">
+<tr><th>Bytes</th><th>Description
+</th></tr><tr><td valign="top">0..47</td><td>First copy of the WAL Index Information</td>
+</tr><tr><td valign="top">48..95</td><td>Second copy of the WAL Index Information</td>
+</tr><tr><td valign="top">96..135</td><td>Checkpoint Information and Locks</td>
+</tr></table>
+</center>
+
+<p>Individual fields of the shm header, except for the
+salt values copied from the WAL header, are unsigned integers
+in the native byte-order of the host machine. The salt values
+are exact copies from the WAL header and are in whatever byte
+order is used by the WAL file.
+The size of integers may be 8, 16, 32, or 64 bits.
+A detailed breakout of the individual fields of the shm header
+follows:
+
+</p><center>
+<i>WAL-Index Header Details</i><br>
+<table width="80%" border="1">
+<tr><th>Bytes</th><th>Name</th><th>Meaning</th></tr>
+<tr>
+<td>0..3</td><td>iVersion</td>
+<td>The WAL-index format version number. Always 3007000.</td>
+</tr>
+<tr>
+<td>4..7</td><td>&nbsp;</td>
+<td>Unused padding space. Must be zero.
+</td></tr>
+<tr>
+<td>8..11</td><td>iChange</td>
+<td>Unsigned integer counter, incremented with each transaction
+</td></tr>
+<tr>
+<td>12</td><td>isInit</td>
+<td>The "isInit" flag. 1 when the shm file has been initialized.
+</td></tr>
+<tr>
+<td>13</td><td>bigEndCksum</td>
+<td>True if the WAL file uses big-ending checksums. 0 if the WAL
+ uses little-endian checksums.
+</td></tr>
+<tr>
+<td>14..15</td><td>szPage</td>
+<td>The database page size in bytes, or 1 if the page size is 65536.
+</td></tr>
+<tr>
+<td>16..19</td><td>mxFrame</td>
+<td>Number of valid and committed frames in the WAL file.
+</td></tr>
+<tr>
+<td>20..23</td><td>nPage</td>
+<td>Size of the database file in pages.
+</td></tr>
+<tr>
+<td>24..31</td><td>aFrameCksum</td>
+<td>Checksum of the last frame in the WAL file.
+</td></tr>
+<tr>
+<td>32..39</td><td>aSalt</td>
+<td>The two salt value copied from the WAL file header.
+These values are in the byte-order of the WAL file, which
+might be different from the native byte-order of the
+machine.
+</td></tr>
+<tr>
+<td>40..47</td><td>aCksum</td>
+<td>A checksum over bytes 0 through 39 of this header.
+</td></tr>
+<tr>
+<td>48..95</td><td>&nbsp;</td>
+<td>A copy of bytes 0 through 47 of this header.
+</td></tr>
+<tr>
+<td>96..99</td><td>nBackfill</td>
+<td>Number of WAL frames that have already been backfilled into the database
+ by prior checkpoints
+</td></tr>
+<tr>
+<td>100..119</td><td>read-mark&#91;0..4&#93;
+</td><td>Five "read marks". Each read mark is a 32-bit unsigned integer (4 bytes).
+</td></tr>
+<tr>
+<td>120..127</td><td>&nbsp;</td>
+<td>Unused space set aside for 8 file locks.
+</td></tr>
+<tr>
+<td>128..132</td><td>nBackfillAttempted</td>
+<td>Number of WAL frames that have attempted to be backfilled but which might
+ not have been backfilled successfully.
+</td></tr>
+<tr>
+<td>132..136</td><td>&nbsp;</td>
+<td>Unused space reserved for further expansion.
+</td></tr>
+</table>
+</center>
+
+<a name="mxframe"></a>
+
+<h3 id="the_mxframe_field"><span>2.1.1. </span>The mxFrame field</h3>
+
+<p>The 32-bit unsigned integer at offset 16 (and repeated at offset 64)
+is the number of valid frames in the WAL. Because WAL frame are numbered
+starting with 1, mxFrame is also the index of the last valid commit frame
+in the WAL. A commit frame is a frame that has a non-zero "size of database"
+value in bytes 4 through 7 of the frame header, and that indicates the end
+of a transaction.
+</p><p>
+When mxFrame field is zero, it indicates that the WAL is empty and that
+all content should be obtained directly from the database file.
+</p><p>
+When mxFrame is equal to <a href="walformat.html#nbackfill">nBackfill</a>, that indicates that all content
+in the WAL has been written back into the database.
+In that case, all content can be read directly from the database.
+Furthermore, the next writer is free to <a href="fileformat2.html#walreset">reset the WAL</a> if no other
+connections hold locks on WAL_READ_LOCK(N) for N>0.
+</p><p>
+The mxFrame value is always greater than or equal to both
+<a href="walformat.html#nbackfill">nBackfill</a> and nBackfillAttempted.
+
+
+<a name="nbackfill"></a>
+
+</p><h3 id="the_nbackfill_field"><span>2.1.2. </span>The nBackfill field</h3>
+
+<p>The 32-bit unsigned integer at offset 128 in the WAL-index header
+is called the "nBackfill".
+this field holds the number of frames in the WAL file which
+have been copied back into the main database.
+
+</p><p>The nBackfill number is never greater than <a href="walformat.html#mxframe">mxFrame</a>.
+When nBackfill equals <a href="walformat.html#mxframe">mxFrame</a>, that means that the WAL content
+has been completely written back into the database and it is
+ok to <a href="fileformat2.html#walreset">reset the WAL</a> if there are no locks held on any of WAL_READ_LOCK(N)
+for N>0.
+
+</p><p>The nBackfill can only be increased while holding the
+WAL_CKPT_LOCK. However, nBackfill is changed to zero
+during a <a href="fileformat2.html#walreset">WAL reset</a>, and this happens while holding
+the WAL_WRITE_LOCK.
+
+<a name="locks"></a>
+
+</p><h3 id="wal_locks"><span>2.1.3. </span>WAL Locks</h3>
+
+<p>Eight bytes of space are set aside in the header to support
+file locking using the xShmLock() method in the <a href="c3ref/io_methods.html">sqlite3_io_methods</a>
+object. These eight bytes are never read nor written by SQLite since
+some VFSes (ex: Windows) might implement locks using mandatory file locks.
+
+</p><p>These are the eight locks supported:
+
+</p><center>
+<i>WAL-Index Locks Controlled By xShmLock()</i><br>
+<table border="1">
+<tr><th rowspan="2">Name</th><th colspan="2">Offset
+</th></tr><tr><th>xShmLock</th><th>File
+</th></tr><tr>
+<td>WAL_WRITE_LOCK
+</td><td>0
+</td><td>120
+</td></tr>
+
+<tr>
+<td>WAL_CKPT_LOCK
+</td><td>1
+</td><td>121
+</td></tr>
+
+<tr>
+<td>WAL_RECOVER_LOCK
+</td><td>2
+</td><td>122
+</td></tr>
+
+<tr>
+<td>WAL_READ_LOCK(0)
+</td><td>3
+</td><td>123
+</td></tr>
+
+<tr>
+<td>WAL_READ_LOCK(1)
+</td><td>4
+</td><td>124
+</td></tr>
+
+<tr>
+<td>WAL_READ_LOCK(2)
+</td><td>5
+</td><td>125
+</td></tr>
+
+<tr>
+<td>WAL_READ_LOCK(3)
+</td><td>6
+</td><td>126
+</td></tr>
+
+<tr>
+<td>WAL_READ_LOCK(4)
+</td><td>7
+</td><td>127
+</td></tr>
+</table>
+</center>
+
+
+
+<p><i>TBD: More information about the header</i>
+
+</p><h2 id="wal_index_hash_tables"><span>2.2. </span>WAL-Index Hash Tables</h2>
+
+<p>The hash tables in the shm file are designed to answer the
+following question quickly:
+
+</p><blockquote><i>
+FindFrame(P,M):
+Given a page number P and a maximum WAL frame index M,
+return the largest WAL frame index for page P that does not exceed M,
+or return NULL if there are no frames for page P that do not exceed M.
+</i></blockquote>
+
+<p>
+Let the datatypes "u8", "u16", and "u32" mean unsigned integers of
+length 8, 16, and 32 bits, respectively. Then, the first 32768-byte unit
+of the shm file is organized as follows:
+
+</p><blockquote><pre>
+u8 aWalIndexHeader[136];
+u32 aPgno[4062];
+u16 aHash[8192];
+</pre></blockquote>
+
+<p>The second and all subsequent 32768-byte units of the shm file are
+like this:
+
+</p><blockquote><pre>
+u32 aPgno[4096];
+u16 aHash[8192];
+</pre></blockquote>
+
+<p>Collectively, the aPgno entries record the database page number stored
+in all frames of the WAL file. The aPgno[0] entry on the first hash table
+records the database page number stored in the very first frame in the WAL
+file. The aPgno[i] entry from the first hash table is the database page number
+for the i-th frame in the WAL file. The aPgno[k] entry for the second
+hash table is the database page number for the (k+4062)-th frame in the
+WAL file. The aPgno[k] entry for the n-th 32768-byte hash table in the
+shm file (for n>1) holds the database page number stored in the
+(k+4062+4096*(n-2))-th frame of the WAL file.
+
+</p><p>Here is a slightly different way to describe the aPgno values:
+If you think of all aPgno values as a contiguous array, then
+the database page number stored in the i-th frame of the WAL file
+is stored in aPgno[i]. Of course, aPgno is not a contiguous array.
+The first 4062 entries are on the first 32768-byte unit of the shm
+file and subsequent values are in 4096 entry chunks in later units
+of the shm file.
+
+</p><p>One way to compute FindFrame(P,M) would be to scan the aPgno
+array starting with the M-th entry and working backwards towards
+the beginning and return J where aPgno[J]==P. Such an algorithm would
+work, and it would be faster than searching the whole WAL file for
+the latest frame with page number P. But the search can be made
+much faster still by using the aHash structure.
+
+</p><p>A database page number P is mapped into a hash value
+using the following hash function:
+
+</p><blockquote>
+h = (P * 383)%8192
+</blockquote>
+
+<p>This function maps every page number into an integer between 0
+and 8191 inclusive. The aHash field of each 32768-byte shm file unit
+maps P values into indexes of the aPgno field of the same unit as
+follows:
+
+</p><ol>
+<li> Compute the hash value: h = P * 383
+</li><li> Let X be the largest set of consecutive integers {h, h+1, h+2, ..., h+N}
+ such that for every j in X, aPgno&#91;j%8192]!=0. The X set will be empty
+ if aPgno&#91;h%8192]==0. The X set is easily computed by starting with
+ the value h%8192, and adding h%8192 to X and incrementing h until
+ encountering the first aPgno&#91;h%8192] entry that is zero.
+</li><li> The set X contains the index in aPgno of every entry in the current
+ 32768-byte unit of the shm file that might possible be a solution
+ to the FindFrame(P,M) function. Each of these entries must be
+ checked separately to ensure that the aPgno value is P and that the
+ frame number does not exceed M. The largest frame number that passes
+ those two tests is the answer.
+</li></ol>
+
+<p>Each entry in the aPgno array has a single corresponding entry
+in the aHash array. There are more available slots in aHash than
+there are in aPgno. The unused slots in aHash are filled with zero.
+And since there are guaranteed to be unused slots in aHash, that means
+the loop that computes X is guaranteed to terminate. The expected size
+of X is less than 2. The worst case is that X will be the same as the
+number of entries in aPgno, in which case the algorithm runs at about
+the same speed as a linear scan of aPgno. But that worst case performance
+is exceedingly rare. Usually, the size of X will be small and the use
+of the aHash array allows one to compute FindFrame(P,M) much faster.
+
+</p><p>Here is an alternative way of describing the hash look-up algorithm:
+Start with h = (P * 383)%8192 and look at aHash[h] and subsequent entries,
+wrapping around to zero when h reaches 8192, until finding an entry with
+aHash[h]==0. All aPgno entries having a page number of P will have an
+index that is one of the aHash[h] values thusly computed.
+But not all the computed aHash[h] values will
+meet the matching criteria, so you must check them independently. The
+speed advantage comes about because normally this set of h values is
+very small.
+
+</p><p>Note that each 32768-byte unit of the shm file has its own aHash and
+aPgno arrays. The aHash array for a single unit is only helpful in finding
+aPgno entries in that same unit. The overall FindFrame(P,M) function
+needs to do hash lookups beginning with the latest unit and working
+backwards to the oldest unit until it finds an answer.
+
+</p><h2 id="locking_matrix"><span>2.3. </span>Locking Matrix</h2>
+
+<p>Access is coordinated in WAL mode using both the legacy DELETE-mode
+locks controlled by the xLock and xUnlock methods of the <a href="c3ref/io_methods.html">sqlite3_io_methods</a>
+object and the WAL locks controlled by the xShmLock method of the
+<a href="c3ref/io_methods.html">sqlite3_io_methods</a> object.
+
+</p><p>Conceptually, there is just a single DELETE-mode lock. The DELETE-mode
+lock for a single database connection can be in exactly one of the
+following states:
+</p><ol>
+<li value="0"> SQLITE_LOCK_NONE (unlocked)
+</li><li value="1"> SQLITE_LOCK_SHARED (reading)
+</li><li value="2"> SQLITE_LOCK_RESERVED (reading, waiting to write)
+</li><li value="3"> SQLITE_LOCK_PENDING (new readers blocked, waiting to write)
+</li><li value="4"> SQLITE_LOCK_EXCLUSIVE (writing)
+</li></ol>
+<p>The DELETE-mode locks are stored on the <a href="fileformat2.html#lockbyte">lock-byte page</a> of the
+main database file.
+Only SQLITE_LOCK_SHARED and SQLITE_LOCK_EXCLUSIVE are factors for WAL-mode
+databases.
+The other locking states are used in rollback-mode, but not in WAL-mode.
+
+</p><p>The <a href="walformat.html#locks">WAL-mode locks</a> are described above.
+
+</p><h3 id="how_the_various_locks_are_used"><span>2.3.1. </span>How the various locks are used</h3>
+
+<p>The following rules show how each of the locks is used.
+
+</p><ul>
+<li><p><b>SQLITE_LOCK_SHARED</b>
+</p><p>
+All connections hold SQLITE_LOCK_SHARED continuously while attached
+to a WAL-mode database. This is true for both read/write connections
+and read-only connections.
+The SQLITE_LOCK_SHARED lock is held even by connections that are
+not within transaction.
+This is different from rollback mode, where the SQLITE_LOCK_SHARED is
+released at the end of each transaction.
+
+</p></li><li><p><b>SQLITE_LOCK_EXCLUSIVE</b>
+</p><p>
+Connections hold an exclusive lock when change in between WAL mode
+and any of the various rollback-modes. Connections might also attempt to
+obtain an EXCLUSIVE lock when they disconnect from WAL mode. If
+a connection is able to obtain an EXCLUSIVE lock, that means it is the
+only connection to the database and so it may attempt to checkpoint
+and then delete the WAL-index and WAL files.
+</p><p>
+When a connection is holding a SHARED lock on the main database,
+that will prevent any other connection from acquiring the EXCLUSIVE
+lock, which in turn prevents the WAL-index and WAL files from being
+deleted out from under other users, and prevents a transition out of
+WAL-mode while other users are accessing the database in WAL-mode.
+
+
+</p></li><li><p><b>WAL_WRITE_LOCK</b>
+</p><p>
+The WAL_WRITE_LOCK is only locked exclusively. There is never a shared
+lock taken on WAL_WRITE_LOCK.
+</p><p>
+An EXCLUSIVE WAL_WRITE_LOCK is held by any connection that is appending
+content to the end of the WAL. Hence, only a single process at a time
+can append content to the WAL. If a <a href="fileformat2.html#walreset">WAL reset</a> occurs as a consequence of
+a write, then the <a href="walformat.html#nbackfill">nBackfill</a> field of the WAL-index header is reset to
+zero while holding this lock.
+</p><p>An EXCLUSIVE is also held WAL_WRITE_LOCK, and on several other locking
+bytes, when a connection is running <a href="walformat.html#recovery">recovery</a> on the shared WAL-index.
+
+</p></li><li><p><b>WAL_CKPT_LOCK</b>
+</p><p>
+The WAL_CKPT_LOCK is only locked exclusively. There is never a shared
+lock taken on WAL_CKPT_LOCK.
+</p><p>
+An EXCLUSIVE WAL_CKPT_LOCK is held by any connection that is running
+a <a href="wal.html#ckpt">checkpoint</a>. The <a href="walformat.html#nbackfill">nBackfill</a> field of the WAL-index header may be
+increased while holding this exclusive lock, but it may not be decreased.
+</p><p>An EXCLUSIVE is also held WAL_CKPT_LOCK, and on several other locking
+bytes, when a connection is running <a href="walformat.html#recovery">recovery</a> on the shared WAL-index.
+
+</p></li><li><p><b>WAL_RECOVER_LOCK</b>
+</p><p>
+The WAL_RECOVER_LOCK is only locked exclusively. There is never a shared
+lock taken on WAL_RECOVER_LOCK.
+</p><p>
+An EXCLUSIVE WAL_RECOVER_LOCK is held by any connection that is running
+<a href="walformat.html#recovery">recovery</a> to reconstruct the shared WAL-index.
+</p><p>
+A read-only connection that is rebuilding its private heap-memory WAL-index
+does not hold this lock. (It cannot, since read-only connections are not
+allowed to hold any exclusive locks.) This lock is only held when rebuilding
+the global shared WAL-index contained in the memory-mapped SHM file.
+</p><p>
+In addition to locking this byte, a connection running <a href="walformat.html#recovery">recovery</a> also
+gets an exclusive lock on all other WAL locks except for WAL_READ_LOCK(0).
+
+</p></li><li><p><b>WAL_READ_LOCK(N)</b>
+</p><p>
+There are five separate read locks, numbers 0 through 4.
+Read locks may be either SHARED or EXCLUSIVE.
+Connections obtain a shared lock on one of the read locks bytes while
+they are within a transaction.
+Connections also obtain an exclusive lock on read locks, one at a time,
+for the brief moment while they are updating the values of the corresponding
+read-marks.
+Read locks 1 through 4 are held exclusively when running <a href="walformat.html#recovery">recovery</a>.
+</p><p>
+Each read lock byte corresponds to one of the five 32-bit read-mark
+integers located in bytes 100 through 119 of the WAL-index header, as
+follows:
+</p><center>
+<table border="1">
+<tr><th>Lock Name</th><th>Lock offset</th><th>Read-mark name</th><th>Read-mark offset
+</th></tr><tr><td>WAL_READ_LOCK(0)</td><td>123</td><td>read-mark[0]</td><td>100..103
+</td></tr><tr><td>WAL_READ_LOCK(1)</td><td>124</td><td>read-mark[1]</td><td>104..107
+</td></tr><tr><td>WAL_READ_LOCK(2)</td><td>125</td><td>read-mark[2]</td><td>108..111
+</td></tr><tr><td>WAL_READ_LOCK(3)</td><td>126</td><td>read-mark[3]</td><td>112..115
+</td></tr><tr><td>WAL_READ_LOCK(4)</td><td>127</td><td>read-mark[4]</td><td>116..119
+</td></tr></table>
+</center>
+<p>
+When a connection holds a shared lock on WAL_READ_LOCK(N), that is a
+promise by the connection that it will use the WAL and not the database
+file for any database pages that are modified by the first
+read-mark[N] entries in the WAL.
+The read-mark[0] is always zero. If a connection holds a shared lock
+on WAL_READ_LOCK(0), that means the connection expects to be able to ignore
+the WAL and read any content it wants from the main database.
+If N>0 then the connection is free to use more of the WAL file beyond
+read-mark[N] if it wants to, up to the first mxFrame frames.
+But when a connection holds a shared lock on WAL_READ_LOCK(0), that is a
+promise that it will never read content from the WAL
+and will acquire all content directly from the main database.
+</p><p>
+When a checkpoint runs, if it sees a lock on WAL_READ_LOCK(N), then it
+must not move WAL content into the main database for more than the first
+read-mark[N] frames. Were it to do so, it would overwrite content that
+the process holding the lock was expecting to be able to read out of the
+main database file. A consequence of if this is that f the WAL file
+contains more than read-mark[N] frames (if mxFrame>read-mark[N]
+for any read-mark for which WAL_READ_LOCK(N) is held by another process),
+then the checkpoint cannot run to completion.
+</p><p>
+When a writer wants to <a href="fileformat2.html#walreset">reset the WAL</a>, it must ensure that there are
+no locks on WAL_READ_LOCK(N) for N>0 because such locks indicate
+that some other connection is still using the current WAL file and
+a <a href="fileformat2.html#walreset">WAL reset</a> would delete content out from those other connections. It is
+ok for a <a href="fileformat2.html#walreset">WAL reset</a> to occur if other connections are holding WAL_READ_LOCK(0)
+because by holding WAL_READ_LOCK(0), those other connections are promising
+not to use any content from the WAL.
+</p></li></ul>
+
+<h3 id="operations_that_require_locks_and_which_locks_those_operations_use"><span>2.3.2. </span>Operations that require locks and which locks those operations use</h3>
+
+<ul>
+<li><p><b>Transition into and out of WAL-mode</b>
+
+</p><p>The SQLITE_LOCK_EXCLUSIVE lock must be held by a connection that wants
+to transition into our out of WAL mode.
+Transitioning into WAL mode is, therefore, just like any other write
+transaction, since every write transaction in rollback mode requires
+the SQLITE_LOCK_EXCLUSIVE lock.
+If the database file is already in WAL mode (hence if the desire it to change
+it back into rollback mode) and if there are two
+or more connections to the database, then each of these connections will
+be holding an SQLITE_LOCK_SHARED lock. That means that the
+SQLITE_LOCK_EXCLUSIVE cannot be obtained, and the transition out of
+WAL mode will not be allowed. This prevents one connection from deleting
+WAL mode out from under another. It also means that the only way to move
+a database from WAL mode into rollback mode is to close all but one
+connection to the database.
+
+
+</p></li><li><p><b>Close a connection to a WAL mode database</b>
+
+</p><p>When a database connection closes (via <a href="c3ref/close.html">sqlite3_close()</a> or
+<a href="c3ref/close.html">sqlite3_close_v2()</a>), an attempt is made to acquire
+SQLITE_LOCK_EXCLUSIVE. If this attempt is successful, that means
+the connection that is closing is the last connection to the database.
+In that case, it is desirable to clean up the WAL and WAL-index files,
+so the closing connection runs a <a href="wal.html#ckpt">checkpoint</a> (while holding
+SQLITE_LOCK_EXCLUSIVE) and the deletes both the WAL and WAL-index files.
+The SQLITE_LOCK_EXCLUSIVE is not released until after both the
+WAL and WAL-index files have been deleted.
+
+</p><p>If the application invokes
+<a href="c3ref/file_control.html">sqlite3_file_control</a>(<a href="c3ref/c_fcntl_begin_atomic_write.html#sqlitefcntlpersistwal">SQLITE_FCNTL_PERSIST_WAL</a>) on the database
+connection prior to closing, then the final checkpoint is still
+run but the WAL and WAL-index files are
+not deleted as they normally would be.
+This leaves the database in a state that allows other processes
+without write permission on the database, WAL, or WAL-index files
+to open the database read-only.
+If the WAL and WAL-index files are missing, then a process that
+lacks permission to create and initialize those files will not be
+able to open the database, unless the database is designated
+as immutable using the <a href="uri.html#uriimmutable">immutable query parameter</a>.
+
+</p></li><li><p><b>Reconstruct the global shared WAL-index during <a href="walformat.html#recovery">recovery</a></b>
+
+</p><p>All of the WAL-index locks, except for WAL_READ_LOCK(0),
+are held exclusively while reconstructing the global shared WAL-index
+during <a href="walformat.html#recovery">recovery</a>.
+
+</p></li><li><p><b>Append a new transaction to the end of the WAL</b>
+
+</p><p>An exclusive lock is held on WAL_WRITE_LOCK while adding new
+frame onto the end of a WAL file.
+
+</p></li><li><p><b>Read content from the database and WAL as part of a transaction</b>
+
+</p></li><li><p><b>Run a checkpoint</b>
+
+</p></li><li><p><b>Reset the WAL file</b>
+
+</p><p>A <a href="fileformat2.html#walreset">WAL reset</a> means to rewind the WAL and start adding new frames
+at the beginning. This occurs while appending new frames to a WAL
+that has <a href="walformat.html#mxframe">mxFrame</a> equal to <a href="walformat.html#nbackfill">nBackfill</a> and which has no locks on
+WAL_READ_LOCK(1) through WAL_READ_LOCK(4). The WAL_WRITE_LOCK is
+held.
+
+</p></li></ul>
+
+<a name="recovery"></a>
+
+<h1 id="recovery"><span>3. </span>Recovery</h1>
+
+<p>
+Recovery is the process of rebuilding the WAL-index so that it is
+synchronized with the WAL.
+
+</p><p>
+Recovery is run by the first thread to connect to a WAL-mode database.
+Recovery restores the WAL-index so that it accurately describes the
+WAL file. If there is no WAL file present when the first thread connects
+to the database, there is nothing to recover, but the recovery process
+still runs to initialize the WAL-index.
+
+</p><p>
+If the WAL-index is implemented as a memory-mapped file and that file is
+read-only to the first thread to connect, then that thread creates an
+private heap-memory ersazt WAL-index and runs the recovery routine to
+populate that private WAL-index. The same data results, but it is held
+privately rather that being written into the public shared memory area.
+
+</p><p>
+Recovery works by doing a single pass over the WAL, from beginning to end.
+The checksums are verified on each frame of the WAL as it is read. The
+scan stops at the end of the file or at the first invalid checksum.
+The <a href="walformat.html#mxframe">mxFrame</a> field is set to the index of the last valid commit frame
+in WAL. Since WAL frame numbers are indexed starting with 1, mxFrame is
+also the number of valid frames in the WAL. A "commit frame" is a frame
+that has a non-zero value in bytes 4 through 7 of the frame header.
+Since the recovery procedure has no way of knowing how many frames of the
+WAL might have previously been copied back into the database, it initializes
+the <a href="walformat.html#nbackfill">nBackfill</a> value to zero.
+
+</p><p>
+During recovery of the global shared-memory WAL-index, exclusive locks are
+held on WAL_WRITE_LOCK, WAL_CKPT_LOCK, WAL_RECOVER_LOCK, and WAL_READ_LOCK(1) through WAL_READ_LOCK(4). In other words, all locks associated with the
+WAL-index except for WAL_READ_LOCK(0) are held exclusively. This prevents
+any other thread from writing the database and from reading any transactions
+that are held in the WAL, until the recovery is complete.
+</p><p align="center"><small><i>This page last modified on <a href="https://sqlite.org/docsrc/honeypot" id="mtimelink" data-href="https://sqlite.org/docsrc/finfo/pages/walformat.in?m=24994748471a7cea1">2019-04-17 15:17:39</a> UTC </small></i></p>
+