summaryrefslogtreecommitdiffstats
path: root/www/lockingv3.html
blob: da623f8d1f0991f6c50886b89a034b7c07d5952e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
<!DOCTYPE html>
<html><head>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<link href="sqlite.css" rel="stylesheet">
<title>File Locking And Concurrency In SQLite Version 3</title>
<!-- path= -->
</head>
<body>
<div class=nosearch>
<a href="index.html">
<img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0">
</a>
<div><!-- IE hack to prevent disappearing logo --></div>
<div class="tagline desktoponly">
Small. Fast. Reliable.<br>Choose any three.
</div>
<div class="menu mainmenu">
<ul>
<li><a href="index.html">Home</a>
<li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a>
<li class='wideonly'><a href='about.html'>About</a>
<li class='desktoponly'><a href="docs.html">Documentation</a>
<li class='desktoponly'><a href="download.html">Download</a>
<li class='wideonly'><a href='copyright.html'>License</a>
<li class='desktoponly'><a href="support.html">Support</a>
<li class='desktoponly'><a href="prosupport.html">Purchase</a>
<li class='search' id='search_menubutton'>
<a href="javascript:void(0)" onclick='toggle_search()'>Search</a>
</ul>
</div>
<div class="menu submenu" id="submenu">
<ul>
<li><a href='about.html'>About</a>
<li><a href='docs.html'>Documentation</a>
<li><a href='download.html'>Download</a>
<li><a href='support.html'>Support</a>
<li><a href='prosupport.html'>Purchase</a>
</ul>
</div>
<div class="searchmenu" id="searchmenu">
<form method="GET" action="search">
<select name="s" id="searchtype">
<option value="d">Search Documentation</option>
<option value="c">Search Changelog</option>
</select>
<input type="text" name="q" id="searchbox" value="">
<input type="submit" value="Go">
</form>
</div>
</div>
<script>
function toggle_div(nm) {
var w = document.getElementById(nm);
if( w.style.display=="block" ){
w.style.display = "none";
}else{
w.style.display = "block";
}
}
function toggle_search() {
var w = document.getElementById("searchmenu");
if( w.style.display=="block" ){
w.style.display = "none";
} else {
w.style.display = "block";
setTimeout(function(){
document.getElementById("searchbox").focus()
}, 30);
}
}
function div_off(nm){document.getElementById(nm).style.display="none";}
window.onbeforeunload = function(e){div_off("submenu");}
/* Disable the Search feature if we are not operating from CGI, since */
/* Search is accomplished using CGI and will not work without it. */
if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){
document.getElementById("search_menubutton").style.display = "none";
}
/* Used by the Hide/Show button beside syntax diagrams, to toggle the */
function hideorshow(btn,obj){
var x = document.getElementById(obj);
var b = document.getElementById(btn);
if( x.style.display!='none' ){
x.style.display = 'none';
b.innerHTML='show';
}else{
x.style.display = '';
b.innerHTML='hide';
}
return false;
}
var antiRobot = 0;
function antiRobotGo(){
if( antiRobot!=3 ) return;
antiRobot = 7;
var j = document.getElementById("mtimelink");
if(j && j.hasAttribute("data-href")) j.href=j.getAttribute("data-href");
}
function antiRobotDefense(){
document.body.onmousedown=function(){
antiRobot |= 2;
antiRobotGo();
document.body.onmousedown=null;
}
document.body.onmousemove=function(){
antiRobot |= 2;
antiRobotGo();
document.body.onmousemove=null;
}
setTimeout(function(){
antiRobot |= 1;
antiRobotGo();
}, 100)
antiRobotGo();
}
antiRobotDefense();
</script>


<font color="#7f001f"><p>
<p>This document was originally created in early 2004 when SQLite version 2
was still in widespread use and was written to introduce
the new concepts of SQLite version 3 to readers who were already familiar
with SQLite version 2.  But these days, most readers of this document have
probably never seen SQLite version 2 and are only familiar with SQLite
version 3.  Nevertheless, this document continues to serve as an
authoritative reference to how database file locking works in SQLite 
version 3.</p>

<p>The document only describes locking for the older rollback-mode
transaction mechanism.  Locking for the newer <a href="wal.html">write-ahead log</a> or <a href="wal.html">WAL mode</a>
is described separately.</p>
</font>

<h2>1.0 File Locking And Concurrency In SQLite Version 3</h2>

<p>SQLite <a href="releaselog/3_0_0.html">Version 3.0.0</a> introduced a new locking and journaling 
mechanism designed to improve concurrency over SQLite version 2
and to reduce the writer starvation 
problem.  The new mechanism also allows atomic commits of transactions
involving multiple database files.
This document describes the new locking mechanism.
The intended audience is programmers who want to understand and/or modify
the pager code and reviewers working to verify the design
of SQLite version 3.
</p>

<a name="overview"></a>
<h2>2.0 Overview</h2>

<p>
Locking and concurrency control are handled by the 
<a href="https://www.sqlite.org/src/finfo?name=src/pager.c">
pager module</a>.
The pager module is responsible for making SQLite "ACID" (Atomic,
Consistent, Isolated, and Durable).  The pager module makes sure changes
happen all at once, that either all changes occur or none of them do,
that two or more processes do not try to access the database
in incompatible ways at the same time, and that once changes have been
written they persist until explicitly deleted.  The pager also provides
a memory cache of some of the contents of the disk file.</p>

<p>The pager is unconcerned
with the details of B-Trees, text encodings, indices, and so forth.
From the point of view of the pager the database consists of
a single file of uniform-sized blocks.  Each block is called a
"page" and is usually 1024 bytes in size.   The pages are numbered
beginning with 1.  So the first 1024 bytes of the database are called
"page 1" and the second 1024 bytes are call "page 2" and so forth. All 
other encoding details are handled by higher layers of the library.  
The pager communicates with the operating system using one of several
modules 
(Examples:
<a href="https://www.sqlite.org/src/finfo?name=src/os_unix.c">
os_unix.c</a>,
<a href="https://www.sqlite.org/src/finfo?name=src/os_win.c">
os_win.c</a>)
that provides a uniform abstraction for operating system services.
</p>

<p>The pager module effectively controls access for separate threads, or
separate processes, or both.  Throughout this document whenever the
word "process" is written you may substitute the word "thread" without
changing the truth of the statement.</p>

<a name="locking"></a>
<h2>3.0 Locking</h2>

<p>
From the point of view of a single process, a database file
can be in one of five locking states:
</p>

<p>
<table cellpadding="20">
<tr><td valign="top">UNLOCKED</td>
<td valign="top">
No locks are held on the database.  The database may be neither read nor
written.  Any internally cached data is considered suspect and subject to
verification against the database file before being used.  Other 
processes can read or write the database as their own locking states
permit.  This is the default state.
</td></tr>

<tr><td valign="top">SHARED</td>
<td valign="top">
<a name="shared_lock"></a>

The database may be read but not written.  Any number of 
processes can hold SHARED locks at the same time, hence there can be
many simultaneous readers.  But no other thread or process is allowed
to write to the database file while one or more SHARED locks are active.
</td></tr>

<tr><td valign="top">RESERVED</td>
<td valign="top">
<a name="reserved_lock"></a>

A RESERVED lock means that the process is planning on writing to the
database file at some point in the future but that it is currently just
reading from the file.  Only a single RESERVED lock may be active at one
time, though multiple SHARED locks can coexist with a single RESERVED lock.
RESERVED differs from PENDING in that new SHARED locks can be acquired
while there is a RESERVED lock.
</td></tr>

<tr><td valign="top">PENDING</td>
<td valign="top">
<a name="pending_lock"></a>

A PENDING lock means that the process holding the lock wants to write
to the database as soon as possible and is just waiting on all current
SHARED locks to clear so that it can get an EXCLUSIVE lock.  No new 
SHARED locks are permitted against the database if
a PENDING lock is active, though existing SHARED locks are allowed to
continue.
</td></tr>

<tr><td valign="top">EXCLUSIVE</td>
<td valign="top">
<a name="excl_lock"></a>

An EXCLUSIVE lock is needed in order to write to the database file.
Only one EXCLUSIVE lock is allowed on the file and no other locks of
any kind are allowed to coexist with an EXCLUSIVE lock.  In order to
maximize concurrency, SQLite works to minimize the amount of time that
EXCLUSIVE locks are held.
</td></tr>
</table>
</p>

<p>
The operating system interface layer understands and tracks all five
locking states described above.  
The pager module only tracks four of the five locking states.
A PENDING lock is always just a temporary
stepping stone on the path to an EXCLUSIVE lock and so the pager module
does not track PENDING locks.
</p>

<a name="rollback"></a>
<h2>4.0 The Rollback Journal</h2>

<p>When a process wants to change a database file (and it is not
in <a href="wal.html">WAL</a> mode), it
first records the original unchanged database content
in a <em>rollback journal</em>.  The rollback journal is an ordinary
disk file that is always located
in the same directory or folder as the database file and has the
same name as the database file with the addition of a <tt>-journal</tt>
suffix. The rollback journal also records the initial
size of the database so that if the database file grows it can be truncated
back to its original size on a rollback.</p>

<p>If SQLite is working with multiple databases at the same time
(using the <a href="lang_attach.html">ATTACH</a> command) then each database has its own rollback journal.
But there is also a separate aggregate journal
called the <em>super-journal</em>.
The super-journal does not contain page data used for rolling back
changes.  Instead the super-journal contains the names of the
individual database rollback journals for each of the ATTACHed databases. 
Each of the individual database rollback journals also contain the name 
of the super-journal.
If there are no ATTACHed databases (or if none of the ATTACHed database
is participating in the current transaction) no super-journal is
created and the normal rollback journal contains an empty string
in the place normally reserved for recording the name of the
super-journal.</p>

<p>A rollback journal is said to be <a href="fileformat2.html#hotjrnl">hot</a>
if it needs to be rolled back
in order to restore the integrity of its database.  
A hot journal is created when a process is in the middle of a database
update and a program or operating system crash or power failure prevents 
the update from completing.
Hot journals are an exception condition. 
Hot journals exist to recover from crashes and power failures.
If everything is working correctly 
(that is, if there are no crashes or power failures)
you will never get a hot journal.
</p>

<p>
If no super-journal is involved, then
a journal is hot if it exists and has a non-zero header
and its corresponding database file
does not have a RESERVED lock.
If a super-journal is named in the file journal, then the file journal
is hot if its super-journal exists and there is no RESERVED
lock on the corresponding database file.
It is important to understand when a journal is hot so the
preceding rules will be repeated in bullets:
</p>

<ul>
<li>A journal is hot if...
    <ul>
    <li>It exists, and</li>
    <li>Its size is greater than 512 bytes, and</li>
    <li>The journal header is non-zero and well-formed, and</li>
    <li>Its super-journal exists or the super-journal name is an
        empty string, and</li>
    <li>There is no RESERVED lock on the corresponding database file.</li>
    </ul>
</li>
</ul>

<a name="hot_journals"></a>
<h3>4.1 Dealing with hot journals</h3>

<p>
Before reading from a database file, SQLite always checks to see if that
database file has a hot journal.  If the file does have a hot journal, then
the journal is rolled back before the file is read.  In this way, we ensure
that the database file is in a consistent state before it is read.
</p>

<p>When a process wants to read from a database file, it followed
the following sequence of steps:
</p>

<ol>
<li>Open the database file and obtain a SHARED lock.  If the SHARED lock
    cannot be obtained, fail immediately and return SQLITE_BUSY.</li>
<li>Check to see if the database file has a hot journal.   If the file
    does not have a hot journal, we are done.  Return immediately.
    If there is a hot journal, that journal must be rolled back by
    the subsequent steps of this algorithm.</li>
<li>Acquire a PENDING lock then an EXCLUSIVE lock on the database file.
    (Note: Do not acquire a RESERVED lock because that would make
    other processes think the journal was no longer hot.)  If we
    fail to acquire these locks it means another process
    is already trying to do the rollback.  In that case,
    drop all locks, close the database, and return SQLITE_BUSY. </li>
<li>Read the journal file and roll back the changes.</li>
<li>Wait for the rolled back changes to be written onto 
    persistent storage.  This protects the integrity of the database
    in case another power failure or crash occurs.</li>
<li>Delete the journal file (or truncate the journal to zero bytes in
    length if <a href="pragma.html#pragma_journal_mode">PRAGMA journal_mode=TRUNCATE</a> is
    set, or zero the journal header if
    <a href="pragma.html#pragma_journal_mode">PRAGMA journal_mode=PERSIST</a> is set).</li>
<li>Delete the super-journal file if it is safe to do so.
    This step is optional.  It is here only to prevent stale
    super-journals from cluttering up the disk drive.
    See the discussion below for details.</li>
<li>Drop the EXCLUSIVE and PENDING locks but retain the SHARED lock.</li>
</ol>

<p>After the algorithm above completes successfully, it is safe to 
read from the database file.  Once all reading has completed, the
SHARED lock is dropped.</p>

<a name="stale_super_journals"></a>
<h3>4.2 Deleting stale super-journals</h3>

<p>A stale super-journal is a super-journal that is no longer being
used for anything.  There is no requirement that stale super-journals
be deleted.  The only reason for doing so is to free up disk space.</p>

<p>A super-journal is stale if no individual file journals are pointing
to it.  To figure out if a super-journal is stale, we first read the
super-journal to obtain the names of all of its file journals.  Then
we check each of those file journals.  If any of the file journals named
in the super-journal exists and points back to the super-journal, then
the super-journal is not stale.  If all file journals are either missing
or refer to other super-journals or no super-journal at all, then the
super-journal we are testing is stale and can be safely deleted.</p>

<a name="writing"></a>
<h2>5.0 Writing to a database file</h2>

<p>To write to a database, a process must first acquire a SHARED lock
as described above (possibly rolling back incomplete changes if there
is a hot journal). 
After a SHARED lock is obtained, a RESERVED lock must be acquired.
The RESERVED lock signals that the process intends to write to the
database at some point in the future.  Only one process at a time
can hold a RESERVED lock.  But other processes can continue to read
the database while the RESERVED lock is held.
</p>

<p>If the process that wants to write is unable to obtain a RESERVED
lock, it must mean that another process already has a RESERVED lock.
In that case, the write attempt fails and returns SQLITE_BUSY.</p>

<p>After obtaining a RESERVED lock, the process that wants to write
creates a rollback journal.  The header of the journal is initialized
with the original size of the database file.  Space in the journal header
is also reserved for a super-journal name, though the super-journal
name is initially empty.</p>

<p>Before making changes to any page of the database, the process writes
the original content of that page into the rollback journal.  Changes
to pages are held in memory at first and are not written to the disk.
The original database file remains unaltered, which means that other
processes can continue to read the database.</p>

<p>Eventually, the writing process will want to update the database
file, either because its memory cache has filled up or because it is
ready to commit its changes.  Before this happens, the writer must
make sure no other process is reading the database and that the rollback
journal data is safely on the disk surface so that it can be used to
rollback incomplete changes in the event of a power failure.
The steps are as follows:</p>

<ol>
<li>Make sure all rollback journal data has actually been written to
    the surface of the disk (and is not just being held in the operating
    system's  or disk controllers cache) so that if a power failure occurs
    the data will still be there after power is restored.</li>
<li>Obtain a PENDING lock and then an EXCLUSIVE lock on the database file.
    If other processes still have SHARED locks, the writer might have
    to wait until those SHARED locks clear before it is able to obtain
    an EXCLUSIVE lock.</li>
<li>Write all page modifications currently held in memory out to the
    original database disk file.</li>
</ol>

<p>
If the reason for writing to the database file is because the memory
cache was full, then the writer will not commit right away.  Instead,
the writer might continue to make changes to other pages.  Before 
subsequent changes are written to the database file, the rollback
journal must be flushed to disk again.  Note also that the EXCLUSIVE
lock that the writer obtained in order to write to the database initially
must be held until all changes are committed.  That means that no other
processes are able to access the database from the
time the memory cache first spills to disk until the transaction
commits.
</p>

<p>
When a writer is ready to commit its changes, it executes the following
steps:
</p>

<ol>
<li value="4">
   Obtain an EXCLUSIVE lock on the database file and
   make sure all memory changes have been written to the database file
   using the algorithm of steps 1-3 above.</li>
<li>Flush all database file changes to the disk.  Wait for those changes
    to actually be written onto the disk surface.</li>
<li>Delete the journal file.  (Or if the <a href="pragma.html#pragma_journal_mode">PRAGMA journal_mode</a> is TRUNCATE or
    PERSIST, truncate the journal file or zero the header of the journal file,
    respectively.)  This is the instant when the changes are
    committed.  Prior to deleting the journal file, if a power failure
    or crash occurs, the next process to open the database will see that
    it has a hot journal and will roll the changes back.
    After the journal is deleted, there will no longer be a hot journal
    and the changes will persist.
    </li>
<li>Drop the EXCLUSIVE and PENDING locks from the database file.
    </li>
</ol>

<p>As soon as the PENDING lock is released from the database file, other
processes can begin reading the database again.  In the current implementation,
the RESERVED lock is also released, but that is not essential for
correct operation.</p>

<p>If a transaction involves multiple databases, then a more complex
commit sequence is used, as follows:</p>

<ol>
<li value="4">
   Make sure all individual database files have an EXCLUSIVE lock and a
   valid journal.
<li>Create a super-journal.  The name of the super-journal is arbitrary.
    (The current implementation appends random suffixes to the name of the
    main database file until it finds a name that does not previously exist.)
    Fill the super-journal with the names of all the individual journals
    and flush its contents to disk.
<li>Write the name of the super-journal into
    all individual journals (in space set aside for that purpose in the
    headers of the individual journals) and flush the contents of the
    individual journals to disk and wait for those changes to reach the
    disk surface.
<li>Flush all database file changes to the disk.  Wait for those changes
    to actually be written onto the disk surface.</li>
<li>Delete the super-journal file.  This is the instant when the changes are
    committed.  Prior to deleting the super-journal file, if a power failure
    or crash occurs, the individual file journals will be considered hot
    and will be rolled back by the next process that
    attempts to read them.  After the super-journal has been deleted,
    the file journals will no longer be considered hot and the changes
    will persist.
    </li>
<li>Delete all individual journal files.
<li>Drop the EXCLUSIVE and PENDING locks from all database files.
    </li>
</ol>

<a name="writer_starvation"></a>
<h3>5.1 Writer starvation</h3>

<p>In SQLite version 2, if many processes are reading from the database,
it might be the case that there is never a time when there are
no active readers.  And if there is always at least one read lock on the
database, no process would ever be able to make changes to the database
because it would be impossible to acquire a write lock.  This situation
is called <em>writer starvation</em>.</p>

<p>SQLite version 3 seeks to avoid writer starvation through the use of
the PENDING lock.  The PENDING lock allows existing readers to continue
but prevents new readers from connecting to the database.  So when a
process wants to write a busy database, it can set a PENDING lock which
will prevent new readers from coming in.  Assuming existing readers do
eventually complete, all SHARED locks will eventually clear and the
writer will be given a chance to make its changes.</p>

<a name="how_to_corrupt"></a>
<h2>6.0 How To Corrupt Your Database Files</h2>

<p>The pager module is very robust but it can be subverted.  
This section attempts to identify and explain the risks.
(See also the <a href="atomiccommit.html#sect_9_0">Things That Can Go Wrong</a> section of the article
on <a href="atomiccommit.html">Atomic Commit</a>.</p>

<p>
Clearly, a hardware or operating system fault that introduces incorrect data
into the middle of the database file or journal will cause problems.
Likewise, 
if a rogue process opens a database file or journal and writes malformed
data into the middle of it, then the database will become corrupt.
There is not much that can be done about these kinds of problems
so they are given no further attention.
</p>

<p>
SQLite uses POSIX advisory locks to implement locking on Unix.  On
Windows it uses the LockFile(), LockFileEx(), and UnlockFile() system
calls.  SQLite assumes that these system calls all work as advertised.  If
that is not the case, then database corruption can result.  One should
note that POSIX advisory locking is known to be buggy or even unimplemented
on many NFS implementations (including recent versions of Mac OS X)
and that there are reports of locking problems
for network filesystems under Windows.  Your best defense is to not
use SQLite for files on a network filesystem.
</p>

<p>
SQLite uses the fsync() system call to flush data to the disk under Unix and
it uses the FlushFileBuffers() to do the same under Windows.  Once again,
SQLite assumes that these operating system services function as advertised.
But it has been reported that fsync() and FlushFileBuffers() do not always
work correctly, especially with some network filesystems or inexpensive IDE disks.
Apparently some manufactures of IDE disks have controller chips that report
that data has reached the disk surface when in fact the data is still
in volatile cache memory in the disk drive electronics.  There are also
reports that Windows sometimes chooses to ignore FlushFileBuffers() for
unspecified reasons.  The author cannot verify any of these reports.
But if they are true, it means that database corruption is a possibility
following an unexpected power loss.  These are hardware and/or operating
system bugs that SQLite is unable to defend against.
</p>

<a name="ext3-barrier-problem"></a>

<p>If a Linux <a href="http://en.wikipedia.org/wiki/Ext3">ext3</a>
filesystem is mounted without the "barrier=1" option
in the <a href="http://en.wikipedia.org/wiki/fstab">/etc/fstab</a>
and the disk drive write cache is enabled
then filesystem corruption can occur following a power loss or OS crash.
Whether or not corruption can occur depends on the details of the disk control
hardware; corruption is more likely with inexpensive consumer-grade disks
and less of a problem for enterprise-class storage devices with advanced
features such as non-volatile write caches.
Various ext3 experts
<a href="http://www.redhat.com/archives/ext3-users/2010-July/msg00001.html">
confirm this behavior</a>.
We are told that most Linux distributions do not use barrier=1 and do
not disable the write cache so most
Linux distributions are vulnerable to this problem.  Note that this is an
operating system and hardware issue and that there is nothing that SQLite
can do to work around it.  
<a href="http://ozlabs.org/~rusty/index.cgi/tech/2009-10-20.html">
Other database engines</a> have also run into this same problem.</p>

<p>
If a crash or power failure occurs and results in a hot journal but that
journal is deleted, the next process to open the database will not
know that it contains changes that need to be rolled back.  The rollback
will not occur and the database will be left in an inconsistent state.
Rollback journals might be deleted for any number of reasons:
</p>

<ul>
<li>An administrator might be cleaning up after an OS crash or power failure,
    see the journal file, think it is junk, and delete it.</li>
<li>Someone (or some process) might rename the database file but fail to
    also rename its associated journal.</li>
<li>If the database file has aliases (hard or soft links) and the file
    is opened by a different alias than the one used to create the journal,
    then the journal will not be found.  To avoid this problem, you should
    not create links to SQLite database files.</li>
<li>Filesystem corruption following a power failure might cause the
    journal to be renamed or deleted.</li>
</ul>

<p>
The last (fourth) bullet above merits additional comment.  When SQLite creates
a journal file on Unix, it opens the directory that contains that file and
calls fsync() on the directory, in an effort to push the directory information
to disk.  But suppose some other process is adding or removing unrelated
files to the directory that contains the database and journal at the
moment of a power failure.  The supposedly unrelated actions of this other
process might result in the journal file being dropped from the directory and
moved into "lost+found".  This is an unlikely scenario, but it could happen.
The best defenses are to use a journaling filesystem or to keep the
database and journal in a directory by themselves.
</p>

<p>
For a commit involving multiple databases and a super-journal, if the
various databases were on different disk volumes and a power failure occurs
during the commit, then when the machine comes back up the disks might
be remounted with different names.  Or some disks might not be mounted
at all.   When this happens the individual file journals and the
super-journal might not be able to find each other. The worst outcome from
this scenario is that the commit ceases to be atomic.  
Some databases might be rolled back and others might not. 
All databases will continue to be self-consistent.
To defend against this problem, keep all databases
on the same disk volume and/or remount disks using exactly the same names
after a power failure.
</p>

<a name="transaction_control"></a>
<h2>7.0 Transaction Control At The SQL Level</h2>

<p>
The changes to locking and concurrency control in SQLite version 3 also
introduce some subtle changes in the way transactions work at the SQL
language level.
By default, SQLite version 3 operates in <em>autocommit</em> mode.
In autocommit mode,
all changes to the database are committed as soon as all operations associated
with the current database connection complete.</p>

<p>The SQL command "BEGIN TRANSACTION" (the TRANSACTION keyword
is optional) is used to take SQLite out of autocommit mode.
Note that the BEGIN command does not acquire any locks on the database.
After a BEGIN command, a SHARED lock will be acquired when the first
SELECT statement is executed.  A RESERVED lock will be acquired when
the first INSERT, UPDATE, or DELETE statement is executed.  No EXCLUSIVE
lock is acquired until either the memory cache fills up and must
be spilled to disk or until the transaction commits.  In this way,
the system delays blocking read access to the file until the
last possible moment.
</p>

<p>The SQL command "COMMIT"  does not actually commit the changes to
disk.  It just turns autocommit back on.  Then, at the conclusion of
the command, the regular autocommit logic takes over and causes the
actual commit to disk to occur.
The SQL command "ROLLBACK" also operates by turning autocommit back on,
but it also sets a flag that tells the autocommit logic to rollback rather
than commit.</p>

<p>If the SQL COMMIT command turns autocommit on and the autocommit logic
then tries to commit change but fails because some other process is holding
a SHARED lock, then autocommit is turned back off automatically.  This
allows the user to retry the COMMIT at a later time after the SHARED lock
has had an opportunity to clear.</p>

<p align="center"><small><i>This page last modified on  <a href="https://sqlite.org/docsrc/honeypot" id="mtimelink"  data-href="https://sqlite.org/docsrc/finfo/pages/lockingv3.in?m=a1350147b5">2024-04-08 14:35:46</a> UTC </small></i></p>