1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
|
<!DOCTYPE html>
<html><head>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<link href="sqlite.css" rel="stylesheet">
<title>Measuring and Reducing CPU Usage in SQLite</title>
<!-- path= -->
</head>
<body>
<div class=nosearch>
<a href="index.html">
<img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0">
</a>
<div><!-- IE hack to prevent disappearing logo --></div>
<div class="tagline desktoponly">
Small. Fast. Reliable.<br>Choose any three.
</div>
<div class="menu mainmenu">
<ul>
<li><a href="index.html">Home</a>
<li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a>
<li class='wideonly'><a href='about.html'>About</a>
<li class='desktoponly'><a href="docs.html">Documentation</a>
<li class='desktoponly'><a href="download.html">Download</a>
<li class='wideonly'><a href='copyright.html'>License</a>
<li class='desktoponly'><a href="support.html">Support</a>
<li class='desktoponly'><a href="prosupport.html">Purchase</a>
<li class='search' id='search_menubutton'>
<a href="javascript:void(0)" onclick='toggle_search()'>Search</a>
</ul>
</div>
<div class="menu submenu" id="submenu">
<ul>
<li><a href='about.html'>About</a>
<li><a href='docs.html'>Documentation</a>
<li><a href='download.html'>Download</a>
<li><a href='support.html'>Support</a>
<li><a href='prosupport.html'>Purchase</a>
</ul>
</div>
<div class="searchmenu" id="searchmenu">
<form method="GET" action="search">
<select name="s" id="searchtype">
<option value="d">Search Documentation</option>
<option value="c">Search Changelog</option>
</select>
<input type="text" name="q" id="searchbox" value="">
<input type="submit" value="Go">
</form>
</div>
</div>
<script>
function toggle_div(nm) {
var w = document.getElementById(nm);
if( w.style.display=="block" ){
w.style.display = "none";
}else{
w.style.display = "block";
}
}
function toggle_search() {
var w = document.getElementById("searchmenu");
if( w.style.display=="block" ){
w.style.display = "none";
} else {
w.style.display = "block";
setTimeout(function(){
document.getElementById("searchbox").focus()
}, 30);
}
}
function div_off(nm){document.getElementById(nm).style.display="none";}
window.onbeforeunload = function(e){div_off("submenu");}
/* Disable the Search feature if we are not operating from CGI, since */
/* Search is accomplished using CGI and will not work without it. */
if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){
document.getElementById("search_menubutton").style.display = "none";
}
/* Used by the Hide/Show button beside syntax diagrams, to toggle the */
function hideorshow(btn,obj){
var x = document.getElementById(obj);
var b = document.getElementById(btn);
if( x.style.display!='none' ){
x.style.display = 'none';
b.innerHTML='show';
}else{
x.style.display = '';
b.innerHTML='hide';
}
return false;
}
var antiRobot = 0;
function antiRobotGo(){
if( antiRobot!=3 ) return;
antiRobot = 7;
var j = document.getElementById("mtimelink");
if(j && j.hasAttribute("data-href")) j.href=j.getAttribute("data-href");
}
function antiRobotDefense(){
document.body.onmousedown=function(){
antiRobot |= 2;
antiRobotGo();
document.body.onmousedown=null;
}
document.body.onmousemove=function(){
antiRobot |= 2;
antiRobotGo();
document.body.onmousemove=null;
}
setTimeout(function(){
antiRobot |= 1;
antiRobotGo();
}, 100)
antiRobotGo();
}
antiRobotDefense();
</script>
<div class=fancy>
<div class=nosearch>
<div class="fancy_title">
Measuring and Reducing CPU Usage in SQLite
</div>
<div class="fancy_toc">
<a onclick="toggle_toc()">
<span class="fancy_toc_mark" id="toc_mk">►</span>
Table Of Contents
</a>
<div id="toc_sub"><div class="fancy-toc1"><a href="#overview">1. Overview</a></div>
<div class="fancy-toc1"><a href="#measuring_performance">2. Measuring Performance</a></div>
<div class="fancy-toc2"><a href="#compile_options">2.1. Compile Options</a></div>
<div class="fancy-toc2"><a href="#workload">2.2. Workload</a></div>
<div class="fancy-toc2"><a href="#performance_measurement">2.3. Performance Measurement</a></div>
<div class="fancy-toc2"><a href="#microoptimizations">2.4. Microoptimizations</a></div>
<div class="fancy-toc1"><a href="#performance_measurement_workflow">3. Performance Measurement Workflow</a></div>
<div class="fancy-toc1"><a href="#limitations">4. Limitations</a></div>
</div>
</div>
<script>
function toggle_toc(){
var sub = document.getElementById("toc_sub")
var mk = document.getElementById("toc_mk")
if( sub.style.display!="block" ){
sub.style.display = "block";
mk.innerHTML = "▼";
} else {
sub.style.display = "none";
mk.innerHTML = "►";
}
}
</script>
</div>
<h1 id="overview"><span>1. </span>Overview</h1>
<p>The graph below shows the number of CPU cycles used by SQLite on a
standard workload, for versions of SQLite going back about 10 years.
Recent versions of SQLite use about one third as many the CPU cycles
compared to older versions.
</p><p>
This article describes how the SQLite developers measure CPU usage,
what those measurements actually mean, and the techniques used by
SQLite developers on their continuing quest to further reduce the
CPU usage of the SQLite library.
</p>
<center>
<hr>
<div class="imgcontainer">
<img src="./images/cpu-usage.jpg"></div><br>
Measured using cachegrind on Ubuntu 16.04 on x64 with gcc 5.4.0 and -Os.<br>
<hr>
</center>
<h1 id="measuring_performance"><span>2. </span>Measuring Performance</h1>
<p>In brief, the CPU performance of SQLite is measured as follows:
</p><p></p><ol>
<li> Compile SQLite in an as-delivered configuration, without any special
telemetry or debugging options.
</li><li> Link SQLite against a test program that runs approximately 30,000
SQL statements representing a typical workload.
</li><li> Count the number of CPU cycles consumed using
<a href="http://valgrind.org/docs/manual/cg-manual.html">cachegrind</a>.
</li></ol>
<h2 id="compile_options"><span>2.1. </span>Compile Options</h2>
<p>For performance measurement, SQLite is compiled in approximately the same
way as it would be for use in production systems. The compile-time configuration
is "approximate" in the sense that every production use of SQLite is
different. Compile-time options used by one system are not necessarily
the same as those used by others. The key point is that options that
significantly impact the generated machine code are avoided. For example,
the -DSQLITE_DEBUG option is omitted because that option inserts thousands
of assert() statements in the middle of performance critical sections of the
SQLite library. The -pg option (on GCC) is omitted because it causes the
compiler to emit extra probabilistic performance measuring code which interferes
with actual performance measurements.
</p><p>
For performance measurements,
the -Os option is used (optimize for size) rather than -O2 because the
-O2 option creates so much code movement that it is difficult to associate
specific CPU instructions to C source code lines.
</p><h2 id="workload"><span>2.2. </span>Workload</h2>
<p>
The "typical" workload is generated by the
<a href="https://sqlite.org/src/file/test/speedtest1.c">speedtest1.c</a>
program in the canonical SQLite source tree. This program strives to
exercise the SQLite library in a way that is typical of real-world
applications. Of course, every application is different, and so
no test program can exactly mirror the behavior of all applications.
</p><p>
The speedtest1.c program is updated from time to time as the SQLite
developers' understanding of what constitutes "typical" usage evolves.
</p><p>
The
<a href="https://sqlite.org/src/file/tool/speed-check.sh">speed-check.sh</a> shell
script, also in the canonical source tree, is used to run the speedtest1.c
program. To replicate the performance measurements, collect the following
files into a single directory:
</p><ul>
<li> the "speed-check.sh" script,
</li><li> the "speedtest1.c" test program, and
</li><li> the <a href="amalgamation.html">SQLite amalgamation</a> source files "sqlite3.c" and
"sqlite3.h"
</li></ul>
<p>
Then run "sh speed-check.sh trunk".
</p><h2 id="performance_measurement"><span>2.3. </span>Performance Measurement</h2>
<p>
<a href="http://valgrind.org/docs/manual/cg-manual.html">Cachegrind</a> is used to
measure performance because it gives answers that are repeatable to
7 or more significant digits. In comparison, actual (wall-clock)
run times are scarcely repeatable beyond one significant digit.
<a name="microopt"></a>
</p><h2 id="microoptimizations"><span>2.4. </span>Microoptimizations</h2>
<p>
The high repeatability of cachegrind allows the SQLite developers to
implement and measure "microoptimizations". A microoptimization is
a change to the code that results in a very small performance increase.
Typical micro-optimizations reduce the number of CPU cycles by 0.1% or
0.05% or even less. Such improvements are impossible to measure with
real-world timings. But hundreds or thousands of microoptimizations
add up, resulting in measurable real-world performance gains.
</p><h1 id="performance_measurement_workflow"><span>3. </span>Performance Measurement Workflow</h1>
<p>
As SQLite developers edit the SQLite source code, they run the
<a href="https://sqlite.org/src/file/tool/speed-check.sh">speed-check.sh</a>
shell script to track the performance impact of changes. This
script compiles the speedtest1.c program, runs it under cachegrind,
processes the cachegrind output using the
<a href="https://sqlite.org/src/file/tool/cg_anno.tcl">cg_anno.tcl</a> TCL
script, then saves the results in a series of text files.
Typical output from the speed-check.sh script looks like this:
</p><blockquote><pre>
==8683==
==8683== I refs: <font color="red">1,060,925,768</font>
==8683== I1 misses: 23,731,246
==8683== LLi misses: 5,176
==8683== I1 miss rate: 2.24%
==8683== LLi miss rate: 0.00%
==8683==
==8683== D refs: 557,686,925 (361,828,925 rd + 195,858,000 wr)
==8683== D1 misses: 5,067,063 ( 3,544,278 rd + 1,522,785 wr)
==8683== LLd misses: 57,958 ( 16,067 rd + 41,891 wr)
==8683== D1 miss rate: 0.9% ( 1.0% + 0.8% )
==8683== LLd miss rate: 0.0% ( 0.0% + 0.0% )
==8683==
==8683== LL refs: 28,798,309 ( 27,275,524 rd + 1,522,785 wr)
==8683== LL misses: 63,134 ( 21,243 rd + 41,891 wr)
==8683== LL miss rate: 0.0% ( 0.0% + 0.0% )
text data bss dec hex filename
523044 8240 1976 <font color="red">533260</font> 8230c sqlite3.o
220507 1007870 7769352 sqlite3.c
</pre></blockquote>
<p>The important parts of the output (the parts that the developers pay
the most attention to) are shown in red.
Basically, the developers want to know the size of the compiled SQLite
library and how many CPU cycles were needed to run the performance test.
</p><p>The output from the
<a href="https://sqlite.org/src/file/tool/cg_anno.tcl">cg_anno.tcl</a> script
shows the number of CPU cycles spent on each line of code.
The report is approximately 80,000 lines long. The following is a brief
snippet taken from the middle of the report to show what it looks like:
</p><blockquote><pre>
. SQLITE_PRIVATE int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
. MemPage *pPage;
. assert( cursorOwnsBtShared(pCur) );
. assert( pRes!=0 );
. assert( *pRes==0 || *pRes==1 );
. assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
369,648 pCur->info.nSize = 0;
369,648 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
369,648 *pRes = 0;
739,296 if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes);
1,473,580 pPage = pCur->apPage[pCur->iPage];
1,841,975 if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){
4,340 pCur->aiIdx[pCur->iPage]--;
5,593 return btreeNext(pCur, pRes);
. }
728,110 if( pPage->leaf ){
. return SQLITE_OK;
. }else{
3,117 return moveToLeftmost(pCur);
. }
721,876 }
</pre></blockquote>
<p>
The numbers on the left are the CPU cycle counts for that line of code,
of course.
</p><p>
The cg_anno.tcl script removes extraneous details from the default
cachegrind annotation
output so that before-and-after reports can be compared using a
side-by-side diff to view specific details of how a
micro-optimization attempt affected performance.
</p><h1 id="limitations"><span>4. </span>Limitations</h1>
<p>The use of the standardized speedtest1.c workload and cachegrind has
enabled significant performance improvement.
However, it is important to recognize the limitations of this approach:
</p><ul>
<li><p>
Performance measurements are done with a single compiler (gcc 5.4.0),
optimization setting (-Os), and
on a single platform (Ubuntu 16.04 LTS on x64). The performance of
other compilers and processors may vary.
</p></li><li><p>
The speedtest1.c workload that is being measured tries to be representative
of a wide range of typical uses of SQLite. But every application is
different. The speedtest1.c workload might not be a good proxy for the
kinds of activities performed by some applications. The SQLite developers
are constantly working to improve the speedtest1.c program, to make it
a better proxy for actual SQLite usage. Community feedback is welcomed.
</p></li><li><p>
The cycle counts provided by cachegrind are a good proxy for actual
performance, but they are not 100% accurate.
</p></li><li><p>
Only CPU cycle counts are being measured here.
CPU cycle counts are a good proxy for energy consumption,
but do not necessary correlate well with real-world timings.
Time spent doing I/O is not reflected in the CPU cycle counts,
and I/O time predominates in many SQLite usage scenarios.
</p></li></ul>
<p align="center"><small><i>This page last modified on <a href="https://sqlite.org/docsrc/honeypot" id="mtimelink" data-href="https://sqlite.org/docsrc/finfo/pages/cpu.in?m=30de04a5bf">2022-01-08 05:02:57</a> UTC </small></i></p>
|