1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
|
<!DOCTYPE html>
<html><head>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<link href="sqlite.css" rel="stylesheet">
<title>SQLite FTS3 and FTS4 Extensions</title>
<!-- path= -->
</head>
<body>
<div class=nosearch>
<a href="index.html">
<img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0">
</a>
<div><!-- IE hack to prevent disappearing logo --></div>
<div class="tagline desktoponly">
Small. Fast. Reliable.<br>Choose any three.
</div>
<div class="menu mainmenu">
<ul>
<li><a href="index.html">Home</a>
<li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a>
<li class='wideonly'><a href='about.html'>About</a>
<li class='desktoponly'><a href="docs.html">Documentation</a>
<li class='desktoponly'><a href="download.html">Download</a>
<li class='wideonly'><a href='copyright.html'>License</a>
<li class='desktoponly'><a href="support.html">Support</a>
<li class='desktoponly'><a href="prosupport.html">Purchase</a>
<li class='search' id='search_menubutton'>
<a href="javascript:void(0)" onclick='toggle_search()'>Search</a>
</ul>
</div>
<div class="menu submenu" id="submenu">
<ul>
<li><a href='about.html'>About</a>
<li><a href='docs.html'>Documentation</a>
<li><a href='download.html'>Download</a>
<li><a href='support.html'>Support</a>
<li><a href='prosupport.html'>Purchase</a>
</ul>
</div>
<div class="searchmenu" id="searchmenu">
<form method="GET" action="search">
<select name="s" id="searchtype">
<option value="d">Search Documentation</option>
<option value="c">Search Changelog</option>
</select>
<input type="text" name="q" id="searchbox" value="">
<input type="submit" value="Go">
</form>
</div>
</div>
<script>
function toggle_div(nm) {
var w = document.getElementById(nm);
if( w.style.display=="block" ){
w.style.display = "none";
}else{
w.style.display = "block";
}
}
function toggle_search() {
var w = document.getElementById("searchmenu");
if( w.style.display=="block" ){
w.style.display = "none";
} else {
w.style.display = "block";
setTimeout(function(){
document.getElementById("searchbox").focus()
}, 30);
}
}
function div_off(nm){document.getElementById(nm).style.display="none";}
window.onbeforeunload = function(e){div_off("submenu");}
/* Disable the Search feature if we are not operating from CGI, since */
/* Search is accomplished using CGI and will not work without it. */
if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){
document.getElementById("search_menubutton").style.display = "none";
}
/* Used by the Hide/Show button beside syntax diagrams, to toggle the */
function hideorshow(btn,obj){
var x = document.getElementById(obj);
var b = document.getElementById(btn);
if( x.style.display!='none' ){
x.style.display = 'none';
b.innerHTML='show';
}else{
x.style.display = '';
b.innerHTML='hide';
}
return false;
}
var antiRobot = 0;
function antiRobotGo(){
if( antiRobot!=3 ) return;
antiRobot = 7;
var j = document.getElementById("mtimelink");
if(j && j.hasAttribute("data-href")) j.href=j.getAttribute("data-href");
}
function antiRobotDefense(){
document.body.onmousedown=function(){
antiRobot |= 2;
antiRobotGo();
document.body.onmousedown=null;
}
document.body.onmousemove=function(){
antiRobot |= 2;
antiRobotGo();
document.body.onmousemove=null;
}
setTimeout(function(){
antiRobot |= 1;
antiRobotGo();
}, 100)
antiRobotGo();
}
antiRobotDefense();
</script>
<div class=fancy>
<div class=nosearch>
<div class="fancy_title">
SQLite FTS3 and FTS4 Extensions
</div>
<div class="fancy_toc">
<a onclick="toggle_toc()">
<span class="fancy_toc_mark" id="toc_mk">►</span>
Table Of Contents
</a>
<div id="toc_sub"><div class="fancy-toc1"><a href="#introduction_to_fts3_and_fts4">1. Introduction to FTS3 and FTS4</a></div>
<div class="fancy-toc2"><a href="#differences_between_fts3_and_fts4">1.1. Differences between FTS3 and FTS4</a></div>
<div class="fancy-toc2"><a href="#creating_and_destroying_fts_tables">1.2. Creating and Destroying FTS Tables</a></div>
<div class="fancy-toc2"><a href="#populating_fts_tables">1.3. Populating FTS Tables</a></div>
<div class="fancy-toc2"><a href="#simple_fts_queries">1.4. Simple FTS Queries</a></div>
<div class="fancy-toc2"><a href="#summary">1.5. Summary</a></div>
<div class="fancy-toc1"><a href="#compiling_and_enabling_fts3_and_fts4">2. Compiling and Enabling FTS3 and FTS4</a></div>
<div class="fancy-toc1"><a href="#full_text_index_queries">3. Full-text Index Queries</a></div>
<div class="fancy-toc2"><a href="#_set_operations_using_the_enhanced_query_syntax">3.1.
Set Operations Using The Enhanced Query Syntax</a></div>
<div class="fancy-toc2"><a href="#set_operations_using_the_standard_query_syntax">3.2. Set Operations Using The Standard Query Syntax</a></div>
<div class="fancy-toc1"><a href="#auxiliary_functions_snippet_offsets_and_matchinfo">4. Auxiliary Functions - Snippet, Offsets and Matchinfo</a></div>
<div class="fancy-toc2"><a href="#the_offsets_function">4.1. The Offsets Function</a></div>
<div class="fancy-toc2"><a href="#the_snippet_function">4.2. The Snippet Function</a></div>
<div class="fancy-toc2"><a href="#matchinfo">4.3. The Matchinfo Function</a></div>
<div class="fancy-toc1"><a href="#fts4aux">5. Fts4aux - Direct Access to the Full-Text Index</a></div>
<div class="fancy-toc1"><a href="#fts4_options">6. FTS4 Options</a></div>
<div class="fancy-toc2"><a href="#the_compress_and_uncompress_options">6.1. The compress= and uncompress= options</a></div>
<div class="fancy-toc2"><a href="#the_content_option_">6.2. The content= option </a></div>
<div class="fancy-toc3"><a href="#_contentless_fts4_tables_">6.2.1. Contentless FTS4 Tables </a></div>
<div class="fancy-toc3"><a href="#_external_content_fts4_tables_">6.2.2. External Content FTS4 Tables </a></div>
<div class="fancy-toc2"><a href="#the_languageid_option">6.3. The languageid= option</a></div>
<div class="fancy-toc2"><a href="#the_matchinfo_option">6.4. The matchinfo= option</a></div>
<div class="fancy-toc2"><a href="#the_notindexed_option">6.5. The notindexed= option</a></div>
<div class="fancy-toc2"><a href="#the_prefix_option">6.6. The prefix= option</a></div>
<div class="fancy-toc1"><a href="#commands">7. Special Commands For FTS3 and FTS4</a></div>
<div class="fancy-toc2"><a href="#optimize">7.1. The "optimize" command</a></div>
<div class="fancy-toc2"><a href="#rebuild">7.2. The "rebuild" command</a></div>
<div class="fancy-toc2"><a href="#integcheck">7.3. The "integrity-check" command</a></div>
<div class="fancy-toc2"><a href="#mergecmd">7.4. The "merge=X,Y" command</a></div>
<div class="fancy-toc2"><a href="#automerge"">7.5. The "automerge=N" command</a></div>
<div class="fancy-toc1"><a href="#tokenizer">8. Tokenizers</a></div>
<div class="fancy-toc2"><a href="#custom_application_defined_tokenizers">8.1. Custom (Application Defined) Tokenizers</a></div>
<div class="fancy-toc2"><a href="#querying_tokenizers">8.2. Querying Tokenizers</a></div>
<div class="fancy-toc1"><a href="#data_structures">9. Data Structures</a></div>
<div class="fancy-toc2"><a href="#shadow_tables">9.1. Shadow Tables</a></div>
<div class="fancy-toc2"><a href="#variable_length_integer_varint_format">9.2. Variable Length Integer (varint) Format</a></div>
<div class="fancy-toc2"><a href="#segment_b_tree_format">9.3. Segment B-Tree Format</a></div>
<div class="fancy-toc3"><a href="#segment_b_tree_leaf_nodes">9.3.1. Segment B-Tree Leaf Nodes</a></div>
<div class="fancy-toc3"><a href="#segment_b_tree_interior_nodes">9.3.2. Segment B-Tree Interior Nodes</a></div>
<div class="fancy-toc2"><a href="#doclist_format">9.4. Doclist Format</a></div>
<div class="fancy-toc1"><a href="#limitations">10. Limitations</a></div>
<div class="fancy-toc2"><a href="#_utf_16_byte_order_mark_problem_">10.1. UTF-16 byte-order-mark problem </a></div>
<div class="fancy-toc1"><a href="#appendix_a">
Appendix A: Search Application Tips
</a></div>
</div>
</div>
<script>
function toggle_toc(){
var sub = document.getElementById("toc_sub")
var mk = document.getElementById("toc_mk")
if( sub.style.display!="block" ){
sub.style.display = "block";
mk.innerHTML = "▼";
} else {
sub.style.display = "none";
mk.innerHTML = "►";
}
}
</script>
</div>
<h2 id="overview" style="margin-left:1.0em" notoc="1"> Overview</h2>
<p>
FTS3 and FTS4 are SQLite virtual table modules that allows users to perform
full-text searches on a set of documents. The most common (and effective)
way to describe full-text searches is "what Google, Yahoo, and Bing do
with documents placed on the World Wide Web". Users input a term, or series
of terms, perhaps connected by a binary operator or grouped together into a
phrase, and the full-text query system finds the set of documents that best
matches those terms considering the operators and groupings the user has
specified. This article describes the deployment and usage of FTS3 and FTS4.
</p><p>
FTS1 and FTS2 are obsolete full-text search modules for SQLite. There are known
issues with these older modules and their use should be avoided.
Portions of the original FTS3 code were contributed to the SQLite project
by Scott Hess of <a href="http://www.google.com">Google</a>. It is now
developed and maintained as part of SQLite.
</p><h1 id="introduction_to_fts3_and_fts4"><span>1. </span>Introduction to FTS3 and FTS4</h1>
<p>
The FTS3 and FTS4 extension modules allows users to create special tables with a
built-in full-text index (hereafter "FTS tables"). The full-text index
allows the user to efficiently query the database for all rows that contain
one or more words (hereafter "tokens"), even if the table
contains many large documents.
</p><p>
For example, if each of the 517430 documents in the
"<a href="http://www.cs.cmu.edu/~enron/">Enron E-Mail Dataset</a>"
is inserted into both an FTS table and an ordinary SQLite table
created using the following SQL script:
</p><div class="codeblock"><pre>CREATE VIRTUAL TABLE enrondata1 USING fts3(content TEXT); /* FTS3 table */
CREATE TABLE enrondata2(content TEXT); /* Ordinary table */
</pre></div>
<p>
Then either of the two queries below may be executed to find the number of
documents in the database that contain the word "linux" (351). Using one
desktop PC hardware configuration, the query on the FTS3 table returns in
approximately 0.03 seconds, versus 22.5 for querying the ordinary table.
</p><div class="codeblock"><pre>SELECT count(*) FROM enrondata1 WHERE content MATCH 'linux'; /* 0.03 seconds */
SELECT count(*) FROM enrondata2 WHERE content LIKE '%linux%'; /* 22.5 seconds */
</pre></div>
<p>
Of course, the two queries above are not entirely equivalent. For example
the LIKE query matches rows that contain terms such as "linuxophobe"
or "EnterpriseLinux" (as it happens, the Enron E-Mail Dataset does not
actually contain any such terms), whereas the MATCH query on the FTS3 table
selects only those rows that contain "linux" as a discrete token. Both
searches are case-insensitive. The FTS3 table consumes around 2006 MB on
disk compared to just 1453 MB for the ordinary table. Using the same
hardware configuration used to perform the SELECT queries above, the FTS3
table took just under 31 minutes to populate, versus 25 for the ordinary
table.
</p><h2 id="differences_between_fts3_and_fts4"><span>1.1. </span>Differences between FTS3 and FTS4</h2>
<a name="fts4"></a>
<p>
FTS3 and FTS4 are nearly identical. They share most of their code in common,
and their interfaces are the same. The differences are:
</p><ul>
<li> <p>FTS4 contains query performance optimizations that may significantly
improve the performance of full-text queries that contain terms that are
very common (present in a large percentage of table rows).
</p></li><li> <p>FTS4 supports some additional options that may used with the <a href="fts3.html#matchinfo">matchinfo()</a>
function.
</p></li><li> <p>Because it stores extra information on disk in two new
<a href="fts3.html#*shadowtab">shadow tables</a> in order to support the performance
optimizations and extra matchinfo() options, FTS4 tables may consume more
disk space than the equivalent table created using FTS3. Usually the overhead
is 1-2% or less, but may be as high as 10% if the documents stored in the
FTS table are very small. The overhead may be reduced by specifying the
directive <a href="fts3.html#fts4matchinfo">"matchinfo=fts3"</a> as part of the FTS4 table
declaration, but this comes at the expense of sacrificing some of the
extra supported matchinfo() options.
</p></li><li> <p>FTS4 provides hooks (the compress and uncompress
<a href="fts3.html#fts4_options">options</a>) allowing data to be stored in a compressed
form, reducing disk usage and IO.
</p></li></ul>
<p>
FTS4 is an enhancement to FTS3.
FTS3 has been available since SQLite <a href="releaselog/3_5_0.html">version 3.5.0</a> (2007-09-04)
The enhancements for FTS4 were added with SQLite <a href="releaselog/3_7_4.html">version 3.7.4</a>
(2010-12-07).
</p><p>
Which module, FTS3 or FTS4, should you use in your application? FTS4 is
sometimes significantly faster than FTS3, even orders of magnitude faster
depending on the query, though in the common case the performance of the two
modules is similar. FTS4 also offers the enhanced <a href="fts3.html#matchinfo">matchinfo()</a> outputs which
can be useful in ranking the results of a <a href="fts3.html#full_text_index_queries">MATCH</a> operation. On the
other hand, in the absence of a <a href="fts3.html#fts4matchinfo">matchinfo=fts3</a> directive FTS4 requires a little
more disk space than FTS3, though only a percent of two in most cases.
</p><p>
For newer applications, FTS4 is recommended; though if compatibility with older
versions of SQLite is important, then FTS3 will usually serve just as well.
</p><h2 id="creating_and_destroying_fts_tables"><span>1.2. </span>Creating and Destroying FTS Tables</h2>
<p>
Like other virtual table types, new FTS tables are created using a
<a href="lang_createvtab.html">CREATE VIRTUAL TABLE</a> statement. The module name, which follows
the USING keyword, is either "fts3" or "fts4". The virtual table module arguments may
be left empty, in which case an FTS table with a single user-defined
column named "content" is created. Alternatively, the module arguments
may be passed a list of comma separated column names.
</p><p>
If column names are explicitly provided for the FTS table as part of
the CREATE VIRTUAL TABLE statement, then a datatype name may be optionally
specified for each column. This is pure syntactic sugar, the
supplied typenames are not used by FTS or the SQLite core for any
purpose. The same applies to any constraints specified along with an
FTS column name - they are parsed but not used or recorded by the system
in any way.
</p><div class="codeblock"><pre><i>-- Create an FTS table named "data" with one column - "content":</i>
CREATE VIRTUAL TABLE data USING fts3();
<i>-- Create an FTS table named "pages" with three columns:</i>
CREATE VIRTUAL TABLE pages USING fts4(title, keywords, body);
<i>-- Create an FTS table named "mail" with two columns. Datatypes
-- and column constraints are specified along with each column. These
-- are completely ignored by FTS and SQLite. </i>
CREATE VIRTUAL TABLE mail USING fts3(
subject VARCHAR(256) NOT NULL,
body TEXT CHECK(length(body)<10240)
);
</pre></div>
<p>
As well as a list of columns, the module arguments passed to a CREATE
VIRTUAL TABLE statement used to create an FTS table may be used to specify
a <a href="fts3.html#tokenizer">tokenizer</a>. This is done by specifying a string of the form
"tokenize=<tokenizer name> <tokenizer args>" in place of a column
name, where <tokenizer name> is the name of the tokenizer to use and
<tokenizer args> is an optional list of whitespace separated qualifiers
to pass to the tokenizer implementation. A tokenizer specification may be
placed anywhere in the column list, but at most one tokenizer declaration is
allowed for each CREATE VIRTUAL TABLE statement. <a href="fts3.html#tokenizer">See below</a> for a
detailed description of using (and, if necessary, implementing) a tokenizer.
</p><div class="codeblock"><pre><i>-- Create an FTS table named "papers" with two columns that uses</i>
<i>-- the tokenizer "porter".</i>
CREATE VIRTUAL TABLE papers USING fts3(author, document, tokenize=porter);
<i>-- Create an FTS table with a single column - "content" - that uses</i>
<i>-- the "simple" tokenizer.</i>
CREATE VIRTUAL TABLE data USING fts4(tokenize=simple);
<i>-- Create an FTS table with two columns that uses the "icu" tokenizer.</i>
<i>-- The qualifier "en_AU" is passed to the tokenizer implementation</i>
CREATE VIRTUAL TABLE names USING fts3(a, b, tokenize=icu en_AU);
</pre></div>
<p>
FTS tables may be dropped from the database using an ordinary <a href="lang_droptable.html">DROP TABLE</a>
statement. For example:
</p><div class="codeblock"><pre><i>-- Create, then immediately drop, an FTS4 table.</i>
CREATE VIRTUAL TABLE data USING fts4();
DROP TABLE data;
</pre></div>
<h2 id="populating_fts_tables"><span>1.3. </span>Populating FTS Tables</h2>
<p>
FTS tables are populated using <a href="lang_insert.html">INSERT</a>, <a href="lang_update.html">UPDATE</a> and <a href="lang_delete.html">DELETE</a>
statements in the same way as ordinary SQLite tables are.
</p><p>
As well as the columns named by the user (or the "content" column if no
module arguments were specified as part of the <a href="lang_createvtab.html">CREATE VIRTUAL TABLE</a>
statement), each FTS table has a "rowid" column. The rowid of an FTS
table behaves in the same way as the rowid column of an ordinary SQLite
table, except that the values stored in the rowid column of an FTS table
remain unchanged if the database is rebuilt using the <a href="lang_vacuum.html">VACUUM</a> command.
For FTS tables, "docid" is allowed as an alias along with the usual "rowid",
"oid" and "_oid_" identifiers. Attempting to insert or update a row with a
docid value that already exists in the table is an error, just as it would
be with an ordinary SQLite table.
</p><p>
There is one other subtle difference between "docid" and the normal SQLite
aliases for the rowid column. Normally, if an INSERT or UPDATE statement
assigns discrete values to two or more aliases of the rowid column, SQLite
writes the rightmost of such values specified in the INSERT or UPDATE
statement to the database. However, assigning a non-NULL value to both
the "docid" and one or more of the SQLite rowid aliases when inserting or
updating an FTS table is considered an error. See below for an example.
</p><div class="codeblock"><pre><i>-- Create an FTS table</i>
CREATE VIRTUAL TABLE pages USING fts4(title, body);
<i>-- Insert a row with a specific docid value.</i>
INSERT INTO pages(docid, title, body) VALUES(53, 'Home Page', 'SQLite is a software...');
<i>-- Insert a row and allow FTS to assign a docid value using the same algorithm as</i>
<i>-- SQLite uses for ordinary tables. In this case the new docid will be 54,</i>
<i>-- one greater than the largest docid currently present in the table.</i>
INSERT INTO pages(title, body) VALUES('Download', 'All SQLite source code...');
<i>-- Change the title of the row just inserted.</i>
UPDATE pages SET title = 'Download SQLite' WHERE rowid = 54;
<i>-- Delete the entire table contents.</i>
DELETE FROM pages;
<i>-- The following is an error. It is not possible to assign non-NULL values to both</i>
<i>-- the rowid and docid columns of an FTS table.</i>
INSERT INTO pages(rowid, docid, title, body) VALUES(1, 2, 'A title', 'A document body');
</pre></div>
<p>
To support full-text queries, FTS maintains an inverted index that maps
from each unique term or word that appears in the dataset to the locations
in which it appears within the table contents. For the curious, a
complete description of the <a href="fts3.html#data_structures">data structure</a> used to store
this index within the database file appears below. A feature of
this data structure is that at any time the database may contain not
one index b-tree, but several different b-trees that are incrementally
merged as rows are inserted, updated and deleted. This technique improves
performance when writing to an FTS table, but causes some overhead for
full-text queries that use the index. Evaluating the special <a href="fts3.html#*fts4optcmd">"optimize" command</a>,
an SQL statement of the
form "INSERT INTO <fts-table>(<fts-table>) VALUES('optimize')",
causes FTS to merge all existing index b-trees into a single large
b-tree containing the entire index. This can be an expensive operation,
but may speed up future queries.
</p><p>
For example, to optimize the full-text index for an FTS table named
"docs":
</p><div class="codeblock"><pre><i>-- Optimize the internal structure of FTS table "docs".</i>
INSERT INTO docs(docs) VALUES('optimize');
</pre></div>
<p>
The statement above may appear syntactically incorrect to some. Refer to
the section describing the <a href="fts3.html#simple_fts_queries">simple fts queries</a> for an explanation.
</p><p>
There is another, deprecated, method for invoking the optimize
operation using a SELECT statement. New code should use statements
similar to the INSERT above to optimize FTS structures.
</p><a name="simple_fts_queries"></a>
<h2 tags="simple fts queries" id="simple_fts_queries"><span>1.4. </span>Simple FTS Queries</h2>
<p>
As for all other SQLite tables, virtual or otherwise, data is retrieved
from FTS tables using a <a href="lang_select.html">SELECT</a> statement.
</p><p>
FTS tables can be queried efficiently using SELECT statements of two
different forms:
</p><ul>
<li><p>
<b>Query by rowid</b>. If the WHERE clause of the SELECT statement
contains a sub-clause of the form "rowid = ?", where ? is an SQL expression,
FTS is able to retrieve the requested row directly using the equivalent
of an SQLite <a href="lang_createtable.html#rowid">INTEGER PRIMARY KEY</a> index.
</p></li><li><p>
<b>Full-text query</b>. If the WHERE clause of the SELECT statement contains
a sub-clause of the form "<column> MATCH ?", FTS is able to use
the built-in full-text index to restrict the search to those documents
that match the full-text query string specified as the right-hand operand
of the MATCH clause.
</p></li></ul>
<p>
If neither of these two query strategies can be used, all
queries on FTS tables are implemented using a linear scan of the entire
table. If the table contains large amounts of data, this may be an
impractical approach (the first example on this page shows that a linear
scan of 1.5 GB of data takes around 30 seconds using a modern PC).
</p><div class="codeblock"><pre><i>-- The examples in this block assume the following FTS table:</i>
CREATE VIRTUAL TABLE mail USING fts3(subject, body);
SELECT * FROM mail WHERE rowid = 15; <i>-- Fast. Rowid lookup.</i>
SELECT * FROM mail WHERE body MATCH 'sqlite'; <i>-- Fast. Full-text query.</i>
SELECT * FROM mail WHERE mail MATCH 'search'; <i>-- Fast. Full-text query.</i>
SELECT * FROM mail WHERE rowid BETWEEN 15 AND 20; <i>-- Fast. Rowid lookup.</i>
SELECT * FROM mail WHERE subject = 'database'; <i>-- Slow. Linear scan.</i>
SELECT * FROM mail WHERE subject MATCH 'database'; <i>-- Fast. Full-text query.</i>
</pre></div>
<p>
In all of the full-text queries above, the right-hand operand of the MATCH
operator is a string consisting of a single term. In this case, the MATCH
expression evaluates to true for all documents that contain one or more
instances of the specified word ("sqlite", "search" or "database", depending
on which example you look at). Specifying a single term as the right-hand
operand of the MATCH operator results in the simplest and most common type
of full-text query possible. However more complicated queries are possible,
including phrase searches, term-prefix searches and searches for documents
containing combinations of terms occurring within a defined proximity of each
other. The various ways in which the full-text index may be queried are
<a href="fts3.html#full_text_index_queries">described below</a>.
</p><p>
Normally, full-text queries are case-insensitive. However, this
is dependent on the specific <a href="fts3.html#tokenizer">tokenizer</a> used by the FTS table
being queried. Refer to the section on <a href="fts3.html#tokenizer">tokenizers</a> for details.
</p><p>
The paragraph above notes that a MATCH operator with a simple term as the
right-hand operand evaluates to true for all documents that contain the
specified term. In this context, the "document" may refer to either the
data stored in a single column of a row of an FTS table, or to the contents
of all columns in a single row, depending on the identifier used as the
left-hand operand to the MATCH operator. If the identifier specified as
the left-hand operand of the MATCH operator is an FTS table column name,
then the document that the search term must be contained in is the value
stored in the specified column. However, if the identifier is the name
of the FTS <i>table</i> itself, then the MATCH operator evaluates to true
for each row of the FTS table for which any column contains the search
term. The following example demonstrates this:
</p><div class="codeblock"><pre><i>-- Example schema</i>
CREATE VIRTUAL TABLE mail USING fts3(subject, body);
<i>-- Example table population</i>
INSERT INTO mail(docid, subject, body) VALUES(1, 'software feedback', 'found it too slow');
INSERT INTO mail(docid, subject, body) VALUES(2, 'software feedback', 'no feedback');
INSERT INTO mail(docid, subject, body) VALUES(3, 'slow lunch order', 'was a software problem');
<i>-- Example queries</i>
SELECT * FROM mail WHERE subject MATCH 'software'; <i>-- Selects rows 1 and 2</i>
SELECT * FROM mail WHERE body MATCH 'feedback'; <i>-- Selects row 2</i>
SELECT * FROM mail WHERE mail MATCH 'software'; <i>-- Selects rows 1, 2 and 3</i>
SELECT * FROM mail WHERE mail MATCH 'slow'; <i>-- Selects rows 1 and 3</i>
</pre></div>
<p>
At first glance, the final two full-text queries in the example above seem
to be syntactically incorrect, as there is a table name ("mail") used as
an SQL expression. The reason this is acceptable is that each FTS table
actually has a <a href="c3ref/declare_vtab.html">HIDDEN</a> column with the same name
as the table itself (in this case, "mail"). The value stored in this
column is not meaningful to the application, but can be used as the
left-hand operand to a MATCH operator. This special column may also be
passed as an argument to the <a href="fts3.html#snippet">FTS auxiliary functions</a>.
</p><p>
The following example illustrates the above. The expressions "docs",
"docs.docs" and "main.docs.docs" all refer to column "docs". However, the
expression "main.docs" does not refer to any column. It could be used to
refer to a table, but a table name is not allowed in the context in which
it is used below.
</p><div class="codeblock"><pre><i>-- Example schema</i>
CREATE VIRTUAL TABLE docs USING fts4(content);
<i>-- Example queries</i>
SELECT * FROM docs WHERE docs MATCH 'sqlite'; <i>-- OK.</i>
SELECT * FROM docs WHERE docs.docs MATCH 'sqlite'; <i>-- OK.</i>
SELECT * FROM docs WHERE main.docs.docs MATCH 'sqlite'; <i>-- OK.</i>
SELECT * FROM docs WHERE main.docs MATCH 'sqlite'; <i>-- Error.</i>
</pre></div>
<h2 id="summary"><span>1.5. </span>Summary</h2>
<p>
From the users point of view, FTS tables are similar to ordinary SQLite
tables in many ways. Data may be added to, modified within and removed
from FTS tables using the INSERT, UPDATE and DELETE commands just as
it may be with ordinary tables. Similarly, the SELECT command may be used
to query data. The following list summarizes the differences between FTS
and ordinary tables:
</p><ol>
<li><p>
As with all virtual table types, it is not possible to create indices or
triggers attached to FTS tables. Nor is it possible to use the ALTER TABLE
command to add extra columns to FTS tables (although it is possible to use
ALTER TABLE to rename an FTS table).
</p></li><li><p>
Data-types specified as part of the "CREATE VIRTUAL TABLE" statement
used to create an FTS table are ignored completely. Instead of the
normal rules for applying type <a href="datatype3.html#affinity">affinity</a> to inserted values, all
values inserted into FTS table columns (except the special rowid
column) are converted to type TEXT before being stored.
</p></li><li><p>
FTS tables permit the special alias "docid" to be used to refer to the
rowid column supported by all <a href="vtab.html">virtual tables</a>.
</p></li><li><p>
The <a href="fts3.html#full_text_index_queries">FTS MATCH</a> operator is supported for queries based on the built-in
full-text index.
</p></li><li><p>
The <a href="fts3.html#snippet">FTS auxiliary functions</a>, <a href="fts3.html#snippet">snippet()</a>, <a href="fts3.html#offsets">offsets()</a>, and <a href="fts3.html#matchinfo">matchinfo()</a> are
available to support full-text queries.
</p></li><li><p>
<a name="hiddencol"></a>
Every FTS table has a <a href="vtab.html#hiddencol">hidden column</a> with the
same name as the table itself. The value contained in each row for the
hidden column is a blob that is only useful as the left operand of a
<a href="fts3.html#full_text_index_queries">MATCH</a> operator, or as the left-most argument to one
of the <a href="fts3.html#snippet">FTS auxiliary functions</a>.
</p></li></ol>
<a name="compiling_and_enabling_fts3_and_fts4"></a>
<h1 tags="compile fts" id="compiling_and_enabling_fts3_and_fts4"><span>2. </span>Compiling and Enabling FTS3 and FTS4</h1>
<p>
Although FTS3 and FTS4 are included with the SQLite core source code, they are not
enabled by default. To build SQLite with FTS functionality enabled, define
the preprocessor macro <a href="compile.html#enable_fts3">SQLITE_ENABLE_FTS3</a> when compiling. New applications
should also define the <a href="compile.html#enable_fts3_parenthesis">SQLITE_ENABLE_FTS3_PARENTHESIS</a> macro to enable the
<a href="fts3.html#_set_operations_using_the_enhanced_query_syntax">enhanced query syntax</a> (see below). Usually, this is done by adding the
following two switches to the compiler command line:
</p><div class="codeblock"><pre>-DSQLITE_ENABLE_FTS3
-DSQLITE_ENABLE_FTS3_PARENTHESIS
</pre></div>
<p>
Note that enabling FTS3 also makes FTS4 available. There is not a separate
SQLITE_ENABLE_FTS4 compile-time option. A build of SQLite either supports
both FTS3 and FTS4 or it supports neither.
</p><p>
If using the amalgamation autoconf based build system, setting the CPPFLAGS
environment variable while running the 'configure' script is an easy
way to set these macros. For example, the following command:
</p><div class="codeblock"><pre>CPPFLAGS="-DSQLITE_ENABLE_FTS3 -DSQLITE_ENABLE_FTS3_PARENTHESIS" ./configure <configure options>
</pre></div>
<p>
where <i><configure options></i> are those options normally passed to
the configure script, if any.
</p><p>
Because FTS3 and FTS4 are virtual tables, The <a href="compile.html#enable_fts3">SQLITE_ENABLE_FTS3</a> compile-time option
is incompatible with the <a href="compile.html#omit_virtualtable">SQLITE_OMIT_VIRTUALTABLE</a> option.
</p><p>
If a build of SQLite does not include the FTS modules, then any attempt to prepare an
SQL statement to create an FTS3 or FTS4 table or to drop or access an existing
FTS table in any way will fail. The error message returned will be similar
to "no such module: ftsN" (where N is either 3 or 4).
</p><p>
If the C version of the <a href="http://site.icu-project.org/">ICU library</a>
is available, then FTS may also be compiled with the SQLITE_ENABLE_ICU
pre-processor macro defined. Compiling with this macro enables an FTS
<a href="fts3.html#tokenizer">tokenizer</a> that uses the ICU library to split a document into terms
(words) using the conventions for a specified language and locale.
</p><div class="codeblock"><pre>-DSQLITE_ENABLE_ICU
</pre></div>
<a name="full_text_index_queries"></a>
<h1 tags="FTS MATCH" id="full_text_index_queries"><span>3. </span>Full-text Index Queries</h1>
<p>
The most useful thing about FTS tables is the queries that may be
performed using the built-in full-text index. Full-text queries are
performed by specifying a clause of the form
"<column> MATCH <full-text query expression>" as part of the WHERE
clause of a SELECT statement that reads data from an FTS table.
<a href="fts3.html#simple_fts_queries">Simple FTS queries</a> that return all documents that
contain a given term are described above. In that discussion the right-hand
operand of the MATCH operator was assumed to be a string consisting of a
single term. This section describes the more complex query types supported
by FTS tables, and how they may be utilized by specifying a more
complex query expression as the right-hand operand of a MATCH operator.
</p><p>
FTS tables support three basic query types:
</p><ul>
<a name="termprefix"></a>
<li><p><b>Token or token prefix queries</b>.
An FTS table may be queried for all documents that contain a specified
term (the <a href="fts3.html#simple_fts_queries">simple case</a> described above), or for
all documents that contain a term with a specified prefix. As we have
seen, the query expression for a specific term is simply the term itself.
The query expression used to search for a term prefix is the prefix
itself with a '*' character appended to it. For example:
</p></li></ul>
<div class="codeblock"><pre><i>-- Virtual table declaration</i>
CREATE VIRTUAL TABLE docs USING fts3(title, body);
<i>-- Query for all documents containing the term "linux":</i>
SELECT * FROM docs WHERE docs MATCH 'linux';
<i>-- Query for all documents containing a term with the prefix "lin". This will match</i>
<i>-- all documents that contain "linux", but also those that contain terms "linear",</i>
<i>--"linker", "linguistic" and so on.</i>
SELECT * FROM docs WHERE docs MATCH 'lin*';
</pre></div>
<ul>
<li style="list-style:none"><p>
Normally, a token or token prefix query is matched against the FTS table
column specified as the left-hand side of the MATCH operator. Or, if the
special column with the same name as the FTS table itself is specified,
against all columns. This may be overridden by specifying a column-name
followed by a ":" character before a basic term query. There may be space
between the ":" and the term to query for, but not between the column-name
and the ":" character. For example:
</p></li></ul>
<div class="codeblock"><pre><i>-- Query the database for documents for which the term "linux" appears in</i>
<i>-- the document title, and the term "problems" appears in either the title</i>
<i>-- or body of the document.</i>
SELECT * FROM docs WHERE docs MATCH 'title:linux problems';
<i>-- Query the database for documents for which the term "linux" appears in</i>
<i>-- the document title, and the term "driver" appears in the body of the document</i>
<i>-- ("driver" may also appear in the title, but this alone will not satisfy the</i>
<i>-- query criteria).</i>
SELECT * FROM docs WHERE body MATCH 'title:linux driver';
</pre></div>
<ul>
<li style="list-style:none"><p>
If the FTS table is an FTS4 table (not FTS3), a token may also be prefixed
with a "^" character. In this case, in order to match the token must
appear as the very first token in any column of the matching row. Examples:
</p></li></ul>
<div class="codeblock"><pre><i>-- All documents for which "linux" is the first token of at least one</i>
<i>-- column.</i>
SELECT * FROM docs WHERE docs MATCH '^linux';
<i>-- All documents for which the first token in column "title" begins with "lin".</i>
SELECT * FROM docs WHERE body MATCH 'title: ^lin*';
</pre></div>
<a name="phrase"></a>
<ul>
<li><p><b>Phrase queries</b>.
A phrase query is a query that retrieves all documents that contain a
nominated set of terms or term prefixes in a specified order with no
intervening tokens. Phrase queries are specified by enclosing a space
separated sequence of terms or term prefixes in double quotes (").
For example:
</p></li></ul>
<div class="codeblock"><pre><i>-- Query for all documents that contain the phrase "linux applications".</i>
SELECT * FROM docs WHERE docs MATCH '"linux applications"';
<i>-- Query for all documents that contain a phrase that matches "lin* app*". As well as</i>
<i>-- "linux applications", this will match common phrases such as "linoleum appliances"</i>
<i>-- or "link apprentice".</i>
SELECT * FROM docs WHERE docs MATCH '"lin* app*"';
</pre></div>
<a name="near"></a>
<ul>
<li><p><b>NEAR queries</b>.
A NEAR query is a query that returns documents that contain a two or
more nominated terms or phrases within a specified proximity of each
other (by default with 10 or less intervening terms). A NEAR query is
specified by putting the keyword "NEAR" between two phrase, token or
token prefix queries. To specify a proximity other than the default,
an operator of the form "NEAR/<i><N></i>" may be used, where
<i><N></i> is the maximum number of intervening terms allowed.
For example:
</p></li></ul>
<div class="codeblock"><pre><i>-- Virtual table declaration.</i>
CREATE VIRTUAL TABLE docs USING fts4();
<i>-- Virtual table data.</i>
INSERT INTO docs VALUES('SQLite is an ACID compliant embedded relational database management system');
<i>-- Search for a document that contains the terms "sqlite" and "database" with</i>
<i>-- not more than 10 intervening terms. This matches the only document in</i>
<i>-- table docs (since there are only six terms between "SQLite" and "database"</i>
<i>-- in the document)</i>.
SELECT * FROM docs WHERE docs MATCH 'sqlite NEAR database';
<i>-- Search for a document that contains the terms "sqlite" and "database" with</i>
<i>-- not more than 6 intervening terms. This also matches the only document in</i>
<i>-- table docs. Note that the order in which the terms appear in the document</i>
<i>-- does not have to be the same as the order in which they appear in the query.</i>
SELECT * FROM docs WHERE docs MATCH 'database NEAR/6 sqlite';
<i>-- Search for a document that contains the terms "sqlite" and "database" with</i>
<i>-- not more than 5 intervening terms. This query matches no documents.</i>
SELECT * FROM docs WHERE docs MATCH 'database NEAR/5 sqlite';
<i>-- Search for a document that contains the phrase "ACID compliant" and the term</i>
<i>-- "database" with not more than 2 terms separating the two. This matches the</i>
<i>-- document stored in table docs.</i>
SELECT * FROM docs WHERE docs MATCH 'database NEAR/2 "ACID compliant"';
<i>-- Search for a document that contains the phrase "ACID compliant" and the term</i>
<i>-- "sqlite" with not more than 2 terms separating the two. This also matches</i>
<i>-- the only document stored in table docs.</i>
SELECT * FROM docs WHERE docs MATCH '"ACID compliant" NEAR/2 sqlite';
</pre></div>
<ul>
<li style="list-style: none"><p>
More than one NEAR operator may appear in a single query. In this case each
pair of terms or phrases separated by a NEAR operator must appear within the
specified proximity of each other in the document. Using the same table and
data as in the block of examples above:
</p></li></ul>
<div class="codeblock"><pre>
<i>-- The following query selects documents that contains an instance of the term </i>
<i>-- "sqlite" separated by two or fewer terms from an instance of the term "acid",</i>
<i>-- which is in turn separated by two or fewer terms from an instance of the term</i>
<i>-- "relational".</i>
SELECT * FROM docs WHERE docs MATCH 'sqlite NEAR/2 acid NEAR/2 relational';
<i>-- This query matches no documents. There is an instance of the term "sqlite" with</i>
<i>-- sufficient proximity to an instance of "acid" but it is not sufficiently close</i>
<i>-- to an instance of the term "relational".</i>
SELECT * FROM docs WHERE docs MATCH 'acid NEAR/2 sqlite NEAR/2 relational';
</pre></div>
<p>
Phrase and NEAR queries may not span multiple columns within a row.
</p><p>
The three basic query types described above may be used to query the full-text
index for the set of documents that match the specified criteria. Using the
FTS query expression language it is possible to perform various set
operations on the results of basic queries. There are currently three
supported operations:
</p><ul>
<li> The AND operator determines the <b>intersection</b> of two sets of documents.
</li><li> The OR operator calculates the <b>union</b> of two sets of documents.
</li><li> The NOT operator (or, if using the standard syntax, a unary "-" operator)
may be used to compute the <b>relative complement</b> of one set of
documents with respect to another.
</li></ul>
<p>
The FTS modules may be compiled to use one of two slightly different versions
of the full-text query syntax, the "standard" query syntax and the "enhanced"
query syntax. The basic term, term-prefix, phrase and NEAR queries described
above are the same in both versions of the syntax. The way in which set
operations are specified is slightly different. The following two sub-sections
describe the part of the two query syntaxes that pertains to set operations.
Refer to the description of how to <a href="fts3.html#compiling_and_enabling_fts3_and_fts4">compile fts</a> for compilation notes.
</p><a name="_set_operations_using_the_enhanced_query_syntax"></a>
<h2 tags="enhanced query syntax" id="_set_operations_using_the_enhanced_query_syntax"><span>3.1. </span>
Set Operations Using The Enhanced Query Syntax</h2>
<p>
The enhanced query syntax supports the AND, OR and NOT binary set operators.
Each of the two operands to an operator may be a basic FTS query, or the
result of another AND, OR or NOT set operation. Operators must be entered
using capital letters. Otherwise, they are interpreted as basic term queries
instead of set operators.
</p><p>
The AND operator may be implicitly specified. If two basic queries appear
with no operator separating them in an FTS query string, the results are
the same as if the two basic queries were separated by an AND operator.
For example, the query expression "implicit operator" is a more succinct
version of "implicit AND operator".
</p><div class="codeblock"><pre><i>-- Virtual table declaration</i>
CREATE VIRTUAL TABLE docs USING fts3();
<i>-- Virtual table data</i>
INSERT INTO docs(docid, content) VALUES(1, 'a database is a software system');
INSERT INTO docs(docid, content) VALUES(2, 'sqlite is a software system');
INSERT INTO docs(docid, content) VALUES(3, 'sqlite is a database');
<i>-- Return the set of documents that contain the term "sqlite", and the</i>
<i>-- term "database". This query will return the document with docid 3 only.</i>
SELECT * FROM docs WHERE docs MATCH 'sqlite AND database';
<i>-- Again, return the set of documents that contain both "sqlite" and</i>
<i>-- "database". This time, use an implicit AND operator. Again, document</i>
<i>-- 3 is the only document matched by this query. </i>
SELECT * FROM docs WHERE docs MATCH 'database sqlite';
<i>-- Query for the set of documents that contains either "sqlite" or "database".</i>
<i>-- All three documents in the database are matched by this query.</i>
SELECT * FROM docs WHERE docs MATCH 'sqlite OR database';
<i>-- Query for all documents that contain the term "database", but do not contain</i>
<i>-- the term "sqlite". Document 1 is the only document that matches this criteria.</i>
SELECT * FROM docs WHERE docs MATCH 'database NOT sqlite';
<i>-- The following query matches no documents. Because "and" is in lowercase letters,</i>
<i>-- it is interpreted as a basic term query instead of an operator. Operators must</i>
<i>-- be specified using capital letters. In practice, this query will match any documents</i>
<i>-- that contain each of the three terms "database", "and" and "sqlite" at least once.</i>
<i>-- No documents in the example data above match this criteria.</i>
SELECT * FROM docs WHERE docs MATCH 'database and sqlite';
</pre></div>
<p>
The examples above all use basic full-text term queries as both operands of
the set operations demonstrated. Phrase and NEAR queries may also be used,
as may the results of other set operations. When more than one set operation
is present in an FTS query, the precedence of operators is as follows:
</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
<tr style="text-align:left"><th>Operator</th><th>Enhanced Query Syntax Precedence
</th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>NOT </td><td> Highest precedence (tightest grouping).
</td></tr><tr style="text-align:left"><td>AND </td><td>
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>OR </td><td> Lowest precedence (loosest grouping).
</td></tr></table>
<p>
When using the enhanced query syntax, parenthesis may be used to override
the default precedence of the various operators. For example:
</p><div class="codeblock"><pre><i>-- Return the docid values associated with all documents that contain the</i>
<i>-- two terms "sqlite" and "database", and/or contain the term "library".</i>
SELECT docid FROM docs WHERE docs MATCH 'sqlite AND database OR library';
<i>-- This query is equivalent to the above.</i>
SELECT docid FROM docs WHERE docs MATCH 'sqlite AND database'
UNION
SELECT docid FROM docs WHERE docs MATCH 'library';
<i>-- Query for the set of documents that contains the term "linux", and at least</i>
<i>-- one of the phrases "sqlite database" and "sqlite library".</i>
SELECT docid FROM docs WHERE docs MATCH '("sqlite database" OR "sqlite library") AND linux';
<i>-- This query is equivalent to the above.</i>
SELECT docid FROM docs WHERE docs MATCH 'linux'
INTERSECT
SELECT docid FROM (
SELECT docid FROM docs WHERE docs MATCH '"sqlite library"'
UNION
SELECT docid FROM docs WHERE docs MATCH '"sqlite database"'
);
</pre></div>
<h2 id="set_operations_using_the_standard_query_syntax"><span>3.2. </span>Set Operations Using The Standard Query Syntax</h2>
<p>
FTS query set operations using the standard query syntax are similar, but
not identical, to set operations with the enhanced query syntax. There
are four differences, as follows:
</p><ol>
<li value="1"><p> Only the implicit version of the AND operator is supported.
Specifying the string "AND" as part of a standard query syntax query is
interpreted as a term query for the set of documents containing the term
"and".
</p></li></ol>
<ol>
<li value="2"><p> Parenthesis are not supported.
</p></li></ol>
<ol>
<li value="3"><p> The NOT operator is not supported. Instead of the NOT
operator, the standard query syntax supports a unary "-" operator that
may be applied to basic term and term-prefix queries (but not to phrase
or NEAR queries). A term or term-prefix that has a unary "-" operator
attached to it may not appear as an operand to an OR operator. An FTS
query may not consist entirely of terms or term-prefix queries with unary
"-" operators attached to them.
</p></li></ol>
<div class="codeblock"><pre><i>-- Search for the set of documents that contain the term "sqlite" but do</i>
<i>-- not contain the term "database".</i>
SELECT * FROM docs WHERE docs MATCH 'sqlite -database';
</pre></div>
<ol>
<li value="4"><p> The relative precedence of the set operations is different.
In particular, using the standard query syntax the "OR" operator has a
higher precedence than "AND". The precedence of operators when using the
standard query syntax is:
</p></li></ol>
<table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
<tr style="text-align:left"><th>Operator</th><th>Standard Query Syntax Precedence
</th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>Unary "-" </td><td> Highest precedence (tightest grouping).
</td></tr><tr style="text-align:left"><td>OR </td><td>
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>AND </td><td> Lowest precedence (loosest grouping).
</td></tr></table>
<ol><li style="list-style:none">
The following example illustrates precedence of operators using the standard
query syntax:
</li></ol>
<div class="codeblock"><pre><i>-- Search for documents that contain at least one of the terms "database"</i>
<i>-- and "sqlite", and also contain the term "library". Because of the differences</i>
<i>-- in operator precedences, this query would have a different interpretation using</i>
<i>-- the enhanced query syntax.</i>
SELECT * FROM docs WHERE docs MATCH 'sqlite OR database library';
</pre></div>
<a name="snippet"></a>
<h1 id="auxiliary_functions_snippet_offsets_and_matchinfo"><span>4. </span>Auxiliary Functions - Snippet, Offsets and Matchinfo</h1>
<p>
The FTS3 and FTS4 modules provide three special SQL scalar functions that may be useful
to the developers of full-text query systems: "snippet", "offsets" and
"matchinfo". The purpose of the "snippet" and "offsets" functions is to allow
the user to identify the location of queried terms in the returned documents.
The "matchinfo" function provides the user with metrics that may be useful
for filtering or sorting query results according to relevance.
</p><p>
The first argument to all three special SQL scalar functions
must be the <a href="fts3.html#hiddencol">FTS hidden column</a> of the FTS table that the function is
applied to. The <a href="fts3.html#hiddencol">FTS hidden column</a> is an automatically-generated column found on
all FTS tables that has the same name as the FTS table itself.
For example, given an FTS table named "mail":
</p><div class="codeblock"><pre>SELECT offsets(mail) FROM mail WHERE mail MATCH <full-text query expression>;
SELECT snippet(mail) FROM mail WHERE mail MATCH <full-text query expression>;
SELECT matchinfo(mail) FROM mail WHERE mail MATCH <full-text query expression>;
</pre></div>
<p>
The three auxiliary functions are only useful within a SELECT statement that
uses the FTS table's full-text index. If used within a SELECT that uses
the "query by rowid" or "linear scan" strategies, then the snippet and
offsets both return an empty string, and the matchinfo function returns
a blob value zero bytes in size.
</p><p id="matchable">
All three auxiliary functions extract a set of "matchable phrases" from
the FTS query expression to work with. The set of matchable phrases for
a given query consists of all phrases (including unquoted tokens and
token prefixes) in the expression except those that are prefixed with
a unary "-" operator (standard syntax) or are part of a sub-expression
that is used as the right-hand operand of a NOT operator.
</p><p>
With the following provisos, each series of tokens in the FTS table that
matches one of the matchable phrases in the query expression is known as a
"phrase match":
</p><ol>
<li> If a matchable phrase is part of a series of phrases connected by
NEAR operators in the FTS query expression, then each phrase match
must be sufficiently close to other phrase matches of the relevant
types to satisfy the NEAR condition.
</li><li> If the matchable phrase in the FTS query is restricted to matching
data in a specified FTS table column, then only phrase matches that
occur within that column are considered.
</li></ol>
<a name="offsets"></a>
<h2 id="the_offsets_function"><span>4.1. </span>The Offsets Function</h2>
<p>
For a SELECT query that uses the full-text index, the offsets() function
returns a text value containing a series of space-separated integers. For
each term in each <a href="#matchable">phrase match</a> of the current row,
there are four integers in the returned list. Each set of four integers is
interpreted as follows:
</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
<tr style="text-align:left"><th>Integer </th><th>Interpretation
</th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>0
</td><td>The column number that the term instance occurs in (0 for the
leftmost column of the FTS table, 1 for the next leftmost, etc.).
</td></tr><tr style="text-align:left"><td>1
</td><td>The term number of the matching term within the full-text query
expression. Terms within a query expression are numbered starting
from 0 in the order that they occur.
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>2
</td><td>The byte offset of the matching term within the column.
</td></tr><tr style="text-align:left"><td>3
</td><td>The size of the matching term in bytes.
</td></tr></table>
<p>
The following block contains examples that use the offsets function.
</p><div class="codeblock"><pre>CREATE VIRTUAL TABLE mail USING fts3(subject, body);
INSERT INTO mail VALUES('hello world', 'This message is a hello world message.');
INSERT INTO mail VALUES('urgent: serious', 'This mail is seen as a more serious mail');
<i>-- The following query returns a single row (as it matches only the first</i>
<i>-- entry in table "mail". The text returned by the offsets function is</i>
<i>-- "0 0 6 5 1 0 24 5".</i>
<i>--</i>
<i>-- The first set of four integers in the result indicate that column 0</i>
<i>-- contains an instance of term 0 ("world") at byte offset 6. The term instance</i>
<i>-- is 5 bytes in size. The second set of four integers shows that column 1</i>
<i>-- of the matched row contains an instance of term 0 ("world") at byte offset</i>
<i>-- 24. Again, the term instance is 5 bytes in size.</i>
SELECT offsets(mail) FROM mail WHERE mail MATCH 'world';
<i>-- The following query returns also matches only the first row in table "mail".</i>
<i>-- In this case the returned text is "1 0 5 7 1 0 30 7".</i>
SELECT offsets(mail) FROM mail WHERE mail MATCH 'message';
<i>-- The following query matches the second row in table "mail". It returns the</i>
<i>-- text "1 0 28 7 1 1 36 4". Only those occurrences of terms "serious" and "mail"</i>
<i>-- that are part of an instance of the phrase "serious mail" are identified; the</i>
<i>-- other occurrences of "serious" and "mail" are ignored.</i>
SELECT offsets(mail) FROM mail WHERE mail MATCH '"serious mail"';
</pre></div>
<a name="snippet"></a>
<h2 id="the_snippet_function"><span>4.2. </span>The Snippet Function</h2>
<p>
The snippet function is used to create formatted fragments of document text
for display as part of a full-text query results report. The snippet function
may be passed between one and six arguments, as follows:
</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
<tr style="text-align:left"><th>Argument </th><th>Default Value </th><th>Description
</th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>0 </td><td>N/A
</td><td> The first argument to the snippet function must always be the <a href="fts3.html#hiddencol">FTS hidden column</a>
of the FTS table being queried and from which the snippet is to be taken. The
<a href="fts3.html#hiddencol">FTS hidden column</a> is an automatically generated column with the same name as the
FTS table itself.
</td></tr><tr style="text-align:left"><td>1 </td><td>"<b>"
</td><td> The "start match" text.
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>2 </td><td>"</b>"
</td><td> The "end match" text.
</td></tr><tr style="text-align:left"><td>3 </td><td>"<b>...</b>"
</td><td> The "ellipses" text.
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>4 </td><td>-1
</td><td> The FTS table column number to extract the returned fragments of
text from. Columns are numbered from left to right starting with
zero. A negative value indicates that the text may be extracted
from any column.
</td></tr><tr style="text-align:left"><td>5 </td><td>-15
</td><td> The absolute value of this integer argument is used as the
(approximate) number of tokens to include in the returned text
value. The maximum allowable absolute value is 64. The value of
this argument is referred to as <i>N</i> in the discussion below.
</td></tr></table>
<p>
The snippet function first attempts to find a fragment of text consisting
of <i>|N|</i> tokens within the current row that contains at least one phrase
match for each matchable phrase matched somewhere in the current row,
where <i>|N|</i> is the absolute value of the sixth argument passed to the
snippet function. If the text stored in a single column contains less than
<i>|N|</i> tokens, then the entire column value is considered. Text fragments
may not span multiple columns.
</p><p>
If such a text fragment can be found, it is returned with the following
modifications:
</p><ul>
<li> If the text fragment does not begin at the start of a column value,
the "ellipses" text is prepended to it.
</li><li> If the text fragment does not finish at the end of a column value,
the "ellipses" text is appended to it.
</li><li> For each token in the text fragment that is part of a phrase match,
the "start match" text is inserted into the fragment before the token,
and the "end match" text is inserted immediately after it.
</li></ul>
<p>
If more than one such fragment can be found, then fragments that contain
a larger number of "extra" phrase matches are favored. The start of
the selected text fragment may be moved a few tokens forward or backward
to attempt to concentrate the phrase matches toward the center of the
fragment.
</p><p>
Assuming <i>N</i> is a positive value, if no fragments can be found that
contain a phrase match corresponding to each matchable phrase, the snippet
function attempts to find two fragments of approximately <i>N</i>/2 tokens
that between them contain at least one phrase match for each matchable phrase
matched by the current row. If this fails, attempts are made to find three
fragments of <i>N</i>/3 tokens each and finally four <i>N</i>/4 token
fragments. If a set of four fragments cannot be found that encompasses the
required phrase matches, the four fragments of <i>N</i>/4 tokens that provide
the best coverage are selected.
</p><p>
If <i>N</i> is a negative value, and no single fragment can be found
containing the required phrase matches, the snippet function searches
for two fragments of <i>|N|</i> tokens each, then three, then four. In
other words, if the specified value of <i>N</i> is negative, the sizes
of the fragments is not decreased if more than one fragment is required
to provide the desired phrase match coverage.
</p><p>
After the <i>M</i> fragments have been located, where <i>M</i> is between
two and four as described in the paragraphs above, they are joined together
in sorted order with the "ellipses" text separating them. The three
modifications enumerated earlier are performed on the text before it is
returned.
</p><div class="codeblock"><pre><b>Note: In this block of examples, newlines and whitespace characters have
been inserted into the document inserted into the FTS table, and the expected
results described in SQL comments. This is done to enhance readability only,
they would not be present in actual SQLite commands or output.</b>
<i>-- Create and populate an FTS table.</i>
CREATE VIRTUAL TABLE text USING fts4();
INSERT INTO text VALUES('
During 30 Nov-1 Dec, 2-3oC drops. Cool in the upper portion, minimum temperature 14-16oC
and cool elsewhere, minimum temperature 17-20oC. Cold to very cold on mountaintops,
minimum temperature 6-12oC. Northeasterly winds 15-30 km/hr. After that, temperature
increases. Northeasterly winds 15-30 km/hr.
');
<i>-- The following query returns the text value:</i>
<i>--</i>
<i>-- "<b>...</b>cool elsewhere, minimum temperature 17-20oC. <b>Cold</b> to very </i>
<i>-- <b>cold</b> on mountaintops, minimum temperature 6<b>...</b>".</i>
<i>--</i>
SELECT snippet(text) FROM text WHERE text MATCH 'cold';
<i>-- The following query returns the text value:</i>
<i>--</i>
<i>-- "...the upper portion, [minimum] [temperature] 14-16oC and cool elsewhere,</i>
<i>-- [minimum] [temperature] 17-20oC. Cold..."</i>
<i>--</i>
SELECT snippet(text, '[', ']', '...') FROM text WHERE text MATCH '"min* tem*"'
</pre></div>
<a name="matchinfo"></a>
<h2 id="matchinfo" tags="matchinfo"><span>4.3. </span>The Matchinfo Function</h2>
<p>
The matchinfo function returns a blob value. If it is used within a query
that does not use the full-text index (a "query by rowid" or "linear scan"),
then the blob is zero bytes in size. Otherwise, the blob consists of zero
or more 32-bit unsigned integers in machine byte-order. The exact number
of integers in the returned array depends on both the query and the value
of the second argument (if any) passed to the matchinfo function.
</p><p>
The matchinfo function is called with either one or two arguments. As for
all auxiliary functions, the first argument must be the special
<a href="fts3.html#hiddencol">FTS hidden column</a>. The second argument, if it is specified, must be a text value
comprised only of the characters 'p', 'c', 'n', 'a', 'l', 's', 'x', 'y' and 'b'.
If no second argument is explicitly supplied, it defaults to "pcx". The
second argument is referred to as the "format string" below.
</p><p>
Characters in the matchinfo format string are processed from left to right.
Each character in the format string causes one or more 32-bit unsigned
integer values to be added to the returned array. The "values" column in
the following table contains the number of integer values appended to the
output buffer for each supported format string character. In the formula
given, <i>cols</i> is the number of columns in the FTS table, and
<i>phrases</i> is the number of <a href="#matchable">matchable phrases</a> in
the query.
</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
<tr style="text-align:left"><th>Character</th><th>Values</th><th>Description
</th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>p </td><td>1 </td><td>The number of matchable phrases in the query.
</td></tr><tr style="text-align:left"><td>c </td><td>1 </td><td>The number of user defined columns in the FTS
table (i.e. not including the docid or the <a href="fts3.html#hiddencol">FTS hidden column</a>).
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>x </td><td style="white-space:nowrap">3 * <i>cols</i> * <i>phrases</i>
</td><td><a name="matchinfo-x"></a>
For each distinct combination of a phrase and table column, the
following three values:
<ul>
<li> In the current row, the number of times the phrase appears in
the column.
</li><li> The total number of times the phrase appears in the column in
all rows in the FTS table.
</li><li> The total number of rows in the FTS table for which the
column contains at least one instance of the phrase.
</li></ul>
The first set of three values corresponds to the left-most column
of the table (column 0) and the left-most matchable phrase in the
query (phrase 0). If the table has more than one column, the second
set of three values in the output array correspond to phrase 0 and
column 1. Followed by phrase 0, column 2 and so on for all columns of
the table. And so on for phrase 1, column 0, then phrase 1, column 1
etc. In other words, the data for occurrences of phrase <i>p</i> in
column <i>c</i> may be found using the following formula:
<pre>
hits_this_row = array[3 * (c + p*cols) + 0]
hits_all_rows = array[3 * (c + p*cols) + 1]
docs_with_hits = array[3 * (c + p*cols) + 2]
</pre>
</td></tr><tr style="text-align:left"><td>y</td><td style="white-space:nowrap"><i>cols</i> * <i>phrases</i>
</td><td><a name="matchinfo-y"></a>
For each distinct combination of a phrase and table column, the
number of usable phrase matches that appear in the column. This is
usually identical to the first value in each set of three returned by the
<a href="fts3.html#matchinfo-x">matchinfo 'x' flag</a>. However, the number of hits reported by the
'y' flag is zero for any phrase that is part of a sub-expression
that does not match the current row. This makes a difference for
expressions that contain AND operators that are descendants of OR
operators. For example, consider the expression:
<pre>
a OR (b AND c)
</pre>
and the document:
<pre>
"a c d"
</pre>
The <a href="fts3.html#matchinfo-x">matchinfo 'x' flag</a> would report a single hit for the phrases "a" and "c".
However, the 'y' directive reports the number of hits for "c" as zero, as
it is part of a sub-expression that does not match the document - (b AND c).
For queries that do not contain AND operators descended from OR
operators, the result values returned by 'y' are always the same as
those returned by 'x'.
<p style="margin-left:0;margin-right:0">
The first value in the array of integer values corresponds to the
leftmost column of the table (column 0) and the first phrase in the query
(phrase 0). The values corresponding to other column/phrase combinations
may be located using the following formula:
</p><pre>
hits_for_phrase_p_column_c = array[c + p*cols]
</pre>
For queries that use OR expressions, or those that use LIMIT or return
many rows, the 'y' matchinfo option may be faster than 'x'.
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>b</td><td style="white-space:nowrap"><i>((cols+31)/32)</i> * <i>phrases</i>
</td><td><a name="matchinfo-b"></a>
The matchinfo 'b' flag provides similar information to the
<a href="fts3.html#matchinfo-y">matchinfo 'y' flag</a>, but in a more
compact form. Instead of the precise number of hits, 'b' provides a single
boolean flag for each phrase/column combination. If the phrase is present in
the column at least once (i.e. if the corresponding integer output of 'y' would
be non-zero), the corresponding flag is set. Otherwise cleared.
<p style="margin-left:0;margin-right:0">
If the table has 32 or fewer columns, a single unsigned integer is output for
each phrase in the query. The least significant bit of the integer is set if the
phrase appears at least once in column 0. The second least significant bit is
set if the phrase appears once or more in column 1. And so on.
</p><p style="margin-left:0;margin-right:0">
If the table has more than 32 columns, an extra integer is added to the output
of each phrase for each extra 32 columns or part thereof. Integers
corresponding to the same phrase are clumped together. For example, if a table
with 45 columns is queried for two phrases, 4 integers are output. The first
corresponds to phrase 0 and columns 0-31 of the table. The second integer
contains data for phrase 0 and columns 32-44, and so on.
</p><p style="margin-left:0;margin-right:0">
For example, if nCol is the number of columns in the table, to determine if
phrase p is present in column c:
</p><pre>
p_is_in_c = array[p * ((nCol+31)/32)] & (1 << (c % 32))
</pre>
</td></tr><tr style="text-align:left"><td>n </td><td>1 </td><td>The number of rows in the FTS4 table. This value is
only available when querying FTS4 tables, not FTS3.
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>a </td><td><i>cols</i> </td><td>For each column, the average number of
tokens in the text values stored in the column (considering all rows in
the FTS4 table). This value is only available when querying FTS4 tables,
not FTS3.
</td></tr><tr style="text-align:left"><td>l </td><td><i>cols</i> </td><td>
For each column, the length of the value stored in the current row of the
FTS4 table, in tokens. This value is only available when querying
FTS4 tables, not FTS3. And only if the "matchinfo=fts3" directive was not
specified as part of the "CREATE VIRTUAL TABLE" statement used to create
the FTS4 table.
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>s </td><td><i>cols</i> </td><td>For each column, the length of the longest
subsequence of phrase matches that the column value has in common
with the query text. For example, if a table column contains the text
'a b c d e' and the query is 'a c "d e"', then the length of the longest
common subsequence is 2 (phrase "c" followed by phrase "d e").
</td></tr></table>
<p>
For example:
</p><div class="codeblock"><pre><i>-- Create and populate an FTS4 table with two columns:</i>
CREATE VIRTUAL TABLE t1 USING fts4(a, b);
INSERT INTO t1 VALUES('transaction default models default', 'Non transaction reads');
INSERT INTO t1 VALUES('the default transaction', 'these semantics present');
INSERT INTO t1 VALUES('single request', 'default data');
<i>-- In the following query, no format string is specified and so it defaults</i>
<i>-- to "pcx". It therefore returns a single row consisting of a single blob</i>
<i>-- value 80 bytes in size (20 32-bit integers - 1 for "p", 1 for "c" and</i>
<i>-- 3*2*3 for "x"). If each block of 4 bytes in</i> the blob is interpreted
<i>-- as an unsigned integer in machine byte-order, the values will be:</i>
<i>--</i>
<i>-- 3 2 1 3 2 0 1 1 1 2 2 0 1 1 0 0 0 1 1 1</i>
<i>--</i>
<i>-- The row returned corresponds to the second entry inserted into table t1.</i>
<i>-- The first two integers in the blob show that the query contained three</i>
<i>-- phrases and the table being queried has two columns. The next block of</i>
<i>-- three integers describes column 0 (in this case column "a") and phrase</i>
<i>-- 0 (in this case "default"). The current row contains 1 hit for "default"</i>
<i>-- in column 0, of a total of 3 hits for "default" that occur in column</i>
<i>-- 0 of any table row. The 3 hits are spread across 2 different rows.</i>
<i>--</i>
<i>-- The next set of three integers (0 1 1) pertain to the hits for "default"</i>
<i>-- in column 1 of the table (0 in this row, 1 in all rows, spread across </i>
<i>-- 1 rows).</i>
<i>--</i>
SELECT matchinfo(t1) FROM t1 WHERE t1 MATCH 'default transaction "these semantics"';
<i>-- The format string for this query is "ns". The output array will therefore</i>
<i>-- contain 3 integer values - 1 for "n" and 2 for "s". The query returns</i>
<i>-- two rows (the first two rows in the table match). The values returned are:</i>
<i>--</i>
<i>-- 3 1 1</i>
<i>-- 3 2 0</i>
<i>--</i>
<i>-- The first value in the matchinfo array returned for both rows is 3 (the </i>
<i>-- number of rows in the table). The following two values are the lengths </i>
<i>-- of the longest common subsequence of phrase matches in each column.</i>
SELECT matchinfo(t1, 'ns') FROM t1 WHERE t1 MATCH 'default transaction';
</pre></div>
<p>
The matchinfo function is much faster than either the snippet or offsets
functions. This is because the implementation of both snippet and offsets
is required to retrieve the documents being analyzed from disk, whereas
all data required by matchinfo is available as part of the same portions
of the full-text index that are required to implement the full-text query
itself. This means that of the following two queries, the first may be
an order of magnitude faster than the second:
</p><div class="codeblock"><pre>SELECT docid, matchinfo(tbl) FROM tbl WHERE tbl MATCH <query expression>;
SELECT docid, offsets(tbl) FROM tbl WHERE tbl MATCH <query expression>;
</pre></div>
<p>
The matchinfo function provides all the information required to calculate
probabilistic "bag-of-words" relevancy scores such as
<a href="http://en.wikipedia.org/wiki/Okapi_BM25">Okapi BM25/BM25F</a> that may
be used to order results in a full-text search application. Appendix A of this
document, "<a href="fts3.html#appendix_a">search application tips</a>", contains an example of using the
matchinfo() function efficiently.
</p><a name="fts4aux"></a>
<h1 id="fts4aux" tags="fts4aux"><span>5. </span>Fts4aux - Direct Access to the Full-Text Index</h1>
<p>
As of <a href="releaselog/3_7_6.html">version 3.7.6</a> (2011-04-12),
SQLite includes a new virtual table module called
"fts4aux", which can be used to inspect the full-text index of an existing
FTS table directly. Despite its name, fts4aux works just as well with FTS3
tables as it does with FTS4 tables. Fts4aux tables are read-only. The only
way to modify the contents of an fts4aux table is by modifying the
contents of the associated FTS table. The fts4aux module is automatically
included in all <a href="fts3.html#compiling_and_enabling_fts3_and_fts4">builds that include FTS</a>.
</p><p>
An fts4aux virtual table is constructed with one or two arguments. When
used with a single argument, that argument is the unqualified name of the
FTS table that it will be used to access. To access a table in a different
database (for example, to create a TEMP fts4aux table that will access an
FTS3 table in the MAIN database) use the two-argument form and give the
name of the target database (ex: "main") in the first argument and the name
of the FTS3/4 table as the second argument. (The two-argument form of
fts4aux was added for SQLite <a href="releaselog/3_7_17.html">version 3.7.17</a> (2013-05-20)
and will throw an error in prior releases.)
For example:
</p><div class="codeblock"><pre><i>-- Create an FTS4 table</i>
CREATE VIRTUAL TABLE ft USING fts4(x, y);
<i>-- Create an fts4aux table to access the full-text index for table "ft"</i>
CREATE VIRTUAL TABLE ft_terms USING fts4aux(ft);
<i>-- Create a TEMP fts4aux table accessing the "ft" table in "main"</i>
CREATE VIRTUAL TABLE temp.ft_terms_2 USING fts4aux(main,ft);
</pre></div>
<p>
For each term present in the FTS table, there are between 2 and N+1 rows
in the fts4aux table, where N is the number of user-defined columns in
the associated FTS table. An fts4aux table always has the same four columns,
as follows, from left to right:
</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
<tr style="text-align:left"><th>Column Name</th><th>Column Contents
</th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>term</td><td>
Contains the text of the term for this row.
</td></tr><tr style="text-align:left"><td>col</td><td>
This column may contain either the text value '*' (i.e. a single
character, U+002a) or an integer between 0 and N-1, where N is
again the number of user-defined columns in the corresponding FTS table.
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>documents</td><td>
This column always contains an integer value greater than zero.
<br><br>
If the "col" column contains the value '*', then this column
contains the number of rows of the FTS table that contain at least one
instance of the term (in any column). If col contains an integer
value, then this column contains the number of rows of the FTS table that
contain at least one instance of the term in the column identified by
the col value. As usual, the columns of the FTS table are numbered
from left to right, starting with zero.
</td></tr><tr style="text-align:left"><td>occurrences</td><td>
This column also always contains an integer value greater than zero.
<br><br>
If the "col" column contains the value '*', then this column
contains the total number of instances of the term in all rows of the
FTS table (in any column). Otherwise, if col contains an integer
value, then this column contains the total number of instances of the
term that appear in the FTS table column identified by the col
value.
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>languageid <i>(hidden)</i></td><td>
<a name="f4alid"></a>
This column determines which <a href="fts3.html#*fts4languageid">languageid</a> is used to
extract vocabulary from the FTS3/4 table.
<br><br>
The default value for languageid is 0. If an alternative language
is specified in WHERE clause constraints, then that alternative is
used instead of 0. There can only be a single languageid per query.
In other words, the WHERE clause cannot contain a range constraint
or IN operator on the languageid.
</td></tr></table>
<p>
For example, using the tables created above:
</p><div class="codeblock"><pre>INSERT INTO ft(x, y) VALUES('Apple banana', 'Cherry');
INSERT INTO ft(x, y) VALUES('Banana Date Date', 'cherry');
INSERT INTO ft(x, y) VALUES('Cherry Elderberry', 'Elderberry');
<i>-- The following query returns this data:</i>
<i>--</i>
<i>-- apple | * | 1 | 1</i>
<i>-- apple | 0 | 1 | 1</i>
<i>-- banana | * | 2 | 2</i>
<i>-- banana | 0 | 2 | 2</i>
<i>-- cherry | * | 3 | 3</i>
<i>-- cherry | 0 | 1 | 1</i>
<i>-- cherry | 1 | 2 | 2</i>
<i>-- date | * | 1 | 2</i>
<i>-- date | 0 | 1 | 2</i>
<i>-- elderberry | * | 1 | 2</i>
<i>-- elderberry | 0 | 1 | 1</i>
<i>-- elderberry | 1 | 1 | 1</i>
<i>--</i>
SELECT term, col, documents, occurrences FROM ft_terms;
</pre></div>
<p>
In the example, the values in the "term" column are all lower case,
even though they were inserted into table "ft" in mixed case. This is because
an fts4aux table contains the terms as extracted from the document text
by the <a href="fts3.html#tokenizer">tokenizer</a>. In this case, since table "ft" uses the
<a href="fts3.html#tokenizer">simple tokenizer</a>, this means all terms have been folded to
lower case. Also, there is (for example) no row with column "term"
set to "apple" and column "col" set to 1. Since there are no instances
of the term "apple" in column 1, no row is present in the fts4aux table.
</p><p>
During a transaction, some of the data written to an FTS table may be
cached in memory and written to the database only when the transaction is
committed. However the implementation of the fts4aux module is only able
to read data from the database. In practice this means that if an fts4aux
table is queried from within a transaction in which the associated
FTS table has been modified, the results of the query are likely to reflect
only a (possibly empty) subset of the changes made.
</p><a name="fts4_options"></a>
<h1 id="fts4_options" tags="FTS4 options"><span>6. </span>FTS4 Options</h1>
<p>
If the "CREATE VIRTUAL TABLE" statement specifies module FTS4 (not FTS3),
then special directives - FTS4 options - similar to the "tokenize=*" option
may also appear in place of column names. An FTS4 option consists of the
option name, followed by an "=" character, followed by the option value.
The option value may optionally be enclosed in single or double quotes, with
embedded quote characters escaped in the same way as for SQL literals. There
may not be whitespace on either side of the "=" character. For example,
to create an FTS4 table with the value of option "matchinfo" set to "fts3":
</p><div class="codeblock"><pre><i>-- Create a reduced-footprint FTS4 table.</i>
CREATE VIRTUAL TABLE papers USING fts4(author, document, matchinfo=fts3);
</pre></div>
<p>
FTS4 currently supports the following options:
</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
<tr style="text-align:left"><th>Option</th><th>Interpretation
</th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>compress</td><td>
The compress option is used to specify the compress function. It is an error to
specify a compress function without also specifying an uncompress
function. <a href="fts3.html#the_compress_and_uncompress_options">See below</a> for details.
</td></tr><tr style="text-align:left"><td>content</td><td>
The content allows the text being indexed to be
stored in a separate table distinct from the FTS4 table,
or even outside of SQLite.
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>languageid</td><td>
The languageid option causes the FTS4 table to have an additional hidden
integer column that identifies the language of the text contained in
each row. The use of the languageid option allows the same FTS4 table
to hold text in multiple languages or scripts, each with different tokenizer
rules, and to query each language independently of the others.
</td></tr><tr style="text-align:left"><td>matchinfo</td><td>
When set to the value "fts3", the matchinfo option reduces the amount of
information stored by FTS4 with the consequence that the "l" option of
<a href="fts3.html#matchinfo">matchinfo()</a> is no longer available.
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>notindexed</td><td>
This option is used to specify the name of a column for which data is
not indexed. Values stored in columns that are not indexed are not
matched by MATCH queries. Nor are they recognized by auxiliary functions.
A single CREATE VIRTUAL TABLE statement may have any number of notindexed
options.
</td></tr><tr style="text-align:left"><td>order</td><td>
<a name="fts4order"></a>
The "order" option may be set to either "DESC" or "ASC" (in upper or
lower case). If it is set to "DESC", then FTS4 stores its data in such
a way as to optimize returning results in descending order by docid.
If it is set to "ASC" (the default), then the data structures are
optimized for returning results in ascending order by docid. In other
words, if many of the queries run against the FTS4 table use "ORDER BY
docid DESC", then it may improve performance to add the "order=desc"
option to the CREATE VIRTUAL TABLE statement.
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>prefix</td><td>
This option may be set to a comma-separated list of positive non-zero
integers. For each integer N in the list, a separate index is created
in the database file to optimize <a href="fts3.html#termprefix">prefix queries</a> where
the query term is N bytes in length, not including the '*' character,
when encoded using UTF-8. <a href="fts3.html#the_prefix_option">See below</a> for details.
</td></tr><tr style="text-align:left"><td>uncompress</td><td>
This option is used to specify the uncompress function. It is an error to
specify an uncompress function without also specifying a compress
function. <a href="fts3.html#the_compress_and_uncompress_options">See below</a> for details.
</td></tr></table>
<p>
When using FTS4, specifying a column name that contains an "=" character
and is not either a "tokenize=*" specification or a recognized FTS4 option
is an error. With FTS3, the first token in the unrecognized directive is
interpreted as a column name. Similarly, specifying multiple "tokenize=*"
directives in a single table declaration is an error when using FTS4, whereas
the second and subsequent "tokenize=*" directives are interpreted as column
names by FTS3. For example:
</p><div class="codeblock"><pre><i>-- An error. FTS4 does not recognize the directive "xyz=abc".</i>
CREATE VIRTUAL TABLE papers USING fts4(author, document, xyz=abc);
<i>-- Create an FTS3 table with three columns - "author", "document"</i>
<i>-- and "xyz".</i>
CREATE VIRTUAL TABLE papers USING fts3(author, document, xyz=abc);
<i>-- An error. FTS4 does not allow multiple tokenize=* directives</i>
CREATE VIRTUAL TABLE papers USING fts4(tokenize=porter, tokenize=simple);
<i>-- Create an FTS3 table with a single column named "tokenize". The</i>
<i>-- table uses the "porter" tokenizer.</i>
CREATE VIRTUAL TABLE papers USING fts3(tokenize=porter, tokenize=simple);
<i>-- An error. Cannot create a table with two columns named "tokenize".</i>
CREATE VIRTUAL TABLE papers USING fts3(tokenize=porter, tokenize=simple, tokenize=icu);
</pre></div>
<a name="*fts4compression"></a>
<a name="the_compress_and_uncompress_options"></a>
<h2 tags="fts4 compress option" id="the_compress_and_uncompress_options"><span>6.1. </span>The compress= and uncompress= options</h2>
<p>
The compress and uncompress options allow FTS4 content to be stored in
the database in a compressed form. Both options should be set to the name
of an SQL scalar function registered using <a href="c3ref/create_function.html">sqlite3_create_function()</a>
that accepts a single argument.
</p><p>
The compress function should return a compressed version of the value
passed to it as an argument. Each time data is written to the FTS4 table,
each column value is passed to the compress function and the result value
stored in the database. The compress function may return any type of SQLite
value (blob, text, real, integer or null).
</p><p>
The uncompress function should uncompress data previously compressed by
the compress function. In other words, for all SQLite values X, it should
be true that uncompress(compress(X)) equals X. When data that has been
compressed by the compress function is read from the database by FTS4, it
is passed to the uncompress function before it is used.
</p><p>
If the specified compress or uncompress functions do not exist, the table
may still be created. An error is not returned until the FTS4 table is
read (if the uncompress function does not exist) or written (if it is the
compress function that does not exist).
</p><div class="codeblock"><pre><i>-- Create an FTS4 table that stores data in compressed form. This</i>
<i>-- assumes that the scalar functions zip() and unzip() have been (or</i>
<i>-- will be) added to the database handle.</i>
CREATE VIRTUAL TABLE papers USING fts4(author, document, compress=zip, uncompress=unzip);
</pre></div>
<p>
When implementing the compress and uncompress functions it is important to
pay attention to data types. Specifically, when a user reads a value from
a compressed FTS table, the value returned by FTS is exactly the same
as the value returned by the uncompress function, including the data type.
If that data type is not the same as the data type of the original value as
passed to the compress function (for example if the uncompress function is
returning BLOB when compress was originally passed TEXT), then the users
query may not function as expected.
<a name="*fts4content"></a>
</p><a name="the_content_option_"></a>
<h2 tags="fts4 content option" id="the_content_option_"><span>6.2. </span>The content= option </h2>
<p>
The content option allows FTS4 to forego storing the text being indexed.
The content option can be used in two ways:
</p><ul>
<li><p> The indexed documents are not stored within the SQLite database
at all (a "contentless" FTS4 table), or
</p></li><li><p> The indexed documents are stored in a database table created and
managed by the user (an "external content" FTS4 table).
</p></li></ul>
<p>
Because the indexed documents themselves are usually much larger than
the full-text index, the content option can be used to achieve
significant space savings.
</p><a name="_contentless_fts4_tables_"></a>
<h3 tags="contentless fts4 tables" id="_contentless_fts4_tables_"><span>6.2.1. </span> Contentless FTS4 Tables </h3>
<p>
In order to create an FTS4 table that does not store a copy of the indexed
documents at all, the content option should be set to an empty string.
For example, the following SQL creates such an FTS4 table with three
columns - "a", "b", and "c":
</p><div class="codeblock"><pre>CREATE VIRTUAL TABLE t1 USING fts4(content="", a, b, c);
</pre></div>
<p>
Data can be inserted into such an FTS4 table using an INSERT statements.
However, unlike ordinary FTS4 tables, the user must supply an explicit
integer docid value. For example:
</p><div class="codeblock"><pre><i>-- This statement is Ok:</i>
INSERT INTO t1(docid, a, b, c) VALUES(1, 'a b c', 'd e f', 'g h i');
<i>-- This statement causes an error, as no docid value has been provided:</i>
INSERT INTO t1(a, b, c) VALUES('j k l', 'm n o', 'p q r');
</pre></div>
<p>
It is not possible to UPDATE or DELETE a row stored in a contentless FTS4
table. Attempting to do so is an error.
</p><p>
Contentless FTS4 tables also support SELECT statements. However, it is
an error to attempt to retrieve the value of any table column other than
the docid column. The auxiliary function matchinfo() may be used, but
snippet() and offsets() may not. For example:
</p><div class="codeblock"><pre><i>-- The following statements are Ok:</i>
SELECT docid FROM t1 WHERE t1 MATCH 'xxx';
SELECT docid FROM t1 WHERE a MATCH 'xxx';
SELECT matchinfo(t1) FROM t1 WHERE t1 MATCH 'xxx';
<i>-- The following statements all cause errors, as the value of columns</i>
<i>-- other than docid are required to evaluate them.</i>
SELECT * FROM t1;
SELECT a, b FROM t1 WHERE t1 MATCH 'xxx';
SELECT docid FROM t1 WHERE a LIKE 'xxx%';
SELECT snippet(t1) FROM t1 WHERE t1 MATCH 'xxx';
</pre></div>
<p>
Errors related to attempting to retrieve column values other than docid
are runtime errors that occur within sqlite3_step(). In some cases, for
example if the MATCH expression in a SELECT query matches zero rows, there
may be no error at all even if a statement does refer to column values
other than docid.
</p><a name="_external_content_fts4_tables_"></a>
<h3 tags="external content fts4 tables" id="_external_content_fts4_tables_"><span>6.2.2. </span> External Content FTS4 Tables </h3>
<p>
An "external content" FTS4 table is similar to a contentless table, except
that if evaluation of a query requires the value of a column other than
docid, FTS4 attempts to retrieve that value from a table (or view, or
virtual table) nominated by the user (hereafter referred to as the "content
table"). The FTS4 module never writes to the content table, and writing
to the content table does not affect the full-text index. It is the
responsibility of the user to ensure that the content table and the
full-text index are consistent.
</p><p>
An external content FTS4 table is created by setting the content option
to the name of a table (or view, or virtual table) that may be queried by
FTS4 to retrieve column values when required. If the nominated table does
not exist, then an external content table behaves in the same way as
a contentless table. For example:
</p><div class="codeblock"><pre>CREATE TABLE t2(id INTEGER PRIMARY KEY, a, b, c);
CREATE VIRTUAL TABLE t3 USING fts4(content="t2", a, c);
</pre></div>
<p>
Assuming the nominated table does exist, then its columns must be the same
as or a superset of those defined for the FTS table. The external table
must also be in the same database file as the FTS table. In other words,
The external table cannot be in a different database file connected using
<a href="lang_attach.html">ATTACH</a> nor may one of the FTS table and the external content be in the
TEMP database when the other is in a persistent database file such as MAIN.
</p><p>
When a users query on the FTS table requires a column value other than
docid, FTS attempts to read the requested value from the corresponding column of
the row in the content table with a rowid value equal to the current FTS
docid. Only the subset of content-table columns duplicated in the FTS/34
table declaration can be queried for - to retrieve values from any other
columns the content table must be queried directly. Or, if such a row cannot
be found in the content table, a NULL value is used instead. For example:
</p><div class="codeblock"><pre>CREATE TABLE t2(id INTEGER PRIMARY KEY, a, b, c);
CREATE VIRTUAL TABLE t3 USING fts4(content="t2", b, c);
INSERT INTO t2 VALUES(2, 'a b', 'c d', 'e f');
INSERT INTO t2 VALUES(3, 'g h', 'i j', 'k l');
INSERT INTO t3(docid, b, c) SELECT id, b, c FROM t2;
<i>-- The following query returns a single row with two columns containing
-- the text values "i j" and "k l".
--
-- The query uses the full-text index to discover that the MATCH
-- term matches the row with docid=3. It then retrieves the values
-- of columns b and c from the row with rowid=3 in the content table
-- to return.
--</i>
SELECT * FROM t3 WHERE t3 MATCH 'k';
<i>-- Following the UPDATE, the query still returns a single row, this
-- time containing the text values "xxx" and "yyy". This is because the
-- full-text index still indicates that the row with docid=3 matches
-- the FTS4 query 'k', even though the documents stored in the content
-- table have been modified.
--</i>
UPDATE t2 SET b = 'xxx', c = 'yyy' WHERE rowid = 3;
SELECT * FROM t3 WHERE t3 MATCH 'k';
<i>-- Following the DELETE below, the query returns one row containing two
-- NULL values. NULL values are returned because FTS is unable to find
-- a row with rowid=3 within the content table.
--</i>
DELETE FROM t2;
SELECT * FROM t3 WHERE t3 MATCH 'k';
</pre></div>
<p>
When a row is deleted from an external content FTS4 table, FTS4 needs to
retrieve the column values of the row being deleted from the content table.
This is so that FTS4 can update the full-text index entries for each token
that occurs within the deleted row to indicate that row has been
deleted. If the content table row cannot be found, or if it contains values
inconsistent with the contents of the FTS index, the results can be difficult
to predict. The FTS index may be left containing entries corresponding to the
deleted row, which can lead to seemingly nonsensical results being returned
by subsequent SELECT queries. The same applies when a row is updated, as
internally an UPDATE is the same as a DELETE followed by an INSERT.
</p><p>
This means that in order to keep an FTS in sync with an external content
table, any UPDATE or DELETE operations must be applied first to the FTS
table, and then to the external content table. For example:
</p><div class="codeblock"><pre>CREATE TABLE t1_real(id INTEGER PRIMARY KEY, a, b, c, d);
CREATE VIRTUAL TABLE t1_fts USING fts4(content="t1_real", b, c);
<i>-- This works. When the row is removed from the FTS table, FTS retrieves
-- the row with rowid=123 and tokenizes it in order to determine the entries
-- that must be removed from the full-text index.
--</i>
DELETE FROM t1_fts WHERE rowid = 123;
DELETE FROM t1_real WHERE rowid = 123;
--<i> This <b>does not work</b>. By the time the FTS table is updated, the row
-- has already been deleted from the underlying content table. As a result
-- FTS is unable to determine the entries to remove from the FTS index and
-- so the index and content table are left out of sync.
--</i>
DELETE FROM t1_real WHERE rowid = 123;
DELETE FROM t1_fts WHERE rowid = 123;
</pre></div>
<p>
Instead of writing separately to the full-text index and the content table,
some users may wish to use database triggers to keep the full-text index
up to date with respect to the set of documents stored in the content table.
For example, using the tables from earlier examples:
</p><div class="codeblock"><pre>CREATE TRIGGER t2_bu BEFORE UPDATE ON t2 BEGIN
DELETE FROM t3 WHERE docid=old.rowid;
END;
CREATE TRIGGER t2_bd BEFORE DELETE ON t2 BEGIN
DELETE FROM t3 WHERE docid=old.rowid;
END;
CREATE TRIGGER t2_au AFTER UPDATE ON t2 BEGIN
INSERT INTO t3(docid, b, c) VALUES(new.rowid, new.b, new.c);
END;
CREATE TRIGGER t2_ai AFTER INSERT ON t2 BEGIN
INSERT INTO t3(docid, b, c) VALUES(new.rowid, new.b, new.c);
END;
</pre></div>
<p>
The DELETE trigger must be fired before the actual delete takes place
on the content table. This is so that FTS4 can still retrieve the original
values in order to update the full-text index. And the INSERT trigger must
be fired after the new row is inserted, so as to handle the case where the
rowid is assigned automatically within the system. The UPDATE trigger must
be split into two parts, one fired before and one after the update of the
content table, for the same reasons.
</p><p>
The <a href="fts3.html#*fts4rebuidcmd">FTS4 "rebuild" command</a>
deletes the entire full-text index and rebuilds it based on the current
set of documents in the content table. Assuming again that "t3" is the
name of the external content FTS4 table, the rebuild command looks like this:
</p><div class="codeblock"><pre>INSERT INTO t3(t3) VALUES('rebuild');
</pre></div>
<p>
This command may also be used with ordinary FTS4 tables, for example if
the implementation of the tokenizer changes. It is an
error to attempt to rebuild the full-text index maintained by a contentless
FTS4 table, since no content will be available to do the rebuilding.
<a name="*fts4languageid"></a>
</p><a name="the_languageid_option"></a>
<h2 tags="fts4 languageid option" id="the_languageid_option"><span>6.3. </span>The languageid= option</h2>
<p>
When the languageid option is present, it specifies the name of
another <a href="vtab.html#hiddencol">hidden column</a> that is added to the FTS4
table and which is used to specify the language stored in each row
of the FTS4 table. The name of the languageid hidden column must
be distinct from all other column names in the FTS4 table. Example:
</p><div class="codeblock"><pre>CREATE VIRTUAL TABLE t1 USING fts4(x, y, languageid="lid")
</pre></div>
<p>
The default value of a languageid column is 0. Any value inserted
into a languageid column is converted to a 32-bit (not 64) signed
integer.
</p><p>
By default, FTS queries (those that use the MATCH operator)
consider only those rows with the languageid column set to 0. To
query for rows with other languageid values, a constraint of the
form "</p><language-id> = <integer>" must be added to the queries
WHERE clause. For example:
<div class="codeblock"><pre>SELECT * FROM t1 WHERE t1 MATCH 'abc' AND lid=5;
</pre></div>
<p>
It is not possible for a single FTS query to return rows with
different languageid values. The results of adding WHERE clauses
that use other operators (e.g. lid!=5, or lid<=5) are undefined.
</p><p>
If the content option is used along with the languageid option,
then the named languageid column must exist in the content= table
(subject to the usual rules - if a query never needs to read the
content table then this restriction does not apply).
</p><p>
When the languageid option is used, SQLite invokes the xLanguageid()
on the sqlite3_tokenizer_module object immediately after the object
is created in order to pass in the language id that the
tokenizer should use. The xLanguageid() method will never be called
more than once for any single tokenizer object. The fact that different
languages might be tokenized differently is one reason why no single
FTS query can return rows with different languageid values.
<a name="fts4matchinfo"></a>
</p><a name="the_matchinfo_option"></a>
<h2 tags="fts4 matchinfo option" id="the_matchinfo_option"><span>6.4. </span>The matchinfo= option</h2>
<p>
The matchinfo option may only be set to the value "fts3".
Attempting to set matchinfo to anything other than "fts3" is an error.
If this option is specified, then some of the extra information stored by
FTS4 is omitted. This reduces the amount of disk space consumed by
an FTS4 table until it is almost the same as the amount that would
be used by the equivalent FTS3 table, but also means that the data
accessed by passing the 'l' flag to the <a href="fts3.html#matchinfo">matchinfo()</a> function is
not available.
<a name="fts4notindexed"></a>
</p><a name="the_notindexed_option"></a>
<h2 tags="fts4 notindexed option" id="the_notindexed_option"><span>6.5. </span>The notindexed= option</h2>
<p>
Normally, the FTS module maintains an inverted index of all terms in
all columns of the table. This option is used to specify the name of
a column for which entries should not be added to the index. Multiple
"notindexed" options may be used to specify that multiple columns should
be omitted from the index. For example:
</p><div class="codeblock"><pre><i>-- Create an FTS4 table for which only the contents of columns c2 and c4</i>
<i>-- are tokenized and added to the inverted index.</i>
CREATE VIRTUAL TABLE t1 USING fts4(c1, c2, c3, c4, notindexed=c1, notindexed=c3);
</pre></div>
<p>
Values stored in unindexed columns are not eligible to match MATCH
operators. They do not influence the results of the offsets() or matchinfo()
auxiliary functions. Nor will the snippet() function ever return a
snippet based on a value stored in an unindexed column.
<a name="fts4prefix"></a>
</p><a name="the_prefix_option"></a>
<h2 tags="fts4 prefix option" id="the_prefix_option"><span>6.6. </span>The prefix= option</h2>
<p>
The FTS4 prefix option causes FTS to index term prefixes of specified lengths
in the same way that it always indexes complete terms. The prefix option
must be set to a comma separated list of positive non-zero integers.
For each value N in the list, prefixes of length N bytes (when encoded
using UTF-8) are indexed. FTS4 uses term prefix indexes to speed up
<a href="fts3.html#termprefix">prefix queries</a>. The cost, of course, is that indexing term prefixes as
well as complete terms increases the database size and slows down write
operations on the FTS4 table.
</p><p>
Prefix indexes may be used to optimize <a href="fts3.html#termprefix">prefix queries</a> in two cases.
If the query is for a prefix of N bytes, then a prefix index created
with "prefix=N" provides the best optimization. Or, if no "prefix=N"
index is available, a "prefix=N+1" index may be used instead.
Using a "prefix=N+1" index is less
efficient than a "prefix=N" index, but is better than no prefix index at all.
</p><div class="codeblock"><pre><i>-- Create an FTS4 table with indexes to optimize 2 and 4 byte prefix queries.</i>
CREATE VIRTUAL TABLE t1 USING fts4(c1, c2, prefix="2,4");
<i>-- The following two queries are both optimized using the prefix indexes.</i>
SELECT * FROM t1 WHERE t1 MATCH 'ab*';
SELECT * FROM t1 WHERE t1 MATCH 'abcd*';
<i>-- The following two queries are both partially optimized using the prefix</i>
<i>-- indexes. The optimization is not as pronounced as it is for the queries</i>
<i>-- above, but still an improvement over no prefix indexes at all.</i>
SELECT * FROM t1 WHERE t1 MATCH 'a*';
SELECT * FROM t1 WHERE t1 MATCH 'abc*';
</pre></div>
<a name="*cmds"></a>
<a name="commands"></a>
<h1 id="commands" tags="commands"><span>7. </span>Special Commands For FTS3 and FTS4</h1>
<p>
Special INSERT operates can be used to issue commands to FTS3 and FTS4 tables.
Every FTS3 and FTS4 has a hidden, read-only column which is the same name as
the table itself. INSERTs into this hidden column are interpreted as commands
to the FTS3/4 table. For a table with the name "xyz" the following commands
are supported:
</p><ul>
<li><p>INSERT INTO xyz(xyz) VALUES('optimize');</p>
</li><li><p>INSERT INTO xyz(xyz) VALUES('rebuild');</p>
</li><li><p>INSERT INTO xyz(xyz) VALUES('integrity-check');</p>
</li><li><p>INSERT INTO xyz(xyz) VALUES('merge=X,Y');</p>
</li><li><p>INSERT INTO xyz(xyz) VALUES('automerge=N');</p>
</li></ul>
<a name="*fts4optcmd"></a>
<h2 id="optimize"><span>7.1. </span>The "optimize" command</h2>
<p>
The "optimize" command causes FTS3/4 to merge together all of its
inverted index b-trees into one large and complete b-tree. Doing
an optimize will make subsequent queries run faster since there are
fewer b-trees to search, and it may reduce disk usage by coalescing
redundant entries. However, for a large FTS table, running optimize
can be as expensive as running <a href="lang_vacuum.html">VACUUM</a>. The optimize command
essentially has to read and write the entire FTS table, resulting
in a large transaction.
</p><p>
In batch-mode operation, where an FTS table is initially built up
using a large number of INSERT operations, then queried repeatedly
without further changes, it is often a good idea
to run "optimize" after the last INSERT and before the first query.
<a name="*fts4rebuidcmd"></a>
</p><h2 id="rebuild"><span>7.2. </span>The "rebuild" command</h2>
<p>
The "rebuild" command causes SQLite to discard the entire FTS3/4
table and then rebuild it again from original text. The concept
is similar to <a href="lang_reindex.html">REINDEX</a>, only that it applies to an
FTS3/4 table instead of an ordinary index.
</p><p>
The "rebuild" command should be run whenever the implementation
of a custom tokenizer changes, so that all content can be retokenized.
The "rebuild" command is also useful when using the
<a href="fts3.html#*fts4content">FTS4 content option</a> after changes have been made to the original
content table.
<a name="*fts4ickcmd"></a>
</p><h2 id="integcheck"><span>7.3. </span>The "integrity-check" command</h2>
<p>
The "integrity-check" command causes SQLite to read and verify
the accuracy of all inverted indices in an FTS3/4 table by comparing
those inverted indices against the original content. The
"integrity-check" command silently succeeds if the inverted
indices are all ok, but will fail with an SQLITE_CORRUPT error
if any problems are found.
</p><p>
The "integrity-check" command is similar in concept to
<a href="pragma.html#pragma_integrity_check">PRAGMA integrity_check</a>. In a working system, the "integrity-command"
should always be successful. Possible causes of integrity-check
failures include:
</p><ul>
<li> The application has made changes to the <a href="fts3.html#*shadowtab">FTS shadow tables</a>
directly, without using the FTS3/4 virtual table, causing
the shadow tables to become out of sync with each other.
</li><li> Using the <a href="fts3.html#*fts4content">FTS4 content option</a> and failing to manually keep
the content in sync with the FTS4 inverted indices.
</li><li> Bugs in the FTS3/4 virtual table. (The "integrity-check"
command was original conceived as part of the test suite
for FTS3/4.)
</li><li> Corruption to the underlying SQLite database file. (See
documentation on <a href="howtocorrupt.html">how to corrupt</a> and SQLite database for
additional information.)
</li></ul>
<a name="*fts4mergecmd"></a>
<h2 id="mergecmd"><span>7.4. </span>The "merge=X,Y" command</h2>
<p>
The "merge=X,Y" command (where X and Y are integers) causes SQLite
to do a limited amount of work toward merging the various inverted
index b-trees of an FTS3/4 table together into one large b-tree.
The X value is the target number of "blocks" to be merged, and Y is
the minimum number of b-tree segments on a level required before
merging will be applied to that level. The value of Y should
be between 2 and 16 with a recommended value of 8. The value of X
can be any positive integer but values on the order of 100 to 300
are recommended.
</p><p>
When an FTS table accumulates 16 b-tree segments at the same level,
the next INSERT into that table will cause all 16 segments to be
merged into a single b-tree segment at the next higher level. The
effect of these level merges is that most INSERTs into an FTS table
are very fast and take minimal memory, but an occasional INSERT is
slow and generates a large transaction because of the need to
do merging. This results in "spiky" performance of INSERTs.
</p><p>
To avoid spiky INSERT performance, an application can run the
"merge=X,Y" command periodically, possibly in an idle thread or
idle process, to ensure that the FTS table never accumulates
too many b-tree segments at the same level. INSERT performance
spikes can generally be avoided, and performance of FTS3/4 can be
maximized, by running "merge=X,Y" after every few thousand
document inserts. Each "merge=X,Y" command will run in a separate
transaction (unless they are grouped together using <a href="lang_transaction.html">BEGIN</a>...<a href="lang_transaction.html">COMMIT</a>,
of course). The transactions can be kept small by choosing a value
for X in the range of 100 to 300. The idle thread that is running
the merge commands can know when it is done by checking the difference
in <a href="c3ref/total_changes.html">sqlite3_total_changes()</a> before and after each "merge=X,Y"
command and stopping the loop when the difference drops below two.
<a name="*fts4automergecmd"></a>
</p><h2 id="automerge""><span>7.5. </span>The "automerge=N" command</h2>
<p>
The "automerge=N" command (where N is an integer between 0 and 15,
inclusive) is used to configure an FTS3/4 tables "automerge" parameter,
which controls automatic incremental inverted index merging. The default
automerge value for new tables is 0, meaning that automatic incremental
merging is completely disabled. If the value of the automerge parameter
is modified using the "automerge=N" command, the new parameter value is
stored persistently in the database and is used by all subsequently
established database connections.
</p><p>
Setting the automerge parameter to a non-zero value enables automatic
incremental merging. This causes SQLite to do a small amount of inverted
index merging after every INSERT operation. The amount of merging
performed is designed so that the FTS3/4 table never reaches a point
where it has 16 segments at the same level and hence has to do a large
merge in order to complete an insert. In other words, automatic
incremental merging is designed to prevent spiky INSERT performance.
</p><p>
The downside of automatic incremental merging is that it makes
every INSERT, UPDATE, and DELETE operation on an FTS3/4 table run
a little slower, since extra time must be used to do the incremental
merge. For maximum performance, it is recommended that applications
disable automatic incremental merge and instead use the
<a href="fts3.html#*fts4mergecmd">"merge" command</a> in an idle process to keep the inverted indices
well merged. But if the structure of an application does not easily
allow for idle processes, the use of automatic incremental merge is
a very reasonable fallback solution.
</p><p>
The actual value of the automerge parameter determines the number of
index segments merged simultaneously by an automatic inverted index
merge. If the value is set to N, the system waits until there are at
least N segments on a single level before beginning to incrementally
merge them. Setting a lower value of N causes segments to be merged more
quickly, which may speed up full-text queries and, if the workload
contains UPDATE or DELETE operations as well as INSERTs, reduce the space
on disk consumed by the full-text index. However, it also increases the
amount of data written to disk.
</p><p>
For general use in cases where the workload contains few UPDATE or DELETE
operations, a good choice for automerge is 8. If the workload contains
many UPDATE or DELETE commands,
or if query speed is a concern, it may be advantageous to reduce automerge
to 2.
</p><p>
For reasons of backwards compatibility, the "automerge=1" command sets
the automerge parameter to 8, not 1 (a value of 1 would make no sense
anyway, as merging data from a single segment is a no-op).
</p><a name="tokenizer"></a>
<h1 id="tokenizer" tags="tokenizer"><span>8. </span>Tokenizers</h1>
<p>
An FTS tokenizer is a set of rules for extracting terms from a document
or basic FTS full-text query.
</p><p>
Unless a specific tokenizer is specified as part of the CREATE
VIRTUAL TABLE statement used to create the FTS table, the default
tokenizer, "simple", is used. The simple tokenizer extracts tokens from
a document or basic FTS full-text query according to the following
rules:
</p><ul>
<li><p> A term is a contiguous sequence of eligible characters, where
eligible characters are all alphanumeric characters and all characters with
Unicode codepoint values greater than or equal to 128.
All other characters are
discarded when splitting a document into terms. Their only contribution is
to separate adjacent terms.
</p></li><li><p> All uppercase characters within the ASCII range (Unicode codepoints
less than 128), are transformed to their lowercase equivalents as part
of the tokenization process. Thus, full-text queries are
case-insensitive when using the simple tokenizer.
</p></li></ul>
<p>
For example, when a document containing the text "Right now, they're very
frustrated.", the terms extracted from the document and added to the
full-text index are, in order, "right now they re very frustrated". Such
a document would match a full-text query such as "MATCH 'Frustrated'",
as the simple tokenizer transforms the term in the query to lowercase
before searching the full-text index.
</p><p>
As well as the "simple" tokenizer, the FTS source code features a tokenizer
that uses the <a href="http://tartarus.org/~martin/PorterStemmer/">Porter
Stemming algorithm</a>. This tokenizer uses the same rules to separate
the input document into terms including folding all terms into lower case,
but also uses the Porter Stemming algorithm to reduce related English language
words to a common root. For example, using the same input document as in the
paragraph above, the porter tokenizer extracts the following tokens:
"right now thei veri frustrat". Even though some of these terms are not even
English words, in some cases using them to build the full-text index is more
useful than the more intelligible output produced by the simple tokenizer.
Using the porter tokenizer, the document not only matches full-text queries
such as "MATCH 'Frustrated'", but also queries such as "MATCH 'Frustration'",
as the term "Frustration" is reduced by the Porter stemmer algorithm to
"frustrat" - just as "Frustrated" is. So, when using the porter tokenizer,
FTS is able to find not just exact matches for queried terms, but matches
against similar English language terms. For more information on the
Porter Stemmer algorithm, please refer to the page linked above.
</p><p>
Example illustrating the difference between the "simple" and "porter"
tokenizers:
</p><div class="codeblock"><pre><i>-- Create a table using the simple tokenizer. Insert a document into it.</i>
CREATE VIRTUAL TABLE simple USING fts3(tokenize=simple);
INSERT INTO simple VALUES('Right now they''re very frustrated');
<i>-- The first of the following two queries matches the document stored in</i>
<i>-- table "simple". The second does not.</i>
SELECT * FROM simple WHERE simple MATCH 'Frustrated';
SELECT * FROM simple WHERE simple MATCH 'Frustration';
<i>-- Create a table using the porter tokenizer. Insert the same document into it</i>
CREATE VIRTUAL TABLE porter USING fts3(tokenize=porter);
INSERT INTO porter VALUES('Right now they''re very frustrated');
<i>-- Both of the following queries match the document stored in table "porter".</i>
SELECT * FROM porter WHERE porter MATCH 'Frustrated';
SELECT * FROM porter WHERE porter MATCH 'Frustration';
</pre></div>
<p>
If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor
symbol defined, then there exists a built-in tokenizer named "icu"
implemented using the ICU library. The first argument passed to the
xCreate() method (see fts3_tokenizer.h) of this tokenizer may be
an ICU locale identifier. For example "tr_TR" for Turkish as used
in Turkey, or "en_AU" for English as used in Australia. For example:
</p><div class="codeblock"><pre>CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenize=icu th_TH)
</pre></div>
<p>
The ICU tokenizer implementation is very simple. It splits the input
text according to the ICU rules for finding word boundaries and discards
any tokens that consist entirely of white-space. This may be suitable
for some applications in some locales, but not all. If more complex
processing is required, for example to implement stemming or
discard punctuation, this can be done by creating a tokenizer
implementation that uses the ICU tokenizer as part of its implementation.
<a name="unicode61"></a>
</p><p>
The "unicode61" tokenizer is available beginning with SQLite <a href="releaselog/3_7_13.html">version 3.7.13</a>
(2012-06-11).
Unicode61 works very much like "simple" except that it does simple unicode
case folding according to rules in Unicode Version 6.1 and it recognizes
unicode space and punctuation characters and uses those to separate tokens.
The simple tokenizer only does case folding of ASCII characters and only
recognizes ASCII space and punctuation characters as token separators.
</p><p>
By default, "unicode61" attempts to remove diacritics from Latin script
characters. This behaviour can be overridden by adding the tokenizer argument
"remove_diacritics=0". For example:
</p><div class="codeblock"><pre><i>-- Create tables that remove <b>all</b>diacritics from Latin script characters</i>
<i>-- as part of tokenization.</i>
CREATE VIRTUAL TABLE txt1 USING fts4(tokenize=unicode61);
CREATE VIRTUAL TABLE txt2 USING fts4(tokenize=unicode61 "remove_diacritics=2");
<i>-- Create a table that does not remove diacritics from Latin script</i>
<i>-- characters as part of tokenization.</i>
CREATE VIRTUAL TABLE txt3 USING fts4(tokenize=unicode61 "remove_diacritics=0");
</pre></div>
<p>The remove_diacritics option may be set to "0", "1" or "2". The default
value is "1". If it is set to "1" or "2", then diacritics are removed from
Latin script characters as described above. However, if it is set to "1",
then diacritics are not removed in the fairly uncommon case where a single
unicode codepoint is used to represent a character with more that one
diacritic. For example, diacritics are not removed from codepoint 0x1ED9
("LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW"). This is technically
a bug, but cannot be fixed without creating backwards compatibility
problems. If this option is set to "2", then diacritics are correctly
removed from all Latin characters.
</p><p>
It is also possible to customize the set of codepoints that unicode61 treats
as separator characters. The "separators=" option may be used to specify one
or more extra characters that should be treated as separator characters, and
the "tokenchars=" option may be used to specify one or more extra characters
that should be treated as part of tokens instead of as separator characters.
For example:
</p><div class="codeblock"><pre><i>-- Create a table that uses the unicode61 tokenizer, but considers "."</i>
<i>-- and "=" characters to be part of tokens, and capital "X" characters to</i>
<i>-- function as separators.</i>
CREATE VIRTUAL TABLE txt3 USING fts4(tokenize=unicode61 "tokenchars=.=" "separators=X");
<i>-- Create a table that considers space characters (codepoint 32) to be</i>
<i>-- a token character</i>
CREATE VIRTUAL TABLE txt4 USING fts4(tokenize=unicode61 "tokenchars= ");
</pre></div>
<p>
If a character specified as part of the argument to "tokenchars=" is considered
to be a token character by default, it is ignored. This is true even if it has
been marked as a separator by an earlier "separators=" option. Similarly, if
a character specified as part of a "separators=" option is treated as a separator
character by default, it is ignored. If multiple "tokenchars=" or "separators="
options are specified, all are processed. For example:
</p><div class="codeblock"><pre><i>-- Create a table that uses the unicode61 tokenizer, but considers "."</i>
<i>-- and "=" characters to be part of tokens, and capital "X" characters to</i>
<i>-- function as separators. Both of the "tokenchars=" options are processed</i>
<i>-- The "separators=" option ignores the "." passed to it, as "." is by</i>
<i>-- default a separator character, even though it has been marked as a token</i>
<i>-- character by an earlier "tokenchars=" option.</i>
CREATE VIRTUAL TABLE txt5 USING fts4(
tokenize=unicode61 "tokenchars=." "separators=X." "tokenchars=="
);
</pre></div>
<p>
The arguments passed to the "tokenchars=" or "separators=" options are
case-sensitive. In the example above, specifying that "X" is a separator
character does not affect the way "x" is handled.
<a name="f3tknzr"></a>
</p><h2 id="custom_application_defined_tokenizers"><span>8.1. </span>Custom (Application Defined) Tokenizers</h2>
<p>
In addition to providing built-in "simple", "porter" and (possibly) "icu" and
"unicode61" tokenizers,
FTS provides an interface for applications to implement and register custom
tokenizers written in C. The interface used to create a new tokenizer is defined
and described in the fts3_tokenizer.h source file.
</p><p>
Registering a new FTS tokenizer is similar to registering a new
virtual table module with SQLite. The user passes a pointer to a
structure containing pointers to various callback functions that
make up the implementation of the new tokenizer type. For tokenizers,
the structure (defined in fts3_tokenizer.h) is called
"sqlite3_tokenizer_module".
</p><p>
FTS does not expose a C-function that users call to register new
tokenizer types with a database handle. Instead, the pointer must
be encoded as an SQL blob value and passed to FTS through the SQL
engine by evaluating a special scalar function, "fts3_tokenizer()".
The fts3_tokenizer() function may be called with one or two arguments,
as follows:
</p><div class="codeblock"><pre>SELECT fts3_tokenizer(<tokenizer-name>);
SELECT fts3_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>);
</pre></div>
<p>
Where <tokenizer-name> is <a href="lang_expr.html#varparam">parameter</a> to which a string is bound using
<a href="c3ref/bind_blob.html">sqlite3_bind_text()</a> where the string identifies the tokenizer and
<sqlite3_tokenizer_module ptr> is a <a href="lang_expr.html#varparam">parameter</a> to which a BLOB is
bound using <a href="c3ref/bind_blob.html">sqlite3_bind_blob()</a> where the value of the BLOB is a
pointer to an sqlite3_tokenizer_module structure.
If the second argument is present,
it is registered as tokenizer <tokenizer-name> and a copy of it
returned. If only one argument is passed, a pointer to the tokenizer
implementation currently registered as <tokenizer-name> is returned,
encoded as a blob. Or, if no such tokenizer exists, an SQL exception
(error) is raised.
</p><p>
Prior to SQLite <a href="releaselog/3_11_0.html">version 3.11.0</a> (2016-02-15), the arguments to
fts3_tokenizer() could be literal strings or BLOBs. They did not have to
be <a href="lang_expr.html#varparam">bound parameters</a>. But that could lead to security problems in the
event of an SQL injection. Hence, the legacy behavior is now disabled
by default. But the old legacy behavior can be enabled, for backwards
compatibility in applications that really need it,
by calling
<a href="c3ref/db_config.html">sqlite3_db_config</a>(db,<a href="c3ref/c_dbconfig_defensive.html#sqlitedbconfigenablefts3tokenizer">SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER</a>,1,0).
</p><p>
The following block contains an example of calling the fts3_tokenizer()
function from C code:
</p><div class="codeblock"><pre><i>/*
** Register a tokenizer implementation with FTS3 or FTS4.
*/</i>
int registerTokenizer(
sqlite3 *db,
char *zName,
const sqlite3_tokenizer_module *p
){
int rc;
sqlite3_stmt *pStmt;
const char *zSql = "SELECT fts3_tokenizer(?1, ?2)";
rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
if( rc!=SQLITE_OK ){
return rc;
}
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
sqlite3_step(pStmt);
return sqlite3_finalize(pStmt);
}
<i>/*
** Query FTS for the tokenizer implementation named zName.
*/</i>
int queryTokenizer(
sqlite3 *db,
char *zName,
const sqlite3_tokenizer_module **pp
){
int rc;
sqlite3_stmt *pStmt;
const char *zSql = "SELECT fts3_tokenizer(?)";
*pp = 0;
rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
if( rc!=SQLITE_OK ){
return rc;
}
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
if( SQLITE_ROW==sqlite3_step(pStmt) ){
if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
}
}
return sqlite3_finalize(pStmt);
}
</pre></div>
<a name="fts3tok"></a>
<h2 id="querying_tokenizers"><span>8.2. </span>Querying Tokenizers</h2>
<p>The "fts3tokenize" virtual table can be used to directly access any
tokenizer. The following SQL demonstrates how to create an instance
of the fts3tokenize virtual table:
</p><div class="codeblock"><pre>CREATE VIRTUAL TABLE tok1 USING fts3tokenize('porter');
</pre></div>
<p>The name of the desired tokenizer should be substituted in place of
'porter' in the example, of course. If the tokenizer requires one or
more arguments, they should be separated by commas in the fts3tokenize
declaration (even though they are separated by spaces in declarations
of regular fts4 tables). The following creates fts4 and fts3tokenize
tables that use the same tokenizer:
</p><div class="codeblock"><pre>CREATE VIRTUAL TABLE text1 USING fts4(tokenize=icu en_AU);
CREATE VIRTUAL TABLE tokens1 USING fts3tokenize(icu, en_AU);
CREATE VIRTUAL TABLE text2 USING fts4(tokenize=unicode61 "tokenchars=@." "separators=123");
CREATE VIRTUAL TABLE tokens2 USING fts3tokenize(unicode61, "tokenchars=@.", "separators=123");
</pre></div>
<p>
Once the virtual table is created, it can be queried as follows:
</p><div class="codeblock"><pre>SELECT token, start, end, position
FROM tok1
WHERE input='This is a test sentence.';
</pre></div>
<p>The virtual table will return one row of output for each token in the
input string. The "token" column is the text of the token. The "start"
and "end" columns are the byte offset to the beginning and end of the
token in the original input string.
The "position" column is the sequence number
of the token in the original input string. There is also an "input"
column which is simply a copy of the input string that is specified in
the WHERE clause. Note that a constraint of the form "input=?" must
appear in the WHERE clause or else the virtual table will have no input
to tokenize and will return no rows. The example above generates
the following output:
</p><div class="codeblock"><pre>thi|0|4|0
is|5|7|1
a|8|9|2
test|10|14|3
sentenc|15|23|4
</pre></div>
<p>Notice that the tokens in the result set from the fts3tokenize virtual
table have been transformed according to the rules of the tokenizer.
Since this example used the "porter" tokenizer, the "This" token was
converted into "thi". If the original text of the token is desired,
it can be retrieved using the "start" and "end" columns with the
<a href="lang_corefunc.html#substr">substr()</a> function. For example:
</p><div class="codeblock"><pre>SELECT substr(input, start+1, end-start), token, position
FROM tok1
WHERE input='This is a test sentence.';
</pre></div>
<p>The fts3tokenize virtual table can be used on any tokenizer, regardless
of whether or not there exists an FTS3 or FTS4 table that actually uses
that tokenizer.
</p><a name="data_structures"></a>
<h1 tags="segment btree" id="data_structures"><span>9. </span>Data Structures</h1>
<p>
This section describes at a high-level the way the FTS module stores its
index and content in the database. It is <b>not necessary to read or
understand the material in this section in order to use FTS</b> in an
application. However, it may be useful to application developers attempting
to analyze and understand FTS performance characteristics, or to developers
contemplating enhancements to the existing FTS feature set.
</p>
<a name="*shadowtab"></a>
<a name="shadow_tables"></a>
<h2 tags="shadowtabs" id="shadow_tables"><span>9.1. </span>Shadow Tables</h2>
<p>
For each FTS virtual table in a database, three to five real (non-virtual) tables
are created to store the underlying data. These real tables are called "shadow tables".
The real tables are named "%_content",
"%_segdir", "%_segments", "%_stat", and "%_docsize", where "%" is replaced by the name
of the FTS virtual table.
</p><p>
The leftmost column of the "%_content" table is an INTEGER PRIMARY KEY field
named "docid". Following this is one column for each column of the FTS
virtual table as declared by the user, named by prepending the column name
supplied by the user with "c<i>N</i>", where <i>N</i> is the index of the
column within the table, numbered from left to right starting with 0. Data
types supplied as part of the virtual table declaration are not used as
part of the %_content table declaration. For example:
</p><div class="codeblock"><pre><i>-- Virtual table declaration</i>
CREATE VIRTUAL TABLE abc USING fts4(a NUMBER, b TEXT, c);
<i>-- Corresponding %_content table declaration</i>
CREATE TABLE abc_content(docid INTEGER PRIMARY KEY, c0a, c1b, c2c);
</pre></div>
<p>
The %_content table contains the unadulterated data inserted by the user
into the FTS virtual table by the user. If the user does not explicitly
supply a "docid" value when inserting records, one is selected automatically
by the system.
</p><p>
The %_stat and %_docsize tables are only created if the FTS table uses the
FTS4 module, not FTS3. Furthermore, the %_docsize table is omitted if the
FTS4 table is created with the <a href="fts3.html#fts4matchinfo">"matchinfo=fts3"</a> directive
specified as part of the CREATE VIRTUAL TABLE statement. If they are created,
the schema of the two tables is as follows:
</p><div class="codeblock"><pre>CREATE TABLE %_stat(
id INTEGER PRIMARY KEY,
value BLOB
);
CREATE TABLE %_docsize(
docid INTEGER PRIMARY KEY,
size BLOB
);
</pre></div>
<p>
For each row in the FTS table, the %_docsize table contains a corresponding
row with the same "docid" value. The "size" field contains a blob consisting
of <i>N</i> FTS varints, where <i>N</i> is the number of user-defined columns
in the table. Each varint in the "size" blob is the number of tokens in the
corresponding column of the associated row in the FTS table. The %_stat table
always contains a single row with the "id" column set to 0. The "value"
column contains a blob consisting of <i>N+1</i> FTS varints, where <i>N</i>
is again the number of user-defined columns in the FTS table. The first
varint in the blob is set to the total number of rows in the FTS table. The
second and subsequent varints contain the total number of tokens stored in
the corresponding column for all rows of the FTS table.
</p><p>
The two remaining tables, %_segments and %_segdir, are used to store the
full-text index. Conceptually, this index is a lookup table that maps each
term (word) to the set of docid values corresponding to records in the
%_content table that contain one or more occurrences of the term. To
retrieve all documents that contain a specified term, the FTS module
queries this index to determine the set of docid values for records that
contain the term, then retrieves the required documents from the %_content
table. Regardless of the schema of the FTS virtual table, the %_segments
and %_segdir tables are always created as follows:
</p><div class="codeblock"><pre>CREATE TABLE %_segments(
blockid INTEGER PRIMARY KEY, <i>-- B-tree node id</i>
block blob <i>-- B-tree node data</i>
);
CREATE TABLE %_segdir(
level INTEGER,
idx INTEGER,
start_block INTEGER, <i>-- Blockid of first node in %_segments</i>
leaves_end_block INTEGER, <i>-- Blockid of last leaf node in %_segments</i>
end_block INTEGER, <i>-- Blockid of last node in %_segments</i>
root BLOB, <i>-- B-tree root node</i>
PRIMARY KEY(level, idx)
);
</pre></div>
<p>
The schema depicted above is not designed to store the full-text index
directly. Instead, it is used to store one or more b-tree structures. There
is one b-tree for each row in the %_segdir table. The %_segdir table
row contains the root node and various meta-data associated with the
b-tree structure, and the %_segments table contains all other (non-root)
b-tree nodes. Each b-tree is referred to as a "segment". Once it has
been created, a segment b-tree is never updated (although it may be
deleted altogether).
</p><p>
The keys used by each segment b-tree are terms (words). As well as the
key, each segment b-tree entry has an associated "doclist" (document list).
A doclist consists of zero or more entries, where each entry consists of:
</p><ul>
<li> A docid (document id), and
</li><li> A list of term offsets, one for each occurrence of the term within
the document. A term offset indicates the number of tokens (words)
that occur before the term in question, not the number of characters
or bytes. For example, the term offset of the term "war" in the
phrase "Ancestral voices prophesying war!" is 3.
</li></ul>
<p>
Entries within a doclist are sorted by docid. Positions within a doclist
entry are stored in ascending order.
</p><p>
The contents of the logical full-text index is found by merging the
contents of all segment b-trees. If a term is present in more than one
segment b-tree, then it maps to the union of each individual doclist. If,
for a single term, the same docid occurs in more than one doclist, then only
the doclist that is part of the most recently created segment b-tree is
considered valid.
</p><p>
Multiple b-tree structures are used instead of a single b-tree to reduce
the cost of inserting records into FTS tables. When a new record is
inserted into an FTS table that already contains a lot of data, it is
likely that many of the terms in the new record are already present in
a large number of existing records. If a single b-tree were used, then
large doclist structures would have to be loaded from the database,
amended to include the new docid and term-offset list, then written back
to the database. Using multiple b-tree tables allows this to be avoided
by creating a new b-tree which can be merged with the existing b-tree
(or b-trees) later on. Merging of b-tree structures can be performed as
a background task, or once a certain number of separate b-tree structures
have been accumulated. Of course, this scheme makes queries more expensive
(as the FTS code may have to look up individual terms in more than one
b-tree and merge the results), but it has been found that in practice this
overhead is often negligible.
</p><h2 id="variable_length_integer_varint_format"><span>9.2. </span>Variable Length Integer (varint) Format</h2>
<p>
Integer values stored as part of segment b-tree nodes are encoded using the
FTS varint format. This encoding is similar, but <b>not identical</b>, to
the <a href="fileformat.html#varint_format">SQLite varint format</a>.
</p><p>
An encoded FTS varint consumes between one and ten bytes of space. The
number of bytes required is determined by the sign and magnitude of the
integer value encoded. More accurately, the number of bytes used to store
the encoded integer depends on the position of the most significant set bit
in the 64-bit twos-complement representation of the integer value. Negative
values always have the most significant bit set (the sign bit), and so are
always stored using the full ten bytes. Positive integer values may be
stored using less space.
</p><p>
The final byte of an encoded FTS varint has its most significant bit
cleared. All preceding bytes have the most significant bit set. Data
is stored in the remaining seven least significant bits of each byte.
The first byte of the encoded representation contains the least significant
seven bits of the encoded integer value. The second byte of the encoded
representation, if it is present, contains the seven next least significant
bits of the integer value, and so on. The following table contains examples
of encoded integer values:
</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
<tr style="text-align:left"><th>Decimal</th><th>Hexadecimal</th><th width="100%">Encoded Representation
</th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>43</td><td>0x000000000000002B</td><td>0x2B
</td></tr><tr style="text-align:left"><td>200815</td><td>0x000000000003106F</td><td>0xEF 0xA0 0x0C
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>-1</td><td>0xFFFFFFFFFFFFFFFF</td><td>0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0x01
</td></tr></table>
<h2 id="segment_b_tree_format"><span>9.3. </span>Segment B-Tree Format</h2>
<p>
Segment b-trees are prefix-compressed b+-trees. There is one segment b-tree
for each row in the %_segdir table (see above). The root node of the segment
b-tree is stored as a blob in the "root" field of the corresponding row
of the %_segdir table. All other nodes (if any exist) are stored in the
"blob" column of the %_segments table. Nodes within the %_segments table are
identified by the integer value in the blockid field of the corresponding
row. The following table describes the fields of the %_segdir table:
</p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
<tr style="text-align:left"><th>Column </th><th width="100%">Interpretation
</th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>level </td><td>
Between them, the contents of the "level" and "idx" fields define the
relative age of the segment b-tree. The smaller the value stored in the
"level" field, the more recently the segment b-tree was created. If two
segment b-trees are of the same "level", the segment with the larger
value stored in the "idx" column is more recent. The PRIMARY KEY constraint
on the %_segdir table prevents any two segments from having the same value
for both the "level" and "idx" fields.
</td></tr><tr style="text-align:left"><td>idx </td><td> See above.
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>start_block </td><td>
The blockid that corresponds to the node with the smallest blockid that
belongs to this segment b-tree. Or zero if the entire segment b-tree
fits on the root node. If it exists, this node is always a leaf node.
</td></tr><tr style="text-align:left"><td>leaves_end_block </td><td>
The blockid that corresponds to the leaf node with the largest blockid
that belongs to this segment b-tree. Or zero if the entire segment b-tree
fits on the root node.
</td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>end_block </td><td>
This field may contain either an integer or a text field consisting of
two integers separated by a space character (unicode codepoint 0x20).
<p style="margin-left:0;margin-right:0">
The first, or only, integer is the blockid that corresponds to the interior
node with the largest blockid that belongs to this segment b-tree. Or zero
if the entire segment b-tree fits on the root node. If it exists, this node
is always an interior node.
</p><p style="margin-left:0;margin-right:0;margin-bottom:0">
The second integer, if it is present, is the aggregate size of all data
stored on leaf pages in bytes. If the value is negative, then the segment
is the output of an unfinished incremental-merge operation, and the
absolute value is current size in bytes.
</p></td></tr><tr style="text-align:left"><td>root </td><td>
Blob containing the root node of the segment b-tree.
</td></tr></table>
<p>
Apart from the root node, the nodes that make up a single segment b-tree are
always stored using a contiguous sequence of blockids. Furthermore, the
nodes that make up a single level of the b-tree are themselves stored as
a contiguous block, in b-tree order. The contiguous sequence of blockids
used to store the b-tree leaves are allocated starting with the blockid
value stored in the "start_block" column of the corresponding %_segdir row,
and finishing at the blockid value stored in the "leaves_end_block"
field of the same row. It is therefore possible to iterate through all the
leaves of a segment b-tree, in key order, by traversing the %_segments
table in blockid order from "start_block" to "leaves_end_block".
</p><h3 id="segment_b_tree_leaf_nodes"><span>9.3.1. </span>Segment B-Tree Leaf Nodes</h3>
<p>
The following diagram depicts the format of a segment b-tree leaf node.
</p><center>
<img src="images/fts3_leaf_node.png">
<p> Segment B-Tree Leaf Node Format
</p></center>
<p>
The first term stored on each node ("Term 1" in the figure above) is
stored verbatim. Each subsequent term is prefix-compressed with respect
to its predecessor. Terms are stored within a page in sorted (memcmp)
order.
</p><h3 id="segment_b_tree_interior_nodes"><span>9.3.2. </span>Segment B-Tree Interior Nodes</h3>
<p>
The following diagram depicts the format of a segment b-tree interior
(non-leaf) node.
</p><center>
<img src="images/fts3_interior_node.png">
<p> Segment B-Tree Interior Node Format
</p></center>
<h2 id="doclist_format"><span>9.4. </span>Doclist Format</h2>
<p>
A doclist consists of an array of 64-bit signed integers, serialized using
the FTS varint format. Each doclist entry is made up of a series of two
or more integers, as follows:
</p><ol>
<li> The docid value. The first entry in a doclist contains the literal docid
value. The first field of each subsequent doclist entry contains the
difference between the new docid and the previous one (always a positive
number).
</li><li> Zero or more term-offset lists. A term-offset list is present for each
column of the FTS virtual table that contains the term. A term-offset
list consists of the following:
<ol>
<li> Constant value 1. This field is omitted for any term-offset list
associated with column 0.
</li><li> The column number (1 for the second leftmost column, etc.). This
field is omitted for any term-offset list associated with column 0.
</li><li> A list of term-offsets, sorted from smallest to largest. Instead
of storing the term-offset value literally, each integer stored
is the difference between the current term-offset and the previous
one (or zero if the current term-offset is the first), plus 2.
</li></ol>
</li><li> Constant value 0.
</li></ol>
<center>
<img src="images/fts3_doclist2.png">
<p> FTS3 Doclist Format
</p></center>
<center>
<img src="images/fts3_doclist.png">
<p> FTS Doclist Entry Format
</p></center>
<p>
For doclists for which the term appears in more than one column of the FTS
virtual table, term-offset lists within the doclist are stored in column
number order. This ensures that the term-offset list associated with
column 0 (if any) is always first, allowing the first two fields of the
term-offset list to be omitted in this case.
</p><a name="limitations"></a>
<h1 tags="bugs" id="limitations"><span>10. </span>Limitations</h1>
<h2 id="_utf_16_byte_order_mark_problem_"><span>10.1. </span> UTF-16 byte-order-mark problem </h2>
For UTF-16 databases, when using the "simple" tokenizer, it is possible to use
malformed unicode strings to cause the
<a href="#integcheck">integrity-check special command</a> to falsely report
corruption, or for <a href="#snippet">auxiliary functions</a> to return
incorrect results. More specifically, the bug can be triggered by any of the following:
<ul>
<li><p>A UTF-16 byte-order-mark (BOM) is embedded at the beginning of an SQL string
literal value inserted into an FTS3 table. For example:
</p><div class="codeblock"><pre>INSERT INTO fts_table(col) VALUES(char(0xfeff)||'text...');
</pre></div>
</li><li><p>Malformed UTF-8 that SQLite converts to a UTF-16 byte-order-mark is
embedded at the beginning of an SQL string literal value inserted
into an FTS3 table.
</p></li><li><p>A text value created by casting a blob that begins with the two
bytes 0xFF and 0xFE, in either possible order, is inserted into an
FTS3 table. For example:
</p><div class="codeblock"><pre>INSERT INTO fts_table(col) VALUES(CAST(X'FEFF' AS TEXT));
</pre></div>
</li></ul>
Everything works correctly if any of the following are true:
<ul>
<li> The <a href="pragma.html#pragma_encoding">database encoding</a> is UTF-8.
</li><li> All text strings are insert using one of the
<a href="c3ref/bind_blob.html">sqlite3_bind_text()</a> family of functions.
</li><li> Literal strings contain no byte-order-marks.
</li><li> A tokenizer is used that recognizes byte-order-marks
as whitespace. (The default "simple" tokenizer for
FTS3/4 does not think that BOMs are whitespace, but
the unicode tokenizer does.)
</li></ul>
All of the above conditions must be false in order for problems
to occur. And even if all of the conditiona above are false,
most things will still operator correctly. Only the
<a href="#integcheck">integrity-check</a> command and the
<a href="#snippet">auxiliary functions</a> might given
unexpected results.
<a name="appendix_a"></a>
<h1 id="appendix_a" nonumber="1" tags="search application tips">
Appendix A: Search Application Tips
</h1>
<p>
FTS is primarily designed to support Boolean full-text queries - queries
to find the set of documents that match a specified criteria. However, many
(most?) search applications require that results are somehow ranked in order
of "relevance", where "relevance" is defined as the likelihood that the user
who performed the search is interested in a specific element of the returned
set of documents. When using a search engine to find documents on the world
wide web, the user expects that the most useful, or "relevant", documents
will be returned as the first page of results, and that each subsequent page
contains progressively less relevant results. Exactly how a machine can
determine document relevance based on a users query is a complicated problem
and the subject of much ongoing research.
</p><p>
One very simple scheme might be to count the number of instances of the
users search terms in each result document. Those documents that contain
many instances of the terms are considered more relevant than those with
a small number of instances of each term. In an FTS application, the
number of term instances in each result could be determined by counting
the number of integers in the return value of the <a href="fts3.html#offsets">offsets</a> function.
The following example shows a query that could be used to obtain the
ten most relevant results for a query entered by the user:
</p><div class="codeblock"><pre><i>-- This example (and all others in this section) assumes the following schema</i>
CREATE VIRTUAL TABLE documents USING fts3(title, content);
<i>-- Assuming the application has supplied an SQLite user function named "countintegers"</i>
<i>-- that returns the number of space-separated integers contained in its only argument,</i>
<i>-- the following query could be used to return the titles of the 10 documents that contain</i>
<i>-- the greatest number of instances of the users query terms. Hopefully, these 10</i>
<i>-- documents will be those that the users considers more or less the most "relevant".</i>
SELECT title FROM documents
WHERE documents MATCH <query>
ORDER BY countintegers(offsets(documents)) DESC
LIMIT 10 OFFSET 0
</pre></div>
<p>
The query above could be made to run faster by using the FTS <a href="fts3.html#matchinfo">matchinfo</a>
function to determine the number of query term instances that appear in each
result. The matchinfo function is much more efficient than the offsets
function. Furthermore, the matchinfo function provides extra information
regarding the overall number of occurrences of each query term in the entire
document set (not just the current row) and the number of documents in which
each query term appears. This may be used (for example) to attach a higher
weight to less common terms which may increase the overall computed relevancy
of those results the user considers more interesting.
</p><div class="codeblock"><pre><i>-- If the application supplies an SQLite user function called "rank" that</i>
<i>-- interprets the blob of data returned by matchinfo and returns a numeric</i>
<i>-- relevancy based on it, then the following SQL may be used to return the</i>
<i>-- titles of the 10 most relevant documents in the dataset for a users query.</i>
SELECT title FROM documents
WHERE documents MATCH <query>
ORDER BY rank(matchinfo(documents)) DESC
LIMIT 10 OFFSET 0
</pre></div>
<p>
The SQL query in the example above uses less CPU than the first example
in this section, but still has a non-obvious performance problem. SQLite
satisfies this query by retrieving the value of the "title" column and
matchinfo data from the FTS module for every row matched by the users
query before it sorts and limits the results. Because of the way SQLite's
virtual table interface works, retrieving the value of the "title" column
requires loading the entire row from disk (including the "content" field,
which may be quite large). This means that if the users query matches
several thousand documents, many megabytes of "title" and "content" data
may be loaded from disk into memory even though they will never be used
for any purpose.
</p><p>
The SQL query in the following example block is one solution to this
problem. In SQLite, when a <a href="optoverview.html#flattening">sub-query
used in a join contains a LIMIT clause</a>, the results of the sub-query are
calculated and stored in temporary table before the main query is executed.
This means that SQLite will load only the docid and matchinfo data for each
row matching the users query into memory, determine the docid values
corresponding to the ten most relevant documents, then load only the title
and content information for those 10 documents only. Because both the matchinfo
and docid values are gleaned entirely from the full-text index, this results
in dramatically less data being loaded from the database into memory.
</p><div class="codeblock"><pre>SELECT title FROM documents JOIN (
SELECT docid, rank(matchinfo(documents)) AS rank
FROM documents
WHERE documents MATCH <query>
ORDER BY rank DESC
LIMIT 10 OFFSET 0
) AS ranktable USING(docid)
ORDER BY ranktable.rank DESC
</pre></div>
<p>
The next block of SQL enhances the query with solutions to two other problems
that may arise in developing search applications using FTS:
</p><ol>
<li> <p>
The <a href="fts3.html#snippet">snippet</a> function cannot be used with the above query. Because
the outer query does not include a "WHERE ... MATCH" clause, the snippet
function may not be used with it. One solution is to duplicate the WHERE
clause used by the sub-query in the outer query. The overhead associated
with this is usually negligible.
</p></li><li> <p>
The relevancy of a document may depend on something other than just
the data available in the return value of matchinfo. For example
each document in the database may be assigned a static weight based
on factors unrelated to its content (origin, author, age, number
of references etc.). These values can be stored by the application
in a separate table that can be joined against the documents table
in the sub-query so that the rank function may access them.
</p></li></ol>
<p>
This version of the query is very similar to that used by the
<a href="http://www.sqlite.org/search?q=fts3">sqlite.org documentation search</a>
application.
</p><div class="codeblock"><pre><i>-- This table stores the static weight assigned to each document in FTS table</i>
<i>-- "documents". For each row in the documents table there is a corresponding row</i>
<i>-- with the same docid value in this table.</i>
CREATE TABLE documents_data(docid INTEGER PRIMARY KEY, weight);
<i>-- This query is similar to the one in the block above, except that:</i>
<i>--</i>
<i>-- 1. It returns a "snippet" of text along with the document title for display. So</i>
<i>-- that the snippet function may be used, the "WHERE ... MATCH ..." clause from</i>
<i>-- the sub-query is duplicated in the outer query.</i>
<i>--</i>
<i>-- 2. The sub-query joins the documents table with the document_data table, so that</i>
<i>-- implementation of the rank function has access to the static weight assigned</i>
<i>-- to each document.</i>
SELECT title, snippet(documents) FROM documents JOIN (
SELECT docid, rank(matchinfo(documents), documents_data.weight) AS rank
FROM documents JOIN documents_data USING(docid)
WHERE documents MATCH <query>
ORDER BY rank DESC
LIMIT 10 OFFSET 0
) AS ranktable USING(docid)
WHERE documents MATCH <query>
ORDER BY ranktable.rank DESC
</pre></div>
<p>
All the example queries above return the ten most relevant query results.
By modifying the values used with the OFFSET and LIMIT clauses, a query
to return (say) the next ten most relevant results is easy to construct.
This may be used to obtain the data required for a search applications second
and subsequent pages of results.
</p><p>
The next block contains an example rank function that uses matchinfo data
implemented in C. Instead of a single weight, it allows a weight to be
externally assigned to each column of each document. It may be registered
with SQLite like any other user function using <a href="c3ref/create_function.html">sqlite3_create_function</a>.
</p><p><b> Security Warning:</b> Because it is just an ordinary SQL function,
rank() may be invoked as part of any SQL query in any context. This means
that the first argument passed may not be a valid matchinfo blob.
Implementors should take care to handle this case without causing buffer
overruns or other potential security problems.
</p><div class="codeblock"><pre><i>/*</i>
<i>** SQLite user defined function to use with matchinfo() to calculate the</i>
<i>** relevancy of an FTS match. The value returned is the relevancy score</i>
<i>** (a real value greater than or equal to zero). A larger value indicates </i>
<i>** a more relevant document.</i>
<i>**</i>
<i>** The overall relevancy returned is the sum of the relevancies of each </i>
<i>** column value in the FTS table. The relevancy of a column value is the</i>
<i>** sum of the following for each reportable phrase in the FTS query:</i>
<i>**</i>
<i>** (<hit count> / <global hit count>) * <column weight></i>
<i>**</i>
<i>** where <hit count> is the number of instances of the phrase in the</i>
<i>** column value of the current row and <global hit count> is the number</i>
<i>** of instances of the phrase in the same column of all rows in the FTS</i>
<i>** table. The <column weight> is a weighting factor assigned to each</i>
<i>** column by the caller (see below).</i>
<i>**</i>
<i>** The first argument to this function must be the return value of the FTS </i>
<i>** matchinfo() function. Following this must be one argument for each column </i>
<i>** of the FTS table containing a numeric weight factor for the corresponding </i>
<i>** column. Example:</i>
<i>**</i>
<i>** CREATE VIRTUAL TABLE documents USING fts3(title, content)</i>
<i>**</i>
<i>** The following query returns the docids of documents that match the full-text</i>
<i>** query <query> sorted from most to least relevant. When calculating</i>
<i>** relevance, query term instances in the 'title' column are given twice the</i>
<i>** weighting of those in the 'content' column.</i>
<i>**</i>
<i>** SELECT docid FROM documents </i>
<i>** WHERE documents MATCH <query> </i>
<i>** ORDER BY rank(matchinfo(documents), 1.0, 0.5) DESC</i>
<i>*/</i>
static void rankfunc(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal){
int *aMatchinfo; <i>/* Return value of matchinfo() */</i>
int nMatchinfo; <i>/* Number of elements in aMatchinfo[] */</i>
int nCol = 0; <i>/* Number of columns in the table */</i>
int nPhrase = 0; <i>/* Number of phrases in the query */</i>
int iPhrase; <i>/* Current phrase */</i>
double score = 0.0; <i>/* Value to return */</i>
assert( sizeof(int)==4 );
<i> /* Check that the number of arguments passed to this function is correct.</i>
<i> ** If not, jump to wrong_number_args. Set aMatchinfo to point to the array</i>
<i> ** of unsigned integer values returned by FTS function matchinfo. Set</i>
<i> ** nPhrase to contain the number of reportable phrases in the users full-text</i>
<i> ** query, and nCol to the number of columns in the table. Then check that the</i>
<i> ** size of the matchinfo blob is as expected. Return an error if it is not.</i>
<i> */</i>
if( nVal<1 ) goto wrong_number_args;
aMatchinfo = (unsigned int *)sqlite3_value_blob(apVal[0]);
nMatchinfo = sqlite3_value_bytes(apVal[0]) / sizeof(int);
if( nMatchinfo>=2 ){
nPhrase = aMatchinfo[0];
nCol = aMatchinfo[1];
}
if( nMatchinfo!=(2+3*nCol*nPhrase) ){
sqlite3_result_error(pCtx,
"invalid matchinfo blob passed to function rank()", -1);
return;
}
if( nVal!=(1+nCol) ) goto wrong_number_args;
<i> /* Iterate through each phrase in the users query. */</i>
for(iPhrase=0; iPhrase<nPhrase; iPhrase++){
int iCol; <i>/* Current column */</i>
<i> /* Now iterate through each column in the users query. For each column,</i>
<i> ** increment the relevancy score by:</i>
<i> **</i>
<i> ** (<hit count> / <global hit count>) * <column weight></i>
<i> **</i>
<i> ** aPhraseinfo[] points to the start of the data for phrase iPhrase. So</i>
<i> ** the hit count and global hit counts for each column are found in </i>
<i> ** aPhraseinfo[iCol*3] and aPhraseinfo[iCol*3+1], respectively.</i>
<i> */</i>
int *aPhraseinfo = &aMatchinfo[2 + iPhrase*nCol*3];
for(iCol=0; iCol<nCol; iCol++){
int nHitCount = aPhraseinfo[3*iCol];
int nGlobalHitCount = aPhraseinfo[3*iCol+1];
double weight = sqlite3_value_double(apVal[iCol+1]);
if( nHitCount>0 ){
score += ((double)nHitCount / (double)nGlobalHitCount) * weight;
}
}
}
sqlite3_result_double(pCtx, score);
return;
<i> /* Jump here if the wrong number of arguments are passed to this function */</i>
wrong_number_args:
sqlite3_result_error(pCtx, "wrong number of arguments to function rank()", -1);
}
</pre></div>
</integer></language-id><p align="center"><small><i>This page last modified on <a href="https://sqlite.org/docsrc/honeypot" id="mtimelink" data-href="https://sqlite.org/docsrc/finfo/pages/fts3.in?m=8a8aa7fb8b4ab5e5a">2022-08-10 18:11:15</a> UTC </small></i></p>
|