1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
|
/*-------------------------------------------------------------------------
*
* nbtinsert.c
* Item insertion in Lehman and Yao btrees for Postgres.
*
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/access/nbtree/nbtinsert.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/transam.h"
#include "access/xloginsert.h"
#include "common/pg_prng.h"
#include "lib/qunique.h"
#include "miscadmin.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/smgr.h"
/* Minimum tree height for application of fastpath optimization */
#define BTREE_FASTPATH_MIN_LEVEL 2
static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate);
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
Relation heapRel,
IndexUniqueCheck checkUnique, bool *is_unique,
uint32 *speculativeToken);
static OffsetNumber _bt_findinsertloc(Relation rel,
BTInsertState insertstate,
bool checkingunique,
bool indexUnchanged,
BTStack stack,
Relation heapRel);
static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
Buffer buf,
Buffer cbuf,
BTStack stack,
IndexTuple itup,
Size itemsz,
OffsetNumber newitemoff,
int postingoff,
bool split_only_page);
static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
IndexTuple newitem, IndexTuple orignewitem,
IndexTuple nposting, uint16 postingoff);
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
BTStack stack, bool isroot, bool isonly);
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
OffsetNumber itup_off, bool newfirstdataitem);
static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
BTInsertState insertstate,
bool simpleonly, bool checkingunique,
bool uniquedup, bool indexUnchanged);
static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
OffsetNumber *deletable, int ndeletable,
IndexTuple newitem, OffsetNumber minoff,
OffsetNumber maxoff);
static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
int ndeletable, IndexTuple newitem,
int *nblocks);
static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
/*
* _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
*
* This routine is called by the public interface routine, btinsert.
* By here, itup is filled in, including the TID.
*
* If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
* will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
* UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
* For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
* don't actually insert.
*
* indexUnchanged executor hint indicates if itup is from an
* UPDATE that didn't logically change the indexed value, but
* must nevertheless have a new entry to point to a successor
* version.
*
* The result value is only significant for UNIQUE_CHECK_PARTIAL:
* it must be true if the entry is known unique, else false.
* (In the current implementation we'll also return true after a
* successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
* that's just a coding artifact.)
*/
bool
_bt_doinsert(Relation rel, IndexTuple itup,
IndexUniqueCheck checkUnique, bool indexUnchanged,
Relation heapRel)
{
bool is_unique = false;
BTInsertStateData insertstate;
BTScanInsert itup_key;
BTStack stack;
bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
/* we need an insertion scan key to do our search, so build one */
itup_key = _bt_mkscankey(rel, itup);
if (checkingunique)
{
if (!itup_key->anynullkeys)
{
/* No (heapkeyspace) scantid until uniqueness established */
itup_key->scantid = NULL;
}
else
{
/*
* Scan key for new tuple contains NULL key values. Bypass
* checkingunique steps. They are unnecessary because core code
* considers NULL unequal to every value, including NULL.
*
* This optimization avoids O(N^2) behavior within the
* _bt_findinsertloc() heapkeyspace path when a unique index has a
* large number of "duplicates" with NULL key values.
*/
checkingunique = false;
/* Tuple is unique in the sense that core code cares about */
Assert(checkUnique != UNIQUE_CHECK_EXISTING);
is_unique = true;
}
}
/*
* Fill in the BTInsertState working area, to track the current page and
* position within the page to insert on.
*
* Note that itemsz is passed down to lower level code that deals with
* inserting the item. It must be MAXALIGN()'d. This ensures that space
* accounting code consistently considers the alignment overhead that we
* expect PageAddItem() will add later. (Actually, index_form_tuple() is
* already conservative about alignment, but we don't rely on that from
* this distance. Besides, preserving the "true" tuple size in index
* tuple headers for the benefit of nbtsplitloc.c might happen someday.
* Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
*/
insertstate.itup = itup;
insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
insertstate.itup_key = itup_key;
insertstate.bounds_valid = false;
insertstate.buf = InvalidBuffer;
insertstate.postingoff = 0;
search:
/*
* Find and lock the leaf page that the tuple should be added to by
* searching from the root page. insertstate.buf will hold a buffer that
* is locked in exclusive mode afterwards.
*/
stack = _bt_search_insert(rel, &insertstate);
/*
* checkingunique inserts are not allowed to go ahead when two tuples with
* equal key attribute values would be visible to new MVCC snapshots once
* the xact commits. Check for conflicts in the locked page/buffer (if
* needed) here.
*
* It might be necessary to check a page to the right in _bt_check_unique,
* though that should be very rare. In practice the first page the value
* could be on (with scantid omitted) is almost always also the only page
* that a matching tuple might be found on. This is due to the behavior
* of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
* only be allowed to cross a page boundary when there is no candidate
* leaf page split point that avoids it. Also, _bt_check_unique can use
* the leaf page high key to determine that there will be no duplicates on
* the right sibling without actually visiting it (it uses the high key in
* cases where the new item happens to belong at the far right of the leaf
* page).
*
* NOTE: obviously, _bt_check_unique can only detect keys that are already
* in the index; so it cannot defend against concurrent insertions of the
* same key. We protect against that by means of holding a write lock on
* the first page the value could be on, with omitted/-inf value for the
* implicit heap TID tiebreaker attribute. Any other would-be inserter of
* the same key must acquire a write lock on the same page, so only one
* would-be inserter can be making the check at one time. Furthermore,
* once we are past the check we hold write locks continuously until we
* have performed our insertion, so no later inserter can fail to see our
* insertion. (This requires some care in _bt_findinsertloc.)
*
* If we must wait for another xact, we release the lock while waiting,
* and then must perform a new search.
*
* For a partial uniqueness check, we don't wait for the other xact. Just
* let the tuple in and return false for possibly non-unique, or true for
* definitely unique.
*/
if (checkingunique)
{
TransactionId xwait;
uint32 speculativeToken;
xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
&is_unique, &speculativeToken);
if (unlikely(TransactionIdIsValid(xwait)))
{
/* Have to wait for the other guy ... */
_bt_relbuf(rel, insertstate.buf);
insertstate.buf = InvalidBuffer;
/*
* If it's a speculative insertion, wait for it to finish (ie. to
* go ahead with the insertion, or kill the tuple). Otherwise
* wait for the transaction to finish as usual.
*/
if (speculativeToken)
SpeculativeInsertionWait(xwait, speculativeToken);
else
XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
/* start over... */
if (stack)
_bt_freestack(stack);
goto search;
}
/* Uniqueness is established -- restore heap tid as scantid */
if (itup_key->heapkeyspace)
itup_key->scantid = &itup->t_tid;
}
if (checkUnique != UNIQUE_CHECK_EXISTING)
{
OffsetNumber newitemoff;
/*
* The only conflict predicate locking cares about for indexes is when
* an index tuple insert conflicts with an existing lock. We don't
* know the actual page we're going to insert on for sure just yet in
* checkingunique and !heapkeyspace cases, but it's okay to use the
* first page the value could be on (with scantid omitted) instead.
*/
CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
/*
* Do the insertion. Note that insertstate contains cached binary
* search bounds established within _bt_check_unique when insertion is
* checkingunique.
*/
newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
indexUnchanged, stack, heapRel);
_bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
itup, insertstate.itemsz, newitemoff,
insertstate.postingoff, false);
}
else
{
/* just release the buffer */
_bt_relbuf(rel, insertstate.buf);
}
/* be tidy */
if (stack)
_bt_freestack(stack);
pfree(itup_key);
return is_unique;
}
/*
* _bt_search_insert() -- _bt_search() wrapper for inserts
*
* Search the tree for a particular scankey, or more precisely for the first
* leaf page it could be on. Try to make use of the fastpath optimization's
* rightmost leaf page cache before actually searching the tree from the root
* page, though.
*
* Return value is a stack of parent-page pointers (though see notes about
* fastpath optimization and page splits below). insertstate->buf is set to
* the address of the leaf-page buffer, which is write-locked and pinned in
* all cases (if necessary by creating a new empty root page for caller).
*
* The fastpath optimization avoids most of the work of searching the tree
* repeatedly when a single backend inserts successive new tuples on the
* rightmost leaf page of an index. A backend cache of the rightmost leaf
* page is maintained within _bt_insertonpg(), and used here. The cache is
* invalidated here when an insert of a non-pivot tuple must take place on a
* non-rightmost leaf page.
*
* The optimization helps with indexes on an auto-incremented field. It also
* helps with indexes on datetime columns, as well as indexes with lots of
* NULL values. (NULLs usually get inserted in the rightmost page for single
* column indexes, since they usually get treated as coming after everything
* else in the key space. Individual NULL tuples will generally be placed on
* the rightmost leaf page due to the influence of the heap TID column.)
*
* Note that we avoid applying the optimization when there is insufficient
* space on the rightmost page to fit caller's new item. This is necessary
* because we'll need to return a real descent stack when a page split is
* expected (actually, caller can cope with a leaf page split that uses a NULL
* stack, but that's very slow and so must be avoided). Note also that the
* fastpath optimization acquires the lock on the page conditionally as a way
* of reducing extra contention when there are concurrent insertions into the
* rightmost page (we give up if we'd have to wait for the lock). We assume
* that it isn't useful to apply the optimization when there is contention,
* since each per-backend cache won't stay valid for long.
*/
static BTStack
_bt_search_insert(Relation rel, BTInsertState insertstate)
{
Assert(insertstate->buf == InvalidBuffer);
Assert(!insertstate->bounds_valid);
Assert(insertstate->postingoff == 0);
if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
{
/* Simulate a _bt_getbuf() call with conditional locking */
insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
if (_bt_conditionallockbuf(rel, insertstate->buf))
{
Page page;
BTPageOpaque opaque;
_bt_checkpage(rel, insertstate->buf);
page = BufferGetPage(insertstate->buf);
opaque = BTPageGetOpaque(page);
/*
* Check if the page is still the rightmost leaf page and has
* enough free space to accommodate the new tuple. Also check
* that the insertion scan key is strictly greater than the first
* non-pivot tuple on the page. (Note that we expect itup_key's
* scantid to be unset when our caller is a checkingunique
* inserter.)
*/
if (P_RIGHTMOST(opaque) &&
P_ISLEAF(opaque) &&
!P_IGNORE(opaque) &&
PageGetFreeSpace(page) > insertstate->itemsz &&
PageGetMaxOffsetNumber(page) >= P_HIKEY &&
_bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
{
/*
* Caller can use the fastpath optimization because cached
* block is still rightmost leaf page, which can fit caller's
* new tuple without splitting. Keep block in local cache for
* next insert, and have caller use NULL stack.
*
* Note that _bt_insert_parent() has an assertion that catches
* leaf page splits that somehow follow from a fastpath insert
* (it should only be passed a NULL stack when it must deal
* with a concurrent root page split, and never because a NULL
* stack was returned here).
*/
return NULL;
}
/* Page unsuitable for caller, drop lock and pin */
_bt_relbuf(rel, insertstate->buf);
}
else
{
/* Lock unavailable, drop pin */
ReleaseBuffer(insertstate->buf);
}
/* Forget block, since cache doesn't appear to be useful */
RelationSetTargetBlock(rel, InvalidBlockNumber);
}
/* Cannot use optimization -- descend tree, return proper descent stack */
return _bt_search(rel, insertstate->itup_key, &insertstate->buf, BT_WRITE,
NULL);
}
/*
* _bt_check_unique() -- Check for violation of unique index constraint
*
* Returns InvalidTransactionId if there is no conflict, else an xact ID
* we must wait for to see if it commits a conflicting tuple. If an actual
* conflict is detected, no return --- just ereport(). If an xact ID is
* returned, and the conflicting tuple still has a speculative insertion in
* progress, *speculativeToken is set to non-zero, and the caller can wait for
* the verdict on the insertion using SpeculativeInsertionWait().
*
* However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
* InvalidTransactionId because we don't want to wait. In this case we
* set *is_unique to false if there is a potential conflict, and the
* core code must redo the uniqueness check later.
*
* As a side-effect, sets state in insertstate that can later be used by
* _bt_findinsertloc() to reuse most of the binary search work we do
* here.
*
* This code treats NULLs as equal, unlike the default semantics for unique
* indexes. So do not call here when there are NULL values in scan key and
* the index uses the default NULLS DISTINCT mode.
*/
static TransactionId
_bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
IndexUniqueCheck checkUnique, bool *is_unique,
uint32 *speculativeToken)
{
IndexTuple itup = insertstate->itup;
IndexTuple curitup = NULL;
ItemId curitemid = NULL;
BTScanInsert itup_key = insertstate->itup_key;
SnapshotData SnapshotDirty;
OffsetNumber offset;
OffsetNumber maxoff;
Page page;
BTPageOpaque opaque;
Buffer nbuf = InvalidBuffer;
bool found = false;
bool inposting = false;
bool prevalldead = true;
int curposti = 0;
/* Assume unique until we find a duplicate */
*is_unique = true;
InitDirtySnapshot(SnapshotDirty);
page = BufferGetPage(insertstate->buf);
opaque = BTPageGetOpaque(page);
maxoff = PageGetMaxOffsetNumber(page);
/*
* Find the first tuple with the same key.
*
* This also saves the binary search bounds in insertstate. We use them
* in the fastpath below, but also in the _bt_findinsertloc() call later.
*/
Assert(!insertstate->bounds_valid);
offset = _bt_binsrch_insert(rel, insertstate);
/*
* Scan over all equal tuples, looking for live conflicts.
*/
Assert(!insertstate->bounds_valid || insertstate->low == offset);
Assert(!itup_key->anynullkeys);
Assert(itup_key->scantid == NULL);
for (;;)
{
/*
* Each iteration of the loop processes one heap TID, not one index
* tuple. Current offset number for page isn't usually advanced on
* iterations that process heap TIDs from posting list tuples.
*
* "inposting" state is set when _inside_ a posting list --- not when
* we're at the start (or end) of a posting list. We advance curposti
* at the end of the iteration when inside a posting list tuple. In
* general, every loop iteration either advances the page offset or
* advances curposti --- an iteration that handles the rightmost/max
* heap TID in a posting list finally advances the page offset (and
* unsets "inposting").
*
* Make sure the offset points to an actual index tuple before trying
* to examine it...
*/
if (offset <= maxoff)
{
/*
* Fastpath: In most cases, we can use cached search bounds to
* limit our consideration to items that are definitely
* duplicates. This fastpath doesn't apply when the original page
* is empty, or when initial offset is past the end of the
* original page, which may indicate that we need to examine a
* second or subsequent page.
*
* Note that this optimization allows us to avoid calling
* _bt_compare() directly when there are no duplicates, as long as
* the offset where the key will go is not at the end of the page.
*/
if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
{
Assert(insertstate->bounds_valid);
Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
Assert(insertstate->low <= insertstate->stricthigh);
Assert(_bt_compare(rel, itup_key, page, offset) < 0);
break;
}
/*
* We can skip items that are already marked killed.
*
* In the presence of heavy update activity an index may contain
* many killed items with the same key; running _bt_compare() on
* each killed item gets expensive. Just advance over killed
* items as quickly as we can. We only apply _bt_compare() when
* we get to a non-killed item. We could reuse the bounds to
* avoid _bt_compare() calls for known equal tuples, but it
* doesn't seem worth it.
*/
if (!inposting)
curitemid = PageGetItemId(page, offset);
if (inposting || !ItemIdIsDead(curitemid))
{
ItemPointerData htid;
bool all_dead = false;
if (!inposting)
{
/* Plain tuple, or first TID in posting list tuple */
if (_bt_compare(rel, itup_key, page, offset) != 0)
break; /* we're past all the equal tuples */
/* Advanced curitup */
curitup = (IndexTuple) PageGetItem(page, curitemid);
Assert(!BTreeTupleIsPivot(curitup));
}
/* okay, we gotta fetch the heap tuple using htid ... */
if (!BTreeTupleIsPosting(curitup))
{
/* ... htid is from simple non-pivot tuple */
Assert(!inposting);
htid = curitup->t_tid;
}
else if (!inposting)
{
/* ... htid is first TID in new posting list */
inposting = true;
prevalldead = true;
curposti = 0;
htid = *BTreeTupleGetPostingN(curitup, 0);
}
else
{
/* ... htid is second or subsequent TID in posting list */
Assert(curposti > 0);
htid = *BTreeTupleGetPostingN(curitup, curposti);
}
/*
* If we are doing a recheck, we expect to find the tuple we
* are rechecking. It's not a duplicate, but we have to keep
* scanning.
*/
if (checkUnique == UNIQUE_CHECK_EXISTING &&
ItemPointerCompare(&htid, &itup->t_tid) == 0)
{
found = true;
}
/*
* Check if there's any table tuples for this index entry
* satisfying SnapshotDirty. This is necessary because for AMs
* with optimizations like heap's HOT, we have just a single
* index entry for the entire chain.
*/
else if (table_index_fetch_tuple_check(heapRel, &htid,
&SnapshotDirty,
&all_dead))
{
TransactionId xwait;
/*
* It is a duplicate. If we are only doing a partial
* check, then don't bother checking if the tuple is being
* updated in another transaction. Just return the fact
* that it is a potential conflict and leave the full
* check till later. Don't invalidate binary search
* bounds.
*/
if (checkUnique == UNIQUE_CHECK_PARTIAL)
{
if (nbuf != InvalidBuffer)
_bt_relbuf(rel, nbuf);
*is_unique = false;
return InvalidTransactionId;
}
/*
* If this tuple is being updated by other transaction
* then we have to wait for its commit/abort.
*/
xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
SnapshotDirty.xmin : SnapshotDirty.xmax;
if (TransactionIdIsValid(xwait))
{
if (nbuf != InvalidBuffer)
_bt_relbuf(rel, nbuf);
/* Tell _bt_doinsert to wait... */
*speculativeToken = SnapshotDirty.speculativeToken;
/* Caller releases lock on buf immediately */
insertstate->bounds_valid = false;
return xwait;
}
/*
* Otherwise we have a definite conflict. But before
* complaining, look to see if the tuple we want to insert
* is itself now committed dead --- if so, don't complain.
* This is a waste of time in normal scenarios but we must
* do it to support CREATE INDEX CONCURRENTLY.
*
* We must follow HOT-chains here because during
* concurrent index build, we insert the root TID though
* the actual tuple may be somewhere in the HOT-chain.
* While following the chain we might not stop at the
* exact tuple which triggered the insert, but that's OK
* because if we find a live tuple anywhere in this chain,
* we have a unique key conflict. The other live tuple is
* not part of this chain because it had a different index
* entry.
*/
htid = itup->t_tid;
if (table_index_fetch_tuple_check(heapRel, &htid,
SnapshotSelf, NULL))
{
/* Normal case --- it's still live */
}
else
{
/*
* It's been deleted, so no error, and no need to
* continue searching
*/
break;
}
/*
* Check for a conflict-in as we would if we were going to
* write to this page. We aren't actually going to write,
* but we want a chance to report SSI conflicts that would
* otherwise be masked by this unique constraint
* violation.
*/
CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
/*
* This is a definite conflict. Break the tuple down into
* datums and report the error. But first, make sure we
* release the buffer locks we're holding ---
* BuildIndexValueDescription could make catalog accesses,
* which in the worst case might touch this same index and
* cause deadlocks.
*/
if (nbuf != InvalidBuffer)
_bt_relbuf(rel, nbuf);
_bt_relbuf(rel, insertstate->buf);
insertstate->buf = InvalidBuffer;
insertstate->bounds_valid = false;
{
Datum values[INDEX_MAX_KEYS];
bool isnull[INDEX_MAX_KEYS];
char *key_desc;
index_deform_tuple(itup, RelationGetDescr(rel),
values, isnull);
key_desc = BuildIndexValueDescription(rel, values,
isnull);
ereport(ERROR,
(errcode(ERRCODE_UNIQUE_VIOLATION),
errmsg("duplicate key value violates unique constraint \"%s\"",
RelationGetRelationName(rel)),
key_desc ? errdetail("Key %s already exists.",
key_desc) : 0,
errtableconstraint(heapRel,
RelationGetRelationName(rel))));
}
}
else if (all_dead && (!inposting ||
(prevalldead &&
curposti == BTreeTupleGetNPosting(curitup) - 1)))
{
/*
* The conflicting tuple (or all HOT chains pointed to by
* all posting list TIDs) is dead to everyone, so mark the
* index entry killed.
*/
ItemIdMarkDead(curitemid);
opaque->btpo_flags |= BTP_HAS_GARBAGE;
/*
* Mark buffer with a dirty hint, since state is not
* crucial. Be sure to mark the proper buffer dirty.
*/
if (nbuf != InvalidBuffer)
MarkBufferDirtyHint(nbuf, true);
else
MarkBufferDirtyHint(insertstate->buf, true);
}
/*
* Remember if posting list tuple has even a single HOT chain
* whose members are not all dead
*/
if (!all_dead && inposting)
prevalldead = false;
}
}
if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
{
/* Advance to next TID in same posting list */
curposti++;
continue;
}
else if (offset < maxoff)
{
/* Advance to next tuple */
curposti = 0;
inposting = false;
offset = OffsetNumberNext(offset);
}
else
{
int highkeycmp;
/* If scankey == hikey we gotta check the next page too */
if (P_RIGHTMOST(opaque))
break;
highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
Assert(highkeycmp <= 0);
if (highkeycmp != 0)
break;
/* Advance to next non-dead page --- there must be one */
for (;;)
{
BlockNumber nblkno = opaque->btpo_next;
nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
page = BufferGetPage(nbuf);
opaque = BTPageGetOpaque(page);
if (!P_IGNORE(opaque))
break;
if (P_RIGHTMOST(opaque))
elog(ERROR, "fell off the end of index \"%s\"",
RelationGetRelationName(rel));
}
/* Will also advance to next tuple */
curposti = 0;
inposting = false;
maxoff = PageGetMaxOffsetNumber(page);
offset = P_FIRSTDATAKEY(opaque);
/* Don't invalidate binary search bounds */
}
}
/*
* If we are doing a recheck then we should have found the tuple we are
* checking. Otherwise there's something very wrong --- probably, the
* index is on a non-immutable expression.
*/
if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg("failed to re-find tuple within index \"%s\"",
RelationGetRelationName(rel)),
errhint("This may be because of a non-immutable index expression."),
errtableconstraint(heapRel,
RelationGetRelationName(rel))));
if (nbuf != InvalidBuffer)
_bt_relbuf(rel, nbuf);
return InvalidTransactionId;
}
/*
* _bt_findinsertloc() -- Finds an insert location for a tuple
*
* On entry, insertstate buffer contains the page the new tuple belongs
* on. It is exclusive-locked and pinned by the caller.
*
* If 'checkingunique' is true, the buffer on entry is the first page
* that contains duplicates of the new key. If there are duplicates on
* multiple pages, the correct insertion position might be some page to
* the right, rather than the first page. In that case, this function
* moves right to the correct target page.
*
* (In a !heapkeyspace index, there can be multiple pages with the same
* high key, where the new tuple could legitimately be placed on. In
* that case, the caller passes the first page containing duplicates,
* just like when checkingunique=true. If that page doesn't have enough
* room for the new tuple, this function moves right, trying to find a
* legal page that does.)
*
* If 'indexUnchanged' is true, this is for an UPDATE that didn't
* logically change the indexed value, but must nevertheless have a new
* entry to point to a successor version. This hint from the executor
* will influence our behavior when the page might have to be split and
* we must consider our options. Bottom-up index deletion can avoid
* pathological version-driven page splits, but we only want to go to the
* trouble of trying it when we already have moderate confidence that
* it's appropriate. The hint should not significantly affect our
* behavior over time unless practically all inserts on to the leaf page
* get the hint.
*
* On exit, insertstate buffer contains the chosen insertion page, and
* the offset within that page is returned. If _bt_findinsertloc needed
* to move right, the lock and pin on the original page are released, and
* the new buffer is exclusively locked and pinned instead.
*
* If insertstate contains cached binary search bounds, we will take
* advantage of them. This avoids repeating comparisons that we made in
* _bt_check_unique() already.
*/
static OffsetNumber
_bt_findinsertloc(Relation rel,
BTInsertState insertstate,
bool checkingunique,
bool indexUnchanged,
BTStack stack,
Relation heapRel)
{
BTScanInsert itup_key = insertstate->itup_key;
Page page = BufferGetPage(insertstate->buf);
BTPageOpaque opaque;
OffsetNumber newitemoff;
opaque = BTPageGetOpaque(page);
/* Check 1/3 of a page restriction */
if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
_bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
insertstate->itup);
Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
Assert(!insertstate->bounds_valid || checkingunique);
Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
if (itup_key->heapkeyspace)
{
/* Keep track of whether checkingunique duplicate seen */
bool uniquedup = indexUnchanged;
/*
* If we're inserting into a unique index, we may have to walk right
* through leaf pages to find the one leaf page that we must insert on
* to.
*
* This is needed for checkingunique callers because a scantid was not
* used when we called _bt_search(). scantid can only be set after
* _bt_check_unique() has checked for duplicates. The buffer
* initially stored in insertstate->buf has the page where the first
* duplicate key might be found, which isn't always the page that new
* tuple belongs on. The heap TID attribute for new tuple (scantid)
* could force us to insert on a sibling page, though that should be
* very rare in practice.
*/
if (checkingunique)
{
if (insertstate->low < insertstate->stricthigh)
{
/* Encountered a duplicate in _bt_check_unique() */
Assert(insertstate->bounds_valid);
uniquedup = true;
}
for (;;)
{
/*
* Does the new tuple belong on this page?
*
* The earlier _bt_check_unique() call may well have
* established a strict upper bound on the offset for the new
* item. If it's not the last item of the page (i.e. if there
* is at least one tuple on the page that goes after the tuple
* we're inserting) then we know that the tuple belongs on
* this page. We can skip the high key check.
*/
if (insertstate->bounds_valid &&
insertstate->low <= insertstate->stricthigh &&
insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
break;
/* Test '<=', not '!=', since scantid is set now */
if (P_RIGHTMOST(opaque) ||
_bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
break;
_bt_stepright(rel, insertstate, stack);
/* Update local state after stepping right */
page = BufferGetPage(insertstate->buf);
opaque = BTPageGetOpaque(page);
/* Assume duplicates (if checkingunique) */
uniquedup = true;
}
}
/*
* If the target page cannot fit newitem, try to avoid splitting the
* page on insert by performing deletion or deduplication now
*/
if (PageGetFreeSpace(page) < insertstate->itemsz)
_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
checkingunique, uniquedup,
indexUnchanged);
}
else
{
/*----------
* This is a !heapkeyspace (version 2 or 3) index. The current page
* is the first page that we could insert the new tuple to, but there
* may be other pages to the right that we could opt to use instead.
*
* If the new key is equal to one or more existing keys, we can
* legitimately place it anywhere in the series of equal keys. In
* fact, if the new key is equal to the page's "high key" we can place
* it on the next page. If it is equal to the high key, and there's
* not room to insert the new tuple on the current page without
* splitting, then we move right hoping to find more free space and
* avoid a split.
*
* Keep scanning right until we
* (a) find a page with enough free space,
* (b) reach the last page where the tuple can legally go, or
* (c) get tired of searching.
* (c) is not flippant; it is important because if there are many
* pages' worth of equal keys, it's better to split one of the early
* pages than to scan all the way to the end of the run of equal keys
* on every insert. We implement "get tired" as a random choice,
* since stopping after scanning a fixed number of pages wouldn't work
* well (we'd never reach the right-hand side of previously split
* pages). The probability of moving right is set at 0.99, which may
* seem too high to change the behavior much, but it does an excellent
* job of preventing O(N^2) behavior with many equal keys.
*----------
*/
while (PageGetFreeSpace(page) < insertstate->itemsz)
{
/*
* Before considering moving right, see if we can obtain enough
* space by erasing LP_DEAD items
*/
if (P_HAS_GARBAGE(opaque))
{
/* Perform simple deletion */
_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
false, false, false);
if (PageGetFreeSpace(page) >= insertstate->itemsz)
break; /* OK, now we have enough space */
}
/*
* Nope, so check conditions (b) and (c) enumerated above
*
* The earlier _bt_check_unique() call may well have established a
* strict upper bound on the offset for the new item. If it's not
* the last item of the page (i.e. if there is at least one tuple
* on the page that's greater than the tuple we're inserting to)
* then we know that the tuple belongs on this page. We can skip
* the high key check.
*/
if (insertstate->bounds_valid &&
insertstate->low <= insertstate->stricthigh &&
insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
break;
if (P_RIGHTMOST(opaque) ||
_bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
break;
_bt_stepright(rel, insertstate, stack);
/* Update local state after stepping right */
page = BufferGetPage(insertstate->buf);
opaque = BTPageGetOpaque(page);
}
}
/*
* We should now be on the correct page. Find the offset within the page
* for the new tuple. (Possibly reusing earlier search bounds.)
*/
Assert(P_RIGHTMOST(opaque) ||
_bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
newitemoff = _bt_binsrch_insert(rel, insertstate);
if (insertstate->postingoff == -1)
{
/*
* There is an overlapping posting list tuple with its LP_DEAD bit
* set. We don't want to unnecessarily unset its LP_DEAD bit while
* performing a posting list split, so perform simple index tuple
* deletion early.
*/
_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
false, false, false);
/*
* Do new binary search. New insert location cannot overlap with any
* posting list now.
*/
Assert(!insertstate->bounds_valid);
insertstate->postingoff = 0;
newitemoff = _bt_binsrch_insert(rel, insertstate);
Assert(insertstate->postingoff == 0);
}
return newitemoff;
}
/*
* Step right to next non-dead page, during insertion.
*
* This is a bit more complicated than moving right in a search. We must
* write-lock the target page before releasing write lock on current page;
* else someone else's _bt_check_unique scan could fail to see our insertion.
* Write locks on intermediate dead pages won't do because we don't know when
* they will get de-linked from the tree.
*
* This is more aggressive than it needs to be for non-unique !heapkeyspace
* indexes.
*/
static void
_bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
{
Page page;
BTPageOpaque opaque;
Buffer rbuf;
BlockNumber rblkno;
page = BufferGetPage(insertstate->buf);
opaque = BTPageGetOpaque(page);
rbuf = InvalidBuffer;
rblkno = opaque->btpo_next;
for (;;)
{
rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
page = BufferGetPage(rbuf);
opaque = BTPageGetOpaque(page);
/*
* If this page was incompletely split, finish the split now. We do
* this while holding a lock on the left sibling, which is not good
* because finishing the split could be a fairly lengthy operation.
* But this should happen very seldom.
*/
if (P_INCOMPLETE_SPLIT(opaque))
{
_bt_finish_split(rel, rbuf, stack);
rbuf = InvalidBuffer;
continue;
}
if (!P_IGNORE(opaque))
break;
if (P_RIGHTMOST(opaque))
elog(ERROR, "fell off the end of index \"%s\"",
RelationGetRelationName(rel));
rblkno = opaque->btpo_next;
}
/* rbuf locked; unlock buf, update state for caller */
_bt_relbuf(rel, insertstate->buf);
insertstate->buf = rbuf;
insertstate->bounds_valid = false;
}
/*----------
* _bt_insertonpg() -- Insert a tuple on a particular page in the index.
*
* This recursive procedure does the following things:
*
* + if postingoff != 0, splits existing posting list tuple
* (since it overlaps with new 'itup' tuple).
* + if necessary, splits the target page, using 'itup_key' for
* suffix truncation on leaf pages (caller passes NULL for
* non-leaf pages).
* + inserts the new tuple (might be split from posting list).
* + if the page was split, pops the parent stack, and finds the
* right place to insert the new child pointer (by walking
* right using information stored in the parent stack).
* + invokes itself with the appropriate tuple for the right
* child page on the parent.
* + updates the metapage if a true root or fast root is split.
*
* On entry, we must have the correct buffer in which to do the
* insertion, and the buffer must be pinned and write-locked. On return,
* we will have dropped both the pin and the lock on the buffer.
*
* This routine only performs retail tuple insertions. 'itup' should
* always be either a non-highkey leaf item, or a downlink (new high
* key items are created indirectly, when a page is split). When
* inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
* we're inserting the downlink for. This function will clear the
* INCOMPLETE_SPLIT flag on it, and release the buffer.
*----------
*/
static void
_bt_insertonpg(Relation rel,
BTScanInsert itup_key,
Buffer buf,
Buffer cbuf,
BTStack stack,
IndexTuple itup,
Size itemsz,
OffsetNumber newitemoff,
int postingoff,
bool split_only_page)
{
Page page;
BTPageOpaque opaque;
bool isleaf,
isroot,
isrightmost,
isonly;
IndexTuple oposting = NULL;
IndexTuple origitup = NULL;
IndexTuple nposting = NULL;
page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
isleaf = P_ISLEAF(opaque);
isroot = P_ISROOT(opaque);
isrightmost = P_RIGHTMOST(opaque);
isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
/* child buffer must be given iff inserting on an internal page */
Assert(isleaf == !BufferIsValid(cbuf));
/* tuple must have appropriate number of attributes */
Assert(!isleaf ||
BTreeTupleGetNAtts(itup, rel) ==
IndexRelationGetNumberOfAttributes(rel));
Assert(isleaf ||
BTreeTupleGetNAtts(itup, rel) <=
IndexRelationGetNumberOfKeyAttributes(rel));
Assert(!BTreeTupleIsPosting(itup));
Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
/* Caller must always finish incomplete split for us */
Assert(!P_INCOMPLETE_SPLIT(opaque));
/*
* Every internal page should have exactly one negative infinity item at
* all times. Only _bt_split() and _bt_newroot() should add items that
* become negative infinity items through truncation, since they're the
* only routines that allocate new internal pages.
*/
Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
/*
* Do we need to split an existing posting list item?
*/
if (postingoff != 0)
{
ItemId itemid = PageGetItemId(page, newitemoff);
/*
* The new tuple is a duplicate with a heap TID that falls inside the
* range of an existing posting list tuple on a leaf page. Prepare to
* split an existing posting list. Overwriting the posting list with
* its post-split version is treated as an extra step in either the
* insert or page split critical section.
*/
Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
oposting = (IndexTuple) PageGetItem(page, itemid);
/*
* postingoff value comes from earlier call to _bt_binsrch_posting().
* Its binary search might think that a plain tuple must be a posting
* list tuple that needs to be split. This can happen with corruption
* involving an existing plain tuple that is a duplicate of the new
* item, up to and including its table TID. Check for that here in
* passing.
*
* Also verify that our caller has made sure that the existing posting
* list tuple does not have its LP_DEAD bit set.
*/
if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
ItemPointerGetBlockNumber(&itup->t_tid),
ItemPointerGetOffsetNumber(&itup->t_tid),
newitemoff, BufferGetBlockNumber(buf),
RelationGetRelationName(rel))));
/* use a mutable copy of itup as our itup from here on */
origitup = itup;
itup = CopyIndexTuple(origitup);
nposting = _bt_swap_posting(itup, oposting, postingoff);
/* itup now contains rightmost/max TID from oposting */
/* Alter offset so that newitem goes after posting list */
newitemoff = OffsetNumberNext(newitemoff);
}
/*
* Do we need to split the page to fit the item on it?
*
* Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
* so this comparison is correct even though we appear to be accounting
* only for the item and not for its line pointer.
*/
if (PageGetFreeSpace(page) < itemsz)
{
Buffer rbuf;
Assert(!split_only_page);
/* split the buffer into left and right halves */
rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
origitup, nposting, postingoff);
PredicateLockPageSplit(rel,
BufferGetBlockNumber(buf),
BufferGetBlockNumber(rbuf));
/*----------
* By here,
*
* + our target page has been split;
* + the original tuple has been inserted;
* + we have write locks on both the old (left half)
* and new (right half) buffers, after the split; and
* + we know the key we want to insert into the parent
* (it's the "high key" on the left child page).
*
* We're ready to do the parent insertion. We need to hold onto the
* locks for the child pages until we locate the parent, but we can
* at least release the lock on the right child before doing the
* actual insertion. The lock on the left child will be released
* last of all by parent insertion, where it is the 'cbuf' of parent
* page.
*----------
*/
_bt_insert_parent(rel, buf, rbuf, stack, isroot, isonly);
}
else
{
Buffer metabuf = InvalidBuffer;
Page metapg = NULL;
BTMetaPageData *metad = NULL;
BlockNumber blockcache;
/*
* If we are doing this insert because we split a page that was the
* only one on its tree level, but was not the root, it may have been
* the "fast root". We need to ensure that the fast root link points
* at or above the current page. We can safely acquire a lock on the
* metapage here --- see comments for _bt_newroot().
*/
if (unlikely(split_only_page))
{
Assert(!isleaf);
Assert(BufferIsValid(cbuf));
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
if (metad->btm_fastlevel >= opaque->btpo_level)
{
/* no update wanted */
_bt_relbuf(rel, metabuf);
metabuf = InvalidBuffer;
}
}
/* Do the update. No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
if (postingoff != 0)
memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
false) == InvalidOffsetNumber)
elog(PANIC, "failed to add new item to block %u in index \"%s\"",
BufferGetBlockNumber(buf), RelationGetRelationName(rel));
MarkBufferDirty(buf);
if (BufferIsValid(metabuf))
{
/* upgrade meta-page if needed */
if (metad->btm_version < BTREE_NOVAC_VERSION)
_bt_upgrademetapage(metapg);
metad->btm_fastroot = BufferGetBlockNumber(buf);
metad->btm_fastlevel = opaque->btpo_level;
MarkBufferDirty(metabuf);
}
/*
* Clear INCOMPLETE_SPLIT flag on child if inserting the new item
* finishes a split
*/
if (!isleaf)
{
Page cpage = BufferGetPage(cbuf);
BTPageOpaque cpageop = BTPageGetOpaque(cpage);
Assert(P_INCOMPLETE_SPLIT(cpageop));
cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
MarkBufferDirty(cbuf);
}
/* XLOG stuff */
if (RelationNeedsWAL(rel))
{
xl_btree_insert xlrec;
xl_btree_metadata xlmeta;
uint8 xlinfo;
XLogRecPtr recptr;
uint16 upostingoff;
xlrec.offnum = newitemoff;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
if (isleaf && postingoff == 0)
{
/* Simple leaf insert */
xlinfo = XLOG_BTREE_INSERT_LEAF;
}
else if (postingoff != 0)
{
/*
* Leaf insert with posting list split. Must include
* postingoff field before newitem/orignewitem.
*/
Assert(isleaf);
xlinfo = XLOG_BTREE_INSERT_POST;
}
else
{
/* Internal page insert, which finishes a split on cbuf */
xlinfo = XLOG_BTREE_INSERT_UPPER;
XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
if (BufferIsValid(metabuf))
{
/* Actually, it's an internal page insert + meta update */
xlinfo = XLOG_BTREE_INSERT_META;
Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
xlmeta.version = metad->btm_version;
xlmeta.root = metad->btm_root;
xlmeta.level = metad->btm_level;
xlmeta.fastroot = metad->btm_fastroot;
xlmeta.fastlevel = metad->btm_fastlevel;
xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
xlmeta.allequalimage = metad->btm_allequalimage;
XLogRegisterBuffer(2, metabuf,
REGBUF_WILL_INIT | REGBUF_STANDARD);
XLogRegisterBufData(2, (char *) &xlmeta,
sizeof(xl_btree_metadata));
}
}
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
if (postingoff == 0)
{
/* Just log itup from caller */
XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
}
else
{
/*
* Insert with posting list split (XLOG_BTREE_INSERT_POST
* record) case.
*
* Log postingoff. Also log origitup, not itup. REDO routine
* must reconstruct final itup (as well as nposting) using
* _bt_swap_posting().
*/
upostingoff = postingoff;
XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
XLogRegisterBufData(0, (char *) origitup,
IndexTupleSize(origitup));
}
recptr = XLogInsert(RM_BTREE_ID, xlinfo);
if (BufferIsValid(metabuf))
PageSetLSN(metapg, recptr);
if (!isleaf)
PageSetLSN(BufferGetPage(cbuf), recptr);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
/* Release subsidiary buffers */
if (BufferIsValid(metabuf))
_bt_relbuf(rel, metabuf);
if (!isleaf)
_bt_relbuf(rel, cbuf);
/*
* Cache the block number if this is the rightmost leaf page. Cache
* may be used by a future inserter within _bt_search_insert().
*/
blockcache = InvalidBlockNumber;
if (isrightmost && isleaf && !isroot)
blockcache = BufferGetBlockNumber(buf);
/* Release buffer for insertion target block */
_bt_relbuf(rel, buf);
/*
* If we decided to cache the insertion target block before releasing
* its buffer lock, then cache it now. Check the height of the tree
* first, though. We don't go for the optimization with small
* indexes. Defer final check to this point to ensure that we don't
* call _bt_getrootheight while holding a buffer lock.
*/
if (BlockNumberIsValid(blockcache) &&
_bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
RelationSetTargetBlock(rel, blockcache);
}
/* be tidy */
if (postingoff != 0)
{
/* itup is actually a modified copy of caller's original */
pfree(nposting);
pfree(itup);
}
}
/*
* _bt_split() -- split a page in the btree.
*
* On entry, buf is the page to split, and is pinned and write-locked.
* newitemoff etc. tell us about the new item that must be inserted
* along with the data from the original page.
*
* itup_key is used for suffix truncation on leaf pages (internal
* page callers pass NULL). When splitting a non-leaf page, 'cbuf'
* is the left-sibling of the page we're inserting the downlink for.
* This function will clear the INCOMPLETE_SPLIT flag on it, and
* release the buffer.
*
* orignewitem, nposting, and postingoff are needed when an insert of
* orignewitem results in both a posting list split and a page split.
* These extra posting list split details are used here in the same
* way as they are used in the more common case where a posting list
* split does not coincide with a page split. We need to deal with
* posting list splits directly in order to ensure that everything
* that follows from the insert of orignewitem is handled as a single
* atomic operation (though caller's insert of a new pivot/downlink
* into parent page will still be a separate operation). See
* nbtree/README for details on the design of posting list splits.
*
* Returns the new right sibling of buf, pinned and write-locked.
* The pin and lock on buf are maintained.
*/
static Buffer
_bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
{
Buffer rbuf;
Page origpage;
Page leftpage,
rightpage;
BlockNumber origpagenumber,
rightpagenumber;
BTPageOpaque ropaque,
lopaque,
oopaque;
Buffer sbuf = InvalidBuffer;
Page spage = NULL;
BTPageOpaque sopaque = NULL;
Size itemsz;
ItemId itemid;
IndexTuple firstright,
lefthighkey;
OffsetNumber firstrightoff;
OffsetNumber afterleftoff,
afterrightoff,
minusinfoff;
OffsetNumber origpagepostingoff;
OffsetNumber maxoff;
OffsetNumber i;
bool newitemonleft,
isleaf,
isrightmost;
/*
* origpage is the original page to be split. leftpage is a temporary
* buffer that receives the left-sibling data, which will be copied back
* into origpage on success. rightpage is the new page that will receive
* the right-sibling data.
*
* leftpage is allocated after choosing a split point. rightpage's new
* buffer isn't acquired until after leftpage is initialized and has new
* high key, the last point where splitting the page may fail (barring
* corruption). Failing before acquiring new buffer won't have lasting
* consequences, since origpage won't have been modified and leftpage is
* only workspace.
*/
origpage = BufferGetPage(buf);
oopaque = BTPageGetOpaque(origpage);
isleaf = P_ISLEAF(oopaque);
isrightmost = P_RIGHTMOST(oopaque);
maxoff = PageGetMaxOffsetNumber(origpage);
origpagenumber = BufferGetBlockNumber(buf);
/*
* Choose a point to split origpage at.
*
* A split point can be thought of as a point _between_ two existing data
* items on origpage (the lastleft and firstright tuples), provided you
* pretend that the new item that didn't fit is already on origpage.
*
* Since origpage does not actually contain newitem, the representation of
* split points needs to work with two boundary cases: splits where
* newitem is lastleft, and splits where newitem is firstright.
* newitemonleft resolves the ambiguity that would otherwise exist when
* newitemoff == firstrightoff. In all other cases it's clear which side
* of the split every tuple goes on from context. newitemonleft is
* usually (but not always) redundant information.
*
* firstrightoff is supposed to be an origpage offset number, but it's
* possible that its value will be maxoff+1, which is "past the end" of
* origpage. This happens in the rare case where newitem goes after all
* existing items (i.e. newitemoff is maxoff+1) and we end up splitting
* origpage at the point that leaves newitem alone on new right page. Any
* "!newitemonleft && newitemoff == firstrightoff" split point makes
* newitem the firstright tuple, though, so this case isn't a special
* case.
*/
firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
newitem, &newitemonleft);
/* Allocate temp buffer for leftpage */
leftpage = PageGetTempPage(origpage);
_bt_pageinit(leftpage, BufferGetPageSize(buf));
lopaque = BTPageGetOpaque(leftpage);
/*
* leftpage won't be the root when we're done. Also, clear the SPLIT_END
* and HAS_GARBAGE flags.
*/
lopaque->btpo_flags = oopaque->btpo_flags;
lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
/* set flag in leftpage indicating that rightpage has no downlink yet */
lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
lopaque->btpo_prev = oopaque->btpo_prev;
/* handle btpo_next after rightpage buffer acquired */
lopaque->btpo_level = oopaque->btpo_level;
/* handle btpo_cycleid after rightpage buffer acquired */
/*
* Copy the original page's LSN into leftpage, which will become the
* updated version of the page. We need this because XLogInsert will
* examine the LSN and possibly dump it in a page image.
*/
PageSetLSN(leftpage, PageGetLSN(origpage));
/*
* Determine page offset number of existing overlapped-with-orignewitem
* posting list when it is necessary to perform a posting list split in
* passing. Note that newitem was already changed by caller (newitem no
* longer has the orignewitem TID).
*
* This page offset number (origpagepostingoff) will be used to pretend
* that the posting split has already taken place, even though the
* required modifications to origpage won't occur until we reach the
* critical section. The lastleft and firstright tuples of our page split
* point should, in effect, come from an imaginary version of origpage
* that has the nposting tuple instead of the original posting list tuple.
*
* Note: _bt_findsplitloc() should have compensated for coinciding posting
* list splits in just the same way, at least in theory. It doesn't
* bother with that, though. In practice it won't affect its choice of
* split point.
*/
origpagepostingoff = InvalidOffsetNumber;
if (postingoff != 0)
{
Assert(isleaf);
Assert(ItemPointerCompare(&orignewitem->t_tid,
&newitem->t_tid) < 0);
Assert(BTreeTupleIsPosting(nposting));
origpagepostingoff = OffsetNumberPrev(newitemoff);
}
/*
* The high key for the new left page is a possibly-truncated copy of
* firstright on the leaf level (it's "firstright itself" on internal
* pages; see !isleaf comments below). This may seem to be contrary to
* Lehman & Yao's approach of using a copy of lastleft as the new high key
* when splitting on the leaf level. It isn't, though.
*
* Suffix truncation will leave the left page's high key fully equal to
* lastleft when lastleft and firstright are equal prior to heap TID (that
* is, the tiebreaker TID value comes from lastleft). It isn't actually
* necessary for a new leaf high key to be a copy of lastleft for the L&Y
* "subtree" invariant to hold. It's sufficient to make sure that the new
* leaf high key is strictly less than firstright, and greater than or
* equal to (not necessarily equal to) lastleft. In other words, when
* suffix truncation isn't possible during a leaf page split, we take
* L&Y's exact approach to generating a new high key for the left page.
* (Actually, that is slightly inaccurate. We don't just use a copy of
* lastleft. A tuple with all the keys from firstright but the max heap
* TID from lastleft is used, to avoid introducing a special case.)
*/
if (!newitemonleft && newitemoff == firstrightoff)
{
/* incoming tuple becomes firstright */
itemsz = newitemsz;
firstright = newitem;
}
else
{
/* existing item at firstrightoff becomes firstright */
itemid = PageGetItemId(origpage, firstrightoff);
itemsz = ItemIdGetLength(itemid);
firstright = (IndexTuple) PageGetItem(origpage, itemid);
if (firstrightoff == origpagepostingoff)
firstright = nposting;
}
if (isleaf)
{
IndexTuple lastleft;
/* Attempt suffix truncation for leaf page splits */
if (newitemonleft && newitemoff == firstrightoff)
{
/* incoming tuple becomes lastleft */
lastleft = newitem;
}
else
{
OffsetNumber lastleftoff;
/* existing item before firstrightoff becomes lastleft */
lastleftoff = OffsetNumberPrev(firstrightoff);
Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
itemid = PageGetItemId(origpage, lastleftoff);
lastleft = (IndexTuple) PageGetItem(origpage, itemid);
if (lastleftoff == origpagepostingoff)
lastleft = nposting;
}
lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
itemsz = IndexTupleSize(lefthighkey);
}
else
{
/*
* Don't perform suffix truncation on a copy of firstright to make
* left page high key for internal page splits. Must use firstright
* as new high key directly.
*
* Each distinct separator key value originates as a leaf level high
* key; all other separator keys/pivot tuples are copied from one
* level down. A separator key in a grandparent page must be
* identical to high key in rightmost parent page of the subtree to
* its left, which must itself be identical to high key in rightmost
* child page of that same subtree (this even applies to separator
* from grandparent's high key). There must always be an unbroken
* "seam" of identical separator keys that guide index scans at every
* level, starting from the grandparent. That's why suffix truncation
* is unsafe here.
*
* Internal page splits will truncate firstright into a "negative
* infinity" data item when it gets inserted on the new right page
* below, though. This happens during the call to _bt_pgaddtup() for
* the new first data item for right page. Do not confuse this
* mechanism with suffix truncation. It is just a convenient way of
* implementing page splits that split the internal page "inside"
* firstright. The lefthighkey separator key cannot appear a second
* time in the right page (only firstright's downlink goes in right
* page).
*/
lefthighkey = firstright;
}
/*
* Add new high key to leftpage
*/
afterleftoff = P_HIKEY;
Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
IndexRelationGetNumberOfKeyAttributes(rel));
Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
false) == InvalidOffsetNumber)
elog(ERROR, "failed to add high key to the left sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
afterleftoff = OffsetNumberNext(afterleftoff);
/*
* Acquire a new right page to split into, now that left page has a new
* high key. From here on, it's not okay to throw an error without
* zeroing rightpage first. This coding rule ensures that we won't
* confuse future VACUUM operations, which might otherwise try to re-find
* a downlink to a leftover junk page as the page undergoes deletion.
*
* It would be reasonable to start the critical section just after the new
* rightpage buffer is acquired instead; that would allow us to avoid
* leftover junk pages without bothering to zero rightpage. We do it this
* way because it avoids an unnecessary PANIC when either origpage or its
* existing sibling page are corrupt.
*/
rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
rightpage = BufferGetPage(rbuf);
rightpagenumber = BufferGetBlockNumber(rbuf);
/* rightpage was initialized by _bt_getbuf */
ropaque = BTPageGetOpaque(rightpage);
/*
* Finish off remaining leftpage special area fields. They cannot be set
* before both origpage (leftpage) and rightpage buffers are acquired and
* locked.
*
* btpo_cycleid is only used with leaf pages, though we set it here in all
* cases just to be consistent.
*/
lopaque->btpo_next = rightpagenumber;
lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
/*
* rightpage won't be the root when we're done. Also, clear the SPLIT_END
* and HAS_GARBAGE flags.
*/
ropaque->btpo_flags = oopaque->btpo_flags;
ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
ropaque->btpo_prev = origpagenumber;
ropaque->btpo_next = oopaque->btpo_next;
ropaque->btpo_level = oopaque->btpo_level;
ropaque->btpo_cycleid = lopaque->btpo_cycleid;
/*
* Add new high key to rightpage where necessary.
*
* If the page we're splitting is not the rightmost page at its level in
* the tree, then the first entry on the page is the high key from
* origpage.
*/
afterrightoff = P_HIKEY;
if (!isrightmost)
{
IndexTuple righthighkey;
itemid = PageGetItemId(origpage, P_HIKEY);
itemsz = ItemIdGetLength(itemid);
righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
IndexRelationGetNumberOfKeyAttributes(rel));
if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
false, false) == InvalidOffsetNumber)
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add high key to the right sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
}
afterrightoff = OffsetNumberNext(afterrightoff);
}
/*
* Internal page splits truncate first data item on right page -- it
* becomes "minus infinity" item for the page. Set this up here.
*/
minusinfoff = InvalidOffsetNumber;
if (!isleaf)
minusinfoff = afterrightoff;
/*
* Now transfer all the data items (non-pivot tuples in isleaf case, or
* additional pivot tuples in !isleaf case) to the appropriate page.
*
* Note: we *must* insert at least the right page's items in item-number
* order, for the benefit of _bt_restore_page().
*/
for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
{
IndexTuple dataitem;
itemid = PageGetItemId(origpage, i);
itemsz = ItemIdGetLength(itemid);
dataitem = (IndexTuple) PageGetItem(origpage, itemid);
/* replace original item with nposting due to posting split? */
if (i == origpagepostingoff)
{
Assert(BTreeTupleIsPosting(dataitem));
Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
dataitem = nposting;
}
/* does new item belong before this one? */
else if (i == newitemoff)
{
if (newitemonleft)
{
Assert(newitemoff <= firstrightoff);
if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
false))
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add new item to the left sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
}
afterleftoff = OffsetNumberNext(afterleftoff);
}
else
{
Assert(newitemoff >= firstrightoff);
if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
afterrightoff == minusinfoff))
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add new item to the right sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
}
afterrightoff = OffsetNumberNext(afterrightoff);
}
}
/* decide which page to put it on */
if (i < firstrightoff)
{
if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add old item to the left sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
}
afterleftoff = OffsetNumberNext(afterleftoff);
}
else
{
if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
afterrightoff == minusinfoff))
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add old item to the right sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
}
afterrightoff = OffsetNumberNext(afterrightoff);
}
}
/* Handle case where newitem goes at the end of rightpage */
if (i <= newitemoff)
{
/*
* Can't have newitemonleft here; that would imply we were told to put
* *everything* on the left page, which cannot fit (if it could, we'd
* not be splitting the page).
*/
Assert(!newitemonleft && newitemoff == maxoff + 1);
if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
afterrightoff == minusinfoff))
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add new item to the right sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
}
afterrightoff = OffsetNumberNext(afterrightoff);
}
/*
* We have to grab the original right sibling (if any) and update its prev
* link. We are guaranteed that this is deadlock-free, since we couple
* the locks in the standard order: left to right.
*/
if (!isrightmost)
{
sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
spage = BufferGetPage(sbuf);
sopaque = BTPageGetOpaque(spage);
if (sopaque->btpo_prev != origpagenumber)
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("right sibling's left-link doesn't match: "
"block %u links to %u instead of expected %u in index \"%s\"",
oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
RelationGetRelationName(rel))));
}
/*
* Check to see if we can set the SPLIT_END flag in the right-hand
* split page; this can save some I/O for vacuum since it need not
* proceed to the right sibling. We can set the flag if the right
* sibling has a different cycleid: that means it could not be part of
* a group of pages that were all split off from the same ancestor
* page. If you're confused, imagine that page A splits to A B and
* then again, yielding A C B, while vacuum is in progress. Tuples
* originally in A could now be in either B or C, hence vacuum must
* examine both pages. But if D, our right sibling, has a different
* cycleid then it could not contain any tuples that were in A when
* the vacuum started.
*/
if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
ropaque->btpo_flags |= BTP_SPLIT_END;
}
/*
* Right sibling is locked, new siblings are prepared, but original page
* is not updated yet.
*
* NO EREPORT(ERROR) till right sibling is updated. We can get away with
* not starting the critical section till here because we haven't been
* scribbling on the original page yet; see comments above.
*/
START_CRIT_SECTION();
/*
* By here, the original data page has been split into two new halves, and
* these are correct. The algorithm requires that the left page never
* move during a split, so we copy the new left page back on top of the
* original. We need to do this before writing the WAL record, so that
* XLogInsert can WAL log an image of the page if necessary.
*/
PageRestoreTempPage(leftpage, origpage);
/* leftpage, lopaque must not be used below here */
MarkBufferDirty(buf);
MarkBufferDirty(rbuf);
if (!isrightmost)
{
sopaque->btpo_prev = rightpagenumber;
MarkBufferDirty(sbuf);
}
/*
* Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
* a split
*/
if (!isleaf)
{
Page cpage = BufferGetPage(cbuf);
BTPageOpaque cpageop = BTPageGetOpaque(cpage);
cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
MarkBufferDirty(cbuf);
}
/* XLOG stuff */
if (RelationNeedsWAL(rel))
{
xl_btree_split xlrec;
uint8 xlinfo;
XLogRecPtr recptr;
xlrec.level = ropaque->btpo_level;
/* See comments below on newitem, orignewitem, and posting lists */
xlrec.firstrightoff = firstrightoff;
xlrec.newitemoff = newitemoff;
xlrec.postingoff = 0;
if (postingoff != 0 && origpagepostingoff < firstrightoff)
xlrec.postingoff = postingoff;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
/* Log original right sibling, since we've changed its prev-pointer */
if (!isrightmost)
XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
if (!isleaf)
XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
/*
* Log the new item, if it was inserted on the left page. (If it was
* put on the right page, we don't need to explicitly WAL log it
* because it's included with all the other items on the right page.)
* Show the new item as belonging to the left page buffer, so that it
* is not stored if XLogInsert decides it needs a full-page image of
* the left page. We always store newitemoff in the record, though.
*
* The details are sometimes slightly different for page splits that
* coincide with a posting list split. If both the replacement
* posting list and newitem go on the right page, then we don't need
* to log anything extra, just like the simple !newitemonleft
* no-posting-split case (postingoff is set to zero in the WAL record,
* so recovery doesn't need to process a posting list split at all).
* Otherwise, we set postingoff and log orignewitem instead of
* newitem, despite having actually inserted newitem. REDO routine
* must reconstruct nposting and newitem using _bt_swap_posting().
*
* Note: It's possible that our page split point is the point that
* makes the posting list lastleft and newitem firstright. This is
* the only case where we log orignewitem/newitem despite newitem
* going on the right page. If XLogInsert decides that it can omit
* orignewitem due to logging a full-page image of the left page,
* everything still works out, since recovery only needs to log
* orignewitem for items on the left page (just like the regular
* newitem-logged case).
*/
if (newitemonleft && xlrec.postingoff == 0)
XLogRegisterBufData(0, (char *) newitem, newitemsz);
else if (xlrec.postingoff != 0)
{
Assert(isleaf);
Assert(newitemonleft || firstrightoff == newitemoff);
Assert(newitemsz == IndexTupleSize(orignewitem));
XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
}
/* Log the left page's new high key */
if (!isleaf)
{
/* lefthighkey isn't local copy, get current pointer */
itemid = PageGetItemId(origpage, P_HIKEY);
lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
}
XLogRegisterBufData(0, (char *) lefthighkey,
MAXALIGN(IndexTupleSize(lefthighkey)));
/*
* Log the contents of the right page in the format understood by
* _bt_restore_page(). The whole right page will be recreated.
*
* Direct access to page is not good but faster - we should implement
* some new func in page API. Note we only store the tuples
* themselves, knowing that they were inserted in item-number order
* and so the line pointers can be reconstructed. See comments for
* _bt_restore_page().
*/
XLogRegisterBufData(1,
(char *) rightpage + ((PageHeader) rightpage)->pd_upper,
((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
recptr = XLogInsert(RM_BTREE_ID, xlinfo);
PageSetLSN(origpage, recptr);
PageSetLSN(rightpage, recptr);
if (!isrightmost)
PageSetLSN(spage, recptr);
if (!isleaf)
PageSetLSN(BufferGetPage(cbuf), recptr);
}
END_CRIT_SECTION();
/* release the old right sibling */
if (!isrightmost)
_bt_relbuf(rel, sbuf);
/* release the child */
if (!isleaf)
_bt_relbuf(rel, cbuf);
/* be tidy */
if (isleaf)
pfree(lefthighkey);
/* split's done */
return rbuf;
}
/*
* _bt_insert_parent() -- Insert downlink into parent, completing split.
*
* On entry, buf and rbuf are the left and right split pages, which we
* still hold write locks on. Both locks will be released here. We
* release the rbuf lock once we have a write lock on the page that we
* intend to insert a downlink to rbuf on (i.e. buf's current parent page).
* The lock on buf is released at the same point as the lock on the parent
* page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
* atomic operation that completes the split by inserting a new downlink.
*
* stack - stack showing how we got here. Will be NULL when splitting true
* root, or during concurrent root split, where we can be inefficient
* isroot - we split the true root
* isonly - we split a page alone on its level (might have been fast root)
*/
static void
_bt_insert_parent(Relation rel,
Buffer buf,
Buffer rbuf,
BTStack stack,
bool isroot,
bool isonly)
{
/*
* Here we have to do something Lehman and Yao don't talk about: deal with
* a root split and construction of a new root. If our stack is empty
* then we have just split a node on what had been the root level when we
* descended the tree. If it was still the root then we perform a
* new-root construction. If it *wasn't* the root anymore, search to find
* the next higher level that someone constructed meanwhile, and find the
* right place to insert as for the normal case.
*
* If we have to search for the parent level, we do so by re-descending
* from the root. This is not super-efficient, but it's rare enough not
* to matter.
*/
if (isroot)
{
Buffer rootbuf;
Assert(stack == NULL);
Assert(isonly);
/* create a new root node and update the metapage */
rootbuf = _bt_newroot(rel, buf, rbuf);
/* release the split buffers */
_bt_relbuf(rel, rootbuf);
_bt_relbuf(rel, rbuf);
_bt_relbuf(rel, buf);
}
else
{
BlockNumber bknum = BufferGetBlockNumber(buf);
BlockNumber rbknum = BufferGetBlockNumber(rbuf);
Page page = BufferGetPage(buf);
IndexTuple new_item;
BTStackData fakestack;
IndexTuple ritem;
Buffer pbuf;
if (stack == NULL)
{
BTPageOpaque opaque;
elog(DEBUG2, "concurrent ROOT page split");
opaque = BTPageGetOpaque(page);
/*
* We should never reach here when a leaf page split takes place
* despite the insert of newitem being able to apply the fastpath
* optimization. Make sure of that with an assertion.
*
* This is more of a performance issue than a correctness issue.
* The fastpath won't have a descent stack. Using a phony stack
* here works, but never rely on that. The fastpath should be
* rejected within _bt_search_insert() when the rightmost leaf
* page will split, since it's faster to go through _bt_search()
* and get a stack in the usual way.
*/
Assert(!(P_ISLEAF(opaque) &&
BlockNumberIsValid(RelationGetTargetBlock(rel))));
/* Find the leftmost page at the next level up */
pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
/* Set up a phony stack entry pointing there */
stack = &fakestack;
stack->bts_blkno = BufferGetBlockNumber(pbuf);
stack->bts_offset = InvalidOffsetNumber;
stack->bts_parent = NULL;
_bt_relbuf(rel, pbuf);
}
/* get high key from left, a strict lower bound for new right page */
ritem = (IndexTuple) PageGetItem(page,
PageGetItemId(page, P_HIKEY));
/* form an index tuple that points at the new right page */
new_item = CopyIndexTuple(ritem);
BTreeTupleSetDownLink(new_item, rbknum);
/*
* Re-find and write lock the parent of buf.
*
* It's possible that the location of buf's downlink has changed since
* our initial _bt_search() descent. _bt_getstackbuf() will detect
* and recover from this, updating the stack, which ensures that the
* new downlink will be inserted at the correct offset. Even buf's
* parent may have changed.
*/
pbuf = _bt_getstackbuf(rel, stack, bknum);
/*
* Unlock the right child. The left child will be unlocked in
* _bt_insertonpg().
*
* Unlocking the right child must be delayed until here to ensure that
* no concurrent VACUUM operation can become confused. Page deletion
* cannot be allowed to fail to re-find a downlink for the rbuf page.
* (Actually, this is just a vestige of how things used to work. The
* page deletion code is expected to check for the INCOMPLETE_SPLIT
* flag on the left child. It won't attempt deletion of the right
* child until the split is complete. Despite all this, we opt to
* conservatively delay unlocking the right child until here.)
*/
_bt_relbuf(rel, rbuf);
if (pbuf == InvalidBuffer)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
RelationGetRelationName(rel), bknum, rbknum)));
/* Recursively insert into the parent */
_bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
new_item, MAXALIGN(IndexTupleSize(new_item)),
stack->bts_offset + 1, 0, isonly);
/* be tidy */
pfree(new_item);
}
}
/*
* _bt_finish_split() -- Finish an incomplete split
*
* A crash or other failure can leave a split incomplete. The insertion
* routines won't allow to insert on a page that is incompletely split.
* Before inserting on such a page, call _bt_finish_split().
*
* On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
* and unpinned.
*/
void
_bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
{
Page lpage = BufferGetPage(lbuf);
BTPageOpaque lpageop = BTPageGetOpaque(lpage);
Buffer rbuf;
Page rpage;
BTPageOpaque rpageop;
bool wasroot;
bool wasonly;
Assert(P_INCOMPLETE_SPLIT(lpageop));
/* Lock right sibling, the one missing the downlink */
rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
rpage = BufferGetPage(rbuf);
rpageop = BTPageGetOpaque(rpage);
/* Could this be a root split? */
if (!stack)
{
Buffer metabuf;
Page metapg;
BTMetaPageData *metad;
/* acquire lock on the metapage */
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
_bt_relbuf(rel, metabuf);
}
else
wasroot = false;
/* Was this the only page on the level before split? */
wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
elog(DEBUG1, "finishing incomplete split of %u/%u",
BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
_bt_insert_parent(rel, lbuf, rbuf, stack, wasroot, wasonly);
}
/*
* _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
* tuple whose downlink points to child page.
*
* Caller passes child's block number, which is used to identify
* associated pivot tuple in parent page using a linear search that
* matches on pivot's downlink/block number. The expected location of
* the pivot tuple is taken from the stack one level above the child
* page. This is used as a starting point. Insertions into the
* parent level could cause the pivot tuple to move right; deletions
* could cause it to move left, but not left of the page we previously
* found it on.
*
* Caller can use its stack to relocate the pivot tuple/downlink for
* any same-level page to the right of the page found by its initial
* descent. This is necessary because of the possibility that caller
* moved right to recover from a concurrent page split. It's also
* convenient for certain callers to be able to step right when there
* wasn't a concurrent page split, while still using their original
* stack. For example, the checkingunique _bt_doinsert() case may
* have to step right when there are many physical duplicates, and its
* scantid forces an insertion to the right of the "first page the
* value could be on". (This is also relied on by all of our callers
* when dealing with !heapkeyspace indexes.)
*
* Returns write-locked parent page buffer, or InvalidBuffer if pivot
* tuple not found (should not happen). Adjusts bts_blkno &
* bts_offset if changed. Page split caller should insert its new
* pivot tuple for its new right sibling page on parent page, at the
* offset number bts_offset + 1.
*/
Buffer
_bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
{
BlockNumber blkno;
OffsetNumber start;
blkno = stack->bts_blkno;
start = stack->bts_offset;
for (;;)
{
Buffer buf;
Page page;
BTPageOpaque opaque;
buf = _bt_getbuf(rel, blkno, BT_WRITE);
page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
if (P_INCOMPLETE_SPLIT(opaque))
{
_bt_finish_split(rel, buf, stack->bts_parent);
continue;
}
if (!P_IGNORE(opaque))
{
OffsetNumber offnum,
minoff,
maxoff;
ItemId itemid;
IndexTuple item;
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
/*
* start = InvalidOffsetNumber means "search the whole page". We
* need this test anyway due to possibility that page has a high
* key now when it didn't before.
*/
if (start < minoff)
start = minoff;
/*
* Need this check too, to guard against possibility that page
* split since we visited it originally.
*/
if (start > maxoff)
start = OffsetNumberNext(maxoff);
/*
* These loops will check every item on the page --- but in an
* order that's attuned to the probability of where it actually
* is. Scan to the right first, then to the left.
*/
for (offnum = start;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
{
itemid = PageGetItemId(page, offnum);
item = (IndexTuple) PageGetItem(page, itemid);
if (BTreeTupleGetDownLink(item) == child)
{
/* Return accurate pointer to where link is now */
stack->bts_blkno = blkno;
stack->bts_offset = offnum;
return buf;
}
}
for (offnum = OffsetNumberPrev(start);
offnum >= minoff;
offnum = OffsetNumberPrev(offnum))
{
itemid = PageGetItemId(page, offnum);
item = (IndexTuple) PageGetItem(page, itemid);
if (BTreeTupleGetDownLink(item) == child)
{
/* Return accurate pointer to where link is now */
stack->bts_blkno = blkno;
stack->bts_offset = offnum;
return buf;
}
}
}
/*
* The item we're looking for moved right at least one page.
*
* Lehman and Yao couple/chain locks when moving right here, which we
* can avoid. See nbtree/README.
*/
if (P_RIGHTMOST(opaque))
{
_bt_relbuf(rel, buf);
return InvalidBuffer;
}
blkno = opaque->btpo_next;
start = InvalidOffsetNumber;
_bt_relbuf(rel, buf);
}
}
/*
* _bt_newroot() -- Create a new root page for the index.
*
* We've just split the old root page and need to create a new one.
* In order to do this, we add a new root page to the file, then lock
* the metadata page and update it. This is guaranteed to be deadlock-
* free, because all readers release their locks on the metadata page
* before trying to lock the root, and all writers lock the root before
* trying to lock the metadata page. We have a write lock on the old
* root page, so we have not introduced any cycles into the waits-for
* graph.
*
* On entry, lbuf (the old root) and rbuf (its new peer) are write-
* locked. On exit, a new root page exists with entries for the
* two new children, metapage is updated and unlocked/unpinned.
* The new root buffer is returned to caller which has to unlock/unpin
* lbuf, rbuf & rootbuf.
*/
static Buffer
_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
{
Buffer rootbuf;
Page lpage,
rootpage;
BlockNumber lbkno,
rbkno;
BlockNumber rootblknum;
BTPageOpaque rootopaque;
BTPageOpaque lopaque;
ItemId itemid;
IndexTuple item;
IndexTuple left_item;
Size left_item_sz;
IndexTuple right_item;
Size right_item_sz;
Buffer metabuf;
Page metapg;
BTMetaPageData *metad;
lbkno = BufferGetBlockNumber(lbuf);
rbkno = BufferGetBlockNumber(rbuf);
lpage = BufferGetPage(lbuf);
lopaque = BTPageGetOpaque(lpage);
/* get a new root page */
rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
rootpage = BufferGetPage(rootbuf);
rootblknum = BufferGetBlockNumber(rootbuf);
/* acquire lock on the metapage */
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
/*
* Create downlink item for left page (old root). The key value used is
* "minus infinity", a sentinel value that's reliably less than any real
* key value that could appear in the left page.
*/
left_item_sz = sizeof(IndexTupleData);
left_item = (IndexTuple) palloc(left_item_sz);
left_item->t_info = left_item_sz;
BTreeTupleSetDownLink(left_item, lbkno);
BTreeTupleSetNAtts(left_item, 0, false);
/*
* Create downlink item for right page. The key for it is obtained from
* the "high key" position in the left page.
*/
itemid = PageGetItemId(lpage, P_HIKEY);
right_item_sz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(lpage, itemid);
right_item = CopyIndexTuple(item);
BTreeTupleSetDownLink(right_item, rbkno);
/* NO EREPORT(ERROR) from here till newroot op is logged */
START_CRIT_SECTION();
/* upgrade metapage if needed */
if (metad->btm_version < BTREE_NOVAC_VERSION)
_bt_upgrademetapage(metapg);
/* set btree special data */
rootopaque = BTPageGetOpaque(rootpage);
rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
rootopaque->btpo_flags = BTP_ROOT;
rootopaque->btpo_level =
(BTPageGetOpaque(lpage))->btpo_level + 1;
rootopaque->btpo_cycleid = 0;
/* update metapage data */
metad->btm_root = rootblknum;
metad->btm_level = rootopaque->btpo_level;
metad->btm_fastroot = rootblknum;
metad->btm_fastlevel = rootopaque->btpo_level;
/*
* Insert the left page pointer into the new root page. The root page is
* the rightmost page on its level so there is no "high key" in it; the
* two items will go into positions P_HIKEY and P_FIRSTKEY.
*
* Note: we *must* insert the two items in item-number order, for the
* benefit of _bt_restore_page().
*/
Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
false, false) == InvalidOffsetNumber)
elog(PANIC, "failed to add leftkey to new root page"
" while splitting block %u of index \"%s\"",
BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
/*
* insert the right page pointer into the new root page.
*/
Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
Assert(BTreeTupleGetNAtts(right_item, rel) <=
IndexRelationGetNumberOfKeyAttributes(rel));
if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
false, false) == InvalidOffsetNumber)
elog(PANIC, "failed to add rightkey to new root page"
" while splitting block %u of index \"%s\"",
BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
/* Clear the incomplete-split flag in the left child */
Assert(P_INCOMPLETE_SPLIT(lopaque));
lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
MarkBufferDirty(lbuf);
MarkBufferDirty(rootbuf);
MarkBufferDirty(metabuf);
/* XLOG stuff */
if (RelationNeedsWAL(rel))
{
xl_btree_newroot xlrec;
XLogRecPtr recptr;
xl_btree_metadata md;
xlrec.rootblk = rootblknum;
xlrec.level = metad->btm_level;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
md.version = metad->btm_version;
md.root = rootblknum;
md.level = metad->btm_level;
md.fastroot = rootblknum;
md.fastlevel = metad->btm_level;
md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
md.allequalimage = metad->btm_allequalimage;
XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
/*
* Direct access to page is not good but faster - we should implement
* some new func in page API.
*/
XLogRegisterBufData(0,
(char *) rootpage + ((PageHeader) rootpage)->pd_upper,
((PageHeader) rootpage)->pd_special -
((PageHeader) rootpage)->pd_upper);
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
PageSetLSN(lpage, recptr);
PageSetLSN(rootpage, recptr);
PageSetLSN(metapg, recptr);
}
END_CRIT_SECTION();
/* done with metapage */
_bt_relbuf(rel, metabuf);
pfree(left_item);
pfree(right_item);
return rootbuf;
}
/*
* _bt_pgaddtup() -- add a data item to a particular page during split.
*
* The difference between this routine and a bare PageAddItem call is
* that this code can deal with the first data item on an internal btree
* page in passing. This data item (which is called "firstright" within
* _bt_split()) has a key that must be treated as minus infinity after
* the split. Therefore, we truncate away all attributes when caller
* specifies it's the first data item on page (downlink is not changed,
* though). This extra step is only needed for the right page of an
* internal page split. There is no need to do this for the first data
* item on the existing/left page, since that will already have been
* truncated during an earlier page split.
*
* See _bt_split() for a high level explanation of why we truncate here.
* Note that this routine has nothing to do with suffix truncation,
* despite using some of the same infrastructure.
*/
static inline bool
_bt_pgaddtup(Page page,
Size itemsize,
IndexTuple itup,
OffsetNumber itup_off,
bool newfirstdataitem)
{
IndexTupleData trunctuple;
if (newfirstdataitem)
{
trunctuple = *itup;
trunctuple.t_info = sizeof(IndexTupleData);
BTreeTupleSetNAtts(&trunctuple, 0, false);
itup = &trunctuple;
itemsize = sizeof(IndexTupleData);
}
if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
false) == InvalidOffsetNumber))
return false;
return true;
}
/*
* _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
*
* There are three operations performed here: simple index deletion, bottom-up
* index deletion, and deduplication. If all three operations fail to free
* enough space for the incoming item then caller will go on to split the
* page. We always consider simple deletion first. If that doesn't work out
* we consider alternatives. Callers that only want us to consider simple
* deletion (without any fallback) ask for that using the 'simpleonly'
* argument.
*
* We usually pick only one alternative "complex" operation when simple
* deletion alone won't prevent a page split. The 'checkingunique',
* 'uniquedup', and 'indexUnchanged' arguments are used for that.
*
* Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
* level flag was found set. The flag was useful back when there wasn't
* necessarily one single page for a duplicate tuple to go on (before heap TID
* became a part of the key space in version 4 indexes). But we don't
* actually look at the flag anymore (it's not a gating condition for our
* caller). That would cause us to miss tuples that are safe to delete,
* without getting any benefit in return. We know that the alternative is to
* split the page; scanning the line pointer array in passing won't have
* noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
* all this because !heapkeyspace indexes must still do a "getting tired"
* linear search, and so are likely to get some benefit from using it as a
* gating condition.)
*/
static void
_bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
BTInsertState insertstate,
bool simpleonly, bool checkingunique,
bool uniquedup, bool indexUnchanged)
{
OffsetNumber deletable[MaxIndexTuplesPerPage];
int ndeletable = 0;
OffsetNumber offnum,
minoff,
maxoff;
Buffer buffer = insertstate->buf;
BTScanInsert itup_key = insertstate->itup_key;
Page page = BufferGetPage(buffer);
BTPageOpaque opaque = BTPageGetOpaque(page);
Assert(P_ISLEAF(opaque));
Assert(simpleonly || itup_key->heapkeyspace);
Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
/*
* Scan over all items to see which ones need to be deleted according to
* LP_DEAD flags. We'll usually manage to delete a few extra items that
* are not marked LP_DEAD in passing. Often the extra items that actually
* end up getting deleted are items that would have had their LP_DEAD bit
* set before long anyway (if we opted not to include them as extras).
*/
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
for (offnum = minoff;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
{
ItemId itemId = PageGetItemId(page, offnum);
if (ItemIdIsDead(itemId))
deletable[ndeletable++] = offnum;
}
if (ndeletable > 0)
{
_bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
insertstate->itup, minoff, maxoff);
insertstate->bounds_valid = false;
/* Return when a page split has already been avoided */
if (PageGetFreeSpace(page) >= insertstate->itemsz)
return;
/* Might as well assume duplicates (if checkingunique) */
uniquedup = true;
}
/*
* We're done with simple deletion. Return early with callers that only
* call here so that simple deletion can be considered. This includes
* callers that explicitly ask for this and checkingunique callers that
* probably don't have any version churn duplicates on the page.
*
* Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
* return at this point (or when we go on the try either or both of our
* other strategies and they also fail). We do not bother expending a
* separate write to clear it, however. Caller will definitely clear it
* when it goes on to split the page (note also that the deduplication
* process will clear the flag in passing, just to keep things tidy).
*/
if (simpleonly || (checkingunique && !uniquedup))
{
Assert(!indexUnchanged);
return;
}
/* Assume bounds about to be invalidated (this is almost certain now) */
insertstate->bounds_valid = false;
/*
* Perform bottom-up index deletion pass when executor hint indicated that
* incoming item is logically unchanged, or for a unique index that is
* known to have physical duplicates for some other reason. (There is a
* large overlap between these two cases for a unique index. It's worth
* having both triggering conditions in order to apply the optimization in
* the event of successive related INSERT and DELETE statements.)
*
* We'll go on to do a deduplication pass when a bottom-up pass fails to
* delete an acceptable amount of free space (a significant fraction of
* the page, or space for the new item, whichever is greater).
*
* Note: Bottom-up index deletion uses the same equality/equivalence
* routines as deduplication internally. However, it does not merge
* together index tuples, so the same correctness considerations do not
* apply. We deliberately omit an index-is-allequalimage test here.
*/
if ((indexUnchanged || uniquedup) &&
_bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
return;
/* Perform deduplication pass (when enabled and index-is-allequalimage) */
if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
_bt_dedup_pass(rel, buffer, heapRel, insertstate->itup,
insertstate->itemsz, (indexUnchanged || uniquedup));
}
/*
* _bt_simpledel_pass - Simple index tuple deletion pass.
*
* We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
* of all such tuples are determined by caller (caller passes these to us as
* its 'deletable' argument).
*
* We might also delete extra index tuples that turn out to be safe to delete
* in passing (though they must be cheap to check in passing to begin with).
* There is no certainty that any extra tuples will be deleted, though. The
* high level goal of the approach we take is to get the most out of each call
* here (without noticeably increasing the per-call overhead compared to what
* we need to do just to be able to delete the page's LP_DEAD-marked index
* tuples).
*
* The number of extra index tuples that turn out to be deletable might
* greatly exceed the number of LP_DEAD-marked index tuples due to various
* locality related effects. For example, it's possible that the total number
* of table blocks (pointed to by all TIDs on the leaf page) is naturally
* quite low, in which case we might end up checking if it's possible to
* delete _most_ index tuples on the page (without the tableam needing to
* access additional table blocks). The tableam will sometimes stumble upon
* _many_ extra deletable index tuples in indexes where this pattern is
* common.
*
* See nbtree/README for further details on simple index tuple deletion.
*/
static void
_bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
OffsetNumber minoff, OffsetNumber maxoff)
{
Page page = BufferGetPage(buffer);
BlockNumber *deadblocks;
int ndeadblocks;
TM_IndexDeleteOp delstate;
OffsetNumber offnum;
/* Get array of table blocks pointed to by LP_DEAD-set tuples */
deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
&ndeadblocks);
/* Initialize tableam state that describes index deletion operation */
delstate.irel = rel;
delstate.iblknum = BufferGetBlockNumber(buffer);
delstate.bottomup = false;
delstate.bottomupfreespace = 0;
delstate.ndeltids = 0;
delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
for (offnum = minoff;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
{
ItemId itemid = PageGetItemId(page, offnum);
IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
BlockNumber tidblock;
void *match;
if (!BTreeTupleIsPosting(itup))
{
tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
match = bsearch(&tidblock, deadblocks, ndeadblocks,
sizeof(BlockNumber), _bt_blk_cmp);
if (!match)
{
Assert(!ItemIdIsDead(itemid));
continue;
}
/*
* TID's table block is among those pointed to by the TIDs from
* LP_DEAD-bit set tuples on page -- add TID to deltids
*/
odeltid->tid = itup->t_tid;
odeltid->id = delstate.ndeltids;
ostatus->idxoffnum = offnum;
ostatus->knowndeletable = ItemIdIsDead(itemid);
ostatus->promising = false; /* unused */
ostatus->freespace = 0; /* unused */
delstate.ndeltids++;
}
else
{
int nitem = BTreeTupleGetNPosting(itup);
for (int p = 0; p < nitem; p++)
{
ItemPointer tid = BTreeTupleGetPostingN(itup, p);
tidblock = ItemPointerGetBlockNumber(tid);
match = bsearch(&tidblock, deadblocks, ndeadblocks,
sizeof(BlockNumber), _bt_blk_cmp);
if (!match)
{
Assert(!ItemIdIsDead(itemid));
continue;
}
/*
* TID's table block is among those pointed to by the TIDs
* from LP_DEAD-bit set tuples on page -- add TID to deltids
*/
odeltid->tid = *tid;
odeltid->id = delstate.ndeltids;
ostatus->idxoffnum = offnum;
ostatus->knowndeletable = ItemIdIsDead(itemid);
ostatus->promising = false; /* unused */
ostatus->freespace = 0; /* unused */
odeltid++;
ostatus++;
delstate.ndeltids++;
}
}
}
pfree(deadblocks);
Assert(delstate.ndeltids >= ndeletable);
/* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
_bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
pfree(delstate.deltids);
pfree(delstate.status);
}
/*
* _bt_deadblocks() -- Get LP_DEAD related table blocks.
*
* Builds sorted and unique-ified array of table block numbers from index
* tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
* block from incoming newitem just in case it isn't among the LP_DEAD-related
* table blocks.
*
* Always counting the newitem's table block as an LP_DEAD related block makes
* sense because the cost is consistently low; it is practically certain that
* the table block will not incur a buffer miss in tableam. On the other hand
* the benefit is often quite high. There is a decent chance that there will
* be some deletable items from this block, since in general most garbage
* tuples became garbage in the recent past (in many cases this won't be the
* first logical row that core code added to/modified in table block
* recently).
*
* Returns final array, and sets *nblocks to its final size for caller.
*/
static BlockNumber *
_bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
IndexTuple newitem, int *nblocks)
{
int spacentids,
ntids;
BlockNumber *tidblocks;
/*
* Accumulate each TID's block in array whose initial size has space for
* one table block per LP_DEAD-set tuple (plus space for the newitem table
* block). Array will only need to grow when there are LP_DEAD-marked
* posting list tuples (which is not that common).
*/
spacentids = ndeletable + 1;
ntids = 0;
tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
/*
* First add the table block for the incoming newitem. This is the one
* case where simple deletion can visit a table block that doesn't have
* any known deletable items.
*/
Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
for (int i = 0; i < ndeletable; i++)
{
ItemId itemid = PageGetItemId(page, deletable[i]);
IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
Assert(ItemIdIsDead(itemid));
if (!BTreeTupleIsPosting(itup))
{
if (ntids + 1 > spacentids)
{
spacentids *= 2;
tidblocks = (BlockNumber *)
repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
}
tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
}
else
{
int nposting = BTreeTupleGetNPosting(itup);
if (ntids + nposting > spacentids)
{
spacentids = Max(spacentids * 2, ntids + nposting);
tidblocks = (BlockNumber *)
repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
}
for (int j = 0; j < nposting; j++)
{
ItemPointer tid = BTreeTupleGetPostingN(itup, j);
tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
}
}
}
qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
*nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
return tidblocks;
}
/*
* _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
*/
static inline int
_bt_blk_cmp(const void *arg1, const void *arg2)
{
BlockNumber b1 = *((BlockNumber *) arg1);
BlockNumber b2 = *((BlockNumber *) arg2);
if (b1 < b2)
return -1;
else if (b1 > b2)
return 1;
return 0;
}
|