diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 00:55:53 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 00:55:53 +0000 |
commit | 3d0386f27ca66379acf50199e1d1298386eeeeb8 (patch) | |
tree | f87bd4a126b3a843858eb447e8fd5893c3ee3882 /contrib | |
parent | Initial commit. (diff) | |
download | knot-resolver-3d0386f27ca66379acf50199e1d1298386eeeeb8.tar.xz knot-resolver-3d0386f27ca66379acf50199e1d1298386eeeeb8.zip |
Adding upstream version 3.2.1.upstream/3.2.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
52 files changed, 19486 insertions, 0 deletions
diff --git a/contrib/base32hex.c b/contrib/base32hex.c new file mode 100644 index 0000000..1684ac9 --- /dev/null +++ b/contrib/base32hex.c @@ -0,0 +1,289 @@ +/* Copyright (C) 2011-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#include "contrib/base32hex.h" + +#include <stdlib.h> +#include <stdint.h> + +/*! \brief Maximal length of binary input to Base32hex encoding. */ +#define MAX_BIN_DATA_LEN ((INT32_MAX / 8) * 5) + +/*! \brief Base32hex padding character. */ +static const uint8_t base32hex_pad = '='; +/*! \brief Base32hex alphabet. Beware: original code was upper-case. */ +static const uint8_t base32hex_enc[] = "0123456789abcdefghijklmnopqrstuv"; + +/*! \brief Indicates bad Base32hex character. */ +#define KO 255 +/*! \brief Indicates Base32hex padding character. */ +#define PD 32 + +/*! \brief Transformation and validation table for decoding Base32hex. */ +static const uint8_t base32hex_dec[256] = { + [ 0] = KO, [ 43] = KO, ['V'] = 31, [129] = KO, [172] = KO, [215] = KO, + [ 1] = KO, [ 44] = KO, ['W'] = KO, [130] = KO, [173] = KO, [216] = KO, + [ 2] = KO, [ 45] = KO, ['X'] = KO, [131] = KO, [174] = KO, [217] = KO, + [ 3] = KO, [ 46] = KO, ['Y'] = KO, [132] = KO, [175] = KO, [218] = KO, + [ 4] = KO, [ 47] = KO, ['Z'] = KO, [133] = KO, [176] = KO, [219] = KO, + [ 5] = KO, ['0'] = 0, [ 91] = KO, [134] = KO, [177] = KO, [220] = KO, + [ 6] = KO, ['1'] = 1, [ 92] = KO, [135] = KO, [178] = KO, [221] = KO, + [ 7] = KO, ['2'] = 2, [ 93] = KO, [136] = KO, [179] = KO, [222] = KO, + [ 8] = KO, ['3'] = 3, [ 94] = KO, [137] = KO, [180] = KO, [223] = KO, + [ 9] = KO, ['4'] = 4, [ 95] = KO, [138] = KO, [181] = KO, [224] = KO, + [ 10] = KO, ['5'] = 5, [ 96] = KO, [139] = KO, [182] = KO, [225] = KO, + [ 11] = KO, ['6'] = 6, ['a'] = 10, [140] = KO, [183] = KO, [226] = KO, + [ 12] = KO, ['7'] = 7, ['b'] = 11, [141] = KO, [184] = KO, [227] = KO, + [ 13] = KO, ['8'] = 8, ['c'] = 12, [142] = KO, [185] = KO, [228] = KO, + [ 14] = KO, ['9'] = 9, ['d'] = 13, [143] = KO, [186] = KO, [229] = KO, + [ 15] = KO, [ 58] = KO, ['e'] = 14, [144] = KO, [187] = KO, [230] = KO, + [ 16] = KO, [ 59] = KO, ['f'] = 15, [145] = KO, [188] = KO, [231] = KO, + [ 17] = KO, [ 60] = KO, ['g'] = 16, [146] = KO, [189] = KO, [232] = KO, + [ 18] = KO, ['='] = PD, ['h'] = 17, [147] = KO, [190] = KO, [233] = KO, + [ 19] = KO, [ 62] = KO, ['i'] = 18, [148] = KO, [191] = KO, [234] = KO, + [ 20] = KO, [ 63] = KO, ['j'] = 19, [149] = KO, [192] = KO, [235] = KO, + [ 21] = KO, [ 64] = KO, ['k'] = 20, [150] = KO, [193] = KO, [236] = KO, + [ 22] = KO, ['A'] = 10, ['l'] = 21, [151] = KO, [194] = KO, [237] = KO, + [ 23] = KO, ['B'] = 11, ['m'] = 22, [152] = KO, [195] = KO, [238] = KO, + [ 24] = KO, ['C'] = 12, ['n'] = 23, [153] = KO, [196] = KO, [239] = KO, + [ 25] = KO, ['D'] = 13, ['o'] = 24, [154] = KO, [197] = KO, [240] = KO, + [ 26] = KO, ['E'] = 14, ['p'] = 25, [155] = KO, [198] = KO, [241] = KO, + [ 27] = KO, ['F'] = 15, ['q'] = 26, [156] = KO, [199] = KO, [242] = KO, + [ 28] = KO, ['G'] = 16, ['r'] = 27, [157] = KO, [200] = KO, [243] = KO, + [ 29] = KO, ['H'] = 17, ['s'] = 28, [158] = KO, [201] = KO, [244] = KO, + [ 30] = KO, ['I'] = 18, ['t'] = 29, [159] = KO, [202] = KO, [245] = KO, + [ 31] = KO, ['J'] = 19, ['u'] = 30, [160] = KO, [203] = KO, [246] = KO, + [ 32] = KO, ['K'] = 20, ['v'] = 31, [161] = KO, [204] = KO, [247] = KO, + [ 33] = KO, ['L'] = 21, ['w'] = KO, [162] = KO, [205] = KO, [248] = KO, + [ 34] = KO, ['M'] = 22, ['x'] = KO, [163] = KO, [206] = KO, [249] = KO, + [ 35] = KO, ['N'] = 23, ['y'] = KO, [164] = KO, [207] = KO, [250] = KO, + [ 36] = KO, ['O'] = 24, ['z'] = KO, [165] = KO, [208] = KO, [251] = KO, + [ 37] = KO, ['P'] = 25, [123] = KO, [166] = KO, [209] = KO, [252] = KO, + [ 38] = KO, ['Q'] = 26, [124] = KO, [167] = KO, [210] = KO, [253] = KO, + [ 39] = KO, ['R'] = 27, [125] = KO, [168] = KO, [211] = KO, [254] = KO, + [ 40] = KO, ['S'] = 28, [126] = KO, [169] = KO, [212] = KO, [255] = KO, + [ 41] = KO, ['T'] = 29, [127] = KO, [170] = KO, [213] = KO, + [ 42] = KO, ['U'] = 30, [128] = KO, [171] = KO, [214] = KO, +}; + +int32_t base32hex_decode(const uint8_t *in, + const uint32_t in_len, + uint8_t *out, + const uint32_t out_len) +{ + // Checking inputs. + if (in == NULL || out == NULL) { + return -1; + } + if (in_len > INT32_MAX || out_len < ((in_len + 7) / 8) * 5) { + return -1; + } + if ((in_len % 8) != 0) { + return -1; + } + + const uint8_t *stop = in + in_len; + uint8_t *bin = out; + uint8_t pad_len = 0; + uint8_t c1, c2, c3, c4, c5, c6, c7, c8; + + // Decoding loop takes 8 characters and creates 5 bytes. + while (in < stop) { + // Filling and transforming 8 Base32hex chars. + c1 = base32hex_dec[in[0]]; + c2 = base32hex_dec[in[1]]; + c3 = base32hex_dec[in[2]]; + c4 = base32hex_dec[in[3]]; + c5 = base32hex_dec[in[4]]; + c6 = base32hex_dec[in[5]]; + c7 = base32hex_dec[in[6]]; + c8 = base32hex_dec[in[7]]; + + // Check 8. char if is bad or padding. + if (c8 >= PD) { + if (c8 == PD && pad_len == 0) { + pad_len = 1; + } else { + return -1; + } + } + + // Check 7. char if is bad or padding (if so, 6. must be too). + if (c7 >= PD) { + if (c7 == PD && c6 == PD && pad_len == 1) { + pad_len = 3; + } else { + return -1; + } + } + + // Check 6. char if is bad or padding. + if (c6 >= PD) { + if (!(c6 == PD && pad_len == 3)) { + return -1; + } + } + + // Check 5. char if is bad or padding. + if (c5 >= PD) { + if (c5 == PD && pad_len == 3) { + pad_len = 4; + } else { + return -1; + } + } + + // Check 4. char if is bad or padding (if so, 3. must be too). + if (c4 >= PD) { + if (c4 == PD && c3 == PD && pad_len == 4) { + pad_len = 6; + } else { + return -1; + } + } + + // Check 3. char if is bad or padding. + if (c3 >= PD) { + if (!(c3 == PD && pad_len == 6)) { + return -1; + } + } + + // 1. and 2. chars must not be padding. + if (c2 >= PD || c1 >= PD) { + return -1; + } + + // Computing of output data based on padding length. + switch (pad_len) { + case 0: + bin[4] = (c7 << 5) + c8; + case 1: + bin[3] = (c5 << 7) + (c6 << 2) + (c7 >> 3); + case 3: + bin[2] = (c4 << 4) + (c5 >> 1); + case 4: + bin[1] = (c2 << 6) + (c3 << 1) + (c4 >> 4); + case 6: + bin[0] = (c1 << 3) + (c2 >> 2); + } + + // Update output end. + switch (pad_len) { + case 0: + bin += 5; + break; + case 1: + bin += 4; + break; + case 3: + bin += 3; + break; + case 4: + bin += 2; + break; + case 6: + bin += 1; + break; + } + + in += 8; + } + + return (bin - out); +} + +int32_t base32hex_encode(const uint8_t *in, + const uint32_t in_len, + uint8_t *out, + const uint32_t out_len) +{ + // Checking inputs. + if (in == NULL || out == NULL) { + return -1; + } + if (in_len > MAX_BIN_DATA_LEN || out_len < ((in_len + 4) / 5) * 8) { + return -1; + } + + uint8_t rest_len = in_len % 5; + const uint8_t *stop = in + in_len - rest_len; + uint8_t *text = out; + + // Encoding loop takes 5 bytes and creates 8 characters. + while (in < stop) { + text[0] = base32hex_enc[in[0] >> 3]; + text[1] = base32hex_enc[(in[0] & 0x07) << 2 | in[1] >> 6]; + text[2] = base32hex_enc[(in[1] & 0x3E) >> 1]; + text[3] = base32hex_enc[(in[1] & 0x01) << 4 | in[2] >> 4]; + text[4] = base32hex_enc[(in[2] & 0x0F) << 1 | in[3] >> 7]; + text[5] = base32hex_enc[(in[3] & 0x7C) >> 2]; + text[6] = base32hex_enc[(in[3] & 0x03) << 3 | in[4] >> 5]; + text[7] = base32hex_enc[in[4] & 0x1F]; + text += 8; + in += 5; + } + + // Processing of padding, if any. + switch (rest_len) { + case 4: + text[0] = base32hex_enc[in[0] >> 3]; + text[1] = base32hex_enc[(in[0] & 0x07) << 2 | in[1] >> 6]; + text[2] = base32hex_enc[(in[1] & 0x3E) >> 1]; + text[3] = base32hex_enc[(in[1] & 0x01) << 4 | in[2] >> 4]; + text[4] = base32hex_enc[(in[2] & 0x0F) << 1 | in[3] >> 7]; + text[5] = base32hex_enc[(in[3] & 0x7C) >> 2]; + text[6] = base32hex_enc[(in[3] & 0x03) << 3]; + text[7] = base32hex_pad; + text += 8; + break; + case 3: + text[0] = base32hex_enc[in[0] >> 3]; + text[1] = base32hex_enc[(in[0] & 0x07) << 2 | in[1] >> 6]; + text[2] = base32hex_enc[(in[1] & 0x3E) >> 1]; + text[3] = base32hex_enc[(in[1] & 0x01) << 4 | in[2] >> 4]; + text[4] = base32hex_enc[(in[2] & 0x0F) << 1]; + text[5] = base32hex_pad; + text[6] = base32hex_pad; + text[7] = base32hex_pad; + text += 8; + break; + case 2: + text[0] = base32hex_enc[in[0] >> 3]; + text[1] = base32hex_enc[(in[0] & 0x07) << 2 | in[1] >> 6]; + text[2] = base32hex_enc[(in[1] & 0x3E) >> 1]; + text[3] = base32hex_enc[(in[1] & 0x01) << 4]; + text[4] = base32hex_pad; + text[5] = base32hex_pad; + text[6] = base32hex_pad; + text[7] = base32hex_pad; + text += 8; + break; + case 1: + text[0] = base32hex_enc[in[0] >> 3]; + text[1] = base32hex_enc[(in[0] & 0x07) << 2]; + text[2] = base32hex_pad; + text[3] = base32hex_pad; + text[4] = base32hex_pad; + text[5] = base32hex_pad; + text[6] = base32hex_pad; + text[7] = base32hex_pad; + text += 8; + break; + } + + return (text - out); +} diff --git a/contrib/base32hex.h b/contrib/base32hex.h new file mode 100644 index 0000000..561d9ff --- /dev/null +++ b/contrib/base32hex.h @@ -0,0 +1,72 @@ +/* Copyright (C) 2011-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ +/*! + * \file + * + * \brief Base32hex implementation (RFC 4648). + * + * \note Input Base32hex string can contain a-v characters. These characters + * are considered as A-V equivalent. + * + * \addtogroup contrib + * @{ + */ + +#pragma once + +#include <stdint.h> + +/*! + * \brief Decodes text data using Base32hex. + * + * \note Input data needn't be terminated with '\0'. + * + * \note Input data must be continuous Base32hex string! + * + * \param in Input text data. + * \param in_len Length of input string. + * \param out Output data buffer. + * \param out_len Size of output buffer. + * + * \retval >=0 length of output data. + * \retval KNOT_E* if error. + */ +int32_t base32hex_decode(const uint8_t *in, + const uint32_t in_len, + uint8_t *out, + const uint32_t out_len); + + +/*! + * \brief Encodes binary data using Base32hex. Lower case is used! + * + * \note Output data buffer contains Base32hex text string which isn't + * terminated with '\0'! + * + * \param in Input binary data. + * \param in_len Length of input data. + * \param out Output data buffer. + * \param out_len Size of output buffer. + * + * \retval >=0 length of output string. + * \retval <0 if error. + */ +int32_t base32hex_encode(const uint8_t *in, + const uint32_t in_len, + uint8_t *out, + const uint32_t out_len); + +/*! @} */ diff --git a/contrib/base64.c b/contrib/base64.c new file mode 100644 index 0000000..e577901 --- /dev/null +++ b/contrib/base64.c @@ -0,0 +1,268 @@ +/* Copyright (C) 2011-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "contrib/base64.h" +#include "libknot/errcode.h" + +#include <stdlib.h> +#include <stdint.h> + +/*! \brief Maximal length of binary input to Base64 encoding. */ +#define MAX_BIN_DATA_LEN ((INT32_MAX / 4) * 3) + +/*! \brief Base64 padding character. */ +static const uint8_t base64_pad = '='; +/*! \brief Base64 alphabet. */ +static const uint8_t base64_enc[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +/*! \brief Indicates bad Base64 character. */ +#define KO 255 +/*! \brief Indicates Base64 padding character. */ +#define PD 64 + +/*! \brief Transformation and validation table for decoding Base64. */ +static const uint8_t base64_dec[256] = { + [ 0] = KO, ['+'] = 62, ['V'] = 21, [129] = KO, [172] = KO, [215] = KO, + [ 1] = KO, [ 44] = KO, ['W'] = 22, [130] = KO, [173] = KO, [216] = KO, + [ 2] = KO, [ 45] = KO, ['X'] = 23, [131] = KO, [174] = KO, [217] = KO, + [ 3] = KO, [ 46] = KO, ['Y'] = 24, [132] = KO, [175] = KO, [218] = KO, + [ 4] = KO, ['/'] = 63, ['Z'] = 25, [133] = KO, [176] = KO, [219] = KO, + [ 5] = KO, ['0'] = 52, [ 91] = KO, [134] = KO, [177] = KO, [220] = KO, + [ 6] = KO, ['1'] = 53, [ 92] = KO, [135] = KO, [178] = KO, [221] = KO, + [ 7] = KO, ['2'] = 54, [ 93] = KO, [136] = KO, [179] = KO, [222] = KO, + [ 8] = KO, ['3'] = 55, [ 94] = KO, [137] = KO, [180] = KO, [223] = KO, + [ 9] = KO, ['4'] = 56, [ 95] = KO, [138] = KO, [181] = KO, [224] = KO, + [ 10] = KO, ['5'] = 57, [ 96] = KO, [139] = KO, [182] = KO, [225] = KO, + [ 11] = KO, ['6'] = 58, ['a'] = 26, [140] = KO, [183] = KO, [226] = KO, + [ 12] = KO, ['7'] = 59, ['b'] = 27, [141] = KO, [184] = KO, [227] = KO, + [ 13] = KO, ['8'] = 60, ['c'] = 28, [142] = KO, [185] = KO, [228] = KO, + [ 14] = KO, ['9'] = 61, ['d'] = 29, [143] = KO, [186] = KO, [229] = KO, + [ 15] = KO, [ 58] = KO, ['e'] = 30, [144] = KO, [187] = KO, [230] = KO, + [ 16] = KO, [ 59] = KO, ['f'] = 31, [145] = KO, [188] = KO, [231] = KO, + [ 17] = KO, [ 60] = KO, ['g'] = 32, [146] = KO, [189] = KO, [232] = KO, + [ 18] = KO, ['='] = PD, ['h'] = 33, [147] = KO, [190] = KO, [233] = KO, + [ 19] = KO, [ 62] = KO, ['i'] = 34, [148] = KO, [191] = KO, [234] = KO, + [ 20] = KO, [ 63] = KO, ['j'] = 35, [149] = KO, [192] = KO, [235] = KO, + [ 21] = KO, [ 64] = KO, ['k'] = 36, [150] = KO, [193] = KO, [236] = KO, + [ 22] = KO, ['A'] = 0, ['l'] = 37, [151] = KO, [194] = KO, [237] = KO, + [ 23] = KO, ['B'] = 1, ['m'] = 38, [152] = KO, [195] = KO, [238] = KO, + [ 24] = KO, ['C'] = 2, ['n'] = 39, [153] = KO, [196] = KO, [239] = KO, + [ 25] = KO, ['D'] = 3, ['o'] = 40, [154] = KO, [197] = KO, [240] = KO, + [ 26] = KO, ['E'] = 4, ['p'] = 41, [155] = KO, [198] = KO, [241] = KO, + [ 27] = KO, ['F'] = 5, ['q'] = 42, [156] = KO, [199] = KO, [242] = KO, + [ 28] = KO, ['G'] = 6, ['r'] = 43, [157] = KO, [200] = KO, [243] = KO, + [ 29] = KO, ['H'] = 7, ['s'] = 44, [158] = KO, [201] = KO, [244] = KO, + [ 30] = KO, ['I'] = 8, ['t'] = 45, [159] = KO, [202] = KO, [245] = KO, + [ 31] = KO, ['J'] = 9, ['u'] = 46, [160] = KO, [203] = KO, [246] = KO, + [ 32] = KO, ['K'] = 10, ['v'] = 47, [161] = KO, [204] = KO, [247] = KO, + [ 33] = KO, ['L'] = 11, ['w'] = 48, [162] = KO, [205] = KO, [248] = KO, + [ 34] = KO, ['M'] = 12, ['x'] = 49, [163] = KO, [206] = KO, [249] = KO, + [ 35] = KO, ['N'] = 13, ['y'] = 50, [164] = KO, [207] = KO, [250] = KO, + [ 36] = KO, ['O'] = 14, ['z'] = 51, [165] = KO, [208] = KO, [251] = KO, + [ 37] = KO, ['P'] = 15, [123] = KO, [166] = KO, [209] = KO, [252] = KO, + [ 38] = KO, ['Q'] = 16, [124] = KO, [167] = KO, [210] = KO, [253] = KO, + [ 39] = KO, ['R'] = 17, [125] = KO, [168] = KO, [211] = KO, [254] = KO, + [ 40] = KO, ['S'] = 18, [126] = KO, [169] = KO, [212] = KO, [255] = KO, + [ 41] = KO, ['T'] = 19, [127] = KO, [170] = KO, [213] = KO, + [ 42] = KO, ['U'] = 20, [128] = KO, [171] = KO, [214] = KO, +}; + +int32_t base64_encode(const uint8_t *in, + const uint32_t in_len, + uint8_t *out, + const uint32_t out_len) +{ + // Checking inputs. + if (in == NULL || out == NULL) { + return KNOT_EINVAL; + } + if (in_len > MAX_BIN_DATA_LEN || out_len < ((in_len + 2) / 3) * 4) { + return KNOT_ERANGE; + } + + uint8_t rest_len = in_len % 3; + const uint8_t *stop = in + in_len - rest_len; + uint8_t *text = out; + + // Encoding loop takes 3 bytes and creates 4 characters. + while (in < stop) { + text[0] = base64_enc[in[0] >> 2]; + text[1] = base64_enc[(in[0] & 0x03) << 4 | in[1] >> 4]; + text[2] = base64_enc[(in[1] & 0x0F) << 2 | in[2] >> 6]; + text[3] = base64_enc[in[2] & 0x3F]; + text += 4; + in += 3; + } + + // Processing of padding, if any. + switch (rest_len) { + case 2: + text[0] = base64_enc[in[0] >> 2]; + text[1] = base64_enc[(in[0] & 0x03) << 4 | in[1] >> 4]; + text[2] = base64_enc[(in[1] & 0x0F) << 2]; + text[3] = base64_pad; + text += 4; + break; + case 1: + text[0] = base64_enc[in[0] >> 2]; + text[1] = base64_enc[(in[0] & 0x03) << 4]; + text[2] = base64_pad; + text[3] = base64_pad; + text += 4; + break; + } + + return (text - out); +} + +int32_t base64_encode_alloc(const uint8_t *in, + const uint32_t in_len, + uint8_t **out) +{ + // Checking inputs. + if (out == NULL) { + return KNOT_EINVAL; + } + if (in_len > MAX_BIN_DATA_LEN) { + return KNOT_ERANGE; + } + + // Compute output buffer length. + uint32_t out_len = ((in_len + 2) / 3) * 4; + + // Allocate output buffer. + *out = malloc(out_len); + if (*out == NULL) { + return KNOT_ENOMEM; + } + + // Encode data. + int32_t ret = base64_encode(in, in_len, *out, out_len); + if (ret < 0) { + free(*out); + } + + return ret; +} + +int32_t base64_decode(const uint8_t *in, + const uint32_t in_len, + uint8_t *out, + const uint32_t out_len) +{ + // Checking inputs. + if (in == NULL || out == NULL) { + return KNOT_EINVAL; + } + if (in_len > INT32_MAX || out_len < ((in_len + 3) / 4) * 3) { + return KNOT_ERANGE; + } + if ((in_len % 4) != 0) { + return KNOT_BASE64_ESIZE; + } + + const uint8_t *stop = in + in_len; + uint8_t *bin = out; + uint8_t pad_len = 0; + uint8_t c1, c2, c3, c4; + + // Decoding loop takes 4 characters and creates 3 bytes. + while (in < stop) { + // Filling and transforming 4 Base64 chars. + c1 = base64_dec[in[0]]; + c2 = base64_dec[in[1]]; + c3 = base64_dec[in[2]]; + c4 = base64_dec[in[3]]; + + // Check 4. char if is bad or padding. + if (c4 >= PD) { + if (c4 == PD && pad_len == 0) { + pad_len = 1; + } else { + return KNOT_BASE64_ECHAR; + } + } + + // Check 3. char if is bad or padding. + if (c3 >= PD) { + if (c3 == PD && pad_len == 1) { + pad_len = 2; + } else { + return KNOT_BASE64_ECHAR; + } + } + + // Check 1. and 2. chars if are not padding. + if (c2 >= PD || c1 >= PD) { + return KNOT_BASE64_ECHAR; + } + + // Computing of output data based on padding length. + switch (pad_len) { + case 0: + bin[2] = (c3 << 6) + c4; + case 1: + bin[1] = (c2 << 4) + (c3 >> 2); + case 2: + bin[0] = (c1 << 2) + (c2 >> 4); + } + + // Update output end. + switch (pad_len) { + case 0: + bin += 3; + break; + case 1: + bin += 2; + break; + case 2: + bin += 1; + break; + } + + in += 4; + } + + return (bin - out); +} + +int32_t base64_decode_alloc(const uint8_t *in, + const uint32_t in_len, + uint8_t **out) +{ + // Checking inputs. + if (out == NULL) { + return KNOT_EINVAL; + } + + // Compute output buffer length. + uint32_t out_len = ((in_len + 3) / 4) * 3; + + // Allocate output buffer. + *out = malloc(out_len); + if (*out == NULL) { + return KNOT_ENOMEM; + } + + // Decode data. + int32_t ret = base64_decode(in, in_len, *out, out_len); + if (ret < 0) { + free(*out); + } + + return ret; +} diff --git a/contrib/base64.h b/contrib/base64.h new file mode 100644 index 0000000..0684274 --- /dev/null +++ b/contrib/base64.h @@ -0,0 +1,107 @@ +/* Copyright (C) 2011-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +/*! + * \file + * + * \brief Base64 implementation (RFC 4648). + * + * \addtogroup contrib + * @{ + */ + +#pragma once + +#include <stdint.h> + +/*! + * \brief Encodes binary data using Base64. + * + * \note Output data buffer contains Base64 text string which isn't + * terminated with '\0'! + * + * \param in Input binary data. + * \param in_len Length of input data. + * \param out Output data buffer. + * \param out_len Size of output buffer. + * + * \retval >=0 length of output string. + * \retval KNOT_E* if error. + */ +int32_t base64_encode(const uint8_t *in, + const uint32_t in_len, + uint8_t *out, + const uint32_t out_len); + +/*! + * \brief Encodes binary data using Base64 and output stores to own buffer. + * + * \note Output data buffer contains Base64 text string which isn't + * terminated with '\0'! + * + * \note Output buffer should be deallocated after use. + * + * \param in Input binary data. + * \param in_len Length of input data. + * \param out Output data buffer. + * + * \retval >=0 length of output string. + * \retval KNOT_E* if error. + */ +int32_t base64_encode_alloc(const uint8_t *in, + const uint32_t in_len, + uint8_t **out); + +/*! + * \brief Decodes text data using Base64. + * + * \note Input data needn't be terminated with '\0'. + * + * \note Input data must be continuous Base64 string! + * + * \param in Input text data. + * \param in_len Length of input string. + * \param out Output data buffer. + * \param out_len Size of output buffer. + * + * \retval >=0 length of output data. + * \retval KNOT_E* if error. + */ +int32_t base64_decode(const uint8_t *in, + const uint32_t in_len, + uint8_t *out, + const uint32_t out_len); + +/*! + * \brief Decodes text data using Base64 and output stores to own buffer. + * + * \note Input data needn't be terminated with '\0'. + * + * \note Input data must be continuous Base64 string! + * + * \note Output buffer should be deallocated after use. + * + * \param in Input text data. + * \param in_len Length of input string. + * \param out Output data buffer. + * + * \retval >=0 length of output data. + * \retval KNOT_E* if error. + */ +int32_t base64_decode_alloc(const uint8_t *in, + const uint32_t in_len, + uint8_t **out); + +/*! @} */ diff --git a/contrib/ccan/asprintf/LICENSE b/contrib/ccan/asprintf/LICENSE new file mode 100644 index 0000000..89de354 --- /dev/null +++ b/contrib/ccan/asprintf/LICENSE @@ -0,0 +1,17 @@ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/contrib/ccan/asprintf/_info b/contrib/ccan/asprintf/_info new file mode 100644 index 0000000..f72d668 --- /dev/null +++ b/contrib/ccan/asprintf/_info @@ -0,0 +1,45 @@ +#include "config.h" +#include <stdio.h> +#include <string.h> + +/** + * asprintf - asprintf wrapper (and if necessary, implementation). + * + * This provides a convenient wrapper for asprintf, and also implements + * asprintf if necessary. + * + * Author: Rusty Russell <rusty@rustcorp.com.au> + * + * License: MIT + * + * Example: + * #include <ccan/asprintf/asprintf.h> + * #include <unistd.h> + * #include <err.h> + * + * int main(int argc, char *argv[]) + * { + * char *p = afmt("This program has %i arguments", argc); + * int ret; + * + * while ((ret = write(STDOUT_FILENO, p, strlen(p))) > 0) { + * p += ret; + * if (!*p) + * exit(0); + * } + * err(1, "Writing to stdout"); + * } + */ +int main(int argc, char *argv[]) +{ + /* Expect exactly one argument */ + if (argc != 2) + return 1; + + if (strcmp(argv[1], "depends") == 0) { + printf("ccan/compiler\n"); + return 0; + } + + return 1; +} diff --git a/contrib/ccan/asprintf/asprintf.c b/contrib/ccan/asprintf/asprintf.c new file mode 100644 index 0000000..9e66e13 --- /dev/null +++ b/contrib/ccan/asprintf/asprintf.c @@ -0,0 +1,56 @@ +/* Licensed under BSD-MIT - see LICENSE file for details */ +#include <ccan/asprintf/asprintf.h> +#include <stdarg.h> +#include <stdio.h> + +char *PRINTF_FMT(1, 2) afmt(const char *fmt, ...) +{ + va_list ap; + char *ptr; + + va_start(ap, fmt); + /* The BSD version apparently sets ptr to NULL on fail. GNU loses. */ + if (vasprintf(&ptr, fmt, ap) < 0) + ptr = NULL; + va_end(ap); + return ptr; +} + +#if !HAVE_ASPRINTF && !defined(__USE_FORTIFY_LEVEL) +#include <stdarg.h> +#include <stdlib.h> + +int vasprintf(char **strp, const char *fmt, va_list ap) +{ + int len; + va_list ap_copy; + + /* We need to make a copy of ap, since it's a use-once. */ + va_copy(ap_copy, ap); + len = vsnprintf(NULL, 0, fmt, ap_copy); + va_end(ap_copy); + + /* Until version 2.0.6 glibc would return -1 on truncated output. + * OTOH, they had asprintf. */ + if (len < 0) + return -1; + + *strp = malloc(len+1); + if (!*strp) + return -1; + + return vsprintf(*strp, fmt, ap); +} + +int asprintf(char **strp, const char *fmt, ...) +{ + va_list ap; + int len; + + va_start(ap, fmt); + len = vasprintf(strp, fmt, ap); + va_end(ap); + + return len; +} +#endif /* !HAVE_ASPRINTF */ diff --git a/contrib/ccan/asprintf/asprintf.h b/contrib/ccan/asprintf/asprintf.h new file mode 100644 index 0000000..37da83e --- /dev/null +++ b/contrib/ccan/asprintf/asprintf.h @@ -0,0 +1,50 @@ +/* Licensed under BSD-MIT - see LICENSE file for details */ +#ifndef CCAN_ASPRINTF_H +#define CCAN_ASPRINTF_H +#include "config.h" +#include <ccan/compiler/compiler.h> + +/** + * afmt - allocate and populate a string with the given format. + * @fmt: printf-style format. + * + * This is a simplified asprintf interface. Returns NULL on error. + */ +char *PRINTF_FMT(1, 2) afmt(const char *fmt, ...); + +#if HAVE_ASPRINTF || defined(__USE_FORTIFY_LEVEL) +#include <stdio.h> +#else +#include <stdarg.h> +/** + * asprintf - printf to a dynamically-allocated string. + * @strp: pointer to the string to allocate. + * @fmt: printf-style format. + * + * Returns -1 (and leaves @strp undefined) on an error. Otherwise returns + * number of bytes printed into @strp. + * + * Example: + * static char *greeting(const char *name) + * { + * char *str; + * int len = asprintf(&str, "Hello %s", name); + * if (len < 0) + * return NULL; + * return str; + * } + */ +int PRINTF_FMT(2, 3) asprintf(char **strp, const char *fmt, ...); + +/** + * vasprintf - vprintf to a dynamically-allocated string. + * @strp: pointer to the string to allocate. + * @fmt: printf-style format. + * + * Returns -1 (and leaves @strp undefined) on an error. Otherwise returns + * number of bytes printed into @strp. + */ +int vasprintf(char **strp, const char *fmt, va_list ap); +#endif + +#endif /* CCAN_ASPRINTF_H */ diff --git a/contrib/ccan/compiler/LICENSE b/contrib/ccan/compiler/LICENSE new file mode 100644 index 0000000..feb9b11 --- /dev/null +++ b/contrib/ccan/compiler/LICENSE @@ -0,0 +1,28 @@ +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer exclusive Copyright and Related Rights (defined below) upon the creator and subsequent owner(s) (each and all, an "owner") of an original work of authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for the purpose of contributing to a commons of creative, cultural and scientific works ("Commons") that the public can reliably and without fear of later claims of infringement build upon, modify, incorporate in other works, reuse and redistribute as freely as possible in any form whatsoever and for any purposes, including without limitation commercial purposes. These owners may contribute to the Commons to promote the ideal of a free culture and the further production of creative, cultural and scientific works, or to gain reputation or greater distribution for their Work in part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any expectation of additional consideration or compensation, the person associating CC0 with a Work (the "Affirmer"), to the extent that he or she is an owner of Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and publicly distribute the Work under its terms, with knowledge of his or her Copyright and Related Rights in the Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be protected by copyright and related or neighboring rights ("Copyright and Related Rights"). Copyright and Related Rights include, but are not limited to, the following: + + the right to reproduce, adapt, distribute, perform, display, communicate, and translate a Work; + moral rights retained by the original author(s) and/or performer(s); + publicity and privacy rights pertaining to a person's image or likeness depicted in a Work; + rights protecting against unfair competition in regards to a Work, subject to the limitations in paragraph 4(a), below; + rights protecting the extraction, dissemination, use and reuse of data in a Work; + database rights (such as those arising under Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, and under any national implementation thereof, including any amended or successor version of such directive); and + other similar, equivalent or corresponding rights throughout the world based on applicable law or treaty, and any national implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention of, applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and unconditionally waives, abandons, and surrenders all of Affirmer's Copyright and Related Rights and associated claims and causes of action, whether now known or unknown (including existing as well as future claims and causes of action), in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each member of the public at large and to the detriment of Affirmer's heirs and successors, fully intending that such Waiver shall not be subject to revocation, rescission, cancellation, termination, or any other legal or equitable action to disrupt the quiet enjoyment of the Work by the public as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason be judged legally invalid or ineffective under applicable law, then the Waiver shall be preserved to the maximum extent permitted taking into account Affirmer's express Statement of Purpose. In addition, to the extent the Waiver is so judged Affirmer hereby grants to each affected person a royalty-free, non transferable, non sublicensable, non exclusive, irrevocable and unconditional license to exercise Affirmer's Copyright and Related Rights in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "License"). The License shall be deemed effective as of the date CC0 was applied by Affirmer to the Work. Should any part of the License for any reason be judged legally invalid or ineffective under applicable law, such partial invalidity or ineffectiveness shall not invalidate the remainder of the License, and in such case Affirmer hereby affirms that he or she will not (i) exercise any of his or her remaining Copyright and Related Rights in the Work or (ii) assert any associated claims and causes of action with respect to the Work, in either case contrary to Affirmer's express Statement of Purpose. + +4. Limitations and Disclaimers. + + No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected by this document. + Affirmer offers the Work as-is and makes no representations or warranties of any kind concerning the Work, express, implied, statutory or otherwise, including without limitation warranties of title, merchantability, fitness for a particular purpose, non infringement, or the absence of latent or other defects, accuracy, or the present or absence of errors, whether or not discoverable, all to the greatest extent permissible under applicable law. + Affirmer disclaims responsibility for clearing rights of other persons that may apply to the Work or any use thereof, including without limitation any person's Copyright and Related Rights in the Work. Further, Affirmer disclaims responsibility for obtaining any necessary consents, permissions or other rights required for any use of the Work. + Affirmer understands and acknowledges that Creative Commons is not a party to this document and has no duty or obligation with respect to this CC0 or use of the Work. diff --git a/contrib/ccan/compiler/_info b/contrib/ccan/compiler/_info new file mode 100644 index 0000000..d60dff4 --- /dev/null +++ b/contrib/ccan/compiler/_info @@ -0,0 +1,64 @@ +#include "config.h" +#include <string.h> +#include <stdio.h> + +/** + * compiler - macros for common compiler extensions + * + * Abstracts away some compiler hints. Currently these include: + * - COLD + * For functions not called in fast paths (aka. cold functions) + * - PRINTF_FMT + * For functions which take printf-style parameters. + * - CONST_FUNCTION + * For functions which return the same value for same parameters. + * - NEEDED + * For functions and variables which must be emitted even if unused. + * - UNNEEDED + * For functions and variables which need not be emitted if unused. + * - UNUSED + * For parameters which are not used. + * - IS_COMPILE_CONSTANT() + * For using different tradeoffs for compiletime vs runtime evaluation. + * + * License: CC0 (Public domain) + * Author: Rusty Russell <rusty@rustcorp.com.au> + * + * Example: + * #include <ccan/compiler/compiler.h> + * #include <stdio.h> + * #include <stdarg.h> + * + * // Example of a (slow-path) logging function. + * static int log_threshold = 2; + * static void COLD PRINTF_FMT(2,3) + * logger(int level, const char *fmt, ...) + * { + * va_list ap; + * va_start(ap, fmt); + * if (level >= log_threshold) + * vfprintf(stderr, fmt, ap); + * va_end(ap); + * } + * + * int main(int argc, char *argv[]) + * { + * if (argc != 1) { + * logger(3, "Don't want %i arguments!\n", argc-1); + * return 1; + * } + * return 0; + * } + */ +int main(int argc, char *argv[]) +{ + /* Expect exactly one argument */ + if (argc != 2) + return 1; + + if (strcmp(argv[1], "depends") == 0) { + return 0; + } + + return 1; +} diff --git a/contrib/ccan/compiler/compiler.h b/contrib/ccan/compiler/compiler.h new file mode 100644 index 0000000..bce4f25 --- /dev/null +++ b/contrib/ccan/compiler/compiler.h @@ -0,0 +1,231 @@ +/* CC0 (Public domain) - see LICENSE file for details */ +#ifndef CCAN_COMPILER_H +#define CCAN_COMPILER_H +#include "config.h" + +#ifndef COLD +#if HAVE_ATTRIBUTE_COLD +/** + * COLD - a function is unlikely to be called. + * + * Used to mark an unlikely code path and optimize appropriately. + * It is usually used on logging or error routines. + * + * Example: + * static void COLD moan(const char *reason) + * { + * fprintf(stderr, "Error: %s (%s)\n", reason, strerror(errno)); + * } + */ +#define COLD __attribute__((__cold__)) +#else +#define COLD +#endif +#endif + +#ifndef NORETURN +#if HAVE_ATTRIBUTE_NORETURN +/** + * NORETURN - a function does not return + * + * Used to mark a function which exits; useful for suppressing warnings. + * + * Example: + * static void NORETURN fail(const char *reason) + * { + * fprintf(stderr, "Error: %s (%s)\n", reason, strerror(errno)); + * exit(1); + * } + */ +#define NORETURN __attribute__((__noreturn__)) +#else +#define NORETURN +#endif +#endif + +#ifndef PRINTF_FMT +#if HAVE_ATTRIBUTE_PRINTF +/** + * PRINTF_FMT - a function takes printf-style arguments + * @nfmt: the 1-based number of the function's format argument. + * @narg: the 1-based number of the function's first variable argument. + * + * This allows the compiler to check your parameters as it does for printf(). + * + * Example: + * void PRINTF_FMT(2,3) my_printf(const char *prefix, const char *fmt, ...); + */ +#define PRINTF_FMT(nfmt, narg) \ + __attribute__((format(__printf__, nfmt, narg))) +#else +#define PRINTF_FMT(nfmt, narg) +#endif +#endif + +#ifndef CONST_FUNCTION +#if HAVE_ATTRIBUTE_CONST +/** + * CONST_FUNCTION - a function's return depends only on its argument + * + * This allows the compiler to assume that the function will return the exact + * same value for the exact same arguments. This implies that the function + * must not use global variables, or dereference pointer arguments. + */ +#define CONST_FUNCTION __attribute__((__const__)) +#else +#define CONST_FUNCTION +#endif + +#ifndef PURE_FUNCTION +#if HAVE_ATTRIBUTE_PURE +/** + * PURE_FUNCTION - a function is pure + * + * A pure function is one that has no side effects other than it's return value + * and uses no inputs other than it's arguments and global variables. + */ +#define PURE_FUNCTION __attribute__((__pure__)) +#else +#define PURE_FUNCTION +#endif +#endif +#endif + +#if HAVE_ATTRIBUTE_UNUSED +#ifndef UNNEEDED +/** + * UNNEEDED - a variable/function may not be needed + * + * This suppresses warnings about unused variables or functions, but tells + * the compiler that if it is unused it need not emit it into the source code. + * + * Example: + * // With some preprocessor options, this is unnecessary. + * static UNNEEDED int counter; + * + * // With some preprocessor options, this is unnecessary. + * static UNNEEDED void add_to_counter(int add) + * { + * counter += add; + * } + */ +#define UNNEEDED __attribute__((__unused__)) +#endif + +#ifndef NEEDED +#if HAVE_ATTRIBUTE_USED +/** + * NEEDED - a variable/function is needed + * + * This suppresses warnings about unused variables or functions, but tells + * the compiler that it must exist even if it (seems) unused. + * + * Example: + * // Even if this is unused, these are vital for debugging. + * static NEEDED int counter; + * static NEEDED void dump_counter(void) + * { + * printf("Counter is %i\n", counter); + * } + */ +#define NEEDED __attribute__((__used__)) +#else +/* Before used, unused functions and vars were always emitted. */ +#define NEEDED __attribute__((__unused__)) +#endif +#endif + +#ifndef UNUSED +/** + * UNUSED - a parameter is unused + * + * Some compilers (eg. gcc with -W or -Wunused) warn about unused + * function parameters. This suppresses such warnings and indicates + * to the reader that it's deliberate. + * + * Example: + * // This is used as a callback, so needs to have this prototype. + * static int some_callback(void *unused UNUSED) + * { + * return 0; + * } + */ +#define UNUSED __attribute__((__unused__)) +#endif +#else +#ifndef UNNEEDED +#define UNNEEDED +#endif +#ifndef NEEDED +#define NEEDED +#endif +#ifndef UNUSED +#define UNUSED +#endif +#endif + +#ifndef IS_COMPILE_CONSTANT +#if HAVE_BUILTIN_CONSTANT_P +/** + * IS_COMPILE_CONSTANT - does the compiler know the value of this expression? + * @expr: the expression to evaluate + * + * When an expression manipulation is complicated, it is usually better to + * implement it in a function. However, if the expression being manipulated is + * known at compile time, it is better to have the compiler see the entire + * expression so it can simply substitute the result. + * + * This can be done using the IS_COMPILE_CONSTANT() macro. + * + * Example: + * enum greek { ALPHA, BETA, GAMMA, DELTA, EPSILON }; + * + * // Out-of-line version. + * const char *greek_name(enum greek greek); + * + * // Inline version. + * static inline const char *_greek_name(enum greek greek) + * { + * switch (greek) { + * case ALPHA: return "alpha"; + * case BETA: return "beta"; + * case GAMMA: return "gamma"; + * case DELTA: return "delta"; + * case EPSILON: return "epsilon"; + * default: return "**INVALID**"; + * } + * } + * + * // Use inline if compiler knows answer. Otherwise call function + * // to avoid copies of the same code everywhere. + * #define greek_name(g) \ + * (IS_COMPILE_CONSTANT(greek) ? _greek_name(g) : greek_name(g)) + */ +#define IS_COMPILE_CONSTANT(expr) __builtin_constant_p(expr) +#else +/* If we don't know, assume it's not. */ +#define IS_COMPILE_CONSTANT(expr) 0 +#endif +#endif + +#ifndef WARN_UNUSED_RESULT +#if HAVE_WARN_UNUSED_RESULT +/** + * WARN_UNUSED_RESULT - warn if a function return value is unused. + * + * Used to mark a function where it is extremely unlikely that the caller + * can ignore the result, eg realloc(). + * + * Example: + * // buf param may be freed by this; need return value! + * static char *WARN_UNUSED_RESULT enlarge(char *buf, unsigned *size) + * { + * return realloc(buf, (*size) *= 2); + * } + */ +#define WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) +#else +#define WARN_UNUSED_RESULT +#endif +#endif +#endif /* CCAN_COMPILER_H */ diff --git a/contrib/ccan/compiler/test/compile_fail-printf.c b/contrib/ccan/compiler/test/compile_fail-printf.c new file mode 100644 index 0000000..8f34ae5 --- /dev/null +++ b/contrib/ccan/compiler/test/compile_fail-printf.c @@ -0,0 +1,22 @@ +#include <ccan/compiler/compiler.h> + +static void PRINTF_FMT(2,3) my_printf(int x, const char *fmt, ...) +{ +} + +int main(int argc, char *argv[]) +{ + unsigned int i = 0; + + my_printf(1, "Not a pointer " +#ifdef FAIL + "%p", +#if !HAVE_ATTRIBUTE_PRINTF +#error "Unfortunately we don't fail if !HAVE_ATTRIBUTE_PRINTF." +#endif +#else + "%i", +#endif + i); + return 0; +} diff --git a/contrib/ccan/compiler/test/run-is_compile_constant.c b/contrib/ccan/compiler/test/run-is_compile_constant.c new file mode 100644 index 0000000..a66f2e1 --- /dev/null +++ b/contrib/ccan/compiler/test/run-is_compile_constant.c @@ -0,0 +1,15 @@ +#include <ccan/compiler/compiler.h> +#include <ccan/tap/tap.h> + +int main(int argc, char *argv[]) +{ + plan_tests(2); + + ok1(!IS_COMPILE_CONSTANT(argc)); +#if HAVE_BUILTIN_CONSTANT_P + ok1(IS_COMPILE_CONSTANT(7)); +#else + pass("If !HAVE_BUILTIN_CONSTANT_P, IS_COMPILE_CONSTANT always false"); +#endif + return exit_status(); +} diff --git a/contrib/ccan/ilog/LICENSE b/contrib/ccan/ilog/LICENSE new file mode 100644 index 0000000..feb9b11 --- /dev/null +++ b/contrib/ccan/ilog/LICENSE @@ -0,0 +1,28 @@ +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer exclusive Copyright and Related Rights (defined below) upon the creator and subsequent owner(s) (each and all, an "owner") of an original work of authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for the purpose of contributing to a commons of creative, cultural and scientific works ("Commons") that the public can reliably and without fear of later claims of infringement build upon, modify, incorporate in other works, reuse and redistribute as freely as possible in any form whatsoever and for any purposes, including without limitation commercial purposes. These owners may contribute to the Commons to promote the ideal of a free culture and the further production of creative, cultural and scientific works, or to gain reputation or greater distribution for their Work in part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any expectation of additional consideration or compensation, the person associating CC0 with a Work (the "Affirmer"), to the extent that he or she is an owner of Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and publicly distribute the Work under its terms, with knowledge of his or her Copyright and Related Rights in the Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be protected by copyright and related or neighboring rights ("Copyright and Related Rights"). Copyright and Related Rights include, but are not limited to, the following: + + the right to reproduce, adapt, distribute, perform, display, communicate, and translate a Work; + moral rights retained by the original author(s) and/or performer(s); + publicity and privacy rights pertaining to a person's image or likeness depicted in a Work; + rights protecting against unfair competition in regards to a Work, subject to the limitations in paragraph 4(a), below; + rights protecting the extraction, dissemination, use and reuse of data in a Work; + database rights (such as those arising under Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, and under any national implementation thereof, including any amended or successor version of such directive); and + other similar, equivalent or corresponding rights throughout the world based on applicable law or treaty, and any national implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention of, applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and unconditionally waives, abandons, and surrenders all of Affirmer's Copyright and Related Rights and associated claims and causes of action, whether now known or unknown (including existing as well as future claims and causes of action), in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each member of the public at large and to the detriment of Affirmer's heirs and successors, fully intending that such Waiver shall not be subject to revocation, rescission, cancellation, termination, or any other legal or equitable action to disrupt the quiet enjoyment of the Work by the public as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason be judged legally invalid or ineffective under applicable law, then the Waiver shall be preserved to the maximum extent permitted taking into account Affirmer's express Statement of Purpose. In addition, to the extent the Waiver is so judged Affirmer hereby grants to each affected person a royalty-free, non transferable, non sublicensable, non exclusive, irrevocable and unconditional license to exercise Affirmer's Copyright and Related Rights in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "License"). The License shall be deemed effective as of the date CC0 was applied by Affirmer to the Work. Should any part of the License for any reason be judged legally invalid or ineffective under applicable law, such partial invalidity or ineffectiveness shall not invalidate the remainder of the License, and in such case Affirmer hereby affirms that he or she will not (i) exercise any of his or her remaining Copyright and Related Rights in the Work or (ii) assert any associated claims and causes of action with respect to the Work, in either case contrary to Affirmer's express Statement of Purpose. + +4. Limitations and Disclaimers. + + No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected by this document. + Affirmer offers the Work as-is and makes no representations or warranties of any kind concerning the Work, express, implied, statutory or otherwise, including without limitation warranties of title, merchantability, fitness for a particular purpose, non infringement, or the absence of latent or other defects, accuracy, or the present or absence of errors, whether or not discoverable, all to the greatest extent permissible under applicable law. + Affirmer disclaims responsibility for clearing rights of other persons that may apply to the Work or any use thereof, including without limitation any person's Copyright and Related Rights in the Work. Further, Affirmer disclaims responsibility for obtaining any necessary consents, permissions or other rights required for any use of the Work. + Affirmer understands and acknowledges that Creative Commons is not a party to this document and has no duty or obligation with respect to this CC0 or use of the Work. diff --git a/contrib/ccan/ilog/_info b/contrib/ccan/ilog/_info new file mode 100644 index 0000000..f1f3f2d --- /dev/null +++ b/contrib/ccan/ilog/_info @@ -0,0 +1,50 @@ +/** + * ilog - Integer logarithm. + * + * ilog_32() and ilog_64() compute the minimum number of bits required to store + * an unsigned 32-bit or 64-bit value without any leading zero bits. + * + * This can also be thought of as the location of the highest set bit, with + * counting starting from one (so that 0 returns 0, 1 returns 1, and 2**31 + * returns 32). + * + * When the value is known to be non-zero ilog32_nz() and ilog64_nz() can + * compile into as few as two instructions, one of which may get optimized out + * later. + * + * STATIC_ILOG_32 and STATIC_ILOG_64 allow computation on compile-time + * constants, so other compile-time constants can be derived from them. + * + * Example: + * #include <stdio.h> + * #include <limits.h> + * #include <ccan/ilog/ilog.h> + * + * int main(void){ + * int i; + * printf("ilog32(0x%08X)=%i\n",0,ilog32(0)); + * for(i=1;i<=STATIC_ILOG_32(USHRT_MAX);i++){ + * uint32_t v; + * v=(uint32_t)1U<<(i-1); + * //Here we know v is non-zero, so we can use ilog32_nz(). + * printf("ilog32(0x%08X)=%i\n",v,ilog32_nz(v)); + * } + * return 0; + * } + * + * License: CC0 (Public domain) + * Author: Timothy B. Terriberry <tterribe@xiph.org> + */ +#include "config.h" +#include <string.h> +#include <stdio.h> + +int main(int _argc,const char *_argv[]){ + /*Expect exactly one argument.*/ + if(_argc!=2)return 1; + if(strcmp(_argv[1],"depends")==0){ + printf("ccan/compiler\n"); + return 0; + } + return 1; +} diff --git a/contrib/ccan/ilog/ilog.c b/contrib/ccan/ilog/ilog.c new file mode 100644 index 0000000..5f5122d --- /dev/null +++ b/contrib/ccan/ilog/ilog.c @@ -0,0 +1,141 @@ +/*(C) Timothy B. Terriberry (tterribe@xiph.org) 2001-2009 CC0 (Public domain). + * See LICENSE file for details. */ +#include "ilog.h" +#include <limits.h> + +/*The fastest fallback strategy for platforms with fast multiplication appears + to be based on de Bruijn sequences~\cite{LP98}. + Tests confirmed this to be true even on an ARM11, where it is actually faster + than using the native clz instruction. + Define ILOG_NODEBRUIJN to use a simpler fallback on platforms where + multiplication or table lookups are too expensive. + + @UNPUBLISHED{LP98, + author="Charles E. Leiserson and Harald Prokop", + title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word", + month=Jun, + year=1998, + note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}" + }*/ +static UNNEEDED const unsigned char DEBRUIJN_IDX32[32]={ + 0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8, + 31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9 +}; + +/* We always compile these in, in case someone takes address of function. */ +#undef ilog32_nz +#undef ilog32 +#undef ilog64_nz +#undef ilog64 + +int ilog32(uint32_t _v){ +/*On a Pentium M, this branchless version tested as the fastest version without + multiplications on 1,000,000,000 random 32-bit integers, edging out a + similar version with branches, and a 256-entry LUT version.*/ +# if defined(ILOG_NODEBRUIJN) + int ret; + int m; + ret=_v>0; + m=(_v>0xFFFFU)<<4; + _v>>=m; + ret|=m; + m=(_v>0xFFU)<<3; + _v>>=m; + ret|=m; + m=(_v>0xFU)<<2; + _v>>=m; + ret|=m; + m=(_v>3)<<1; + _v>>=m; + ret|=m; + ret+=_v>1; + return ret; +/*This de Bruijn sequence version is faster if you have a fast multiplier.*/ +# else + int ret; + ret=_v>0; + _v|=_v>>1; + _v|=_v>>2; + _v|=_v>>4; + _v|=_v>>8; + _v|=_v>>16; + _v=(_v>>1)+1; + ret+=DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F]; + return ret; +# endif +} + +int ilog32_nz(uint32_t _v) +{ + return ilog32(_v); +} + +int ilog64(uint64_t _v){ +# if defined(ILOG_NODEBRUIJN) + uint32_t v; + int ret; + int m; + ret=_v>0; + m=(_v>0xFFFFFFFFU)<<5; + v=(uint32_t)(_v>>m); + ret|=m; + m=(v>0xFFFFU)<<4; + v>>=m; + ret|=m; + m=(v>0xFFU)<<3; + v>>=m; + ret|=m; + m=(v>0xFU)<<2; + v>>=m; + ret|=m; + m=(v>3)<<1; + v>>=m; + ret|=m; + ret+=v>1; + return ret; +# else +/*If we don't have a 64-bit word, split it into two 32-bit halves.*/ +# if LONG_MAX<9223372036854775807LL + uint32_t v; + int ret; + int m; + ret=_v>0; + m=(_v>0xFFFFFFFFU)<<5; + v=(uint32_t)(_v>>m); + ret|=m; + v|=v>>1; + v|=v>>2; + v|=v>>4; + v|=v>>8; + v|=v>>16; + v=(v>>1)+1; + ret+=DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F]; + return ret; +/*Otherwise do it in one 64-bit operation.*/ +# else + static const unsigned char DEBRUIJN_IDX64[64]={ + 0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40, + 5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57, + 63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56, + 62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58 + }; + int ret; + ret=_v>0; + _v|=_v>>1; + _v|=_v>>2; + _v|=_v>>4; + _v|=_v>>8; + _v|=_v>>16; + _v|=_v>>32; + _v=(_v>>1)+1; + ret+=DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F]; + return ret; +# endif +# endif +} + +int ilog64_nz(uint64_t _v) +{ + return ilog64(_v); +} + diff --git a/contrib/ccan/ilog/ilog.h b/contrib/ccan/ilog/ilog.h new file mode 100644 index 0000000..9adbb82 --- /dev/null +++ b/contrib/ccan/ilog/ilog.h @@ -0,0 +1,151 @@ +/* CC0 (Public domain) - see LICENSE file for details */ +#if !defined(_ilog_H) +# define _ilog_H (1) +# include "config.h" +# include <stdint.h> +# include <limits.h> +# include <ccan/compiler/compiler.h> + +/** + * ilog32 - Integer binary logarithm of a 32-bit value. + * @_v: A 32-bit value. + * Returns floor(log2(_v))+1, or 0 if _v==0. + * This is the number of bits that would be required to represent _v in two's + * complement notation with all of the leading zeros stripped. + * Note that many uses will resolve to the fast macro version instead. + * + * See Also: + * ilog32_nz(), ilog64() + * + * Example: + * // Rounds up to next power of 2 (if not a power of 2). + * static uint32_t round_up32(uint32_t i) + * { + * assert(i != 0); + * return 1U << ilog32(i-1); + * } + */ +int ilog32(uint32_t _v) CONST_FUNCTION; + +/** + * ilog32_nz - Integer binary logarithm of a non-zero 32-bit value. + * @_v: A 32-bit value. + * Returns floor(log2(_v))+1, or undefined if _v==0. + * This is the number of bits that would be required to represent _v in two's + * complement notation with all of the leading zeros stripped. + * Note that many uses will resolve to the fast macro version instead. + * See Also: + * ilog32(), ilog64_nz() + * Example: + * // Find Last Set (ie. highest bit set, 0 to 31). + * static uint32_t fls32(uint32_t i) + * { + * assert(i != 0); + * return ilog32_nz(i) - 1; + * } + */ +int ilog32_nz(uint32_t _v) CONST_FUNCTION; + +/** + * ilog64 - Integer binary logarithm of a 64-bit value. + * @_v: A 64-bit value. + * Returns floor(log2(_v))+1, or 0 if _v==0. + * This is the number of bits that would be required to represent _v in two's + * complement notation with all of the leading zeros stripped. + * Note that many uses will resolve to the fast macro version instead. + * See Also: + * ilog64_nz(), ilog32() + */ +int ilog64(uint64_t _v) CONST_FUNCTION; + +/** + * ilog64_nz - Integer binary logarithm of a non-zero 64-bit value. + * @_v: A 64-bit value. + * Returns floor(log2(_v))+1, or undefined if _v==0. + * This is the number of bits that would be required to represent _v in two's + * complement notation with all of the leading zeros stripped. + * Note that many uses will resolve to the fast macro version instead. + * See Also: + * ilog64(), ilog32_nz() + */ +int ilog64_nz(uint64_t _v) CONST_FUNCTION; + +/** + * STATIC_ILOG_32 - The integer logarithm of an (unsigned, 32-bit) constant. + * @_v: A non-negative 32-bit constant. + * Returns floor(log2(_v))+1, or 0 if _v==0. + * This is the number of bits that would be required to represent _v in two's + * complement notation with all of the leading zeros stripped. + * This macro should only be used when you need a compile-time constant, + * otherwise ilog32 or ilog32_nz are just as fast and more flexible. + * + * Example: + * #define MY_PAGE_SIZE 4096 + * #define MY_PAGE_BITS (STATIC_ILOG_32(PAGE_SIZE) - 1) + */ +#define STATIC_ILOG_32(_v) (STATIC_ILOG5((uint32_t)(_v))) + +/** + * STATIC_ILOG_64 - The integer logarithm of an (unsigned, 64-bit) constant. + * @_v: A non-negative 64-bit constant. + * Returns floor(log2(_v))+1, or 0 if _v==0. + * This is the number of bits that would be required to represent _v in two's + * complement notation with all of the leading zeros stripped. + * This macro should only be used when you need a compile-time constant, + * otherwise ilog64 or ilog64_nz are just as fast and more flexible. + */ +#define STATIC_ILOG_64(_v) (STATIC_ILOG6((uint64_t)(_v))) + +/* Private implementation details */ + +/*Note the casts to (int) below: this prevents "upgrading" + the type of an entire expression to an (unsigned) size_t.*/ +#if INT_MAX>=2147483647 && HAVE_BUILTIN_CLZ +#define builtin_ilog32_nz(v) \ + (((int)sizeof(unsigned)*CHAR_BIT) - __builtin_clz(v)) +#elif LONG_MAX>=2147483647L && HAVE_BUILTIN_CLZL +#define builtin_ilog32_nz(v) \ + (((int)sizeof(unsigned)*CHAR_BIT) - __builtin_clzl(v)) +#endif + +#if INT_MAX>=9223372036854775807LL && HAVE_BUILTIN_CLZ +#define builtin_ilog64_nz(v) \ + (((int)sizeof(unsigned)*CHAR_BIT) - __builtin_clz(v)) +#elif LONG_MAX>=9223372036854775807LL && HAVE_BUILTIN_CLZL +#define builtin_ilog64_nz(v) \ + (((int)sizeof(unsigned long)*CHAR_BIT) - __builtin_clzl(v)) +#elif HAVE_BUILTIN_CLZLL +#define builtin_ilog64_nz(v) \ + (((int)sizeof(unsigned long long)*CHAR_BIT) - __builtin_clzll(v)) +#endif + +#ifdef builtin_ilog32_nz +#define ilog32(_v) (builtin_ilog32_nz(_v)&-!!(_v)) +#define ilog32_nz(_v) builtin_ilog32_nz(_v) +#else +#define ilog32_nz(_v) ilog32(_v) +#define ilog32(_v) (IS_COMPILE_CONSTANT(_v) ? STATIC_ILOG_32(_v) : ilog32(_v)) +#endif /* builtin_ilog32_nz */ + +#ifdef builtin_ilog64_nz +#define ilog64(_v) (builtin_ilog64_nz(_v)&-!!(_v)) +#define ilog64_nz(_v) builtin_ilog64_nz(_v) +#else +#define ilog64_nz(_v) ilog64(_v) +#define ilog64(_v) (IS_COMPILE_CONSTANT(_v) ? STATIC_ILOG_64(_v) : ilog64(_v)) +#endif /* builtin_ilog64_nz */ + +/* Macros for evaluating compile-time constant ilog. */ +# define STATIC_ILOG0(_v) (!!(_v)) +# define STATIC_ILOG1(_v) (((_v)&0x2)?2:STATIC_ILOG0(_v)) +# define STATIC_ILOG2(_v) (((_v)&0xC)?2+STATIC_ILOG1((_v)>>2):STATIC_ILOG1(_v)) +# define STATIC_ILOG3(_v) \ + (((_v)&0xF0)?4+STATIC_ILOG2((_v)>>4):STATIC_ILOG2(_v)) +# define STATIC_ILOG4(_v) \ + (((_v)&0xFF00)?8+STATIC_ILOG3((_v)>>8):STATIC_ILOG3(_v)) +# define STATIC_ILOG5(_v) \ + (((_v)&0xFFFF0000)?16+STATIC_ILOG4((_v)>>16):STATIC_ILOG4(_v)) +# define STATIC_ILOG6(_v) \ + (((_v)&0xFFFFFFFF00000000ULL)?32+STATIC_ILOG5((_v)>>32):STATIC_ILOG5(_v)) + +#endif /* _ilog_H */ diff --git a/contrib/ccan/ilog/test/run-out-of-line.c b/contrib/ccan/ilog/test/run-out-of-line.c new file mode 100644 index 0000000..48205d3 --- /dev/null +++ b/contrib/ccan/ilog/test/run-out-of-line.c @@ -0,0 +1,65 @@ +#include <ccan/ilog/ilog.h> +#include <ccan/ilog/ilog.c> +#include <stdio.h> +#include <ccan/tap/tap.h> + +/*Dead simple (but slow) versions to compare against.*/ + +static int test_ilog32(uint32_t _v){ + int ret; + for(ret=0;_v;ret++)_v>>=1; + return ret; +} + +static int test_ilog64(uint64_t _v){ + int ret; + for(ret=0;_v;ret++)_v>>=1; + return ret; +} + +#define NTRIALS (64) + +int main(int _argc,const char *_argv[]){ + int i; + int j; + int (*il32)(uint32_t) = ilog32; + int (*il64)(uint64_t) = ilog64; + int (*il32_nz)(uint32_t) = ilog32_nz; + int (*il64_nz)(uint64_t) = ilog64_nz; + + /*This is how many tests you plan to run.*/ + plan_tests(33 * NTRIALS * 3 + 65 * NTRIALS * 3); + for(i=0;i<=32;i++){ + uint32_t v; + /*Test each bit in turn (and 0).*/ + v=i?(uint32_t)1U<<(i-1):0; + for(j=0;j<NTRIALS;j++){ + int l; + l=test_ilog32(v); + ok1(STATIC_ILOG_32(v)==l); + ok1(il32(v)==l); + ok1(il32_nz(v) == l || v == 0); + /*Also try a few more pseudo-random values with at most the same number + of bits.*/ + v=(1103515245U*v+12345U)&0xFFFFFFFFU>>((33-i)>>1)>>((32-i)>>1); + } + } + + for(i=0;i<=64;i++){ + uint64_t v; + /*Test each bit in turn (and 0).*/ + v=i?(uint64_t)1U<<(i-1):0; + for(j=0;j<NTRIALS;j++){ + int l; + l=test_ilog64(v); + ok1(STATIC_ILOG_64(v)==l); + ok1(il64(v)==l); + ok1(il64_nz(v) == l || v == 0); + /*Also try a few more pseudo-random values with at most the same number + of bits.*/ + v=(uint64_t)((2862933555777941757ULL*v+3037000493ULL) + &0xFFFFFFFFFFFFFFFFULL>>((65-i)>>1)>>((64-i)>>1)); + } + } + return exit_status(); +} diff --git a/contrib/ccan/ilog/test/run.c b/contrib/ccan/ilog/test/run.c new file mode 100644 index 0000000..bda59f9 --- /dev/null +++ b/contrib/ccan/ilog/test/run.c @@ -0,0 +1,60 @@ +#include <ccan/ilog/ilog.h> +#include <ccan/ilog/ilog.c> +#include <stdio.h> +#include <ccan/tap/tap.h> + +/*Dead simple (but slow) versions to compare against.*/ + +static int test_ilog32(uint32_t _v){ + int ret; + for(ret=0;_v;ret++)_v>>=1; + return ret; +} + +static int test_ilog64(uint64_t _v){ + int ret; + for(ret=0;_v;ret++)_v>>=1; + return ret; +} + +#define NTRIALS (64) + +int main(int _argc,const char *_argv[]){ + int i; + int j; + /*This is how many tests you plan to run.*/ + plan_tests(33 * NTRIALS * 3 + 65 * NTRIALS * 3); + for(i=0;i<=32;i++){ + uint32_t v; + /*Test each bit in turn (and 0).*/ + v=i?(uint32_t)1U<<(i-1):0; + for(j=0;j<NTRIALS;j++){ + int l; + l=test_ilog32(v); + ok1(STATIC_ILOG_32(v)==l); + ok1(ilog32(v)==l); + ok1(ilog32_nz(v) == l || v == 0); + /*Also try a few more pseudo-random values with at most the same number + of bits.*/ + v=(1103515245U*v+12345U)&0xFFFFFFFFU>>((33-i)>>1)>>((32-i)>>1); + } + } + + for(i=0;i<=64;i++){ + uint64_t v; + /*Test each bit in turn (and 0).*/ + v=i?(uint64_t)1U<<(i-1):0; + for(j=0;j<NTRIALS;j++){ + int l; + l=test_ilog64(v); + ok1(STATIC_ILOG_64(v)==l); + ok1(ilog64(v)==l); + ok1(ilog64_nz(v) == l || v == 0); + /*Also try a few more pseudo-random values with at most the same number + of bits.*/ + v=(uint64_t)((2862933555777941757ULL*v+3037000493ULL) + &0xFFFFFFFFFFFFFFFFULL>>((65-i)>>1)>>((64-i)>>1)); + } + } + return exit_status(); +} diff --git a/contrib/ccan/json/LICENSE b/contrib/ccan/json/LICENSE new file mode 100644 index 0000000..89de354 --- /dev/null +++ b/contrib/ccan/json/LICENSE @@ -0,0 +1,17 @@ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/contrib/ccan/json/_info b/contrib/ccan/json/_info new file mode 100644 index 0000000..3113b63 --- /dev/null +++ b/contrib/ccan/json/_info @@ -0,0 +1,119 @@ +#include "config.h" +#include <stdio.h> +#include <string.h> + +/** + * json - Parse and generate JSON (JavaScript Object Notation) + * + * This is a library for encoding and decoding JSON that strives to be + * easy to learn, use, and incorporate into an application. + * + * JSON (JavaScript Object Notation) facilitates passing data among different + * programming languages, particularly JavaScript. It looks like this: + * + * [ + * { + * "id": 1, + * "firstname": "John", + * "lastname": "Smith", + * "email": "john@example.com", + * "likes_pizza": false + * }, + * { + * "id": 2, + * "firstname": "Linda", + * "lastname": "Jones", + * "email": null, + * "likes_pizza": true + * } + * ] + * + * Example: + * #include <ccan/json/json.h> + * #include <math.h> + * #include <stdio.h> + * #include <stdlib.h> + * + * static int find_number(JsonNode *object, const char *name, double *out) + * { + * JsonNode *node = json_find_member(object, name); + * if (node && node->tag == JSON_NUMBER) { + * *out = node->number_; + * return 1; + * } + * return 0; + * } + * + * static void solve_pythagorean(JsonNode *triple) + * { + * double a = 0, b = 0, c = 0; + * int a_given, b_given, c_given; + * + * if (triple->tag != JSON_OBJECT) { + * fprintf(stderr, "Error: Expected a JSON object.\n"); + * exit(EXIT_FAILURE); + * } + * + * a_given = find_number(triple, "a", &a); + * b_given = find_number(triple, "b", &b); + * c_given = find_number(triple, "c", &c); + * + * if (a_given + b_given + c_given != 2) { + * fprintf(stderr, "Error: I need two sides to compute the length of the third.\n"); + * exit(EXIT_FAILURE); + * } + * + * if (a_given && b_given) { + * c = sqrt(a*a + b*b); + * json_append_member(triple, "c", json_mknumber(c)); + * } else if (a_given && c_given) { + * b = sqrt(c*c - a*a); + * json_append_member(triple, "b", json_mknumber(b)); + * } else if (b_given && c_given) { + * a = sqrt(c*c - b*b); + * json_append_member(triple, "a", json_mknumber(a)); + * } + * } + * + * int main(void) + * { + * JsonNode *triples = json_mkarray(); + * + * json_append_element(triples, json_decode("{\"a\": 3, \"b\": 4}")); + * json_append_element(triples, json_decode("{\"a\": 5, \"c\": 13}")); + * json_append_element(triples, json_decode("{\"b\": 24, \"c\": 25}")); + * + * JsonNode *triple; + * json_foreach(triple, triples) + * solve_pythagorean(triple); + * + * char *tmp = json_stringify(triples, "\t"); + * puts(tmp); + * free(tmp); + * + * json_delete(triples); + * return 0; + * } + * + * Author: Joey Adams + * Version: 0.1 + * License: MIT + */ +int main(int argc, char *argv[]) +{ + /* Expect exactly one argument */ + if (argc != 2) + return 1; + + if (strcmp(argv[1], "depends") == 0) { + /* Nothing */ + return 0; + } + + if (strcmp(argv[1], "libs") == 0) { + printf("m\n"); /* Needed for sqrt() used in example code above. */ + return 0; + } + + return 1; +} diff --git a/contrib/ccan/json/json.c b/contrib/ccan/json/json.c new file mode 100644 index 0000000..3fd3d43 --- /dev/null +++ b/contrib/ccan/json/json.c @@ -0,0 +1,1381 @@ +/* + Copyright (C) 2011 Joseph A. Adams (joeyadams3.14159@gmail.com) + All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "json.h" + +#include <assert.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#define out_of_memory() do { \ + fprintf(stderr, "Out of memory.\n"); \ + exit(EXIT_FAILURE); \ + } while (0) + +/* Sadly, strdup is not portable. */ +static char *json_strdup(const char *str) +{ + char *ret = (char*) malloc(strlen(str) + 1); + if (ret == NULL) + out_of_memory(); + strcpy(ret, str); + return ret; +} + +/* String buffer */ + +typedef struct +{ + char *cur; + char *end; + char *start; +} SB; + +static void sb_init(SB *sb) +{ + sb->start = (char*) malloc(17); + if (sb->start == NULL) + out_of_memory(); + sb->cur = sb->start; + sb->end = sb->start + 16; +} + +/* sb and need may be evaluated multiple times. */ +#define sb_need(sb, need) do { \ + if ((sb)->end - (sb)->cur < (need)) \ + sb_grow(sb, need); \ + } while (0) + +static void sb_grow(SB *sb, int need) +{ + size_t length = sb->cur - sb->start; + size_t alloc = sb->end - sb->start; + + do { + alloc *= 2; + } while (alloc < length + need); + + sb->start = (char*) realloc(sb->start, alloc + 1); + if (sb->start == NULL) + out_of_memory(); + sb->cur = sb->start + length; + sb->end = sb->start + alloc; +} + +static void sb_put(SB *sb, const char *bytes, int count) +{ + sb_need(sb, count); + memcpy(sb->cur, bytes, count); + sb->cur += count; +} + +#define sb_putc(sb, c) do { \ + if ((sb)->cur >= (sb)->end) \ + sb_grow(sb, 1); \ + *(sb)->cur++ = (c); \ + } while (0) + +static void sb_puts(SB *sb, const char *str) +{ + sb_put(sb, str, strlen(str)); +} + +static char *sb_finish(SB *sb) +{ + *sb->cur = 0; + assert(sb->start <= sb->cur && strlen(sb->start) == (size_t)(sb->cur - sb->start)); + return sb->start; +} + +static void sb_free(SB *sb) +{ + free(sb->start); +} + +/* + * Unicode helper functions + * + * These are taken from the ccan/charset module and customized a bit. + * Putting them here means the compiler can (choose to) inline them, + * and it keeps ccan/json from having a dependency. + */ + +/* + * Type for Unicode codepoints. + * We need our own because wchar_t might be 16 bits. + */ +typedef uint32_t uchar_t; + +/* + * Validate a single UTF-8 character starting at @s. + * The string must be null-terminated. + * + * If it's valid, return its length (1 thru 4). + * If it's invalid or clipped, return 0. + * + * This function implements the syntax given in RFC3629, which is + * the same as that given in The Unicode Standard, Version 6.0. + * + * It has the following properties: + * + * * All codepoints U+0000..U+10FFFF may be encoded, + * except for U+D800..U+DFFF, which are reserved + * for UTF-16 surrogate pair encoding. + * * UTF-8 byte sequences longer than 4 bytes are not permitted, + * as they exceed the range of Unicode. + * * The sixty-six Unicode "non-characters" are permitted + * (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF). + */ +static int utf8_validate_cz(const char *s) +{ + unsigned char c = *s++; + + if (c <= 0x7F) { /* 00..7F */ + return 1; + } else if (c <= 0xC1) { /* 80..C1 */ + /* Disallow overlong 2-byte sequence. */ + return 0; + } else if (c <= 0xDF) { /* C2..DF */ + /* Make sure subsequent byte is in the range 0x80..0xBF. */ + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; + + return 2; + } else if (c <= 0xEF) { /* E0..EF */ + /* Disallow overlong 3-byte sequence. */ + if (c == 0xE0 && (unsigned char)*s < 0xA0) + return 0; + + /* Disallow U+D800..U+DFFF. */ + if (c == 0xED && (unsigned char)*s > 0x9F) + return 0; + + /* Make sure subsequent bytes are in the range 0x80..0xBF. */ + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; + + return 3; + } else if (c <= 0xF4) { /* F0..F4 */ + /* Disallow overlong 4-byte sequence. */ + if (c == 0xF0 && (unsigned char)*s < 0x90) + return 0; + + /* Disallow codepoints beyond U+10FFFF. */ + if (c == 0xF4 && (unsigned char)*s > 0x8F) + return 0; + + /* Make sure subsequent bytes are in the range 0x80..0xBF. */ + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; + + return 4; + } else { /* F5..FF */ + return 0; + } +} + +/* Validate a null-terminated UTF-8 string. */ +static bool utf8_validate(const char *s) +{ + int len; + + for (; *s != 0; s += len) { + len = utf8_validate_cz(s); + if (len == 0) + return false; + } + + return true; +} + +/* + * Read a single UTF-8 character starting at @s, + * returning the length, in bytes, of the character read. + * + * This function assumes input is valid UTF-8, + * and that there are enough characters in front of @s. + */ +static int utf8_read_char(const char *s, uchar_t *out) +{ + const unsigned char *c = (const unsigned char*) s; + + assert(utf8_validate_cz(s)); + + if (c[0] <= 0x7F) { + /* 00..7F */ + *out = c[0]; + return 1; + } else if (c[0] <= 0xDF) { + /* C2..DF (unless input is invalid) */ + *out = ((uchar_t)c[0] & 0x1F) << 6 | + ((uchar_t)c[1] & 0x3F); + return 2; + } else if (c[0] <= 0xEF) { + /* E0..EF */ + *out = ((uchar_t)c[0] & 0xF) << 12 | + ((uchar_t)c[1] & 0x3F) << 6 | + ((uchar_t)c[2] & 0x3F); + return 3; + } else { + /* F0..F4 (unless input is invalid) */ + *out = ((uchar_t)c[0] & 0x7) << 18 | + ((uchar_t)c[1] & 0x3F) << 12 | + ((uchar_t)c[2] & 0x3F) << 6 | + ((uchar_t)c[3] & 0x3F); + return 4; + } +} + +/* + * Write a single UTF-8 character to @s, + * returning the length, in bytes, of the character written. + * + * @unicode must be U+0000..U+10FFFF, but not U+D800..U+DFFF. + * + * This function will write up to 4 bytes to @out. + */ +static int utf8_write_char(uchar_t unicode, char *out) +{ + unsigned char *o = (unsigned char*) out; + + assert(unicode <= 0x10FFFF && !(unicode >= 0xD800 && unicode <= 0xDFFF)); + + if (unicode <= 0x7F) { + /* U+0000..U+007F */ + *o++ = unicode; + return 1; + } else if (unicode <= 0x7FF) { + /* U+0080..U+07FF */ + *o++ = 0xC0 | unicode >> 6; + *o++ = 0x80 | (unicode & 0x3F); + return 2; + } else if (unicode <= 0xFFFF) { + /* U+0800..U+FFFF */ + *o++ = 0xE0 | unicode >> 12; + *o++ = 0x80 | (unicode >> 6 & 0x3F); + *o++ = 0x80 | (unicode & 0x3F); + return 3; + } else { + /* U+10000..U+10FFFF */ + *o++ = 0xF0 | unicode >> 18; + *o++ = 0x80 | (unicode >> 12 & 0x3F); + *o++ = 0x80 | (unicode >> 6 & 0x3F); + *o++ = 0x80 | (unicode & 0x3F); + return 4; + } +} + +/* + * Compute the Unicode codepoint of a UTF-16 surrogate pair. + * + * @uc should be 0xD800..0xDBFF, and @lc should be 0xDC00..0xDFFF. + * If they aren't, this function returns false. + */ +static bool from_surrogate_pair(uint16_t uc, uint16_t lc, uchar_t *unicode) +{ + if (uc >= 0xD800 && uc <= 0xDBFF && lc >= 0xDC00 && lc <= 0xDFFF) { + *unicode = 0x10000 + ((((uchar_t)uc & 0x3FF) << 10) | (lc & 0x3FF)); + return true; + } else { + return false; + } +} + +/* + * Construct a UTF-16 surrogate pair given a Unicode codepoint. + * + * @unicode must be U+10000..U+10FFFF. + */ +static void to_surrogate_pair(uchar_t unicode, uint16_t *uc, uint16_t *lc) +{ + uchar_t n; + + assert(unicode >= 0x10000 && unicode <= 0x10FFFF); + + n = unicode - 0x10000; + *uc = ((n >> 10) & 0x3FF) | 0xD800; + *lc = (n & 0x3FF) | 0xDC00; +} + +#define is_space(c) ((c) == '\t' || (c) == '\n' || (c) == '\r' || (c) == ' ') +#define is_digit(c) ((c) >= '0' && (c) <= '9') + +static bool parse_value (const char **sp, JsonNode **out); +static bool parse_string (const char **sp, char **out); +static bool parse_number (const char **sp, double *out); +static bool parse_array (const char **sp, JsonNode **out); +static bool parse_object (const char **sp, JsonNode **out); +static bool parse_hex16 (const char **sp, uint16_t *out); + +static bool expect_literal (const char **sp, const char *str); +static void skip_space (const char **sp); + +static void emit_value (SB *out, const JsonNode *node); +static void emit_value_indented (SB *out, const JsonNode *node, const char *space, int indent_level); +static void emit_string (SB *out, const char *str); +static void emit_number (SB *out, double num); +static void emit_array (SB *out, const JsonNode *array); +static void emit_array_indented (SB *out, const JsonNode *array, const char *space, int indent_level); +static void emit_object (SB *out, const JsonNode *object); +static void emit_object_indented (SB *out, const JsonNode *object, const char *space, int indent_level); + +static int write_hex16(char *out, uint16_t val); + +static JsonNode *mknode(JsonTag tag); +static void append_node(JsonNode *parent, JsonNode *child); +static void prepend_node(JsonNode *parent, JsonNode *child); +static void append_member(JsonNode *object, char *key, JsonNode *value); + +/* Assertion-friendly validity checks */ +static bool tag_is_valid(unsigned int tag); +static bool number_is_valid(const char *num); + +JsonNode *json_decode(const char *json) +{ + const char *s = json; + JsonNode *ret; + + skip_space(&s); + if (!parse_value(&s, &ret)) + return NULL; + + skip_space(&s); + if (*s != 0) { + json_delete(ret); + return NULL; + } + + return ret; +} + +char *json_encode(const JsonNode *node) +{ + return json_stringify(node, NULL); +} + +char *json_encode_string(const char *str) +{ + SB sb; + sb_init(&sb); + + emit_string(&sb, str); + + return sb_finish(&sb); +} + +char *json_stringify(const JsonNode *node, const char *space) +{ + SB sb; + sb_init(&sb); + + if (space != NULL) + emit_value_indented(&sb, node, space, 0); + else + emit_value(&sb, node); + + return sb_finish(&sb); +} + +void json_delete(JsonNode *node) +{ + if (node != NULL) { + json_remove_from_parent(node); + + switch (node->tag) { + case JSON_STRING: + free(node->string_); + break; + case JSON_ARRAY: + case JSON_OBJECT: + { + JsonNode *child, *next; + for (child = node->children.head; child != NULL; child = next) { + next = child->next; + json_delete(child); + } + break; + } + default:; + } + + free(node); + } +} + +bool json_validate(const char *json) +{ + const char *s = json; + + skip_space(&s); + if (!parse_value(&s, NULL)) + return false; + + skip_space(&s); + if (*s != 0) + return false; + + return true; +} + +JsonNode *json_find_element(JsonNode *array, int index) +{ + JsonNode *element; + int i = 0; + + if (array == NULL || array->tag != JSON_ARRAY) + return NULL; + + json_foreach(element, array) { + if (i == index) + return element; + i++; + } + + return NULL; +} + +JsonNode *json_find_member(JsonNode *object, const char *name) +{ + JsonNode *member; + + if (object == NULL || object->tag != JSON_OBJECT) + return NULL; + + json_foreach(member, object) + if (strcmp(member->key, name) == 0) + return member; + + return NULL; +} + +JsonNode *json_first_child(const JsonNode *node) +{ + if (node != NULL && (node->tag == JSON_ARRAY || node->tag == JSON_OBJECT)) + return node->children.head; + return NULL; +} + +static JsonNode *mknode(JsonTag tag) +{ + JsonNode *ret = (JsonNode*) calloc(1, sizeof(JsonNode)); + if (ret == NULL) + out_of_memory(); + ret->tag = tag; + return ret; +} + +JsonNode *json_mknull(void) +{ + return mknode(JSON_NULL); +} + +JsonNode *json_mkbool(bool b) +{ + JsonNode *ret = mknode(JSON_BOOL); + ret->bool_ = b; + return ret; +} + +static JsonNode *mkstring(char *s) +{ + JsonNode *ret = mknode(JSON_STRING); + ret->string_ = s; + return ret; +} + +JsonNode *json_mkstring(const char *s) +{ + return mkstring(json_strdup(s)); +} + +JsonNode *json_mknumber(double n) +{ + JsonNode *node = mknode(JSON_NUMBER); + node->number_ = n; + return node; +} + +JsonNode *json_mkarray(void) +{ + return mknode(JSON_ARRAY); +} + +JsonNode *json_mkobject(void) +{ + return mknode(JSON_OBJECT); +} + +static void append_node(JsonNode *parent, JsonNode *child) +{ + child->parent = parent; + child->prev = parent->children.tail; + child->next = NULL; + + if (parent->children.tail != NULL) + parent->children.tail->next = child; + else + parent->children.head = child; + parent->children.tail = child; +} + +static void prepend_node(JsonNode *parent, JsonNode *child) +{ + child->parent = parent; + child->prev = NULL; + child->next = parent->children.head; + + if (parent->children.head != NULL) + parent->children.head->prev = child; + else + parent->children.tail = child; + parent->children.head = child; +} + +static void append_member(JsonNode *object, char *key, JsonNode *value) +{ + value->key = key; + append_node(object, value); +} + +void json_append_element(JsonNode *array, JsonNode *element) +{ + assert(array->tag == JSON_ARRAY); + assert(element->parent == NULL); + + append_node(array, element); +} + +void json_prepend_element(JsonNode *array, JsonNode *element) +{ + assert(array->tag == JSON_ARRAY); + assert(element->parent == NULL); + + prepend_node(array, element); +} + +void json_append_member(JsonNode *object, const char *key, JsonNode *value) +{ + assert(object->tag == JSON_OBJECT); + assert(value->parent == NULL); + + append_member(object, json_strdup(key), value); +} + +void json_prepend_member(JsonNode *object, const char *key, JsonNode *value) +{ + assert(object->tag == JSON_OBJECT); + assert(value->parent == NULL); + + value->key = json_strdup(key); + prepend_node(object, value); +} + +void json_remove_from_parent(JsonNode *node) +{ + JsonNode *parent = node->parent; + + if (parent != NULL) { + if (node->prev != NULL) + node->prev->next = node->next; + else + parent->children.head = node->next; + if (node->next != NULL) + node->next->prev = node->prev; + else + parent->children.tail = node->prev; + + free(node->key); + + node->parent = NULL; + node->prev = node->next = NULL; + node->key = NULL; + } +} + +static bool parse_value(const char **sp, JsonNode **out) +{ + const char *s = *sp; + + switch (*s) { + case 'n': + if (expect_literal(&s, "null")) { + if (out) + *out = json_mknull(); + *sp = s; + return true; + } + return false; + + case 'f': + if (expect_literal(&s, "false")) { + if (out) + *out = json_mkbool(false); + *sp = s; + return true; + } + return false; + + case 't': + if (expect_literal(&s, "true")) { + if (out) + *out = json_mkbool(true); + *sp = s; + return true; + } + return false; + + case '"': { + char *str; + if (parse_string(&s, out ? &str : NULL)) { + if (out) + *out = mkstring(str); + *sp = s; + return true; + } + return false; + } + + case '[': + if (parse_array(&s, out)) { + *sp = s; + return true; + } + return false; + + case '{': + if (parse_object(&s, out)) { + *sp = s; + return true; + } + return false; + + default: { + double num; + if (parse_number(&s, out ? &num : NULL)) { + if (out) + *out = json_mknumber(num); + *sp = s; + return true; + } + return false; + } + } +} + +static bool parse_array(const char **sp, JsonNode **out) +{ + const char *s = *sp; + JsonNode *ret = out ? json_mkarray() : NULL; + JsonNode *element; + + if (*s++ != '[') + goto failure; + skip_space(&s); + + if (*s == ']') { + s++; + goto success; + } + + for (;;) { + if (!parse_value(&s, out ? &element : NULL)) + goto failure; + skip_space(&s); + + if (out) + json_append_element(ret, element); + + if (*s == ']') { + s++; + goto success; + } + + if (*s++ != ',') + goto failure; + skip_space(&s); + } + +success: + *sp = s; + if (out) + *out = ret; + return true; + +failure: + json_delete(ret); + return false; +} + +static bool parse_object(const char **sp, JsonNode **out) +{ + const char *s = *sp; + JsonNode *ret = out ? json_mkobject() : NULL; + char *key; + JsonNode *value; + + if (*s++ != '{') + goto failure; + skip_space(&s); + + if (*s == '}') { + s++; + goto success; + } + + for (;;) { + if (!parse_string(&s, out ? &key : NULL)) + goto failure; + skip_space(&s); + + if (*s++ != ':') + goto failure_free_key; + skip_space(&s); + + if (!parse_value(&s, out ? &value : NULL)) + goto failure_free_key; + skip_space(&s); + + if (out) + append_member(ret, key, value); + + if (*s == '}') { + s++; + goto success; + } + + if (*s++ != ',') + goto failure; + skip_space(&s); + } + +success: + *sp = s; + if (out) + *out = ret; + return true; + +failure_free_key: + if (out) + free(key); +failure: + json_delete(ret); + return false; +} + +bool parse_string(const char **sp, char **out) +{ + const char *s = *sp; + SB sb = { NULL, NULL, NULL }; + char throwaway_buffer[4]; + /* enough space for a UTF-8 character */ + char *b; + + if (*s++ != '"') + return false; + + if (out) { + sb_init(&sb); + sb_need(&sb, 4); + b = sb.cur; + } else { + b = throwaway_buffer; + } + + while (*s != '"') { + unsigned char c = *s++; + + /* Parse next character, and write it to b. */ + if (c == '\\') { + c = *s++; + switch (c) { + case '"': + case '\\': + case '/': + *b++ = c; + break; + case 'b': + *b++ = '\b'; + break; + case 'f': + *b++ = '\f'; + break; + case 'n': + *b++ = '\n'; + break; + case 'r': + *b++ = '\r'; + break; + case 't': + *b++ = '\t'; + break; + case 'u': + { + uint16_t uc, lc; + uchar_t unicode; + + if (!parse_hex16(&s, &uc)) + goto failed; + + if (uc >= 0xD800 && uc <= 0xDFFF) { + /* Handle UTF-16 surrogate pair. */ + if (*s++ != '\\' || *s++ != 'u' || !parse_hex16(&s, &lc)) + goto failed; /* Incomplete surrogate pair. */ + if (!from_surrogate_pair(uc, lc, &unicode)) + goto failed; /* Invalid surrogate pair. */ + } else if (uc == 0) { + /* Disallow "\u0000". */ + goto failed; + } else { + unicode = uc; + } + + b += utf8_write_char(unicode, b); + break; + } + default: + /* Invalid escape */ + goto failed; + } + } else if (c <= 0x1F) { + /* Control characters are not allowed in string literals. */ + goto failed; + } else { + /* Validate and echo a UTF-8 character. */ + int len; + + s--; + len = utf8_validate_cz(s); + if (len == 0) + goto failed; /* Invalid UTF-8 character. */ + + while (len--) + *b++ = *s++; + } + + /* + * Update sb to know about the new bytes, + * and set up b to write another character. + */ + if (out) { + sb.cur = b; + sb_need(&sb, 4); + b = sb.cur; + } else { + b = throwaway_buffer; + } + } + s++; + + if (out) + *out = sb_finish(&sb); + *sp = s; + return true; + +failed: + if (out) + sb_free(&sb); + return false; +} + +/* + * The JSON spec says that a number shall follow this precise pattern + * (spaces and quotes added for readability): + * '-'? (0 | [1-9][0-9]*) ('.' [0-9]+)? ([Ee] [+-]? [0-9]+)? + * + * However, some JSON parsers are more liberal. For instance, PHP accepts + * '.5' and '1.'. JSON.parse accepts '+3'. + * + * This function takes the strict approach. + */ +bool parse_number(const char **sp, double *out) +{ + const char *s = *sp; + + /* '-'? */ + if (*s == '-') + s++; + + /* (0 | [1-9][0-9]*) */ + if (*s == '0') { + s++; + } else { + if (!is_digit(*s)) + return false; + do { + s++; + } while (is_digit(*s)); + } + + /* ('.' [0-9]+)? */ + if (*s == '.') { + s++; + if (!is_digit(*s)) + return false; + do { + s++; + } while (is_digit(*s)); + } + + /* ([Ee] [+-]? [0-9]+)? */ + if (*s == 'E' || *s == 'e') { + s++; + if (*s == '+' || *s == '-') + s++; + if (!is_digit(*s)) + return false; + do { + s++; + } while (is_digit(*s)); + } + + if (out) + *out = strtod(*sp, NULL); + + *sp = s; + return true; +} + +static void skip_space(const char **sp) +{ + const char *s = *sp; + while (is_space(*s)) + s++; + *sp = s; +} + +static void emit_value(SB *out, const JsonNode *node) +{ + assert(tag_is_valid(node->tag)); + switch (node->tag) { + case JSON_NULL: + sb_puts(out, "null"); + break; + case JSON_BOOL: + sb_puts(out, node->bool_ ? "true" : "false"); + break; + case JSON_STRING: + emit_string(out, node->string_); + break; + case JSON_NUMBER: + emit_number(out, node->number_); + break; + case JSON_ARRAY: + emit_array(out, node); + break; + case JSON_OBJECT: + emit_object(out, node); + break; + default: + assert(false); + } +} + +void emit_value_indented(SB *out, const JsonNode *node, const char *space, int indent_level) +{ + assert(tag_is_valid(node->tag)); + switch (node->tag) { + case JSON_NULL: + sb_puts(out, "null"); + break; + case JSON_BOOL: + sb_puts(out, node->bool_ ? "true" : "false"); + break; + case JSON_STRING: + emit_string(out, node->string_); + break; + case JSON_NUMBER: + emit_number(out, node->number_); + break; + case JSON_ARRAY: + emit_array_indented(out, node, space, indent_level); + break; + case JSON_OBJECT: + emit_object_indented(out, node, space, indent_level); + break; + default: + assert(false); + } +} + +static void emit_array(SB *out, const JsonNode *array) +{ + const JsonNode *element; + + sb_putc(out, '['); + json_foreach(element, array) { + emit_value(out, element); + if (element->next != NULL) + sb_putc(out, ','); + } + sb_putc(out, ']'); +} + +static void emit_array_indented(SB *out, const JsonNode *array, const char *space, int indent_level) +{ + const JsonNode *element = array->children.head; + int i; + + if (element == NULL) { + sb_puts(out, "[]"); + return; + } + + sb_puts(out, "[\n"); + while (element != NULL) { + for (i = 0; i < indent_level + 1; i++) + sb_puts(out, space); + emit_value_indented(out, element, space, indent_level + 1); + + element = element->next; + sb_puts(out, element != NULL ? ",\n" : "\n"); + } + for (i = 0; i < indent_level; i++) + sb_puts(out, space); + sb_putc(out, ']'); +} + +static void emit_object(SB *out, const JsonNode *object) +{ + const JsonNode *member; + + sb_putc(out, '{'); + json_foreach(member, object) { + emit_string(out, member->key); + sb_putc(out, ':'); + emit_value(out, member); + if (member->next != NULL) + sb_putc(out, ','); + } + sb_putc(out, '}'); +} + +static void emit_object_indented(SB *out, const JsonNode *object, const char *space, int indent_level) +{ + const JsonNode *member = object->children.head; + int i; + + if (member == NULL) { + sb_puts(out, "{}"); + return; + } + + sb_puts(out, "{\n"); + while (member != NULL) { + for (i = 0; i < indent_level + 1; i++) + sb_puts(out, space); + emit_string(out, member->key); + sb_puts(out, ": "); + emit_value_indented(out, member, space, indent_level + 1); + + member = member->next; + sb_puts(out, member != NULL ? ",\n" : "\n"); + } + for (i = 0; i < indent_level; i++) + sb_puts(out, space); + sb_putc(out, '}'); +} + +void emit_string(SB *out, const char *str) +{ + bool escape_unicode = false; + const char *s = str; + char *b; + + assert(utf8_validate(str)); + + /* + * 14 bytes is enough space to write up to two + * \uXXXX escapes and two quotation marks. + */ + sb_need(out, 14); + b = out->cur; + + *b++ = '"'; + while (*s != 0) { + unsigned char c = *s++; + + /* Encode the next character, and write it to b. */ + switch (c) { + case '"': + *b++ = '\\'; + *b++ = '"'; + break; + case '\\': + *b++ = '\\'; + *b++ = '\\'; + break; + case '\b': + *b++ = '\\'; + *b++ = 'b'; + break; + case '\f': + *b++ = '\\'; + *b++ = 'f'; + break; + case '\n': + *b++ = '\\'; + *b++ = 'n'; + break; + case '\r': + *b++ = '\\'; + *b++ = 'r'; + break; + case '\t': + *b++ = '\\'; + *b++ = 't'; + break; + default: { + int len; + + s--; + len = utf8_validate_cz(s); + + if (len == 0) { + /* + * Handle invalid UTF-8 character gracefully in production + * by writing a replacement character (U+FFFD) + * and skipping a single byte. + * + * This should never happen when assertions are enabled + * due to the assertion at the beginning of this function. + */ + assert(false); + if (escape_unicode) { + strcpy(b, "\\uFFFD"); + b += 6; + } else { + *b++ = (char)0xEF; + *b++ = (char)0xBF; + *b++ = (char)0xBD; + } + s++; + } else if (c < 0x1F || (c >= 0x80 && escape_unicode)) { + /* Encode using \u.... */ + uint32_t unicode; + + s += utf8_read_char(s, &unicode); + + if (unicode <= 0xFFFF) { + *b++ = '\\'; + *b++ = 'u'; + b += write_hex16(b, unicode); + } else { + /* Produce a surrogate pair. */ + uint16_t uc, lc; + assert(unicode <= 0x10FFFF); + to_surrogate_pair(unicode, &uc, &lc); + *b++ = '\\'; + *b++ = 'u'; + b += write_hex16(b, uc); + *b++ = '\\'; + *b++ = 'u'; + b += write_hex16(b, lc); + } + } else { + /* Write the character directly. */ + while (len--) + *b++ = *s++; + } + + break; + } + } + + /* + * Update *out to know about the new bytes, + * and set up b to write another encoded character. + */ + out->cur = b; + sb_need(out, 14); + b = out->cur; + } + *b++ = '"'; + + out->cur = b; +} + +static void emit_number(SB *out, double num) +{ + /* + * This isn't exactly how JavaScript renders numbers, + * but it should produce valid JSON for reasonable numbers + * preserve precision well enough, and avoid some oddities + * like 0.3 -> 0.299999999999999988898 . + */ + char buf[64]; + sprintf(buf, "%.16g", num); + + if (number_is_valid(buf)) + sb_puts(out, buf); + else + sb_puts(out, "null"); +} + +static bool tag_is_valid(unsigned int tag) +{ + return (/* tag >= JSON_NULL && */ tag <= JSON_OBJECT); +} + +static bool number_is_valid(const char *num) +{ + return (parse_number(&num, NULL) && *num == '\0'); +} + +static bool expect_literal(const char **sp, const char *str) +{ + const char *s = *sp; + + while (*str != '\0') + if (*s++ != *str++) + return false; + + *sp = s; + return true; +} + +/* + * Parses exactly 4 hex characters (capital or lowercase). + * Fails if any input chars are not [0-9A-Fa-f]. + */ +static bool parse_hex16(const char **sp, uint16_t *out) +{ + const char *s = *sp; + uint16_t ret = 0; + uint16_t i; + uint16_t tmp; + char c; + + for (i = 0; i < 4; i++) { + c = *s++; + if (c >= '0' && c <= '9') + tmp = c - '0'; + else if (c >= 'A' && c <= 'F') + tmp = c - 'A' + 10; + else if (c >= 'a' && c <= 'f') + tmp = c - 'a' + 10; + else + return false; + + ret <<= 4; + ret += tmp; + } + + if (out) + *out = ret; + *sp = s; + return true; +} + +/* + * Encodes a 16-bit number into hexadecimal, + * writing exactly 4 hex chars. + */ +static int write_hex16(char *out, uint16_t val) +{ + const char *hex = "0123456789ABCDEF"; + + *out++ = hex[(val >> 12) & 0xF]; + *out++ = hex[(val >> 8) & 0xF]; + *out++ = hex[(val >> 4) & 0xF]; + *out++ = hex[ val & 0xF]; + + return 4; +} + +bool json_check(const JsonNode *node, char errmsg[256]) +{ + #define problem(...) do { \ + if (errmsg != NULL) \ + snprintf(errmsg, 256, __VA_ARGS__); \ + return false; \ + } while (0) + + if (node->key != NULL && !utf8_validate(node->key)) + problem("key contains invalid UTF-8"); + + if (!tag_is_valid(node->tag)) + problem("tag is invalid (%u)", node->tag); + + if (node->tag == JSON_BOOL) { + if (node->bool_ != false && node->bool_ != true) + problem("bool_ is neither false (%d) nor true (%d)", (int)false, (int)true); + } else if (node->tag == JSON_STRING) { + if (node->string_ == NULL) + problem("string_ is NULL"); + if (!utf8_validate(node->string_)) + problem("string_ contains invalid UTF-8"); + } else if (node->tag == JSON_ARRAY || node->tag == JSON_OBJECT) { + JsonNode *head = node->children.head; + JsonNode *tail = node->children.tail; + + if (head == NULL || tail == NULL) { + if (head != NULL) + problem("tail is NULL, but head is not"); + if (tail != NULL) + problem("head is NULL, but tail is not"); + } else { + JsonNode *child; + JsonNode *last = NULL; + + if (head->prev != NULL) + problem("First child's prev pointer is not NULL"); + + for (child = head; child != NULL; last = child, child = child->next) { + if (child == node) + problem("node is its own child"); + if (child->next == child) + problem("child->next == child (cycle)"); + if (child->next == head) + problem("child->next == head (cycle)"); + + if (child->parent != node) + problem("child does not point back to parent"); + if (child->next != NULL && child->next->prev != child) + problem("child->next does not point back to child"); + + if (node->tag == JSON_ARRAY && child->key != NULL) + problem("Array element's key is not NULL"); + if (node->tag == JSON_OBJECT && child->key == NULL) + problem("Object member's key is NULL"); + + if (!json_check(child, errmsg)) + return false; + } + + if (last != tail) + problem("tail does not match pointer found by starting at head and following next links"); + } + } + + return true; + + #undef problem +} diff --git a/contrib/ccan/json/json.h b/contrib/ccan/json/json.h new file mode 100644 index 0000000..ed5255e --- /dev/null +++ b/contrib/ccan/json/json.h @@ -0,0 +1,117 @@ +/* + Copyright (C) 2011 Joseph A. Adams (joeyadams3.14159@gmail.com) + All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#ifndef CCAN_JSON_H +#define CCAN_JSON_H + +#include <stdbool.h> +#include <stddef.h> + +typedef enum { + JSON_NULL, + JSON_BOOL, + JSON_STRING, + JSON_NUMBER, + JSON_ARRAY, + JSON_OBJECT, +} JsonTag; + +typedef struct JsonNode JsonNode; + +struct JsonNode +{ + /* only if parent is an object or array (NULL otherwise) */ + JsonNode *parent; + JsonNode *prev, *next; + + /* only if parent is an object (NULL otherwise) */ + char *key; /* Must be valid UTF-8. */ + + JsonTag tag; + union { + /* JSON_BOOL */ + bool bool_; + + /* JSON_STRING */ + char *string_; /* Must be valid UTF-8. */ + + /* JSON_NUMBER */ + double number_; + + /* JSON_ARRAY */ + /* JSON_OBJECT */ + struct { + JsonNode *head, *tail; + } children; + }; +}; + +/*** Encoding, decoding, and validation ***/ + +JsonNode *json_decode (const char *json); +char *json_encode (const JsonNode *node); +char *json_encode_string (const char *str); +char *json_stringify (const JsonNode *node, const char *space); +void json_delete (JsonNode *node); + +bool json_validate (const char *json); + +/*** Lookup and traversal ***/ + +JsonNode *json_find_element (JsonNode *array, int index); +JsonNode *json_find_member (JsonNode *object, const char *key); + +JsonNode *json_first_child (const JsonNode *node); + +#define json_foreach(i, object_or_array) \ + for ((i) = json_first_child(object_or_array); \ + (i) != NULL; \ + (i) = (i)->next) + +/*** Construction and manipulation ***/ + +JsonNode *json_mknull(void); +JsonNode *json_mkbool(bool b); +JsonNode *json_mkstring(const char *s); +JsonNode *json_mknumber(double n); +JsonNode *json_mkarray(void); +JsonNode *json_mkobject(void); + +void json_append_element(JsonNode *array, JsonNode *element); +void json_prepend_element(JsonNode *array, JsonNode *element); +void json_append_member(JsonNode *object, const char *key, JsonNode *value); +void json_prepend_member(JsonNode *object, const char *key, JsonNode *value); + +void json_remove_from_parent(JsonNode *node); + +/*** Debugging ***/ + +/* + * Look for structure and encoding problems in a JsonNode or its descendents. + * + * If a problem is detected, return false, writing a description of the problem + * to errmsg (unless errmsg is NULL). + */ +bool json_check(const JsonNode *node, char errmsg[256]); + +#endif diff --git a/contrib/ccan/json/test/common.h b/contrib/ccan/json/test/common.h new file mode 100644 index 0000000..328cb73 --- /dev/null +++ b/contrib/ccan/json/test/common.h @@ -0,0 +1,18 @@ +#include <ccan/json/json.c> +#include <ccan/tap/tap.h> + +#include <errno.h> +#include <string.h> + +static char *chomp(char *s) +{ + char *e; + + if (s == NULL || *s == 0) + return s; + + e = strchr(s, 0); + if (e[-1] == '\n') + *--e = 0; + return s; +} diff --git a/contrib/ccan/json/test/run-construction.c b/contrib/ccan/json/test/run-construction.c new file mode 100644 index 0000000..cc9a395 --- /dev/null +++ b/contrib/ccan/json/test/run-construction.c @@ -0,0 +1,191 @@ +/* Build a list of numbers with various appends and prepends, verify them by testing against their encoded value, do pointer consistency checks each time, do element lookups, and remove items as well. */ + +#include "common.h" + +#define should_be(var, expected) should_be_(var, #var, expected) + +static void should_be_(const JsonNode *node, const char *name, const char *expected) +{ + char errmsg[256]; + char *encoded; + + if (!json_check(node, errmsg)) { + fail("Invariants check failed: %s", errmsg); + return; + } + + encoded = json_encode(node); + + if (strcmp(encoded, expected) == 0) + pass("%s is %s", name, expected); + else + fail("%s should be %s, but is actually %s", name, expected, encoded); + + free(encoded); +} + +static void test_string(void) +{ + JsonNode *str; + + str = json_mkstring("Hello\tworld!\n\001"); + should_be(str, "\"Hello\\tworld!\\n\\u0001\""); + json_delete(str); + + str = json_mkstring("\"\\\b\f\n\r\t"); + should_be(str, "\"\\\"\\\\\\b\\f\\n\\r\\t\""); + json_delete(str); +} + +static void test_number(void) +{ + JsonNode *num; + + num = json_mknumber(5678901234.0); + should_be(num, "5678901234"); + json_delete(num); + + num = json_mknumber(-5678901234.0); + should_be(num, "-5678901234"); + json_delete(num); + + num = json_mknumber(0.0 / 0.0); + should_be(num, "null"); + json_delete(num); +} + +static void test_array(void) +{ + JsonNode *array; + JsonNode *children[5 + 1]; + + array = json_mkarray(); + should_be(array, "[]"); + + children[1] = json_mknumber(1); + children[2] = json_mknumber(2); + children[3] = json_mknumber(3); + children[4] = json_mknumber(4); + children[5] = json_mknumber(5); + + json_append_element(array, children[3]); + should_be(array, "[3]"); + + json_remove_from_parent(children[3]); + should_be(array, "[]"); + + json_prepend_element(array, children[3]); + should_be(array, "[3]"); + + json_prepend_element(array, children[2]); + should_be(array, "[2,3]"); + + json_append_element(array, children[4]); + should_be(array, "[2,3,4]"); + + json_delete(children[3]); + should_be(array, "[2,4]"); + + json_prepend_element(array, children[1]); + should_be(array, "[1,2,4]"); + + json_delete(children[1]); + should_be(array, "[2,4]"); + + json_delete(children[4]); + should_be(array, "[2]"); + + ok1(json_find_element(array, 0) == children[2]); + ok1(json_find_element(array, -1) == NULL); + ok1(json_find_element(array, 1) == NULL); + + json_append_element(array, children[5]); + should_be(array, "[2,5]"); + + ok1(json_find_element(array, 0) == children[2]); + ok1(json_find_element(array, 1) == children[5]); + ok1(json_find_element(array, -1) == NULL); + ok1(json_find_element(array, 2) == NULL); + + json_delete(children[2]); + json_delete(children[5]); + should_be(array, "[]"); + + ok1(json_find_element(array, -1) == NULL); + ok1(json_find_element(array, 0) == NULL); + ok1(json_find_element(array, 1) == NULL); + + json_delete(array); +} + +static void test_object(void) +{ + JsonNode *object; + JsonNode *children[5 + 1]; + + object = json_mkobject(); + should_be(object, "{}"); + + children[1] = json_mknumber(1); + children[2] = json_mknumber(2); + children[3] = json_mknumber(3); + + ok1(json_find_member(object, "one") == NULL); + ok1(json_find_member(object, "two") == NULL); + ok1(json_find_member(object, "three") == NULL); + + json_append_member(object, "one", children[1]); + should_be(object, "{\"one\":1}"); + + ok1(json_find_member(object, "one") == children[1]); + ok1(json_find_member(object, "two") == NULL); + ok1(json_find_member(object, "three") == NULL); + + json_prepend_member(object, "two", children[2]); + should_be(object, "{\"two\":2,\"one\":1}"); + + ok1(json_find_member(object, "one") == children[1]); + ok1(json_find_member(object, "two") == children[2]); + ok1(json_find_member(object, "three") == NULL); + + json_append_member(object, "three", children[3]); + should_be(object, "{\"two\":2,\"one\":1,\"three\":3}"); + + ok1(json_find_member(object, "one") == children[1]); + ok1(json_find_member(object, "two") == children[2]); + ok1(json_find_member(object, "three") == children[3]); + + json_delete(object); +} + +int main(void) +{ + JsonNode *node; + + (void) chomp; + + plan_tests(49); + + ok1(json_find_element(NULL, 0) == NULL); + ok1(json_find_member(NULL, "") == NULL); + ok1(json_first_child(NULL) == NULL); + + node = json_mknull(); + should_be(node, "null"); + json_delete(node); + + node = json_mkbool(false); + should_be(node, "false"); + json_delete(node); + + node = json_mkbool(true); + should_be(node, "true"); + json_delete(node); + + test_string(); + test_number(); + test_array(); + test_object(); + + return exit_status(); +} diff --git a/contrib/ccan/json/test/run-decode-encode.c b/contrib/ccan/json/test/run-decode-encode.c new file mode 100644 index 0000000..6bdf7c3 --- /dev/null +++ b/contrib/ccan/json/test/run-decode-encode.c @@ -0,0 +1,77 @@ +#include "common.h" + +int main(void) +{ + const char *strings_file = "test/test-strings"; + const char *strings_reencoded_file = "test/test-strings-reencoded"; + FILE *f, *f2; + char buffer[1024], buffer2[1024]; + + plan_tests(90); + + f = fopen(strings_file, "rb"); + if (f == NULL) { + diag("Could not open %s: %s", strings_file, strerror(errno)); + return 1; + } + f2 = fopen(strings_reencoded_file, "rb"); + if (f2 == NULL) { + diag("Could not open %s: %s", strings_reencoded_file, strerror(errno)); + return 1; + } + + while (fgets(buffer, sizeof(buffer), f)) { + const char *s = chomp(buffer); + bool valid; + JsonNode *node; + + if (expect_literal(&s, "valid ")) { + valid = true; + } else if (expect_literal(&s, "invalid ")) { + valid = false; + } else { + fail("Invalid line in test-strings: %s", buffer); + continue; + } + + node = json_decode(s); + + if (valid) { + char *reencoded; + char errmsg[256]; + + if (node == NULL) { + fail("%s is valid, but json_decode returned NULL", s); + continue; + } + + if (!json_check(node, errmsg)) { + fail("Corrupt tree produced by json_decode: %s", errmsg); + continue; + } + + reencoded = json_encode(node); + + if (!fgets(buffer2, sizeof(buffer2), f2)) { + fail("test-strings-reencoded is missing this line: %s", reencoded); + continue; + } + chomp(buffer2); + + ok(strcmp(reencoded, buffer2) == 0, "re-encode %s -> %s", s, reencoded); + + free(reencoded); + json_delete(node); + } else if (node != NULL) { + fail("%s is invalid, but json_decode returned non-NULL", s); + continue; + } + } + + if (ferror(f) || fclose(f) != 0 || ferror(f2) || fclose(f2) != 0) { + diag("I/O error reading test data."); + return 1; + } + + return exit_status(); +} diff --git a/contrib/ccan/json/test/run-stringify.c b/contrib/ccan/json/test/run-stringify.c new file mode 100644 index 0000000..3a4cb73 --- /dev/null +++ b/contrib/ccan/json/test/run-stringify.c @@ -0,0 +1,108 @@ +#include "common.h" + +static char buf1[256], buf2[256]; + +/* Used for pass and fail messages */ +static char *quote_string(const char *str, char buf[256]) +{ + char *out = buf; + + *out++ = '"'; + for (; *str != 0; str++) { + if (out - buf > 256 - 5) { + /* String is too long. End it with `...' */ + out = buf + 256 - 5; + *out++ = '.'; + *out++ = '.'; + *out++ = '.'; + break; + } + switch (*str) { + case '\t': + *out++ = '\\'; + *out++ = 't'; + break; + case '\n': + *out++ = '\\'; + *out++ = 'n'; + break; + case '"': + *out++ = '\\'; + *out++ = '"'; + break; + case '\\': + *out++ = '\\'; + *out++ = '\\'; + break; + default: + *out++ = *str; + break; + } + } + *out++ = '"'; + + *out = 0; + return buf; +} + +static void test_stringify(const char *input, const char *expected) +{ + JsonNode *node = NULL; + char *enc = NULL; + char *strn = NULL; + char *str = NULL; + + node = json_decode(input); + if (node == NULL) { + fail("Failed to decode %s", input); + goto end; + } + + enc = json_encode(node); + if (strcmp(enc, input) != 0) { + fail("%s re-encodes to %s. Either encode/decode is broken, or the input string needs to be normalized", input, enc); + goto end; + } + + strn = json_stringify(node, NULL); + if (strcmp(strn, enc) != 0) { + fail("json_stringify with NULL space produced a different string than json_encode"); + goto end; + } + + str = json_stringify(node, "\t"); + if (strcmp(str, expected) != 0) { + fail("Expected %s, but json_stringify produced %s", + quote_string(expected, buf1), quote_string(str, buf2)); + goto end; + } + + pass("stringify %s", input); + +end: + json_delete(node); + free(enc); + free(strn); + free(str); +} + +int main(void) +{ + (void) chomp; + + plan_tests(9); + + test_stringify("[]", "[]"); + test_stringify("[1]", "[\n\t1\n]"); + test_stringify("[1,2,3]", "[\n\t1,\n\t2,\n\t3\n]"); + test_stringify("[[]]", "[\n\t[]\n]"); + test_stringify("[[1,2],[3,4]]", "[\n\t[\n\t\t1,\n\t\t2\n\t],\n\t[\n\t\t3,\n\t\t4\n\t]\n]"); + + test_stringify("{}", "{}"); + test_stringify("{\"one\":1}", "{\n\t\"one\": 1\n}"); + test_stringify("{\"one\":1,\"t*\":[2,3,10]}", "{\n\t\"one\": 1,\n\t\"t*\": [\n\t\t2,\n\t\t3,\n\t\t10\n\t]\n}"); + test_stringify("{\"a\":{\"1\":1,\"2\":2},\"b\":{\"3\":[null,false,true,\"\\f\"]}}", + "{\n\t\"a\": {\n\t\t\"1\": 1,\n\t\t\"2\": 2\n\t},\n\t\"b\": {\n\t\t\"3\": [\n\t\t\tnull,\n\t\t\tfalse,\n\t\t\ttrue,\n\t\t\t\"\\f\"\n\t\t]\n\t}\n}"); + + return exit_status(); +} diff --git a/contrib/ccan/json/test/run-validate.c b/contrib/ccan/json/test/run-validate.c new file mode 100644 index 0000000..f7bb3b0 --- /dev/null +++ b/contrib/ccan/json/test/run-validate.c @@ -0,0 +1,49 @@ +#include "common.h" + +int main(void) +{ + const char *strings_file = "test/test-strings"; + FILE *f; + char buffer[1024]; + + plan_tests(224); + + f = fopen(strings_file, "rb"); + if (f == NULL) { + diag("Could not open %s: %s", strings_file, strerror(errno)); + return 1; + } + + while (fgets(buffer, sizeof(buffer), f)) { + const char *s = chomp(buffer); + bool valid; + + if (expect_literal(&s, "valid ")) { + valid = true; + } else if (expect_literal(&s, "invalid ")) { + valid = false; + } else { + fail("Invalid line in test-strings: %s", buffer); + continue; + } + + if (strcmp(s, "\"1\\u2\"") == 0) + puts("here"); + + if (json_validate(s) == valid) { + pass("%s %s", valid ? "valid" : "invalid", s); + } else { + fail("%s is %s, but json_validate returned %s", + s, + valid ? "valid" : "invalid", + valid ? "false" : "true"); + } + } + + if (ferror(f) || fclose(f) != 0) { + diag("I/O error reading test strings."); + return 1; + } + + return exit_status(); +} diff --git a/contrib/ccan/json/test/test-strings b/contrib/ccan/json/test/test-strings new file mode 100644 index 0000000..439be7d --- /dev/null +++ b/contrib/ccan/json/test/test-strings @@ -0,0 +1,224 @@ +invalid +invalid +invalid " +invalid [,] +invalid [) +invalid []] +invalid [} +invalid {,} +invalid {] +invalid ["1":2] +invalid [1,2,] +invalid [1:2} +invalid {"1":2,} +invalid {1:2} +invalid {"1":2, "2.5" : [3, 4, {}, {"5": ["6"], [7 ]}]} +invalid {"1":2, "2.5" : [3, 4, {}, {"5": ["6"], [7]}]} +invalid {"1":2, "2.5" : [3, 4, {}, {"5": ["6"], "7" :[8 ]}] +invalid {"1":2, "2.5" : [3, 4, {}, {"5": ["6"], "7" :[8 ]}]] +invalid {"1":2, "3":4 +invalid "1\u2" +invalid [,2] +invalid "3 +invalid "3" "4" +invalid [3[4] +invalid [3[4]] +invalid [3, [4, [5], 6] 7, 8 9] +invalid [3, [4, [5], 6] 7, 8, 9] +invalid [3, [4, [5], 6], 7, 8 9] +invalid {"hello":true, "bye":false, null} +invalid {"hello":true, "bye":false, null:null} +invalid "hi +invalid "hi""" +invalid {"hi": "bye"] +invalid "\uD800\uD800" +invalid "\uD800\uDBFF" +invalid "\UD834\UDD1E" +invalid "\uDB00" +invalid "\uDB00\uDBFF" +valid "\uFFFE" +valid "\uFFFF" +invalid . +valid "" +valid [] +valid {} +invalid +. +valid 0.5 +invalid 0.e1 +valid {"1":{}} +valid {"1":2} +valid {"1":2, "2.5" : [3, 4, {}, {"5": ["6"]}]} +valid {"1":2, "2.5" : [3, 4, {}, {"5": ["6"], "7" :[8 ]}]} +valid 1234 +valid -1234 +valid {"1":2, "3":4} +invalid +1234 +invalid ++1234 +valid 123.456e142 +valid 123.456e-142 +valid 123.456e+142 +invalid 123.e-142 +valid "1\u2000" +valid "1\u20001" +valid 2 +invalid .246e-142 +invalid .2e-142 +valid 3 +invalid .3 +valid "3" +valid [3] +invalid +3. +valid 3.2e+1 +valid [3, [4]] +valid [3, [4, [5]]] +valid [3, [4, [5], 6]] +valid [3, [4, [5], 6], 7] +valid [3, [4, [5], 6], 7, 8] +valid [3, [4, [5], 6], 7, 8, 9] +invalid +3.5 +invalid .3e +invalid .3e1 +invalid .3e-1 +invalid .3e+1 +invalid 3.e1 +invalid 3.e+1 +valid 3e+1 +invalid .5 +invalid +.5 +invalid .5e+1 +valid [ 7] +valid [7 ] +valid [7] +invalid .e-14234 +valid "hello" +valid ["hello"] +valid ["hello", "bye"] +valid ["hello", "bye\n"] +valid ["hello", "bye\n\r\t"] +valid ["hello", "bye\n\r\t\b"] +valid ["hello", "bye\n\r\t\b",true] +valid ["hello", "bye\n\r\t\b",true , false] +valid ["hello", "bye\n\r\t\b",true , false, null] +invalid ["hello", "bye\n\r\t\v"] +valid {"hello":true} +valid {"hello":true, "bye":false} +valid {"hello":true, "bye":false, "foo":["one","two","three"]} +valid "hi" +valid ["hi"] +valid ["hi", "bye"] +valid {"hi": "bye"} +valid ["hi", "bye", 3] +valid ["hi", "bye[", 3] +valid "\u0007" +valid "\u0008" +valid "\u0009" +valid "\u0010" +valid "\u0020" +valid "\u10000" +valid "\u1234" +valid "\u99999" +valid "\ud800\udc00" +valid "\uD800\uDC00" +valid "\uD834\uDD1E" +valid "\uDBFF\uDFFF" +valid "\uFFFD" +valid "\uFFFF" +invalid hello +valid [32, 1] +invalid [32, +valid "\uD800\uDC00" +valid "\n" +valid "hello" +valid "hello\u0009world" +valid "hello" +valid "hello\n" +valid "hello" +valid 3 +invalid 3. +invalid .3 +valid 0.3 +invalid 0.3e +invalid 0.3e+ +valid 0.3e+5 +valid 0.3e-5 +valid 0.3e5 +valid "hello" +invalid +3 +valid -3 +invalid -3. +valid -3.1 +invalid .5 +invalid 5. +invalid 5.e1 +valid 0.5 +invalid .3e1 +invalid .3e+1 +invalid .3e-1 +invalid .3e-1 .5 +invalid .3e-1.5 +invalid .3e+1.5 +invalid .3e+. +invalid .3e+.5 +invalid .3e+1.5 +invalid 9.3e+1.5 +invalid 9.e+1.5 +invalid 9.e+ +invalid 9.e+1 +valid "\"" +valid "\"3.5" +valid "\"." +invalid "\".". +valid "\"....." +invalid "\"\"\"\""" +invalid ["\"\"\"\"", .5] +invalid [.5] +valid ["\"\"\"\"", 0.5] +invalid ["\"\"\"\"", .5] +invalid ["\"\"\"\"",.5] +invalid ["\"",.5] +invalid ["\".5",.5] +invalid ["\".5",".5\"".5] +invalid ["\".5",".5\"", .5] +invalid ["\".5",".5\"",.5] +valid ["\".5",".5\"",0.5] +invalid {"key":/*comment*/"value"} +invalid {"key":/*comment"value"} +invalid {"key":"value"}/* +invalid {"key":"value"}/**/ +invalid {"key":"value"}/***/ +invalid {"key":"value"}/**// +invalid {"key":"value"}/**/// +invalid {"key":"value"}/**///---- +invalid {"key":"value"}# +invalid {"key":"value"}#{ +invalid {"key":"value"}#{} +invalid {"key":"value"}#, +invalid {"key":"value"/**/, "k2":"v2"} +valid "\u0027" +invalid "hello\'" +invalid 'hello\'' +invalid 'hello' +invalid 'hell\'o' +invalid '\'hello' +invalid '\'hello\'' +invalid \'hello\' +invalid 'hello\' +invalid ['hello\'] +invalid ['hello\''] +invalid ['hello"'] +invalid ['hello\"'] +invalid ['hello"o'] +invalid ['"'] +invalid '"' +invalid '"hello"' +invalid '"hello' +invalid '"hi"' +valid [ 1 , 2 , 3 ] +invalid nil +invalid fals +invalid falsify +invalid falsetto +invalid truism +invalid {"key" +invalid {"key","key2":value} +invalid "\u0000" diff --git a/contrib/ccan/json/test/test-strings-reencoded b/contrib/ccan/json/test/test-strings-reencoded new file mode 100644 index 0000000..97890c1 --- /dev/null +++ b/contrib/ccan/json/test/test-strings-reencoded @@ -0,0 +1,90 @@ +"" +"" +"" +[] +{} +0.5 +{"1":{}} +{"1":2} +{"1":2,"2.5":[3,4,{},{"5":["6"]}]} +{"1":2,"2.5":[3,4,{},{"5":["6"],"7":[8]}]} +1234 +-1234 +{"1":2,"3":4} +1.23456e+144 +1.23456e-140 +1.23456e+144 +"1 " +"1 1" +2 +3 +"3" +[3] +32 +[3,[4]] +[3,[4,[5]]] +[3,[4,[5],6]] +[3,[4,[5],6],7] +[3,[4,[5],6],7,8] +[3,[4,[5],6],7,8,9] +30 +[7] +[7] +[7] +"hello" +["hello"] +["hello","bye"] +["hello","bye\n"] +["hello","bye\n\r\t"] +["hello","bye\n\r\t\b"] +["hello","bye\n\r\t\b",true] +["hello","bye\n\r\t\b",true,false] +["hello","bye\n\r\t\b",true,false,null] +{"hello":true} +{"hello":true,"bye":false} +{"hello":true,"bye":false,"foo":["one","two","three"]} +"hi" +["hi"] +["hi","bye"] +{"hi":"bye"} +["hi","bye",3] +["hi","bye[",3] +"\u0007" +"\b" +"\t" +"\u0010" +" " +"က0" +"ሴ" +"香9" +"𐀀" +"𐀀" +"𝄞" +"" +"�" +"" +[32,1] +"𐀀" +"\n" +"hello" +"hello\tworld" +"hello" +"hello\n" +"hello" +3 +0.3 +30000 +3e-06 +30000 +"hello" +-3 +-3.1 +0.5 +"\"" +"\"3.5" +"\"." +"\"....." +["\"\"\"\"",0.5] +["\".5",".5\"",0.5] +"'" +[1,2,3] diff --git a/contrib/cleanup.h b/contrib/cleanup.h new file mode 100644 index 0000000..55843f1 --- /dev/null +++ b/contrib/cleanup.h @@ -0,0 +1,37 @@ +/* Copyright (C) 2015-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. +*/ +/** + * Cleanup attributes. + * @cond internal + */ +#pragma once +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> + +#define auto_free __attribute__((cleanup(_cleanup_free))) +static inline void _cleanup_free(char **p) { + free(*p); +} +#define auto_close __attribute__((cleanup(_cleanup_close))) +static inline void _cleanup_close(int *p) { + if (*p > 0) close(*p); +} +#define auto_fclose __attribute__((cleanup(_cleanup_fclose))) +static inline void _cleanup_fclose(FILE **p) { + if (*p) fclose(*p); +} +/* @endcond */ diff --git a/contrib/config.h b/contrib/config.h new file mode 100644 index 0000000..c655374 --- /dev/null +++ b/contrib/config.h @@ -0,0 +1,6 @@ +/* Dummy file, no real configuration here */ +#define HAVE_ATTRIBUTE_COLD 1 +#define HAVE_ATTRIBUTE_NORETURN 1 +#define HAVE_ATTRIBUTE_PURE 1 +#define HAVE_ATTRIBUTE_UNUSED 1 +#define HAVE_ATTRIBUTE_NONNULL 1 diff --git a/contrib/contrib.mk b/contrib/contrib.mk new file mode 100644 index 0000000..c8be024 --- /dev/null +++ b/contrib/contrib.mk @@ -0,0 +1,22 @@ +contrib_SOURCES := \ + contrib/ccan/asprintf/asprintf.c \ + contrib/ccan/ilog/ilog.c \ + contrib/ccan/json/json.c \ + contrib/ucw/mempool.c \ + contrib/ucw/mempool-fmt.c \ + contrib/murmurhash3/murmurhash3.c \ + contrib/base32hex.c \ + contrib/base64.c +contrib_CFLAGS := -fPIC +contrib_TARGET := $(abspath contrib)/contrib$(AREXT) + +# Use built-in LMDB if not found +ifneq ($(HAS_lmdb), yes) +contrib_SOURCES += contrib/lmdb/mdb.c \ + contrib/lmdb/midl.c +contrib_CFLAGS += -pthread +contrib_LIBS += -pthread +lmdb_CFLAGS += -I$(abspath contrib/lmdb) +endif + +$(eval $(call make_static,contrib,contrib)) diff --git a/contrib/licenses/BSD-MIT b/contrib/licenses/BSD-MIT new file mode 100644 index 0000000..89de354 --- /dev/null +++ b/contrib/licenses/BSD-MIT @@ -0,0 +1,17 @@ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/contrib/licenses/CC0 b/contrib/licenses/CC0 new file mode 100644 index 0000000..feb9b11 --- /dev/null +++ b/contrib/licenses/CC0 @@ -0,0 +1,28 @@ +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer exclusive Copyright and Related Rights (defined below) upon the creator and subsequent owner(s) (each and all, an "owner") of an original work of authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for the purpose of contributing to a commons of creative, cultural and scientific works ("Commons") that the public can reliably and without fear of later claims of infringement build upon, modify, incorporate in other works, reuse and redistribute as freely as possible in any form whatsoever and for any purposes, including without limitation commercial purposes. These owners may contribute to the Commons to promote the ideal of a free culture and the further production of creative, cultural and scientific works, or to gain reputation or greater distribution for their Work in part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any expectation of additional consideration or compensation, the person associating CC0 with a Work (the "Affirmer"), to the extent that he or she is an owner of Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and publicly distribute the Work under its terms, with knowledge of his or her Copyright and Related Rights in the Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be protected by copyright and related or neighboring rights ("Copyright and Related Rights"). Copyright and Related Rights include, but are not limited to, the following: + + the right to reproduce, adapt, distribute, perform, display, communicate, and translate a Work; + moral rights retained by the original author(s) and/or performer(s); + publicity and privacy rights pertaining to a person's image or likeness depicted in a Work; + rights protecting against unfair competition in regards to a Work, subject to the limitations in paragraph 4(a), below; + rights protecting the extraction, dissemination, use and reuse of data in a Work; + database rights (such as those arising under Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, and under any national implementation thereof, including any amended or successor version of such directive); and + other similar, equivalent or corresponding rights throughout the world based on applicable law or treaty, and any national implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention of, applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and unconditionally waives, abandons, and surrenders all of Affirmer's Copyright and Related Rights and associated claims and causes of action, whether now known or unknown (including existing as well as future claims and causes of action), in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each member of the public at large and to the detriment of Affirmer's heirs and successors, fully intending that such Waiver shall not be subject to revocation, rescission, cancellation, termination, or any other legal or equitable action to disrupt the quiet enjoyment of the Work by the public as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason be judged legally invalid or ineffective under applicable law, then the Waiver shall be preserved to the maximum extent permitted taking into account Affirmer's express Statement of Purpose. In addition, to the extent the Waiver is so judged Affirmer hereby grants to each affected person a royalty-free, non transferable, non sublicensable, non exclusive, irrevocable and unconditional license to exercise Affirmer's Copyright and Related Rights in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "License"). The License shall be deemed effective as of the date CC0 was applied by Affirmer to the Work. Should any part of the License for any reason be judged legally invalid or ineffective under applicable law, such partial invalidity or ineffectiveness shall not invalidate the remainder of the License, and in such case Affirmer hereby affirms that he or she will not (i) exercise any of his or her remaining Copyright and Related Rights in the Work or (ii) assert any associated claims and causes of action with respect to the Work, in either case contrary to Affirmer's express Statement of Purpose. + +4. Limitations and Disclaimers. + + No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected by this document. + Affirmer offers the Work as-is and makes no representations or warranties of any kind concerning the Work, express, implied, statutory or otherwise, including without limitation warranties of title, merchantability, fitness for a particular purpose, non infringement, or the absence of latent or other defects, accuracy, or the present or absence of errors, whether or not discoverable, all to the greatest extent permissible under applicable law. + Affirmer disclaims responsibility for clearing rights of other persons that may apply to the Work or any use thereof, including without limitation any person's Copyright and Related Rights in the Work. Further, Affirmer disclaims responsibility for obtaining any necessary consents, permissions or other rights required for any use of the Work. + Affirmer understands and acknowledges that Creative Commons is not a party to this document and has no duty or obligation with respect to this CC0 or use of the Work. diff --git a/contrib/licenses/LGPL2 b/contrib/licenses/LGPL2 new file mode 100644 index 0000000..4362b49 --- /dev/null +++ b/contrib/licenses/LGPL2 @@ -0,0 +1,502 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + <one line to give the library's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + <signature of Ty Coon>, 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! diff --git a/contrib/lmdb/LICENSE b/contrib/lmdb/LICENSE new file mode 100644 index 0000000..05ad757 --- /dev/null +++ b/contrib/lmdb/LICENSE @@ -0,0 +1,47 @@ +The OpenLDAP Public License + Version 2.8, 17 August 2003 + +Redistribution and use of this software and associated documentation +("Software"), with or without modification, are permitted provided +that the following conditions are met: + +1. Redistributions in source form must retain copyright statements + and notices, + +2. Redistributions in binary form must reproduce applicable copyright + statements and notices, this list of conditions, and the following + disclaimer in the documentation and/or other materials provided + with the distribution, and + +3. Redistributions must contain a verbatim copy of this document. + +The OpenLDAP Foundation may revise this license from time to time. +Each revision is distinguished by a version number. You may use +this Software under terms of this license revision or under the +terms of any subsequent revision of the license. + +THIS SOFTWARE IS PROVIDED BY THE OPENLDAP FOUNDATION AND ITS +CONTRIBUTORS ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE OPENLDAP FOUNDATION, ITS CONTRIBUTORS, OR THE AUTHOR(S) +OR OWNER(S) OF THE SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +The names of the authors and copyright holders must not be used in +advertising or otherwise to promote the sale, use or other dealing +in this Software without specific, written prior permission. Title +to copyright in this Software shall at all times remain with copyright +holders. + +OpenLDAP is a registered trademark of the OpenLDAP Foundation. + +Copyright 1999-2003 The OpenLDAP Foundation, Redwood City, +California, USA. All Rights Reserved. Permission to copy and +distribute verbatim copies of this document is granted. diff --git a/contrib/lmdb/lmdb.h b/contrib/lmdb/lmdb.h new file mode 100644 index 0000000..29135b7 --- /dev/null +++ b/contrib/lmdb/lmdb.h @@ -0,0 +1,1604 @@ +/** @file lmdb.h + * @brief Lightning memory-mapped database library + * + * @mainpage Lightning Memory-Mapped Database Manager (LMDB) + * + * @section intro_sec Introduction + * LMDB is a Btree-based database management library modeled loosely on the + * BerkeleyDB API, but much simplified. The entire database is exposed + * in a memory map, and all data fetches return data directly + * from the mapped memory, so no malloc's or memcpy's occur during + * data fetches. As such, the library is extremely simple because it + * requires no page caching layer of its own, and it is extremely high + * performance and memory-efficient. It is also fully transactional with + * full ACID semantics, and when the memory map is read-only, the + * database integrity cannot be corrupted by stray pointer writes from + * application code. + * + * The library is fully thread-aware and supports concurrent read/write + * access from multiple processes and threads. Data pages use a copy-on- + * write strategy so no active data pages are ever overwritten, which + * also provides resistance to corruption and eliminates the need of any + * special recovery procedures after a system crash. Writes are fully + * serialized; only one write transaction may be active at a time, which + * guarantees that writers can never deadlock. The database structure is + * multi-versioned so readers run with no locks; writers cannot block + * readers, and readers don't block writers. + * + * Unlike other well-known database mechanisms which use either write-ahead + * transaction logs or append-only data writes, LMDB requires no maintenance + * during operation. Both write-ahead loggers and append-only databases + * require periodic checkpointing and/or compaction of their log or database + * files otherwise they grow without bound. LMDB tracks free pages within + * the database and re-uses them for new write operations, so the database + * size does not grow without bound in normal use. + * + * The memory map can be used as a read-only or read-write map. It is + * read-only by default as this provides total immunity to corruption. + * Using read-write mode offers much higher write performance, but adds + * the possibility for stray application writes thru pointers to silently + * corrupt the database. Of course if your application code is known to + * be bug-free (...) then this is not an issue. + * + * If this is your first time using a transactional embedded key/value + * store, you may find the \ref starting page to be helpful. + * + * @section caveats_sec Caveats + * Troubleshooting the lock file, plus semaphores on BSD systems: + * + * - A broken lockfile can cause sync issues. + * Stale reader transactions left behind by an aborted program + * cause further writes to grow the database quickly, and + * stale locks can block further operation. + * + * Fix: Check for stale readers periodically, using the + * #mdb_reader_check function or the \ref mdb_stat_1 "mdb_stat" tool. + * Stale writers will be cleared automatically on some systems: + * - Windows - automatic + * - Linux, systems using POSIX mutexes with Robust option - automatic + * - not on BSD, systems using POSIX semaphores. + * Otherwise just make all programs using the database close it; + * the lockfile is always reset on first open of the environment. + * + * - On BSD systems or others configured with MDB_USE_POSIX_SEM, + * startup can fail due to semaphores owned by another userid. + * + * Fix: Open and close the database as the user which owns the + * semaphores (likely last user) or as root, while no other + * process is using the database. + * + * Restrictions/caveats (in addition to those listed for some functions): + * + * - Only the database owner should normally use the database on + * BSD systems or when otherwise configured with MDB_USE_POSIX_SEM. + * Multiple users can cause startup to fail later, as noted above. + * + * - There is normally no pure read-only mode, since readers need write + * access to locks and lock file. Exceptions: On read-only filesystems + * or with the #MDB_NOLOCK flag described under #mdb_env_open(). + * + * - An LMDB configuration will often reserve considerable \b unused + * memory address space and maybe file size for future growth. + * This does not use actual memory or disk space, but users may need + * to understand the difference so they won't be scared off. + * + * - By default, in versions before 0.9.10, unused portions of the data + * file might receive garbage data from memory freed by other code. + * (This does not happen when using the #MDB_WRITEMAP flag.) As of + * 0.9.10 the default behavior is to initialize such memory before + * writing to the data file. Since there may be a slight performance + * cost due to this initialization, applications may disable it using + * the #MDB_NOMEMINIT flag. Applications handling sensitive data + * which must not be written should not use this flag. This flag is + * irrelevant when using #MDB_WRITEMAP. + * + * - A thread can only use one transaction at a time, plus any child + * transactions. Each transaction belongs to one thread. See below. + * The #MDB_NOTLS flag changes this for read-only transactions. + * + * - Use an MDB_env* in the process which opened it, not after fork(). + * + * - Do not have open an LMDB database twice in the same process at + * the same time. Not even from a plain open() call - close()ing it + * breaks fcntl() advisory locking. (It is OK to reopen it after + * fork() - exec*(), since the lockfile has FD_CLOEXEC set.) + * + * - Avoid long-lived transactions. Read transactions prevent + * reuse of pages freed by newer write transactions, thus the + * database can grow quickly. Write transactions prevent + * other write transactions, since writes are serialized. + * + * - Avoid suspending a process with active transactions. These + * would then be "long-lived" as above. Also read transactions + * suspended when writers commit could sometimes see wrong data. + * + * ...when several processes can use a database concurrently: + * + * - Avoid aborting a process with an active transaction. + * The transaction becomes "long-lived" as above until a check + * for stale readers is performed or the lockfile is reset, + * since the process may not remove it from the lockfile. + * + * This does not apply to write transactions if the system clears + * stale writers, see above. + * + * - If you do that anyway, do a periodic check for stale readers. Or + * close the environment once in a while, so the lockfile can get reset. + * + * - Do not use LMDB databases on remote filesystems, even between + * processes on the same host. This breaks flock() on some OSes, + * possibly memory map sync, and certainly sync between programs + * on different hosts. + * + * - Opening a database can fail if another process is opening or + * closing it at exactly the same time. + * + * @author Howard Chu, Symas Corporation. + * + * @copyright Copyright 2011-2017 Howard Chu, Symas Corp. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * <http://www.OpenLDAP.org/license.html>. + * + * @par Derived From: + * This code is derived from btree.c written by Martin Hedenfalk. + * + * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef _LMDB_H_ +#define _LMDB_H_ + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** Unix permissions for creating files, or dummy definition for Windows */ +#ifdef _MSC_VER +typedef int mdb_mode_t; +#else +typedef mode_t mdb_mode_t; +#endif + +/** An abstraction for a file handle. + * On POSIX systems file handles are small integers. On Windows + * they're opaque pointers. + */ +#ifdef _WIN32 +typedef void *mdb_filehandle_t; +#else +typedef int mdb_filehandle_t; +#endif + +/** @defgroup mdb LMDB API + * @{ + * @brief OpenLDAP Lightning Memory-Mapped Database Manager + */ +/** @defgroup Version Version Macros + * @{ + */ +/** Library major version */ +#define MDB_VERSION_MAJOR 0 +/** Library minor version */ +#define MDB_VERSION_MINOR 9 +/** Library patch version */ +#define MDB_VERSION_PATCH 21 + +/** Combine args a,b,c into a single integer for easy version comparisons */ +#define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) + +/** The full library version as a single integer */ +#define MDB_VERSION_FULL \ + MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) + +/** The release date of this library version */ +#define MDB_VERSION_DATE "June 1, 2017" + +/** A stringifier for the version info */ +#define MDB_VERSTR(a,b,c,d) "LMDB " #a "." #b "." #c ": (" d ")" + +/** A helper for the stringifier macro */ +#define MDB_VERFOO(a,b,c,d) MDB_VERSTR(a,b,c,d) + +/** The full library version as a C string */ +#define MDB_VERSION_STRING \ + MDB_VERFOO(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH,MDB_VERSION_DATE) +/** @} */ + +/** @brief Opaque structure for a database environment. + * + * A DB environment supports multiple databases, all residing in the same + * shared-memory map. + */ +typedef struct MDB_env MDB_env; + +/** @brief Opaque structure for a transaction handle. + * + * All database operations require a transaction handle. Transactions may be + * read-only or read-write. + */ +typedef struct MDB_txn MDB_txn; + +/** @brief A handle for an individual database in the DB environment. */ +typedef unsigned int MDB_dbi; + +/** @brief Opaque structure for navigating through a database */ +typedef struct MDB_cursor MDB_cursor; + +/** @brief Generic structure used for passing keys and data in and out + * of the database. + * + * Values returned from the database are valid only until a subsequent + * update operation, or the end of the transaction. Do not modify or + * free them, they commonly point into the database itself. + * + * Key sizes must be between 1 and #mdb_env_get_maxkeysize() inclusive. + * The same applies to data sizes in databases with the #MDB_DUPSORT flag. + * Other data items can in theory be from 0 to 0xffffffff bytes long. + */ +typedef struct MDB_val { + size_t mv_size; /**< size of the data item */ + void *mv_data; /**< address of the data item */ +} MDB_val; + +/** @brief A callback function used to compare two keys in a database */ +typedef int (MDB_cmp_func)(const MDB_val *a, const MDB_val *b); + +/** @brief A callback function used to relocate a position-dependent data item + * in a fixed-address database. + * + * The \b newptr gives the item's desired address in + * the memory map, and \b oldptr gives its previous address. The item's actual + * data resides at the address in \b item. This callback is expected to walk + * through the fields of the record in \b item and modify any + * values based at the \b oldptr address to be relative to the \b newptr address. + * @param[in,out] item The item that is to be relocated. + * @param[in] oldptr The previous address. + * @param[in] newptr The new address to relocate to. + * @param[in] relctx An application-provided context, set by #mdb_set_relctx(). + * @todo This feature is currently unimplemented. + */ +typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *relctx); + +/** @defgroup mdb_env Environment Flags + * @{ + */ + /** mmap at a fixed address (experimental) */ +#define MDB_FIXEDMAP 0x01 + /** no environment directory */ +#define MDB_NOSUBDIR 0x4000 + /** don't fsync after commit */ +#define MDB_NOSYNC 0x10000 + /** read only */ +#define MDB_RDONLY 0x20000 + /** don't fsync metapage after commit */ +#define MDB_NOMETASYNC 0x40000 + /** use writable mmap */ +#define MDB_WRITEMAP 0x80000 + /** use asynchronous msync when #MDB_WRITEMAP is used */ +#define MDB_MAPASYNC 0x100000 + /** tie reader locktable slots to #MDB_txn objects instead of to threads */ +#define MDB_NOTLS 0x200000 + /** don't do any locking, caller must manage their own locks */ +#define MDB_NOLOCK 0x400000 + /** don't do readahead (no effect on Windows) */ +#define MDB_NORDAHEAD 0x800000 + /** don't initialize malloc'd memory before writing to datafile */ +#define MDB_NOMEMINIT 0x1000000 +/** @} */ + +/** @defgroup mdb_dbi_open Database Flags + * @{ + */ + /** use reverse string keys */ +#define MDB_REVERSEKEY 0x02 + /** use sorted duplicates */ +#define MDB_DUPSORT 0x04 + /** numeric keys in native byte order: either unsigned int or size_t. + * The keys must all be of the same size. */ +#define MDB_INTEGERKEY 0x08 + /** with #MDB_DUPSORT, sorted dup items have fixed size */ +#define MDB_DUPFIXED 0x10 + /** with #MDB_DUPSORT, dups are #MDB_INTEGERKEY-style integers */ +#define MDB_INTEGERDUP 0x20 + /** with #MDB_DUPSORT, use reverse string dups */ +#define MDB_REVERSEDUP 0x40 + /** create DB if not already existing */ +#define MDB_CREATE 0x40000 +/** @} */ + +/** @defgroup mdb_put Write Flags + * @{ + */ +/** For put: Don't write if the key already exists. */ +#define MDB_NOOVERWRITE 0x10 +/** Only for #MDB_DUPSORT<br> + * For put: don't write if the key and data pair already exist.<br> + * For mdb_cursor_del: remove all duplicate data items. + */ +#define MDB_NODUPDATA 0x20 +/** For mdb_cursor_put: overwrite the current key/data pair */ +#define MDB_CURRENT 0x40 +/** For put: Just reserve space for data, don't copy it. Return a + * pointer to the reserved space. + */ +#define MDB_RESERVE 0x10000 +/** Data is being appended, don't split full pages. */ +#define MDB_APPEND 0x20000 +/** Duplicate data is being appended, don't split full pages. */ +#define MDB_APPENDDUP 0x40000 +/** Store multiple data items in one call. Only for #MDB_DUPFIXED. */ +#define MDB_MULTIPLE 0x80000 +/* @} */ + +/** @defgroup mdb_copy Copy Flags + * @{ + */ +/** Compacting copy: Omit free space from copy, and renumber all + * pages sequentially. + */ +#define MDB_CP_COMPACT 0x01 +/* @} */ + +/** @brief Cursor Get operations. + * + * This is the set of all operations for retrieving data + * using a cursor. + */ +typedef enum MDB_cursor_op { + MDB_FIRST, /**< Position at first key/data item */ + MDB_FIRST_DUP, /**< Position at first data item of current key. + Only for #MDB_DUPSORT */ + MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */ + MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for #MDB_DUPSORT */ + MDB_GET_CURRENT, /**< Return key/data at current cursor position */ + MDB_GET_MULTIPLE, /**< Return key and up to a page of duplicate data items + from current cursor position. Move cursor to prepare + for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ + MDB_LAST, /**< Position at last key/data item */ + MDB_LAST_DUP, /**< Position at last data item of current key. + Only for #MDB_DUPSORT */ + MDB_NEXT, /**< Position at next data item */ + MDB_NEXT_DUP, /**< Position at next data item of current key. + Only for #MDB_DUPSORT */ + MDB_NEXT_MULTIPLE, /**< Return key and up to a page of duplicate data items + from next cursor position. Move cursor to prepare + for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ + MDB_NEXT_NODUP, /**< Position at first data item of next key */ + MDB_PREV, /**< Position at previous data item */ + MDB_PREV_DUP, /**< Position at previous data item of current key. + Only for #MDB_DUPSORT */ + MDB_PREV_NODUP, /**< Position at last data item of previous key */ + MDB_SET, /**< Position at specified key */ + MDB_SET_KEY, /**< Position at specified key, return key + data */ + MDB_SET_RANGE, /**< Position at first key greater than or equal to specified key. */ + MDB_PREV_MULTIPLE /**< Position at previous page and return key and up to + a page of duplicate data items. Only for #MDB_DUPFIXED */ +} MDB_cursor_op; + +/** @defgroup errors Return Codes + * + * BerkeleyDB uses -30800 to -30999, we'll go under them + * @{ + */ + /** Successful result */ +#define MDB_SUCCESS 0 + /** key/data pair already exists */ +#define MDB_KEYEXIST (-30799) + /** key/data pair not found (EOF) */ +#define MDB_NOTFOUND (-30798) + /** Requested page not found - this usually indicates corruption */ +#define MDB_PAGE_NOTFOUND (-30797) + /** Located page was wrong type */ +#define MDB_CORRUPTED (-30796) + /** Update of meta page failed or environment had fatal error */ +#define MDB_PANIC (-30795) + /** Environment version mismatch */ +#define MDB_VERSION_MISMATCH (-30794) + /** File is not a valid LMDB file */ +#define MDB_INVALID (-30793) + /** Environment mapsize reached */ +#define MDB_MAP_FULL (-30792) + /** Environment maxdbs reached */ +#define MDB_DBS_FULL (-30791) + /** Environment maxreaders reached */ +#define MDB_READERS_FULL (-30790) + /** Too many TLS keys in use - Windows only */ +#define MDB_TLS_FULL (-30789) + /** Txn has too many dirty pages */ +#define MDB_TXN_FULL (-30788) + /** Cursor stack too deep - internal error */ +#define MDB_CURSOR_FULL (-30787) + /** Page has not enough space - internal error */ +#define MDB_PAGE_FULL (-30786) + /** Database contents grew beyond environment mapsize */ +#define MDB_MAP_RESIZED (-30785) + /** Operation and DB incompatible, or DB type changed. This can mean: + * <ul> + * <li>The operation expects an #MDB_DUPSORT / #MDB_DUPFIXED database. + * <li>Opening a named DB when the unnamed DB has #MDB_DUPSORT / #MDB_INTEGERKEY. + * <li>Accessing a data record as a database, or vice versa. + * <li>The database was dropped and recreated with different flags. + * </ul> + */ +#define MDB_INCOMPATIBLE (-30784) + /** Invalid reuse of reader locktable slot */ +#define MDB_BAD_RSLOT (-30783) + /** Transaction must abort, has a child, or is invalid */ +#define MDB_BAD_TXN (-30782) + /** Unsupported size of key/DB name/data, or wrong DUPFIXED size */ +#define MDB_BAD_VALSIZE (-30781) + /** The specified DBI was changed unexpectedly */ +#define MDB_BAD_DBI (-30780) + /** The last defined error code */ +#define MDB_LAST_ERRCODE MDB_BAD_DBI +/** @} */ + +/** @brief Statistics for a database in the environment */ +typedef struct MDB_stat { + unsigned int ms_psize; /**< Size of a database page. + This is currently the same for all databases. */ + unsigned int ms_depth; /**< Depth (height) of the B-tree */ + size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ + size_t ms_leaf_pages; /**< Number of leaf pages */ + size_t ms_overflow_pages; /**< Number of overflow pages */ + size_t ms_entries; /**< Number of data items */ +} MDB_stat; + +/** @brief Information about the environment */ +typedef struct MDB_envinfo { + void *me_mapaddr; /**< Address of map, if fixed */ + size_t me_mapsize; /**< Size of the data memory map */ + size_t me_last_pgno; /**< ID of the last used page */ + size_t me_last_txnid; /**< ID of the last committed transaction */ + unsigned int me_maxreaders; /**< max reader slots in the environment */ + unsigned int me_numreaders; /**< max reader slots used in the environment */ +} MDB_envinfo; + + /** @brief Return the LMDB library version information. + * + * @param[out] major if non-NULL, the library major version number is copied here + * @param[out] minor if non-NULL, the library minor version number is copied here + * @param[out] patch if non-NULL, the library patch version number is copied here + * @retval "version string" The library version as a string + */ +char *mdb_version(int *major, int *minor, int *patch); + + /** @brief Return a string describing a given error code. + * + * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3) + * function. If the error code is greater than or equal to 0, then the string + * returned by the system function strerror(3) is returned. If the error code + * is less than 0, an error string corresponding to the LMDB library error is + * returned. See @ref errors for a list of LMDB-specific error codes. + * @param[in] err The error code + * @retval "error message" The description of the error + */ +char *mdb_strerror(int err); + + /** @brief Create an LMDB environment handle. + * + * This function allocates memory for a #MDB_env structure. To release + * the allocated memory and discard the handle, call #mdb_env_close(). + * Before the handle may be used, it must be opened using #mdb_env_open(). + * Various other options may also need to be set before opening the handle, + * e.g. #mdb_env_set_mapsize(), #mdb_env_set_maxreaders(), #mdb_env_set_maxdbs(), + * depending on usage requirements. + * @param[out] env The address where the new handle will be stored + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_create(MDB_env **env); + + /** @brief Open an environment handle. + * + * If this function fails, #mdb_env_close() must be called to discard the #MDB_env handle. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] path The directory in which the database files reside. This + * directory must already exist and be writable. + * @param[in] flags Special options for this environment. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + * Flags set by mdb_env_set_flags() are also used. + * <ul> + * <li>#MDB_FIXEDMAP + * use a fixed address for the mmap region. This flag must be specified + * when creating the environment, and is stored persistently in the environment. + * If successful, the memory map will always reside at the same virtual address + * and pointers used to reference data items in the database will be constant + * across multiple invocations. This option may not always work, depending on + * how the operating system has allocated memory to shared libraries and other uses. + * The feature is highly experimental. + * <li>#MDB_NOSUBDIR + * By default, LMDB creates its environment in a directory whose + * pathname is given in \b path, and creates its data and lock files + * under that directory. With this option, \b path is used as-is for + * the database main data file. The database lock file is the \b path + * with "-lock" appended. + * <li>#MDB_RDONLY + * Open the environment in read-only mode. No write operations will be + * allowed. LMDB will still modify the lock file - except on read-only + * filesystems, where LMDB does not use locks. + * <li>#MDB_WRITEMAP + * Use a writeable memory map unless MDB_RDONLY is set. This uses + * fewer mallocs but loses protection from application bugs + * like wild pointer writes and other bad updates into the database. + * This may be slightly faster for DBs that fit entirely in RAM, but + * is slower for DBs larger than RAM. + * Incompatible with nested transactions. + * Do not mix processes with and without MDB_WRITEMAP on the same + * environment. This can defeat durability (#mdb_env_sync etc). + * <li>#MDB_NOMETASYNC + * Flush system buffers to disk only once per transaction, omit the + * metadata flush. Defer that until the system flushes files to disk, + * or next non-MDB_RDONLY commit or #mdb_env_sync(). This optimization + * maintains database integrity, but a system crash may undo the last + * committed transaction. I.e. it preserves the ACI (atomicity, + * consistency, isolation) but not D (durability) database property. + * This flag may be changed at any time using #mdb_env_set_flags(). + * <li>#MDB_NOSYNC + * Don't flush system buffers to disk when committing a transaction. + * This optimization means a system crash can corrupt the database or + * lose the last transactions if buffers are not yet flushed to disk. + * The risk is governed by how often the system flushes dirty buffers + * to disk and how often #mdb_env_sync() is called. However, if the + * filesystem preserves write order and the #MDB_WRITEMAP flag is not + * used, transactions exhibit ACI (atomicity, consistency, isolation) + * properties and only lose D (durability). I.e. database integrity + * is maintained, but a system crash may undo the final transactions. + * Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with no + * hint for when to write transactions to disk, unless #mdb_env_sync() + * is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable. + * This flag may be changed at any time using #mdb_env_set_flags(). + * <li>#MDB_MAPASYNC + * When using #MDB_WRITEMAP, use asynchronous flushes to disk. + * As with #MDB_NOSYNC, a system crash can then corrupt the + * database or lose the last transactions. Calling #mdb_env_sync() + * ensures on-disk database integrity until next commit. + * This flag may be changed at any time using #mdb_env_set_flags(). + * <li>#MDB_NOTLS + * Don't use Thread-Local Storage. Tie reader locktable slots to + * #MDB_txn objects instead of to threads. I.e. #mdb_txn_reset() keeps + * the slot reseved for the #MDB_txn object. A thread may use parallel + * read-only transactions. A read-only transaction may span threads if + * the user synchronizes its use. Applications that multiplex many + * user threads over individual OS threads need this option. Such an + * application must also serialize the write transactions in an OS + * thread, since LMDB's write locking is unaware of the user threads. + * <li>#MDB_NOLOCK + * Don't do any locking. If concurrent access is anticipated, the + * caller must manage all concurrency itself. For proper operation + * the caller must enforce single-writer semantics, and must ensure + * that no readers are using old transactions while a writer is + * active. The simplest approach is to use an exclusive lock so that + * no readers may be active at all when a writer begins. + * <li>#MDB_NORDAHEAD + * Turn off readahead. Most operating systems perform readahead on + * read requests by default. This option turns it off if the OS + * supports it. Turning it off may help random read performance + * when the DB is larger than RAM and system RAM is full. + * The option is not implemented on Windows. + * <li>#MDB_NOMEMINIT + * Don't initialize malloc'd memory before writing to unused spaces + * in the data file. By default, memory for pages written to the data + * file is obtained using malloc. While these pages may be reused in + * subsequent transactions, freshly malloc'd pages will be initialized + * to zeroes before use. This avoids persisting leftover data from other + * code (that used the heap and subsequently freed the memory) into the + * data file. Note that many other system libraries may allocate + * and free memory from the heap for arbitrary uses. E.g., stdio may + * use the heap for file I/O buffers. This initialization step has a + * modest performance cost so some applications may want to disable + * it using this flag. This option can be a problem for applications + * which handle sensitive data like passwords, and it makes memory + * checkers like Valgrind noisy. This flag is not needed with #MDB_WRITEMAP, + * which writes directly to the mmap instead of using malloc for pages. The + * initialization is also skipped if #MDB_RESERVE is used; the + * caller is expected to overwrite all of the memory that was + * reserved in that case. + * This flag may be changed at any time using #mdb_env_set_flags(). + * </ul> + * @param[in] mode The UNIX permissions to set on created files and semaphores. + * This parameter is ignored on Windows. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>#MDB_VERSION_MISMATCH - the version of the LMDB library doesn't match the + * version that created the database environment. + * <li>#MDB_INVALID - the environment file headers are corrupted. + * <li>ENOENT - the directory specified by the path parameter doesn't exist. + * <li>EACCES - the user didn't have permission to access the environment files. + * <li>EAGAIN - the environment was locked by another process. + * </ul> + */ +int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode); + + /** @brief Copy an LMDB environment to the specified path. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] path The directory in which the copy will reside. This + * directory must already exist and be writable but must otherwise be + * empty. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copy(MDB_env *env, const char *path); + + /** @brief Copy an LMDB environment to the specified file descriptor. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] fd The filedescriptor to write the copy to. It must + * have already been opened for Write access. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd); + + /** @brief Copy an LMDB environment to the specified path, with options. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] path The directory in which the copy will reside. This + * directory must already exist and be writable but must otherwise be + * empty. + * @param[in] flags Special options for this operation. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + * <ul> + * <li>#MDB_CP_COMPACT - Perform compaction while copying: omit free + * pages and sequentially renumber all pages in output. This option + * consumes more CPU and runs more slowly than the default. + * Currently it fails if the environment has suffered a page leak. + * </ul> + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags); + + /** @brief Copy an LMDB environment to the specified file descriptor, + * with options. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. See + * #mdb_env_copy2() for further details. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] fd The filedescriptor to write the copy to. It must + * have already been opened for Write access. + * @param[in] flags Special options for this operation. + * See #mdb_env_copy2() for options. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copyfd2(MDB_env *env, mdb_filehandle_t fd, unsigned int flags); + + /** @brief Return statistics about the LMDB environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] stat The address of an #MDB_stat structure + * where the statistics will be copied + */ +int mdb_env_stat(MDB_env *env, MDB_stat *stat); + + /** @brief Return information about the LMDB environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] stat The address of an #MDB_envinfo structure + * where the information will be copied + */ +int mdb_env_info(MDB_env *env, MDB_envinfo *stat); + + /** @brief Flush the data buffers to disk. + * + * Data is always written to disk when #mdb_txn_commit() is called, + * but the operating system may keep it buffered. LMDB always flushes + * the OS buffers upon commit as well, unless the environment was + * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. This call is + * not valid if the environment was opened with #MDB_RDONLY. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] force If non-zero, force a synchronous flush. Otherwise + * if the environment has the #MDB_NOSYNC flag set the flushes + * will be omitted, and with #MDB_MAPASYNC they will be asynchronous. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EACCES - the environment is read-only. + * <li>EINVAL - an invalid parameter was specified. + * <li>EIO - an error occurred during synchronization. + * </ul> + */ +int mdb_env_sync(MDB_env *env, int force); + + /** @brief Close the environment and release the memory map. + * + * Only a single thread may call this function. All transactions, databases, + * and cursors must already be closed before calling this function. Attempts to + * use any such handles after calling this function will cause a SIGSEGV. + * The environment handle will be freed and must not be used again after this call. + * @param[in] env An environment handle returned by #mdb_env_create() + */ +void mdb_env_close(MDB_env *env); + + /** @brief Set environment flags. + * + * This may be used to set some flags in addition to those from + * #mdb_env_open(), or to unset these flags. If several threads + * change the flags at the same time, the result is undefined. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] flags The flags to change, bitwise OR'ed together + * @param[in] onoff A non-zero value sets the flags, zero clears them. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_env_set_flags(MDB_env *env, unsigned int flags, int onoff); + + /** @brief Get environment flags. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] flags The address of an integer to store the flags + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_env_get_flags(MDB_env *env, unsigned int *flags); + + /** @brief Return the path that was used in #mdb_env_open(). + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] path Address of a string pointer to contain the path. This + * is the actual string in the environment, not a copy. It should not be + * altered in any way. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_env_get_path(MDB_env *env, const char **path); + + /** @brief Return the filedescriptor for the given environment. + * + * This function may be called after fork(), so the descriptor can be + * closed before exec*(). Other LMDB file descriptors have FD_CLOEXEC. + * (Until LMDB 0.9.18, only the lockfile had that.) + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] fd Address of a mdb_filehandle_t to contain the descriptor. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd); + + /** @brief Set the size of the memory map to use for this environment. + * + * The size should be a multiple of the OS page size. The default is + * 10485760 bytes. The size of the memory map is also the maximum size + * of the database. The value should be chosen as large as possible, + * to accommodate future growth of the database. + * This function should be called after #mdb_env_create() and before #mdb_env_open(). + * It may be called at later times if no transactions are active in + * this process. Note that the library does not check for this condition, + * the caller must ensure it explicitly. + * + * The new size takes effect immediately for the current process but + * will not be persisted to any others until a write transaction has been + * committed by the current process. Also, only mapsize increases are + * persisted into the environment. + * + * If the mapsize is increased by another process, and data has grown + * beyond the range of the current mapsize, #mdb_txn_begin() will + * return #MDB_MAP_RESIZED. This function may be called with a size + * of zero to adopt the new size. + * + * Any attempt to set a size smaller than the space already consumed + * by the environment will be silently changed to the current size of the used space. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] size The size in bytes + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified, or the environment has + * an active write transaction. + * </ul> + */ +int mdb_env_set_mapsize(MDB_env *env, size_t size); + + /** @brief Set the maximum number of threads/reader slots for the environment. + * + * This defines the number of slots in the lock table that is used to track readers in the + * the environment. The default is 126. + * Starting a read-only transaction normally ties a lock table slot to the + * current thread until the environment closes or the thread exits. If + * MDB_NOTLS is in use, #mdb_txn_begin() instead ties the slot to the + * MDB_txn object until it or the #MDB_env object is destroyed. + * This function may only be called after #mdb_env_create() and before #mdb_env_open(). + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] readers The maximum number of reader lock table slots + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified, or the environment is already open. + * </ul> + */ +int mdb_env_set_maxreaders(MDB_env *env, unsigned int readers); + + /** @brief Get the maximum number of threads/reader slots for the environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] readers Address of an integer to store the number of readers + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers); + + /** @brief Set the maximum number of named databases for the environment. + * + * This function is only needed if multiple databases will be used in the + * environment. Simpler applications that use the environment as a single + * unnamed database can ignore this option. + * This function may only be called after #mdb_env_create() and before #mdb_env_open(). + * + * Currently a moderate number of slots are cheap but a huge number gets + * expensive: 7-120 words per transaction, and every #mdb_dbi_open() + * does a linear search of the opened slots. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] dbs The maximum number of databases + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified, or the environment is already open. + * </ul> + */ +int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); + + /** @brief Get the maximum size of keys and #MDB_DUPSORT data we can write. + * + * Depends on the compile-time constant #MDB_MAXKEYSIZE. Default 511. + * See @ref MDB_val. + * @param[in] env An environment handle returned by #mdb_env_create() + * @return The maximum size of a key we can write + */ +int mdb_env_get_maxkeysize(MDB_env *env); + + /** @brief Set application information associated with the #MDB_env. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] ctx An arbitrary pointer for whatever the application needs. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_set_userctx(MDB_env *env, void *ctx); + + /** @brief Get the application information associated with the #MDB_env. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @return The pointer set by #mdb_env_set_userctx(). + */ +void *mdb_env_get_userctx(MDB_env *env); + + /** @brief A callback function for most LMDB assert() failures, + * called before printing the message and aborting. + * + * @param[in] env An environment handle returned by #mdb_env_create(). + * @param[in] msg The assertion message, not including newline. + */ +typedef void MDB_assert_func(MDB_env *env, const char *msg); + + /** Set or reset the assert() callback of the environment. + * Disabled if liblmdb is buillt with NDEBUG. + * @note This hack should become obsolete as lmdb's error handling matures. + * @param[in] env An environment handle returned by #mdb_env_create(). + * @param[in] func An #MDB_assert_func function, or 0. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_set_assert(MDB_env *env, MDB_assert_func *func); + + /** @brief Create a transaction for use with the environment. + * + * The transaction handle may be discarded using #mdb_txn_abort() or #mdb_txn_commit(). + * @note A transaction and its cursors must only be used by a single + * thread, and a thread may only have a single transaction at a time. + * If #MDB_NOTLS is in use, this does not apply to read-only transactions. + * @note Cursors may not span transactions. + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] parent If this parameter is non-NULL, the new transaction + * will be a nested transaction, with the transaction indicated by \b parent + * as its parent. Transactions may be nested to any level. A parent + * transaction and its cursors may not issue any other operations than + * mdb_txn_commit and mdb_txn_abort while it has active child transactions. + * @param[in] flags Special options for this transaction. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + * <ul> + * <li>#MDB_RDONLY + * This transaction will not perform any write operations. + * </ul> + * @param[out] txn Address where the new #MDB_txn handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>#MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + * <li>#MDB_MAP_RESIZED - another process wrote data beyond this MDB_env's + * mapsize and this environment's map must be resized as well. + * See #mdb_env_set_mapsize(). + * <li>#MDB_READERS_FULL - a read-only transaction was requested and + * the reader lock table is full. See #mdb_env_set_maxreaders(). + * <li>ENOMEM - out of memory. + * </ul> + */ +int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **txn); + + /** @brief Returns the transaction's #MDB_env + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + */ +MDB_env *mdb_txn_env(MDB_txn *txn); + + /** @brief Return the transaction's ID. + * + * This returns the identifier associated with this transaction. For a + * read-only transaction, this corresponds to the snapshot being read; + * concurrent readers will frequently have the same transaction ID. + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @return A transaction ID, valid if input is an active transaction. + */ +size_t mdb_txn_id(MDB_txn *txn); + + /** @brief Commit all the operations of a transaction into the database. + * + * The transaction handle is freed. It and its cursors must not be used + * again after this call, except with #mdb_cursor_renew(). + * @note Earlier documentation incorrectly said all cursors would be freed. + * Only write-transactions free cursors. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified. + * <li>ENOSPC - no more disk space. + * <li>EIO - a low-level I/O error occurred while writing. + * <li>ENOMEM - out of memory. + * </ul> + */ +int mdb_txn_commit(MDB_txn *txn); + + /** @brief Abandon all the operations of the transaction instead of saving them. + * + * The transaction handle is freed. It and its cursors must not be used + * again after this call, except with #mdb_cursor_renew(). + * @note Earlier documentation incorrectly said all cursors would be freed. + * Only write-transactions free cursors. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + */ +void mdb_txn_abort(MDB_txn *txn); + + /** @brief Reset a read-only transaction. + * + * Abort the transaction like #mdb_txn_abort(), but keep the transaction + * handle. #mdb_txn_renew() may reuse the handle. This saves allocation + * overhead if the process will start a new read-only transaction soon, + * and also locking overhead if #MDB_NOTLS is in use. The reader table + * lock is released, but the table slot stays tied to its thread or + * #MDB_txn. Use mdb_txn_abort() to discard a reset handle, and to free + * its lock table slot if MDB_NOTLS is in use. + * Cursors opened within the transaction must not be used + * again after this call, except with #mdb_cursor_renew(). + * Reader locks generally don't interfere with writers, but they keep old + * versions of database pages allocated. Thus they prevent the old pages + * from being reused when writers commit new data, and so under heavy load + * the database size may grow much more rapidly than otherwise. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + */ +void mdb_txn_reset(MDB_txn *txn); + + /** @brief Renew a read-only transaction. + * + * This acquires a new reader lock for a transaction handle that had been + * released by #mdb_txn_reset(). It must be called before a reset transaction + * may be used again. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>#MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_txn_renew(MDB_txn *txn); + +/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ +#define mdb_open(txn,name,flags,dbi) mdb_dbi_open(txn,name,flags,dbi) +/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ +#define mdb_close(env,dbi) mdb_dbi_close(env,dbi) + + /** @brief Open a database in the environment. + * + * A database handle denotes the name and parameters of a database, + * independently of whether such a database exists. + * The database handle may be discarded by calling #mdb_dbi_close(). + * The old database handle is returned if the database was already open. + * The handle may only be closed once. + * + * The database handle will be private to the current transaction until + * the transaction is successfully committed. If the transaction is + * aborted the handle will be closed automatically. + * After a successful commit the handle will reside in the shared + * environment, and may be used by other transactions. + * + * This function must not be called from multiple concurrent + * transactions in the same process. A transaction that uses + * this function must finish (either commit or abort) before + * any other transaction in the process may use this function. + * + * To use named databases (with name != NULL), #mdb_env_set_maxdbs() + * must be called before opening the environment. Database names are + * keys in the unnamed database, and may be read but not written. + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] name The name of the database to open. If only a single + * database is needed in the environment, this value may be NULL. + * @param[in] flags Special options for this database. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + * <ul> + * <li>#MDB_REVERSEKEY + * Keys are strings to be compared in reverse order, from the end + * of the strings to the beginning. By default, Keys are treated as strings and + * compared from beginning to end. + * <li>#MDB_DUPSORT + * Duplicate keys may be used in the database. (Or, from another perspective, + * keys may have multiple data items, stored in sorted order.) By default + * keys must be unique and may have only a single data item. + * <li>#MDB_INTEGERKEY + * Keys are binary integers in native byte order, either unsigned int + * or size_t, and will be sorted as such. + * The keys must all be of the same size. + * <li>#MDB_DUPFIXED + * This flag may only be used in combination with #MDB_DUPSORT. This option + * tells the library that the data items for this database are all the same + * size, which allows further optimizations in storage and retrieval. When + * all data items are the same size, the #MDB_GET_MULTIPLE, #MDB_NEXT_MULTIPLE + * and #MDB_PREV_MULTIPLE cursor operations may be used to retrieve multiple + * items at once. + * <li>#MDB_INTEGERDUP + * This option specifies that duplicate data items are binary integers, + * similar to #MDB_INTEGERKEY keys. + * <li>#MDB_REVERSEDUP + * This option specifies that duplicate data items should be compared as + * strings in reverse order. + * <li>#MDB_CREATE + * Create the named database if it doesn't exist. This option is not + * allowed in a read-only transaction or a read-only environment. + * </ul> + * @param[out] dbi Address where the new #MDB_dbi handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>#MDB_NOTFOUND - the specified database doesn't exist in the environment + * and #MDB_CREATE was not specified. + * <li>#MDB_DBS_FULL - too many databases have been opened. See #mdb_env_set_maxdbs(). + * </ul> + */ +int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi); + + /** @brief Retrieve statistics for a database. + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[out] stat The address of an #MDB_stat structure + * where the statistics will be copied + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat); + + /** @brief Retrieve the DB flags for a database handle. + * + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[out] flags Address where the flags will be returned. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags); + + /** @brief Close a database handle. Normally unnecessary. Use with care: + * + * This call is not mutex protected. Handles should only be closed by + * a single thread, and only if no other threads are going to reference + * the database handle or one of its cursors any further. Do not close + * a handle if an existing transaction has modified its database. + * Doing so can cause misbehavior from database corruption to errors + * like MDB_BAD_VALSIZE (since the DB name is gone). + * + * Closing a database handle is not necessary, but lets #mdb_dbi_open() + * reuse the handle value. Usually it's better to set a bigger + * #mdb_env_set_maxdbs(), unless that value would be large. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + */ +void mdb_dbi_close(MDB_env *env, MDB_dbi dbi); + + /** @brief Empty or delete+close a database. + * + * See #mdb_dbi_close() for restrictions about closing the DB handle. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] del 0 to empty the DB, 1 to delete it from the + * environment and close the DB handle. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del); + + /** @brief Set a custom key comparison function for a database. + * + * The comparison function is called whenever it is necessary to compare a + * key specified by the application with a key currently stored in the database. + * If no comparison function is specified, and no special key flags were specified + * with #mdb_dbi_open(), the keys are compared lexically, with shorter keys collating + * before longer keys. + * @warning This function must be called before any data access functions are used, + * otherwise data corruption may occur. The same comparison function must be used by every + * program accessing the database, every time the database is used. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] cmp A #MDB_cmp_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); + + /** @brief Set a custom data comparison function for a #MDB_DUPSORT database. + * + * This comparison function is called whenever it is necessary to compare a data + * item specified by the application with a data item currently stored in the database. + * This function only takes effect if the database was opened with the #MDB_DUPSORT + * flag. + * If no comparison function is specified, and no special key flags were specified + * with #mdb_dbi_open(), the data items are compared lexically, with shorter items collating + * before longer items. + * @warning This function must be called before any data access functions are used, + * otherwise data corruption may occur. The same comparison function must be used by every + * program accessing the database, every time the database is used. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] cmp A #MDB_cmp_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); + + /** @brief Set a relocation function for a #MDB_FIXEDMAP database. + * + * @todo The relocation function is called whenever it is necessary to move the data + * of an item to a different position in the database (e.g. through tree + * balancing operations, shifts as a result of adds or deletes, etc.). It is + * intended to allow address/position-dependent data items to be stored in + * a database in an environment opened with the #MDB_FIXEDMAP option. + * Currently the relocation feature is unimplemented and setting + * this function has no effect. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] rel A #MDB_rel_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel); + + /** @brief Set a context pointer for a #MDB_FIXEDMAP database's relocation function. + * + * See #mdb_set_relfunc and #MDB_rel_func for more details. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] ctx An arbitrary pointer for whatever the application needs. + * It will be passed to the callback function set by #mdb_set_relfunc + * as its \b relctx parameter whenever the callback is invoked. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx); + + /** @brief Get items from a database. + * + * This function retrieves key/data pairs from the database. The address + * and length of the data associated with the specified \b key are returned + * in the structure to which \b data refers. + * If the database supports duplicate keys (#MDB_DUPSORT) then the + * first data item for the key will be returned. Retrieval of other + * items requires the use of #mdb_cursor_get(). + * + * @note The memory pointed to by the returned values is owned by the + * database. The caller need not dispose of the memory, and may not + * modify it in any way. For values returned in a read-only transaction + * any modification attempts will cause a SIGSEGV. + * @note Values returned from the database are valid only until a + * subsequent update operation, or the end of the transaction. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] key The key to search for in the database + * @param[out] data The data corresponding to the key + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>#MDB_NOTFOUND - the key was not in the database. + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); + + /** @brief Store items into a database. + * + * This function stores key/data pairs in the database. The default behavior + * is to enter the new key/data pair, replacing any previously existing key + * if duplicates are disallowed, or adding a duplicate data item if + * duplicates are allowed (#MDB_DUPSORT). + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] key The key to store in the database + * @param[in,out] data The data to store + * @param[in] flags Special options for this operation. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + * <ul> + * <li>#MDB_NODUPDATA - enter the new key/data pair only if it does not + * already appear in the database. This flag may only be specified + * if the database was opened with #MDB_DUPSORT. The function will + * return #MDB_KEYEXIST if the key/data pair already appears in the + * database. + * <li>#MDB_NOOVERWRITE - enter the new key/data pair only if the key + * does not already appear in the database. The function will return + * #MDB_KEYEXIST if the key already appears in the database, even if + * the database supports duplicates (#MDB_DUPSORT). The \b data + * parameter will be set to point to the existing item. + * <li>#MDB_RESERVE - reserve space for data of the given size, but + * don't copy the given data. Instead, return a pointer to the + * reserved space, which the caller can fill in later - before + * the next update operation or the transaction ends. This saves + * an extra memcpy if the data is being generated later. + * LMDB does nothing else with this memory, the caller is expected + * to modify all of the space requested. This flag must not be + * specified if the database was opened with #MDB_DUPSORT. + * <li>#MDB_APPEND - append the given key/data pair to the end of the + * database. This option allows fast bulk loading when keys are + * already known to be in the correct order. Loading unsorted keys + * with this flag will cause a #MDB_KEYEXIST error. + * <li>#MDB_APPENDDUP - as above, but for sorted dup data. + * </ul> + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>#MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). + * <li>#MDB_TXN_FULL - the transaction has too many dirty pages. + * <li>EACCES - an attempt was made to write in a read-only transaction. + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + unsigned int flags); + + /** @brief Delete items from a database. + * + * This function removes key/data pairs from the database. + * If the database does not support sorted duplicate data items + * (#MDB_DUPSORT) the data parameter is ignored. + * If the database supports sorted duplicates and the data parameter + * is NULL, all of the duplicate data items for the key will be + * deleted. Otherwise, if the data parameter is non-NULL + * only the matching data item will be deleted. + * This function will return #MDB_NOTFOUND if the specified key/data + * pair is not in the database. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] key The key to delete from the database + * @param[in] data The data to delete + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EACCES - an attempt was made to write in a read-only transaction. + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); + + /** @brief Create a cursor handle. + * + * A cursor is associated with a specific transaction and database. + * A cursor cannot be used when its database handle is closed. Nor + * when its transaction has ended, except with #mdb_cursor_renew(). + * It can be discarded with #mdb_cursor_close(). + * A cursor in a write-transaction can be closed before its transaction + * ends, and will otherwise be closed when its transaction ends. + * A cursor in a read-only transaction must be closed explicitly, before + * or after its transaction ends. It can be reused with + * #mdb_cursor_renew() before finally closing it. + * @note Earlier documentation said that cursors in every transaction + * were closed when the transaction committed or aborted. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[out] cursor Address where the new #MDB_cursor handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); + + /** @brief Close a cursor handle. + * + * The cursor handle will be freed and must not be used again after this call. + * Its transaction must still be live if it is a write-transaction. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + */ +void mdb_cursor_close(MDB_cursor *cursor); + + /** @brief Renew a cursor handle. + * + * A cursor is associated with a specific transaction and database. + * Cursors that are only used in read-only + * transactions may be re-used, to avoid unnecessary malloc/free overhead. + * The cursor may be associated with a new read-only transaction, and + * referencing the same database handle as it was created with. + * This may be done whether the previous transaction is live or dead. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); + + /** @brief Return the cursor's transaction handle. + * + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + */ +MDB_txn *mdb_cursor_txn(MDB_cursor *cursor); + + /** @brief Return the cursor's database handle. + * + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + */ +MDB_dbi mdb_cursor_dbi(MDB_cursor *cursor); + + /** @brief Retrieve by cursor. + * + * This function retrieves key/data pairs from the database. The address and length + * of the key are returned in the object to which \b key refers (except for the + * case of the #MDB_SET option, in which the \b key object is unchanged), and + * the address and length of the data are returned in the object to which \b data + * refers. + * See #mdb_get() for restrictions on using the output values. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[in,out] key The key for a retrieved item + * @param[in,out] data The data of a retrieved item + * @param[in] op A cursor operation #MDB_cursor_op + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>#MDB_NOTFOUND - no matching key found. + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + MDB_cursor_op op); + + /** @brief Store by cursor. + * + * This function stores key/data pairs into the database. + * The cursor is positioned at the new item, or on failure usually near it. + * @note Earlier documentation incorrectly said errors would leave the + * state of the cursor unchanged. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[in] key The key operated on. + * @param[in] data The data operated on. + * @param[in] flags Options for this operation. This parameter + * must be set to 0 or one of the values described here. + * <ul> + * <li>#MDB_CURRENT - replace the item at the current cursor position. + * The \b key parameter must still be provided, and must match it. + * If using sorted duplicates (#MDB_DUPSORT) the data item must still + * sort into the same place. This is intended to be used when the + * new data is the same size as the old. Otherwise it will simply + * perform a delete of the old record followed by an insert. + * <li>#MDB_NODUPDATA - enter the new key/data pair only if it does not + * already appear in the database. This flag may only be specified + * if the database was opened with #MDB_DUPSORT. The function will + * return #MDB_KEYEXIST if the key/data pair already appears in the + * database. + * <li>#MDB_NOOVERWRITE - enter the new key/data pair only if the key + * does not already appear in the database. The function will return + * #MDB_KEYEXIST if the key already appears in the database, even if + * the database supports duplicates (#MDB_DUPSORT). + * <li>#MDB_RESERVE - reserve space for data of the given size, but + * don't copy the given data. Instead, return a pointer to the + * reserved space, which the caller can fill in later - before + * the next update operation or the transaction ends. This saves + * an extra memcpy if the data is being generated later. This flag + * must not be specified if the database was opened with #MDB_DUPSORT. + * <li>#MDB_APPEND - append the given key/data pair to the end of the + * database. No key comparisons are performed. This option allows + * fast bulk loading when keys are already known to be in the + * correct order. Loading unsorted keys with this flag will cause + * a #MDB_KEYEXIST error. + * <li>#MDB_APPENDDUP - as above, but for sorted dup data. + * <li>#MDB_MULTIPLE - store multiple contiguous data elements in a + * single request. This flag may only be specified if the database + * was opened with #MDB_DUPFIXED. The \b data argument must be an + * array of two MDB_vals. The mv_size of the first MDB_val must be + * the size of a single data element. The mv_data of the first MDB_val + * must point to the beginning of the array of contiguous data elements. + * The mv_size of the second MDB_val must be the count of the number + * of data elements to store. On return this field will be set to + * the count of the number of elements actually written. The mv_data + * of the second MDB_val is unused. + * </ul> + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>#MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). + * <li>#MDB_TXN_FULL - the transaction has too many dirty pages. + * <li>EACCES - an attempt was made to write in a read-only transaction. + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + unsigned int flags); + + /** @brief Delete current key/data pair + * + * This function deletes the key/data pair to which the cursor refers. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[in] flags Options for this operation. This parameter + * must be set to 0 or one of the values described here. + * <ul> + * <li>#MDB_NODUPDATA - delete all of the data items for the current key. + * This flag may only be specified if the database was opened with #MDB_DUPSORT. + * </ul> + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EACCES - an attempt was made to write in a read-only transaction. + * <li>EINVAL - an invalid parameter was specified. + * </ul> + */ +int mdb_cursor_del(MDB_cursor *cursor, unsigned int flags); + + /** @brief Return count of duplicates for current key. + * + * This call is only valid on databases that support sorted duplicate + * data items #MDB_DUPSORT. + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @param[out] countp Address where the count will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + * <ul> + * <li>EINVAL - cursor is not initialized, or an invalid parameter was specified. + * </ul> + */ +int mdb_cursor_count(MDB_cursor *cursor, size_t *countp); + + /** @brief Compare two data items according to a particular database. + * + * This returns a comparison as if the two data items were keys in the + * specified database. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] a The first item to compare + * @param[in] b The second item to compare + * @return < 0 if a < b, 0 if a == b, > 0 if a > b + */ +int mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); + + /** @brief Compare two data items according to a particular database. + * + * This returns a comparison as if the two items were data items of + * the specified database. The database must have the #MDB_DUPSORT flag. + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + * @param[in] a The first item to compare + * @param[in] b The second item to compare + * @return < 0 if a < b, 0 if a == b, > 0 if a > b + */ +int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); + + /** @brief A callback function used to print a message from the library. + * + * @param[in] msg The string to be printed. + * @param[in] ctx An arbitrary context pointer for the callback. + * @return < 0 on failure, >= 0 on success. + */ +typedef int (MDB_msg_func)(const char *msg, void *ctx); + + /** @brief Dump the entries in the reader lock table. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] func A #MDB_msg_func function + * @param[in] ctx Anything the message function needs + * @return < 0 on failure, >= 0 on success. + */ +int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); + + /** @brief Check for stale entries in the reader lock table. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @param[out] dead Number of stale slots that were cleared + * @return 0 on success, non-zero on failure. + */ +int mdb_reader_check(MDB_env *env, int *dead); +/** @} */ + +#ifdef __cplusplus +} +#endif +/** @page tools LMDB Command Line Tools + The following describes the command line tools that are available for LMDB. + \li \ref mdb_copy_1 + \li \ref mdb_dump_1 + \li \ref mdb_load_1 + \li \ref mdb_stat_1 +*/ + +#endif /* _LMDB_H_ */ diff --git a/contrib/lmdb/mdb.c b/contrib/lmdb/mdb.c new file mode 100644 index 0000000..f225a19 --- /dev/null +++ b/contrib/lmdb/mdb.c @@ -0,0 +1,10258 @@ +#pragma GCC diagnostic ignored "-Wshadow" +/** @file mdb.c + * @brief Lightning memory-mapped database library + * + * A Btree-based database management library modeled loosely on the + * BerkeleyDB API, but much simplified. + */ +/* + * Copyright 2011-2017 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * <http://www.OpenLDAP.org/license.html>. + * + * This code is derived from btree.c written by Martin Hedenfalk. + * + * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif +#if defined(__WIN64__) +#define _FILE_OFFSET_BITS 64 +#endif +#ifdef _WIN32 +#include <malloc.h> +#include <windows.h> +#include <wchar.h> /* get wcscpy() */ + +/** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it + * as int64 which is wrong. MSVC doesn't define it at all, so just + * don't use it. + */ +#define MDB_PID_T int +#define MDB_THR_T DWORD +#include <sys/types.h> +#include <sys/stat.h> +#ifdef __GNUC__ +# include <sys/param.h> +#else +# define LITTLE_ENDIAN 1234 +# define BIG_ENDIAN 4321 +# define BYTE_ORDER LITTLE_ENDIAN +# ifndef SSIZE_MAX +# define SSIZE_MAX INT_MAX +# endif +#endif +#else +#include <sys/types.h> +#include <sys/stat.h> +#define MDB_PID_T pid_t +#define MDB_THR_T pthread_t +#include <sys/param.h> +#include <sys/uio.h> +#include <sys/mman.h> +#ifdef HAVE_SYS_FILE_H +#include <sys/file.h> +#endif +#include <fcntl.h> +#endif + +#if defined(__mips) && defined(__linux) +/* MIPS has cache coherency issues, requires explicit cache control */ +#include <asm/cachectl.h> +extern int cacheflush(char *addr, int nbytes, int cache); +#define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache) +#else +#define CACHEFLUSH(addr, bytes, cache) +#endif + +#if defined(__linux) && !defined(MDB_FDATASYNC_WORKS) +/** fdatasync is broken on ext3/ext4fs on older kernels, see + * description in #mdb_env_open2 comments. You can safely + * define MDB_FDATASYNC_WORKS if this code will only be run + * on kernels 3.6 and newer. + */ +#define BROKEN_FDATASYNC +#endif + +#include <errno.h> +#include <limits.h> +#include <stddef.h> +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +#ifdef _MSC_VER +#include <io.h> +typedef SSIZE_T ssize_t; +#else +#include <unistd.h> +#endif + +#if defined(__sun) || defined(ANDROID) +/* Most platforms have posix_memalign, older may only have memalign */ +#define HAVE_MEMALIGN 1 +#include <malloc.h> +#endif + +#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) +#include <netinet/in.h> +#include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */ +#endif + +#if defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) +# define MDB_USE_POSIX_SEM 1 +# define MDB_FDATASYNC fsync +#elif defined(ANDROID) +# define MDB_FDATASYNC fsync +#endif + +#ifndef _WIN32 +#include <pthread.h> +#include <signal.h> +#ifdef MDB_USE_POSIX_SEM +# define MDB_USE_HASH 1 +#include <semaphore.h> +#else +#define MDB_USE_POSIX_MUTEX 1 +#endif +#endif + +#if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) \ + + defined(MDB_USE_POSIX_MUTEX) != 1 +# error "Ambiguous shared-lock implementation" +#endif + +#ifdef USE_VALGRIND +#include <valgrind/memcheck.h> +#define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z) +#define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s) +#define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a) +#define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h) +#define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s) +#else +#define VGMEMP_CREATE(h,r,z) +#define VGMEMP_ALLOC(h,a,s) +#define VGMEMP_FREE(h,a) +#define VGMEMP_DESTROY(h) +#define VGMEMP_DEFINED(a,s) +#endif + +#ifndef BYTE_ORDER +# if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) +/* Solaris just defines one or the other */ +# define LITTLE_ENDIAN 1234 +# define BIG_ENDIAN 4321 +# ifdef _LITTLE_ENDIAN +# define BYTE_ORDER LITTLE_ENDIAN +# else +# define BYTE_ORDER BIG_ENDIAN +# endif +# else +# define BYTE_ORDER __BYTE_ORDER +# endif +#endif + +#ifndef LITTLE_ENDIAN +#define LITTLE_ENDIAN __LITTLE_ENDIAN +#endif +#ifndef BIG_ENDIAN +#define BIG_ENDIAN __BIG_ENDIAN +#endif + +#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) +#define MISALIGNED_OK 1 +#endif + +#include "lmdb.h" +#include "midl.h" + +#if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) +# error "Unknown or unsupported endianness (BYTE_ORDER)" +#elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF +# error "Two's complement, reasonably sized integer types, please" +#endif + +#ifdef __GNUC__ +/** Put infrequently used env functions in separate section */ +# ifdef __APPLE__ +# define ESECT __attribute__ ((section("__TEXT,text_env"))) +# else +# define ESECT __attribute__ ((section("text_env"))) +# endif +#else +#define ESECT +#endif + +#ifdef _WIN32 +#define CALL_CONV WINAPI +#else +#define CALL_CONV +#endif + +/** @defgroup internal LMDB Internals + * @{ + */ +/** @defgroup compat Compatibility Macros + * A bunch of macros to minimize the amount of platform-specific ifdefs + * needed throughout the rest of the code. When the features this library + * needs are similar enough to POSIX to be hidden in a one-or-two line + * replacement, this macro approach is used. + * @{ + */ + + /** Features under development */ +#ifndef MDB_DEVEL +#define MDB_DEVEL 0 +#endif + + /** Wrapper around __func__, which is a C99 feature */ +#if __STDC_VERSION__ >= 199901L +# define mdb_func_ __func__ +#elif __GNUC__ >= 2 || _MSC_VER >= 1300 +# define mdb_func_ __FUNCTION__ +#else +/* If a debug message says <mdb_unknown>(), update the #if statements above */ +# define mdb_func_ "<mdb_unknown>" +#endif + +/* Internal error codes, not exposed outside liblmdb */ +#define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) +#ifdef _WIN32 +#define MDB_OWNERDEAD ((int) WAIT_ABANDONED) +#elif defined(MDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD) +#define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */ +#endif + +#ifdef __GLIBC__ +#define GLIBC_VER ((__GLIBC__ << 16 )| __GLIBC_MINOR__) +#endif +/** Some platforms define the EOWNERDEAD error code + * even though they don't support Robust Mutexes. + * Compile with -DMDB_USE_ROBUST=0, or use some other + * mechanism like -DMDB_USE_POSIX_SEM instead of + * -DMDB_USE_POSIX_MUTEX. + * (Posix semaphores are not robust.) + */ +#ifndef MDB_USE_ROBUST +/* Android currently lacks Robust Mutex support. So does glibc < 2.4. */ +# if defined(MDB_USE_POSIX_MUTEX) && (defined(ANDROID) || \ + (defined(__GLIBC__) && GLIBC_VER < 0x020004)) +# define MDB_USE_ROBUST 0 +# else +# define MDB_USE_ROBUST 1 +# endif +#endif /* !MDB_USE_ROBUST */ + +#if defined(MDB_USE_POSIX_MUTEX) && (MDB_USE_ROBUST) +/* glibc < 2.12 only provided _np API */ +# if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \ + (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST)) +# define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP +# define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) +# define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) +# endif +#endif /* MDB_USE_POSIX_MUTEX && MDB_USE_ROBUST */ + +#if defined(MDB_OWNERDEAD) && (MDB_USE_ROBUST) +#define MDB_ROBUST_SUPPORTED 1 +#endif + +#ifdef _WIN32 +#define MDB_USE_HASH 1 +#define MDB_PIDLOCK 0 +#define THREAD_RET DWORD +#define pthread_t HANDLE +#define pthread_mutex_t HANDLE +#define pthread_cond_t HANDLE +typedef HANDLE mdb_mutex_t, mdb_mutexref_t; +#define pthread_key_t DWORD +#define pthread_self() GetCurrentThreadId() +#define pthread_key_create(x,y) \ + ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0) +#define pthread_key_delete(x) TlsFree(x) +#define pthread_getspecific(x) TlsGetValue(x) +#define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) +#define pthread_mutex_unlock(x) ReleaseMutex(*x) +#define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) +#define pthread_cond_signal(x) SetEvent(*x) +#define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) +#define THREAD_CREATE(thr,start,arg) \ + (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode()) +#define THREAD_FINISH(thr) \ + (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0) +#define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE) +#define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex) +#define mdb_mutex_consistent(mutex) 0 +#define getpid() GetCurrentProcessId() +#define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd)) +#define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len)) +#define ErrCode() GetLastError() +#define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} +#define close(fd) (CloseHandle(fd) ? 0 : -1) +#define munmap(ptr,len) UnmapViewOfFile(ptr) +#ifdef PROCESS_QUERY_LIMITED_INFORMATION +#define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION +#else +#define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000 +#endif +#define Z "I" +#else +#define THREAD_RET void * +#define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg) +#define THREAD_FINISH(thr) pthread_join(thr,NULL) +#define Z "z" /**< printf format modifier for size_t */ + + /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ +#define MDB_PIDLOCK 1 + +#ifdef MDB_USE_POSIX_SEM + +typedef sem_t *mdb_mutex_t, *mdb_mutexref_t; +#define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex) +#define UNLOCK_MUTEX(mutex) sem_post(mutex) + +static int +mdb_sem_wait(sem_t *sem) +{ + int rc; + while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ; + return rc; +} + +#else /* MDB_USE_POSIX_MUTEX: */ + /** Shared mutex/semaphore as the original is stored. + * + * Not for copies. Instead it can be assigned to an #mdb_mutexref_t. + * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it + * is array[size 1] so it can be assigned to the pointer. + */ +typedef pthread_mutex_t mdb_mutex_t[1]; + /** Reference to an #mdb_mutex_t */ +typedef pthread_mutex_t *mdb_mutexref_t; + /** Lock the reader or writer mutex. + * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX(). + */ +#define LOCK_MUTEX0(mutex) pthread_mutex_lock(mutex) + /** Unlock the reader or writer mutex. + */ +#define UNLOCK_MUTEX(mutex) pthread_mutex_unlock(mutex) + /** Mark mutex-protected data as repaired, after death of previous owner. + */ +#define mdb_mutex_consistent(mutex) pthread_mutex_consistent(mutex) +#endif /* MDB_USE_POSIX_SEM */ + + /** Get the error code for the last failed system function. + */ +#define ErrCode() errno + + /** An abstraction for a file handle. + * On POSIX systems file handles are small integers. On Windows + * they're opaque pointers. + */ +#define HANDLE int + + /** A value for an invalid file handle. + * Mainly used to initialize file variables and signify that they are + * unused. + */ +#define INVALID_HANDLE_VALUE (-1) + + /** Get the size of a memory page for the system. + * This is the basic size that the platform's memory manager uses, and is + * fundamental to the use of memory-mapped files. + */ +#define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) +#endif + +#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) +#define MNAME_LEN 32 +#else +#define MNAME_LEN (sizeof(pthread_mutex_t)) +#endif + +/** @} */ + +#ifdef MDB_ROBUST_SUPPORTED + /** Lock mutex, handle any error, set rc = result. + * Return 0 on success, nonzero (not rc) on error. + */ +#define LOCK_MUTEX(rc, env, mutex) \ + (((rc) = LOCK_MUTEX0(mutex)) && \ + ((rc) = mdb_mutex_failed(env, mutex, rc))) +static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc); +#else +#define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex)) +#define mdb_mutex_failed(env, mutex, rc) (rc) +#endif + +#ifndef _WIN32 +/** A flag for opening a file and requesting synchronous data writes. + * This is only used when writing a meta page. It's not strictly needed; + * we could just do a normal write and then immediately perform a flush. + * But if this flag is available it saves us an extra system call. + * + * @note If O_DSYNC is undefined but exists in /usr/include, + * preferably set some compiler flag to get the definition. + */ +#ifndef MDB_DSYNC +# ifdef O_DSYNC +# define MDB_DSYNC O_DSYNC +# else +# define MDB_DSYNC O_SYNC +# endif +#endif +#endif + +/** Function for flushing the data of a file. Define this to fsync + * if fdatasync() is not supported. + */ +#ifndef MDB_FDATASYNC +# define MDB_FDATASYNC fdatasync +#endif + +#ifndef MDB_MSYNC +# define MDB_MSYNC(addr,len,flags) msync(addr,len,flags) +#endif + +#ifndef MS_SYNC +#define MS_SYNC 1 +#endif + +#ifndef MS_ASYNC +#define MS_ASYNC 0 +#endif + + /** A page number in the database. + * Note that 64 bit page numbers are overkill, since pages themselves + * already represent 12-13 bits of addressable memory, and the OS will + * always limit applications to a maximum of 63 bits of address space. + * + * @note In the #MDB_node structure, we only store 48 bits of this value, + * which thus limits us to only 60 bits of addressable data. + */ +typedef MDB_ID pgno_t; + + /** A transaction ID. + * See struct MDB_txn.mt_txnid for details. + */ +typedef MDB_ID txnid_t; + +/** @defgroup debug Debug Macros + * @{ + */ +#ifndef MDB_DEBUG + /** Enable debug output. Needs variable argument macros (a C99 feature). + * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs + * read from and written to the database (used for free space management). + */ +#define MDB_DEBUG 0 +#endif + +#if MDB_DEBUG +static int mdb_debug; +static txnid_t mdb_debug_start; + + /** Print a debug message with printf formatting. + * Requires double parenthesis around 2 or more args. + */ +# define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args)) +# define DPRINTF0(fmt, ...) \ + fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__) +#else +# define DPRINTF(args) ((void) 0) +#endif + /** Print a debug string. + * The string is printed literally, with no format processing. + */ +#define DPUTS(arg) DPRINTF(("%s", arg)) + /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */ +#define DDBI(mc) \ + (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) +/** @} */ + + /** @brief The maximum size of a database page. + * + * It is 32k or 64k, since value-PAGEBASE must fit in + * #MDB_page.%mp_upper. + * + * LMDB will use database pages < OS pages if needed. + * That causes more I/O in write transactions: The OS must + * know (read) the whole page before writing a partial page. + * + * Note that we don't currently support Huge pages. On Linux, + * regular data files cannot use Huge pages, and in general + * Huge pages aren't actually pageable. We rely on the OS + * demand-pager to read our data and page it out when memory + * pressure from other processes is high. So until OSs have + * actual paging support for Huge pages, they're not viable. + */ +#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) + + /** The minimum number of keys required in a database page. + * Setting this to a larger value will place a smaller bound on the + * maximum size of a data item. Data items larger than this size will + * be pushed into overflow pages instead of being stored directly in + * the B-tree node. This value used to default to 4. With a page size + * of 4096 bytes that meant that any item larger than 1024 bytes would + * go into an overflow page. That also meant that on average 2-3KB of + * each overflow page was wasted space. The value cannot be lower than + * 2 because then there would no longer be a tree structure. With this + * value, items larger than 2KB will go into overflow pages, and on + * average only 1KB will be wasted. + */ +#define MDB_MINKEYS 2 + + /** A stamp that identifies a file as an LMDB file. + * There's nothing special about this value other than that it is easily + * recognizable, and it will reflect any byte order mismatches. + */ +#define MDB_MAGIC 0xBEEFC0DE + + /** The version number for a database's datafile format. */ +#define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) + /** The version number for a database's lockfile format. */ +#define MDB_LOCK_VERSION 1 + + /** @brief The max size of a key we can write, or 0 for computed max. + * + * This macro should normally be left alone or set to 0. + * Note that a database with big keys or dupsort data cannot be + * reliably modified by a liblmdb which uses a smaller max. + * The default is 511 for backwards compat, or 0 when #MDB_DEVEL. + * + * Other values are allowed, for backwards compat. However: + * A value bigger than the computed max can break if you do not + * know what you are doing, and liblmdb <= 0.9.10 can break when + * modifying a DB with keys/dupsort data bigger than its max. + * + * Data items in an #MDB_DUPSORT database are also limited to + * this size, since they're actually keys of a sub-DB. Keys and + * #MDB_DUPSORT data items must fit on a node in a regular page. + */ +#ifndef MDB_MAXKEYSIZE +#define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) +#endif + + /** The maximum size of a key we can write to the environment. */ +#if MDB_MAXKEYSIZE +#define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) +#else +#define ENV_MAXKEY(env) ((env)->me_maxkey) +#endif + + /** @brief The maximum size of a data item. + * + * We only store a 32 bit value for node sizes. + */ +#define MAXDATASIZE 0xffffffffUL + +#if MDB_DEBUG + /** Key size which fits in a #DKBUF. + * @ingroup debug + */ +#define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) + /** A key buffer. + * @ingroup debug + * This is used for printing a hex dump of a key's contents. + */ +#define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1] + /** Display a key in hex. + * @ingroup debug + * Invoke a function to display a key in hex. + */ +#define DKEY(x) mdb_dkey(x, kbuf) +#else +#define DKBUF +#define DKEY(x) 0 +#endif + + /** An invalid page number. + * Mainly used to denote an empty tree. + */ +#define P_INVALID (~(pgno_t)0) + + /** Test if the flags \b f are set in a flag word \b w. */ +#define F_ISSET(w, f) (((w) & (f)) == (f)) + + /** Round \b n up to an even number. */ +#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ + + /** Used for offsets within a single page. + * Since memory pages are typically 4 or 8KB in size, 12-13 bits, + * this is plenty. + */ +typedef uint16_t indx_t; + + /** Default size of memory map. + * This is certainly too small for any actual applications. Apps should always set + * the size explicitly using #mdb_env_set_mapsize(). + */ +#define DEFAULT_MAPSIZE 1048576 + +/** @defgroup readers Reader Lock Table + * Readers don't acquire any locks for their data access. Instead, they + * simply record their transaction ID in the reader table. The reader + * mutex is needed just to find an empty slot in the reader table. The + * slot's address is saved in thread-specific data so that subsequent read + * transactions started by the same thread need no further locking to proceed. + * + * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. + * + * No reader table is used if the database is on a read-only filesystem, or + * if #MDB_NOLOCK is set. + * + * Since the database uses multi-version concurrency control, readers don't + * actually need any locking. This table is used to keep track of which + * readers are using data from which old transactions, so that we'll know + * when a particular old transaction is no longer in use. Old transactions + * that have discarded any data pages can then have those pages reclaimed + * for use by a later write transaction. + * + * The lock table is constructed such that reader slots are aligned with the + * processor's cache line size. Any slot is only ever used by one thread. + * This alignment guarantees that there will be no contention or cache + * thrashing as threads update their own slot info, and also eliminates + * any need for locking when accessing a slot. + * + * A writer thread will scan every slot in the table to determine the oldest + * outstanding reader transaction. Any freed pages older than this will be + * reclaimed by the writer. The writer doesn't use any locks when scanning + * this table. This means that there's no guarantee that the writer will + * see the most up-to-date reader info, but that's not required for correct + * operation - all we need is to know the upper bound on the oldest reader, + * we don't care at all about the newest reader. So the only consequence of + * reading stale information here is that old pages might hang around a + * while longer before being reclaimed. That's actually good anyway, because + * the longer we delay reclaiming old pages, the more likely it is that a + * string of contiguous pages can be found after coalescing old pages from + * many old transactions together. + * @{ + */ + /** Number of slots in the reader table. + * This value was chosen somewhat arbitrarily. 126 readers plus a + * couple mutexes fit exactly into 8KB on my development machine. + * Applications should set the table size using #mdb_env_set_maxreaders(). + */ +#define DEFAULT_READERS 126 + + /** The size of a CPU cache line in bytes. We want our lock structures + * aligned to this size to avoid false cache line sharing in the + * lock table. + * This value works for most CPUs. For Itanium this should be 128. + */ +#ifndef CACHELINE +#define CACHELINE 64 +#endif + + /** The information we store in a single slot of the reader table. + * In addition to a transaction ID, we also record the process and + * thread ID that owns a slot, so that we can detect stale information, + * e.g. threads or processes that went away without cleaning up. + * @note We currently don't check for stale records. We simply re-init + * the table when we know that we're the only process opening the + * lock file. + */ +typedef struct MDB_rxbody { + /** Current Transaction ID when this transaction began, or (txnid_t)-1. + * Multiple readers that start at the same time will probably have the + * same ID here. Again, it's not important to exclude them from + * anything; all we need to know is which version of the DB they + * started from so we can avoid overwriting any data used in that + * particular version. + */ + volatile txnid_t mrb_txnid; + /** The process ID of the process owning this reader txn. */ + volatile MDB_PID_T mrb_pid; + /** The thread ID of the thread owning this txn. */ + volatile MDB_THR_T mrb_tid; +} MDB_rxbody; + + /** The actual reader record, with cacheline padding. */ +typedef struct MDB_reader { + union { + MDB_rxbody mrx; + /** shorthand for mrb_txnid */ +#define mr_txnid mru.mrx.mrb_txnid +#define mr_pid mru.mrx.mrb_pid +#define mr_tid mru.mrx.mrb_tid + /** cache line alignment */ + char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)]; + } mru; +} MDB_reader; + + /** The header for the reader table. + * The table resides in a memory-mapped file. (This is a different file + * than is used for the main database.) + * + * For POSIX the actual mutexes reside in the shared memory of this + * mapped file. On Windows, mutexes are named objects allocated by the + * kernel; we store the mutex names in this mapped file so that other + * processes can grab them. This same approach is also used on + * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support + * process-shared POSIX mutexes. For these cases where a named object + * is used, the object name is derived from a 64 bit FNV hash of the + * environment pathname. As such, naming collisions are extremely + * unlikely. If a collision occurs, the results are unpredictable. + */ +typedef struct MDB_txbody { + /** Stamp identifying this as an LMDB file. It must be set + * to #MDB_MAGIC. */ + uint32_t mtb_magic; + /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ + uint32_t mtb_format; +#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) + char mtb_rmname[MNAME_LEN]; +#else + /** Mutex protecting access to this table. + * This is the reader table lock used with LOCK_MUTEX(). + */ + mdb_mutex_t mtb_rmutex; +#endif + /** The ID of the last transaction committed to the database. + * This is recorded here only for convenience; the value can always + * be determined by reading the main database meta pages. + */ + volatile txnid_t mtb_txnid; + /** The number of slots that have been used in the reader table. + * This always records the maximum count, it is not decremented + * when readers release their slots. + */ + volatile unsigned mtb_numreaders; +} MDB_txbody; + + /** The actual reader table definition. */ +typedef struct MDB_txninfo { + union { + MDB_txbody mtb; +#define mti_magic mt1.mtb.mtb_magic +#define mti_format mt1.mtb.mtb_format +#define mti_rmutex mt1.mtb.mtb_rmutex +#define mti_rmname mt1.mtb.mtb_rmname +#define mti_txnid mt1.mtb.mtb_txnid +#define mti_numreaders mt1.mtb.mtb_numreaders + char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; + } mt1; + union { +#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) + char mt2_wmname[MNAME_LEN]; +#define mti_wmname mt2.mt2_wmname +#else + mdb_mutex_t mt2_wmutex; +#define mti_wmutex mt2.mt2_wmutex +#endif + char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)]; + } mt2; + MDB_reader mti_readers[1]; +} MDB_txninfo; + + /** Lockfile format signature: version, features and field layout */ +#define MDB_LOCK_FORMAT \ + ((uint32_t) \ + ((MDB_LOCK_VERSION) \ + /* Flags which describe functionality */ \ + + (((MDB_PIDLOCK) != 0) << 16))) +/** @} */ + +/** Common header for all page types. The page type depends on #mp_flags. + * + * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with + * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages + * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. + * + * #P_OVERFLOW records occupy one or more contiguous pages where only the + * first has a page header. They hold the real data of #F_BIGDATA nodes. + * + * #P_SUBP sub-pages are small leaf "pages" with duplicate data. + * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. + * (Duplicate data can also go in sub-databases, which use normal pages.) + * + * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. + * + * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once + * in the snapshot: Either used by a database or listed in a freeDB record. + */ +typedef struct MDB_page { +#define mp_pgno mp_p.p_pgno +#define mp_next mp_p.p_next + union { + pgno_t p_pgno; /**< page number */ + struct MDB_page *p_next; /**< for in-memory list of freed pages */ + } mp_p; + uint16_t mp_pad; /**< key size if this is a LEAF2 page */ +/** @defgroup mdb_page Page Flags + * @ingroup internal + * Flags for the page headers. + * @{ + */ +#define P_BRANCH 0x01 /**< branch page */ +#define P_LEAF 0x02 /**< leaf page */ +#define P_OVERFLOW 0x04 /**< overflow page */ +#define P_META 0x08 /**< meta page */ +#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ +#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ +#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ +#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ +#define P_KEEP 0x8000 /**< leave this page alone during spill */ +/** @} */ + uint16_t mp_flags; /**< @ref mdb_page */ +#define mp_lower mp_pb.pb.pb_lower +#define mp_upper mp_pb.pb.pb_upper +#define mp_pages mp_pb.pb_pages + union { + struct { + indx_t pb_lower; /**< lower bound of free space */ + indx_t pb_upper; /**< upper bound of free space */ + } pb; + uint32_t pb_pages; /**< number of overflow pages */ + } mp_pb; + indx_t mp_ptrs[1]; /**< dynamic size */ +} MDB_page; + + /** Size of the page header, excluding dynamic data at the end */ +#define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs)) + + /** Address of first usable data byte in a page, after the header */ +#define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) + + /** ITS#7713, change PAGEBASE to handle 65536 byte pages */ +#define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) + + /** Number of nodes on a page */ +#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1) + + /** The amount of space remaining in the page */ +#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) + + /** The percentage of space used in the page, in tenths of a percent. */ +#define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ + ((env)->me_psize - PAGEHDRSZ)) + /** The minimum page fill factor, in tenths of a percent. + * Pages emptier than this are candidates for merging. + */ +#define FILL_THRESHOLD 250 + + /** Test if a page is a leaf page */ +#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) + /** Test if a page is a LEAF2 page */ +#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) + /** Test if a page is a branch page */ +#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) + /** Test if a page is an overflow page */ +#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) + /** Test if a page is a sub page */ +#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) + + /** The number of overflow pages needed to store the given size. */ +#define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) + + /** Link in #MDB_txn.%mt_loose_pgs list. + * Kept outside the page header, which is needed when reusing the page. + */ +#define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) + + /** Header for a single key/data pair within a page. + * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. + * We guarantee 2-byte alignment for 'MDB_node's. + * + * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child + * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used + * for pgno. (Branch nodes have no flags). Lo and hi are in host byte + * order in case some accesses can be optimized to 32-bit word access. + * + * Leaf node flags describe node contents. #F_BIGDATA says the node's + * data part is the page number of an overflow page with actual data. + * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in + * a sub-page/sub-database, and named databases (just #F_SUBDATA). + */ +typedef struct MDB_node { + /** part of data size or pgno + * @{ */ +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned short mn_lo, mn_hi; +#else + unsigned short mn_hi, mn_lo; +#endif + /** @} */ +/** @defgroup mdb_node Node Flags + * @ingroup internal + * Flags for node headers. + * @{ + */ +#define F_BIGDATA 0x01 /**< data put on overflow page */ +#define F_SUBDATA 0x02 /**< data is a sub-database */ +#define F_DUPDATA 0x04 /**< data has duplicates */ + +/** valid flags for #mdb_node_add() */ +#define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND) + +/** @} */ + unsigned short mn_flags; /**< @ref mdb_node */ + unsigned short mn_ksize; /**< key size */ + char mn_data[1]; /**< key and data are appended here */ +} MDB_node; + + /** Size of the node header, excluding dynamic data at the end */ +#define NODESIZE offsetof(MDB_node, mn_data) + + /** Bit position of top word in page number, for shifting mn_flags */ +#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) + + /** Size of a node in a branch page with a given key. + * This is just the node header plus the key, there is no data. + */ +#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) + + /** Size of a node in a leaf page with a given key and data. + * This is node header plus key plus data size. + */ +#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) + + /** Address of node \b i in page \b p */ +#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE)) + + /** Address of the key for the node */ +#define NODEKEY(node) (void *)((node)->mn_data) + + /** Address of the data for a node */ +#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) + + /** Get the page number pointed to by a branch node */ +#define NODEPGNO(node) \ + ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \ + (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0)) + /** Set the page number in a branch node */ +#define SETPGNO(node,pgno) do { \ + (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \ + if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0) + + /** Get the size of the data in a leaf node */ +#define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) + /** Set the size of the data for a leaf node */ +#define SETDSZ(node,size) do { \ + (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0) + /** The size of a key in a node */ +#define NODEKSZ(node) ((node)->mn_ksize) + + /** Copy a page number from src to dst */ +#ifdef MISALIGNED_OK +#define COPY_PGNO(dst,src) dst = src +#else +#if SIZE_MAX > 4294967295UL +#define COPY_PGNO(dst,src) do { \ + unsigned short *s, *d; \ + s = (unsigned short *)&(src); \ + d = (unsigned short *)&(dst); \ + *d++ = *s++; \ + *d++ = *s++; \ + *d++ = *s++; \ + *d = *s; \ +} while (0) +#else +#define COPY_PGNO(dst,src) do { \ + unsigned short *s, *d; \ + s = (unsigned short *)&(src); \ + d = (unsigned short *)&(dst); \ + *d++ = *s++; \ + *d = *s; \ +} while (0) +#endif +#endif + /** The address of a key in a LEAF2 page. + * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs. + * There are no node headers, keys are stored contiguously. + */ +#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks))) + + /** Set the \b node's key into \b keyptr, if requested. */ +#define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \ + (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } } + + /** Set the \b node's key into \b key. */ +#define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); } + + /** Information about a single database in the environment. */ +typedef struct MDB_db { + uint32_t md_pad; /**< also ksize for LEAF2 pages */ + uint16_t md_flags; /**< @ref mdb_dbi_open */ + uint16_t md_depth; /**< depth of this tree */ + pgno_t md_branch_pages; /**< number of internal pages */ + pgno_t md_leaf_pages; /**< number of leaf pages */ + pgno_t md_overflow_pages; /**< number of overflow pages */ + size_t md_entries; /**< number of data items */ + pgno_t md_root; /**< the root page of this tree */ +} MDB_db; + +#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ +#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) + /** #mdb_dbi_open() flags */ +#define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ + MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) + + /** Handle for the DB used to track free pages. */ +#define FREE_DBI 0 + /** Handle for the default DB. */ +#define MAIN_DBI 1 + /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ +#define CORE_DBS 2 + + /** Number of meta pages - also hardcoded elsewhere */ +#define NUM_METAS 2 + + /** Meta page content. + * A meta page is the start point for accessing a database snapshot. + * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). + */ +typedef struct MDB_meta { + /** Stamp identifying this as an LMDB file. It must be set + * to #MDB_MAGIC. */ + uint32_t mm_magic; + /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ + uint32_t mm_version; + void *mm_address; /**< address for fixed mapping */ + size_t mm_mapsize; /**< size of mmap region */ + MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ + /** The size of pages used in this DB */ +#define mm_psize mm_dbs[FREE_DBI].md_pad + /** Any persistent environment flags. @ref mdb_env */ +#define mm_flags mm_dbs[FREE_DBI].md_flags + /** Last used page in the datafile. + * Actually the file may be shorter if the freeDB lists the final pages. + */ + pgno_t mm_last_pg; + volatile txnid_t mm_txnid; /**< txnid that committed this page */ +} MDB_meta; + + /** Buffer for a stack-allocated meta page. + * The members define size and alignment, and silence type + * aliasing warnings. They are not used directly; that could + * mean incorrectly using several union members in parallel. + */ +typedef union MDB_metabuf { + MDB_page mb_page; + struct { + char mm_pad[PAGEHDRSZ]; + MDB_meta mm_meta; + } mb_metabuf; +} MDB_metabuf; + + /** Auxiliary DB info. + * The information here is mostly static/read-only. There is + * only a single copy of this record in the environment. + */ +typedef struct MDB_dbx { + MDB_val md_name; /**< name of the database */ + MDB_cmp_func *md_cmp; /**< function for comparing keys */ + MDB_cmp_func *md_dcmp; /**< function for comparing data items */ + MDB_rel_func *md_rel; /**< user relocate function */ + void *md_relctx; /**< user-provided context for md_rel */ +} MDB_dbx; + + /** A database transaction. + * Every operation requires a transaction handle. + */ +struct MDB_txn { + MDB_txn *mt_parent; /**< parent of a nested txn */ + /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ + MDB_txn *mt_child; + pgno_t mt_next_pgno; /**< next unallocated page */ + /** The ID of this transaction. IDs are integers incrementing from 1. + * Only committed write transactions increment the ID. If a transaction + * aborts, the ID may be re-used by the next writer. + */ + txnid_t mt_txnid; + MDB_env *mt_env; /**< the DB environment */ + /** The list of pages that became unused during this transaction. + */ + MDB_IDL mt_free_pgs; + /** The list of loose pages that became unused and may be reused + * in this transaction, linked through #NEXT_LOOSE_PAGE(page). + */ + MDB_page *mt_loose_pgs; + /** Number of loose pages (#mt_loose_pgs) */ + int mt_loose_count; + /** The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. + */ + MDB_IDL mt_spill_pgs; + union { + /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ + MDB_ID2L dirty_list; + /** For read txns: This thread/txn's reader table slot, or NULL. */ + MDB_reader *reader; + } mt_u; + /** Array of records for each DB known in the environment. */ + MDB_dbx *mt_dbxs; + /** Array of MDB_db records for each known DB */ + MDB_db *mt_dbs; + /** Array of sequence numbers for each DB handle */ + unsigned int *mt_dbiseqs; +/** @defgroup mt_dbflag Transaction DB Flags + * @ingroup internal + * @{ + */ +#define DB_DIRTY 0x01 /**< DB was written in this txn */ +#define DB_STALE 0x02 /**< Named-DB record is older than txnID */ +#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ +#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ +#define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ +#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ +/** @} */ + /** In write txns, array of cursors for each DB */ + MDB_cursor **mt_cursors; + /** Array of flags for each DB */ + unsigned char *mt_dbflags; + /** Number of DB records in use, or 0 when the txn is finished. + * This number only ever increments until the txn finishes; we + * don't decrement it when individual DB handles are closed. + */ + MDB_dbi mt_numdbs; + +/** @defgroup mdb_txn Transaction Flags + * @ingroup internal + * @{ + */ + /** #mdb_txn_begin() flags */ +#define MDB_TXN_BEGIN_FLAGS MDB_RDONLY +#define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ + /* internal txn flags */ +#define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */ +#define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ +#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ +#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ +#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ +#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ + /** most operations on the txn are currently illegal */ +#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD) +/** @} */ + unsigned int mt_flags; /**< @ref mdb_txn */ + /** #dirty_list room: Array size - \#dirty pages visible to this txn. + * Includes ancestor txns' dirty pages not hidden by other txns' + * dirty/spilled pages. Thus commit(nested txn) has room to merge + * dirty_list into mt_parent after freeing hidden mt_parent pages. + */ + unsigned int mt_dirty_room; +}; + +/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. + * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to + * raise this on a 64 bit machine. + */ +#define CURSOR_STACK 32 + +struct MDB_xcursor; + + /** Cursors are used for all DB operations. + * A cursor holds a path of (page pointer, key index) from the DB + * root to a position in the DB, plus other state. #MDB_DUPSORT + * cursors include an xcursor to the current data item. Write txns + * track their cursors and keep them up to date when data moves. + * Exception: An xcursor's pointer to a #P_SUBP page can be stale. + * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). + */ +struct MDB_cursor { + /** Next cursor on this DB in this txn */ + MDB_cursor *mc_next; + /** Backup of the original cursor if this cursor is a shadow */ + MDB_cursor *mc_backup; + /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ + struct MDB_xcursor *mc_xcursor; + /** The transaction that owns this cursor */ + MDB_txn *mc_txn; + /** The database handle this cursor operates on */ + MDB_dbi mc_dbi; + /** The database record for this cursor */ + MDB_db *mc_db; + /** The database auxiliary record for this cursor */ + MDB_dbx *mc_dbx; + /** The @ref mt_dbflag for this database */ + unsigned char *mc_dbflag; + unsigned short mc_snum; /**< number of pushed pages */ + unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ +/** @defgroup mdb_cursor Cursor Flags + * @ingroup internal + * Cursor state flags. + * @{ + */ +#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ +#define C_EOF 0x02 /**< No more data */ +#define C_SUB 0x04 /**< Cursor is a sub-cursor */ +#define C_DEL 0x08 /**< last op was a cursor_del */ +#define C_UNTRACK 0x40 /**< Un-track cursor when closing */ +/** @} */ + unsigned int mc_flags; /**< @ref mdb_cursor */ + MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ + indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ +}; + + /** Context for sorted-dup records. + * We could have gone to a fully recursive design, with arbitrarily + * deep nesting of sub-databases. But for now we only handle these + * levels - main DB, optional sub-DB, sorted-duplicate DB. + */ +typedef struct MDB_xcursor { + /** A sub-cursor for traversing the Dup DB */ + MDB_cursor mx_cursor; + /** The database record for this Dup DB */ + MDB_db mx_db; + /** The auxiliary DB record for this Dup DB */ + MDB_dbx mx_dbx; + /** The @ref mt_dbflag for this Dup DB */ + unsigned char mx_dbflag; +} MDB_xcursor; + + /** Check if there is an inited xcursor, so #XCURSOR_REFRESH() is proper */ +#define XCURSOR_INITED(mc) \ + ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + + /** Update sub-page pointer, if any, in \b mc->mc_xcursor. Needed + * when the node which contains the sub-page may have moved. Called + * with \b mp = mc->mc_pg[mc->mc_top], \b ki = mc->mc_ki[mc->mc_top]. + */ +#define XCURSOR_REFRESH(mc, mp, ki) do { \ + MDB_page *xr_pg = (mp); \ + MDB_node *xr_node = NODEPTR(xr_pg, ki); \ + if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ + (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ +} while (0) + + /** State of FreeDB old pages, stored in the MDB_env */ +typedef struct MDB_pgstate { + pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ + txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ +} MDB_pgstate; + + /** The database environment. */ +struct MDB_env { + HANDLE me_fd; /**< The main data file */ + HANDLE me_lfd; /**< The lock file */ + HANDLE me_mfd; /**< For writing and syncing the meta pages */ + /** Failed to update the meta page. Probably an I/O error. */ +#define MDB_FATAL_ERROR 0x80000000U + /** Some fields are initialized. */ +#define MDB_ENV_ACTIVE 0x20000000U + /** me_txkey is set */ +#define MDB_ENV_TXKEY 0x10000000U + /** fdatasync is unreliable */ +#define MDB_FSYNCONLY 0x08000000U + uint32_t me_flags; /**< @ref mdb_env */ + unsigned int me_psize; /**< DB page size, inited from me_os_psize */ + unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ + unsigned int me_maxreaders; /**< size of the reader table */ + /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */ + volatile int me_close_readers; + MDB_dbi me_numdbs; /**< number of DBs opened */ + MDB_dbi me_maxdbs; /**< size of the DB table */ + MDB_PID_T me_pid; /**< process ID of this env */ + char *me_path; /**< path to the DB files */ + char *me_map; /**< the memory map of the data file */ + MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ + MDB_meta *me_metas[NUM_METAS]; /**< pointers to the two meta pages */ + void *me_pbuf; /**< scratch area for DUPSORT put() */ + MDB_txn *me_txn; /**< current write transaction */ + MDB_txn *me_txn0; /**< prealloc'd write transaction */ + size_t me_mapsize; /**< size of the data memory map */ + off_t me_size; /**< current file size */ + pgno_t me_maxpg; /**< me_mapsize / me_psize */ + MDB_dbx *me_dbxs; /**< array of static DB info */ + uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ + unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */ + pthread_key_t me_txkey; /**< thread-key for readers */ + txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ + MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ +# define me_pglast me_pgstate.mf_pglast +# define me_pghead me_pgstate.mf_pghead + MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ + /** IDL of pages that became unused in a write txn */ + MDB_IDL me_free_pgs; + /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ + MDB_ID2L me_dirty_list; + /** Max number of freelist items that can fit in a single overflow page */ + int me_maxfree_1pg; + /** Max size of a node on a page */ + unsigned int me_nodemax; +#if !(MDB_MAXKEYSIZE) + unsigned int me_maxkey; /**< max size of a key */ +#endif + int me_live_reader; /**< have liveness lock in reader table */ +#ifdef _WIN32 + int me_pidquery; /**< Used in OpenProcess */ +#endif +#ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */ +# define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */ +# define me_wmutex me_txns->mti_wmutex /**< Shared writer lock */ +#else + mdb_mutex_t me_rmutex; + mdb_mutex_t me_wmutex; +#endif + void *me_userctx; /**< User-settable context */ + MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ +}; + + /** Nested transaction */ +typedef struct MDB_ntxn { + MDB_txn mnt_txn; /**< the transaction */ + MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ +} MDB_ntxn; + + /** max number of pages to commit in one writev() call */ +#define MDB_COMMIT_PAGES 64 +#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES +#undef MDB_COMMIT_PAGES +#define MDB_COMMIT_PAGES IOV_MAX +#endif + + /** max bytes to write in one call */ +#define MAX_WRITE (0x40000000U >> (sizeof(ssize_t) == 4)) + + /** Check \b txn and \b dbi arguments to a function */ +#define TXN_DBI_EXIST(txn, dbi, validity) \ + ((txn) && (dbi)<(txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) + + /** Check for misused \b dbi handles */ +#define TXN_DBI_CHANGED(txn, dbi) \ + ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) + +static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); +static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); +static int mdb_page_touch(MDB_cursor *mc); + +#define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \ + "reset-tmp", "fail-begin", "fail-beginchild"} +enum { + /* mdb_txn_end operation number, for logging */ + MDB_END_COMMITTED, MDB_END_EMPTY_COMMIT, MDB_END_ABORT, MDB_END_RESET, + MDB_END_RESET_TMP, MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD +}; +#define MDB_END_OPMASK 0x0F /**< mask for #mdb_txn_end() operation number */ +#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ +#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ +#define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ +static void mdb_txn_end(MDB_txn *txn, unsigned mode); + +static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); +static int mdb_page_search_root(MDB_cursor *mc, + MDB_val *key, int modify); +#define MDB_PS_MODIFY 1 +#define MDB_PS_ROOTONLY 2 +#define MDB_PS_FIRST 4 +#define MDB_PS_LAST 8 +static int mdb_page_search(MDB_cursor *mc, + MDB_val *key, int flags); +static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); + +#define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ +static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, + pgno_t newpgno, unsigned int nflags); + +static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); +static MDB_meta *mdb_env_pick_meta(const MDB_env *env); +static int mdb_env_write_meta(MDB_txn *txn); +#ifdef MDB_USE_POSIX_MUTEX /* Drop unused excl arg */ +# define mdb_env_close0(env, excl) mdb_env_close1(env) +#endif +static void mdb_env_close0(MDB_env *env, int excl); + +static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); +static int mdb_node_add(MDB_cursor *mc, indx_t indx, + MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags); +static void mdb_node_del(MDB_cursor *mc, int ksize); +static void mdb_node_shrink(MDB_page *mp, indx_t indx); +static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); +static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); +static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); +static size_t mdb_branch_size(MDB_env *env, MDB_val *key); + +static int mdb_rebalance(MDB_cursor *mc); +static int mdb_update_key(MDB_cursor *mc, MDB_val *key); + +static void mdb_cursor_pop(MDB_cursor *mc); +static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); + +static int mdb_cursor_del0(MDB_cursor *mc); +static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); +static int mdb_cursor_sibling(MDB_cursor *mc, int move_right); +static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); +static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); +static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, + int *exactp); +static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); +static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); + +static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); +static void mdb_xcursor_init0(MDB_cursor *mc); +static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); +static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); + +static int mdb_drop0(MDB_cursor *mc, int subs); +static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); +static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); + +/** @cond */ +static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long; +/** @endcond */ + +/** Compare two items pointing at size_t's of unknown alignment. */ +#ifdef MISALIGNED_OK +# define mdb_cmp_clong mdb_cmp_long +#else +# define mdb_cmp_clong mdb_cmp_cint +#endif + +#ifdef _WIN32 +static SECURITY_DESCRIPTOR mdb_null_sd; +static SECURITY_ATTRIBUTES mdb_all_sa; +static int mdb_sec_inited; + +struct MDB_name; +static int utf8_to_utf16(const char *src, struct MDB_name *dst, int xtra); +#endif + +/** Return the library version info. */ +char * ESECT +mdb_version(int *major, int *minor, int *patch) +{ + if (major) *major = MDB_VERSION_MAJOR; + if (minor) *minor = MDB_VERSION_MINOR; + if (patch) *patch = MDB_VERSION_PATCH; + return MDB_VERSION_STRING; +} + +/** Table of descriptions for LMDB @ref errors */ +static char *const mdb_errstr[] = { + "MDB_KEYEXIST: Key/data pair already exists", + "MDB_NOTFOUND: No matching key/data pair found", + "MDB_PAGE_NOTFOUND: Requested page not found", + "MDB_CORRUPTED: Located page was wrong type", + "MDB_PANIC: Update of meta page failed or environment had fatal error", + "MDB_VERSION_MISMATCH: Database environment version mismatch", + "MDB_INVALID: File is not an LMDB file", + "MDB_MAP_FULL: Environment mapsize limit reached", + "MDB_DBS_FULL: Environment maxdbs limit reached", + "MDB_READERS_FULL: Environment maxreaders limit reached", + "MDB_TLS_FULL: Thread-local storage keys full - too many environments open", + "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big", + "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", + "MDB_PAGE_FULL: Internal error - page has no more space", + "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", + "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", + "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", + "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", + "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", + "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", +}; + +char * +mdb_strerror(int err) +{ +#ifdef _WIN32 + /** HACK: pad 4KB on stack over the buf. Return system msgs in buf. + * This works as long as no function between the call to mdb_strerror + * and the actual use of the message uses more than 4K of stack. + */ +#define MSGSIZE 1024 +#define PADSIZE 4096 + char buf[MSGSIZE+PADSIZE], *ptr = buf; +#endif + int i; + if (!err) + return ("Successful return: 0"); + + if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { + i = err - MDB_KEYEXIST; + return mdb_errstr[i]; + } + +#ifdef _WIN32 + /* These are the C-runtime error codes we use. The comment indicates + * their numeric value, and the Win32 error they would correspond to + * if the error actually came from a Win32 API. A major mess, we should + * have used LMDB-specific error codes for everything. + */ + switch(err) { + case ENOENT: /* 2, FILE_NOT_FOUND */ + case EIO: /* 5, ACCESS_DENIED */ + case ENOMEM: /* 12, INVALID_ACCESS */ + case EACCES: /* 13, INVALID_DATA */ + case EBUSY: /* 16, CURRENT_DIRECTORY */ + case EINVAL: /* 22, BAD_COMMAND */ + case ENOSPC: /* 28, OUT_OF_PAPER */ + return strerror(err); + default: + ; + } + buf[0] = 0; + FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, 0, ptr, MSGSIZE, (va_list *)buf+MSGSIZE); + return ptr; +#else + return strerror(err); +#endif +} + +/** assert(3) variant in cursor context */ +#define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) +/** assert(3) variant in transaction context */ +#define mdb_tassert(txn, expr) mdb_assert0((txn)->mt_env, expr, #expr) +/** assert(3) variant in environment context */ +#define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) + +#ifndef NDEBUG +# define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \ + mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__)) + +static void ESECT +mdb_assert_fail(MDB_env *env, const char *expr_txt, + const char *func, const char *file, int line) +{ + char buf[400]; + sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()", + file, line, expr_txt, func); + if (env->me_assert_func) + env->me_assert_func(env, buf); + fprintf(stderr, "%s\n", buf); + abort(); +} +#else +# define mdb_assert0(env, expr, expr_txt) ((void) 0) +#endif /* NDEBUG */ + +#if MDB_DEBUG +/** Return the page number of \b mp which may be sub-page, for debug output */ +static pgno_t +mdb_dbg_pgno(MDB_page *mp) +{ + pgno_t ret; + COPY_PGNO(ret, mp->mp_pgno); + return ret; +} + +/** Display a key in hexadecimal and return the address of the result. + * @param[in] key the key to display + * @param[in] buf the buffer to write into. Should always be #DKBUF. + * @return The key in hexadecimal form. + */ +char * +mdb_dkey(MDB_val *key, char *buf) +{ + char *ptr = buf; + unsigned char *c = key->mv_data; + unsigned int i; + + if (!key) + return ""; + + if (key->mv_size > DKBUF_MAXKEYSIZE) + return "MDB_MAXKEYSIZE"; + /* may want to make this a dynamic check: if the key is mostly + * printable characters, print it as-is instead of converting to hex. + */ +#if 1 + buf[0] = '\0'; + for (i=0; i<key->mv_size; i++) + ptr += sprintf(ptr, "%02x", *c++); +#else + sprintf(buf, "%.*s", key->mv_size, key->mv_data); +#endif + return buf; +} + +static const char * +mdb_leafnode_type(MDB_node *n) +{ + static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; + return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : + tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; +} + +/** Display all the keys in the page. */ +void +mdb_page_list(MDB_page *mp) +{ + pgno_t pgno = mdb_dbg_pgno(mp); + const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; + MDB_node *node; + unsigned int i, nkeys, nsize, total = 0; + MDB_val key; + DKBUF; + + switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { + case P_BRANCH: type = "Branch page"; break; + case P_LEAF: type = "Leaf page"; break; + case P_LEAF|P_SUBP: type = "Sub-page"; break; + case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; + case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; + case P_OVERFLOW: + fprintf(stderr, "Overflow page %"Z"u pages %u%s\n", + pgno, mp->mp_pages, state); + return; + case P_META: + fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n", + pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); + return; + default: + fprintf(stderr, "Bad page %"Z"u flags 0x%X\n", pgno, mp->mp_flags); + return; + } + + nkeys = NUMKEYS(mp); + fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state); + + for (i=0; i<nkeys; i++) { + if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ + key.mv_size = nsize = mp->mp_pad; + key.mv_data = LEAF2KEY(mp, i, nsize); + total += nsize; + fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); + continue; + } + node = NODEPTR(mp, i); + key.mv_size = node->mn_ksize; + key.mv_data = node->mn_data; + nsize = NODESIZE + key.mv_size; + if (IS_BRANCH(mp)) { + fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node), + DKEY(&key)); + total += nsize; + } else { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + nsize += sizeof(pgno_t); + else + nsize += NODEDSZ(node); + total += nsize; + nsize += sizeof(indx_t); + fprintf(stderr, "key %d: nsize %d, %s%s\n", + i, nsize, DKEY(&key), mdb_leafnode_type(node)); + } + total = EVEN(total); + } + fprintf(stderr, "Total: header %d + contents %d + unused %d\n", + IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp)); +} + +void +mdb_cursor_chk(MDB_cursor *mc) +{ + unsigned int i; + MDB_node *node; + MDB_page *mp; + + if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; + for (i=0; i<mc->mc_top; i++) { + mp = mc->mc_pg[i]; + node = NODEPTR(mp, mc->mc_ki[i]); + if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno) + printf("oops!\n"); + } + if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) + printf("ack!\n"); + if (XCURSOR_INITED(mc)) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && + mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { + printf("blah!\n"); + } + } +} +#endif + +#if (MDB_DEBUG) > 2 +/** Count all the pages in each DB and in the freelist + * and make sure it matches the actual number of pages + * being used. + * All named DBs must be open for a correct count. + */ +static void mdb_audit(MDB_txn *txn) +{ + MDB_cursor mc; + MDB_val key, data; + MDB_ID freecount, count; + MDB_dbi i; + int rc; + + freecount = 0; + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + freecount += *(MDB_ID *)data.mv_data; + mdb_tassert(txn, rc == MDB_NOTFOUND); + + count = 0; + for (i = 0; i<txn->mt_numdbs; i++) { + MDB_xcursor mx; + if (!(txn->mt_dbflags[i] & DB_VALID)) + continue; + mdb_cursor_init(&mc, txn, i, &mx); + if (txn->mt_dbs[i].md_root == P_INVALID) + continue; + count += txn->mt_dbs[i].md_branch_pages + + txn->mt_dbs[i].md_leaf_pages + + txn->mt_dbs[i].md_overflow_pages; + if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { + rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST); + for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) { + unsigned j; + MDB_page *mp; + mp = mc.mc_pg[mc.mc_top]; + for (j=0; j<NUMKEYS(mp); j++) { + MDB_node *leaf = NODEPTR(mp, j); + if (leaf->mn_flags & F_SUBDATA) { + MDB_db db; + memcpy(&db, NODEDATA(leaf), sizeof(db)); + count += db.md_branch_pages + db.md_leaf_pages + + db.md_overflow_pages; + } + } + } + mdb_tassert(txn, rc == MDB_NOTFOUND); + } + } + if (freecount + count + NUM_METAS != txn->mt_next_pgno) { + fprintf(stderr, "audit: %"Z"u freecount: %"Z"u count: %"Z"u total: %"Z"u next_pgno: %"Z"u\n", + txn->mt_txnid, freecount, count+NUM_METAS, + freecount+count+NUM_METAS, txn->mt_next_pgno); + } +} +#endif + +int +mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) +{ + return txn->mt_dbxs[dbi].md_cmp(a, b); +} + +int +mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) +{ + MDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp; +#if UINT_MAX < SIZE_MAX + if (dcmp == mdb_cmp_int && a->mv_size == sizeof(size_t)) + dcmp = mdb_cmp_clong; +#endif + return dcmp(a, b); +} + +/** Allocate memory for a page. + * Re-use old malloc'd pages first for singletons, otherwise just malloc. + * Set #MDB_TXN_ERROR on failure. + */ +static MDB_page * +mdb_page_malloc(MDB_txn *txn, unsigned num) +{ + MDB_env *env = txn->mt_env; + MDB_page *ret = env->me_dpages; + size_t psize = env->me_psize, sz = psize, off; + /* For ! #MDB_NOMEMINIT, psize counts how much to init. + * For a single page alloc, we init everything after the page header. + * For multi-page, we init the final page; if the caller needed that + * many pages they will be filling in at least up to the last page. + */ + if (num == 1) { + if (ret) { + VGMEMP_ALLOC(env, ret, sz); + VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); + env->me_dpages = ret->mp_next; + return ret; + } + psize -= off = PAGEHDRSZ; + } else { + sz *= num; + off = sz - psize; + } + if ((ret = malloc(sz)) != NULL) { + VGMEMP_ALLOC(env, ret, sz); + if (!(env->me_flags & MDB_NOMEMINIT)) { + memset((char *)ret + off, 0, psize); + ret->mp_pad = 0; + } + } else { + txn->mt_flags |= MDB_TXN_ERROR; + } + return ret; +} +/** Free a single page. + * Saves single pages to a list, for future reuse. + * (This is not used for multi-page overflow pages.) + */ +static void +mdb_page_free(MDB_env *env, MDB_page *mp) +{ + mp->mp_next = env->me_dpages; + VGMEMP_FREE(env, mp); + env->me_dpages = mp; +} + +/** Free a dirty page */ +static void +mdb_dpage_free(MDB_env *env, MDB_page *dp) +{ + if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { + mdb_page_free(env, dp); + } else { + /* large pages just get freed directly */ + VGMEMP_FREE(env, dp); + free(dp); + } +} + +/** Return all dirty pages to dpage list */ +static void +mdb_dlist_free(MDB_txn *txn) +{ + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned i, n = dl[0].mid; + + for (i = 1; i <= n; i++) { + mdb_dpage_free(env, dl[i].mptr); + } + dl[0].mid = 0; +} + +/** Loosen or free a single page. + * Saves single pages to a list for future reuse + * in this same txn. It has been pulled from the freeDB + * and already resides on the dirty list, but has been + * deleted. Use these pages first before pulling again + * from the freeDB. + * + * If the page wasn't dirtied in this txn, just add it + * to this txn's free list. + */ +static int +mdb_page_loose(MDB_cursor *mc, MDB_page *mp) +{ + int loose = 0; + pgno_t pgno = mp->mp_pgno; + MDB_txn *txn = mc->mc_txn; + + if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { + if (txn->mt_parent) { + MDB_ID2 *dl = txn->mt_u.dirty_list; + /* If txn has a parent, make sure the page is in our + * dirty list. + */ + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (mp != dl[x].mptr) { /* bad cursor? */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + /* ok, it's ours */ + loose = 1; + } + } + } else { + /* no parent txn, so it's just ours */ + loose = 1; + } + } + if (loose) { + DPRINTF(("loosen db %d page %"Z"u", DDBI(mc), + mp->mp_pgno)); + NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs; + txn->mt_loose_pgs = mp; + txn->mt_loose_count++; + mp->mp_flags |= P_LOOSE; + } else { + int rc = mdb_midl_append(&txn->mt_free_pgs, pgno); + if (rc) + return rc; + } + + return MDB_SUCCESS; +} + +/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. + * @param[in] mc A cursor handle for the current operation. + * @param[in] pflags Flags of the pages to update: + * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. + * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). + * @return 0 on success, non-zero on failure. + */ +static int +mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) +{ + enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m3, *m0 = mc; + MDB_xcursor *mx; + MDB_page *dp, *mp; + MDB_node *leaf; + unsigned i, j; + int rc = MDB_SUCCESS, level; + + /* Mark pages seen by cursors */ + if (mc->mc_flags & C_UNTRACK) + mc = NULL; /* will find mc in mt_cursors */ + for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { + for (; mc; mc=mc->mc_next) { + if (!(mc->mc_flags & C_INITIALIZED)) + continue; + for (m3 = mc;; m3 = &mx->mx_cursor) { + mp = NULL; + for (j=0; j<m3->mc_snum; j++) { + mp = m3->mc_pg[j]; + if ((mp->mp_flags & Mask) == pflags) + mp->mp_flags ^= P_KEEP; + } + mx = m3->mc_xcursor; + /* Proceed to mx if it is at a sub-database */ + if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) + break; + if (! (mp && (mp->mp_flags & P_LEAF))) + break; + leaf = NODEPTR(mp, m3->mc_ki[j-1]); + if (!(leaf->mn_flags & F_SUBDATA)) + break; + } + } + if (i == 0) + break; + } + + if (all) { + /* Mark dirty root pages */ + for (i=0; i<txn->mt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + pgno_t pgno = txn->mt_dbs[i].md_root; + if (pgno == P_INVALID) + continue; + if ((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS) + break; + if ((dp->mp_flags & Mask) == pflags && level <= 1) + dp->mp_flags ^= P_KEEP; + } + } + } + + return rc; +} + +static int mdb_page_flush(MDB_txn *txn, int keep); + +/** Spill pages from the dirty list back to disk. + * This is intended to prevent running into #MDB_TXN_FULL situations, + * but note that they may still occur in a few cases: + * 1) our estimate of the txn size could be too small. Currently this + * seems unlikely, except with a large number of #MDB_MULTIPLE items. + * 2) child txns may run out of space if their parents dirtied a + * lot of pages and never spilled them. TODO: we probably should do + * a preemptive spill during #mdb_txn_begin() of a child txn, if + * the parent's dirty_room is below a given threshold. + * + * Otherwise, if not using nested txns, it is expected that apps will + * not run into #MDB_TXN_FULL any more. The pages are flushed to disk + * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. + * If the txn never references them again, they can be left alone. + * If the txn only reads them, they can be used without any fuss. + * If the txn writes them again, they can be dirtied immediately without + * going thru all of the work of #mdb_page_touch(). Such references are + * handled by #mdb_page_unspill(). + * + * Also note, we never spill DB root pages, nor pages of active cursors, + * because we'll need these back again soon anyway. And in nested txns, + * we can't spill a page in a child txn if it was already spilled in a + * parent txn. That would alter the parent txns' data even though + * the child hasn't committed yet, and we'd have no way to undo it if + * the child aborted. + * + * @param[in] m0 cursor A cursor handle identifying the transaction and + * database for which we are checking space. + * @param[in] key For a put operation, the key being stored. + * @param[in] data For a put operation, the data being stored. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) +{ + MDB_txn *txn = m0->mc_txn; + MDB_page *dp; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned int i, j, need; + int rc; + + if (m0->mc_flags & C_SUB) + return MDB_SUCCESS; + + /* Estimate how much space this op will take */ + i = m0->mc_db->md_depth; + /* Named DBs also dirty the main DB */ + if (m0->mc_dbi >= CORE_DBS) + i += txn->mt_dbs[MAIN_DBI].md_depth; + /* For puts, roughly factor in the key+data size */ + if (key) + i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; + i += i; /* double it for good measure */ + need = i; + + if (txn->mt_dirty_room > i) + return MDB_SUCCESS; + + if (!txn->mt_spill_pgs) { + txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); + if (!txn->mt_spill_pgs) + return ENOMEM; + } else { + /* purge deleted slots */ + MDB_IDL sl = txn->mt_spill_pgs; + unsigned int num = sl[0]; + j=0; + for (i=1; i<=num; i++) { + if (!(sl[i] & 1)) + sl[++j] = sl[i]; + } + sl[0] = j; + } + + /* Preserve pages which may soon be dirtied again */ + if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) + goto done; + + /* Less aggressive spill - we originally spilled the entire dirty list, + * with a few exceptions for cursor pages and DB root pages. But this + * turns out to be a lot of wasted effort because in a large txn many + * of those pages will need to be used again. So now we spill only 1/8th + * of the dirty pages. Testing revealed this to be a good tradeoff, + * better than 1/2, 1/4, or 1/10. + */ + if (need < MDB_IDL_UM_MAX / 8) + need = MDB_IDL_UM_MAX / 8; + + /* Save the page IDs of all the pages we're flushing */ + /* flush from the tail forward, this saves a lot of shifting later on. */ + for (i=dl[0].mid; i && need; i--) { + MDB_ID pn = dl[i].mid << 1; + dp = dl[i].mptr; + if (dp->mp_flags & (P_LOOSE|P_KEEP)) + continue; + /* Can't spill twice, make sure it's not already in a parent's + * spill list. + */ + if (txn->mt_parent) { + MDB_txn *tx2; + for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { + if (tx2->mt_spill_pgs) { + j = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { + dp->mp_flags |= P_KEEP; + break; + } + } + } + if (tx2) + continue; + } + if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) + goto done; + need--; + } + mdb_midl_sort(txn->mt_spill_pgs); + + /* Flush the spilled part of dirty list */ + if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) + goto done; + + /* Reset any dirty pages we kept that page_flush didn't see */ + rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); + +done: + txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; + return rc; +} + +/** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ +static txnid_t +mdb_find_oldest(MDB_txn *txn) +{ + int i; + txnid_t mr, oldest = txn->mt_txnid - 1; + if (txn->mt_env->me_txns) { + MDB_reader *r = txn->mt_env->me_txns->mti_readers; + for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { + if (r[i].mr_pid) { + mr = r[i].mr_txnid; + if (oldest > mr) + oldest = mr; + } + } + } + return oldest; +} + +/** Add a page to the txn's dirty list */ +static void +mdb_page_dirty(MDB_txn *txn, MDB_page *mp) +{ + MDB_ID2 mid; + int rc, (*insert)(MDB_ID2L, MDB_ID2 *); + + if (txn->mt_flags & MDB_TXN_WRITEMAP) { + insert = mdb_mid2l_append; + } else { + insert = mdb_mid2l_insert; + } + mid.mid = mp->mp_pgno; + mid.mptr = mp; + rc = insert(txn->mt_u.dirty_list, &mid); + mdb_tassert(txn, rc == 0); + txn->mt_dirty_room--; +} + +/** Allocate page numbers and memory for writing. Maintain me_pglast, + * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. + * + * If there are free pages available from older transactions, they + * are re-used first. Otherwise allocate a new page at mt_next_pgno. + * Do not modify the freedB, just merge freeDB records into me_pghead[] + * and move me_pglast to say which records were consumed. Only this + * function can create me_pghead and move me_pglast/mt_next_pgno. + * @param[in] mc cursor A cursor handle identifying the transaction and + * database for which we are allocating. + * @param[in] num the number of pages to allocate. + * @param[out] mp Address of the allocated page(s). Requests for multiple pages + * will always be satisfied by a single contiguous chunk of memory. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) +{ +#ifdef MDB_PARANOID /* Seems like we can ignore this now */ + /* Get at most <Max_retries> more freeDB records once me_pghead + * has enough pages. If not enough, use new pages from the map. + * If <Paranoid> and mc is updating the freeDB, only get new + * records if me_pghead is empty. Then the freelist cannot play + * catch-up with itself by growing while trying to save it. + */ + enum { Paranoid = 1, Max_retries = 500 }; +#else + enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ }; +#endif + int rc, retry = num * 60; + MDB_txn *txn = mc->mc_txn; + MDB_env *env = txn->mt_env; + pgno_t pgno, *mop = env->me_pghead; + unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1; + MDB_page *np; + txnid_t oldest = 0, last; + MDB_cursor_op op; + MDB_cursor m2; + int found_old = 0; + + /* If there are any loose pages, just use them */ + if (num == 1 && txn->mt_loose_pgs) { + np = txn->mt_loose_pgs; + txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); + txn->mt_loose_count--; + DPRINTF(("db %d use loose page %"Z"u", DDBI(mc), + np->mp_pgno)); + *mp = np; + return MDB_SUCCESS; + } + + *mp = NULL; + + /* If our dirty list is already full, we can't do anything */ + if (txn->mt_dirty_room == 0) { + rc = MDB_TXN_FULL; + goto fail; + } + + for (op = MDB_FIRST;; op = MDB_NEXT) { + MDB_val key, data; + MDB_node *leaf; + pgno_t *idl; + + /* Seek a big enough contiguous page range. Prefer + * pages at the tail, just truncating the list. + */ + if (mop_len > n2) { + i = mop_len; + do { + pgno = mop[i]; + if (mop[i-n2] == pgno+n2) + goto search_done; + } while (--i > n2); + if (--retry < 0) + break; + } + + if (op == MDB_FIRST) { /* 1st iteration */ + /* Prepare to fetch more and coalesce */ + last = env->me_pglast; + oldest = env->me_pgoldest; + mdb_cursor_init(&m2, txn, FREE_DBI, NULL); + if (last) { + op = MDB_SET_RANGE; + key.mv_data = &last; /* will look up last+1 */ + key.mv_size = sizeof(last); + } + if (Paranoid && mc->mc_dbi == FREE_DBI) + retry = -1; + } + if (Paranoid && retry < 0 && mop_len) + break; + + last++; + /* Do not fetch more if the record will be too recent */ + if (oldest <= last) { + if (!found_old) { + oldest = mdb_find_oldest(txn); + env->me_pgoldest = oldest; + found_old = 1; + } + if (oldest <= last) + break; + } + rc = mdb_cursor_get(&m2, &key, NULL, op); + if (rc) { + if (rc == MDB_NOTFOUND) + break; + goto fail; + } + last = *(txnid_t*)key.mv_data; + if (oldest <= last) { + if (!found_old) { + oldest = mdb_find_oldest(txn); + env->me_pgoldest = oldest; + found_old = 1; + } + if (oldest <= last) + break; + } + np = m2.mc_pg[m2.mc_top]; + leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); + if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS) + goto fail; + + idl = (MDB_ID *) data.mv_data; + i = idl[0]; + if (!mop) { + if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { + rc = ENOMEM; + goto fail; + } + } else { + if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) + goto fail; + mop = env->me_pghead; + } + env->me_pglast = last; +#if (MDB_DEBUG) > 1 + DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u", + last, txn->mt_dbs[FREE_DBI].md_root, i)); + for (j = i; j; j--) + DPRINTF(("IDL %"Z"u", idl[j])); +#endif + /* Merge in descending sorted order */ + mdb_midl_xmerge(mop, idl); + mop_len = mop[0]; + } + + /* Use new pages from the map when nothing suitable in the freeDB */ + i = 0; + pgno = txn->mt_next_pgno; + if (pgno + num >= env->me_maxpg) { + DPUTS("DB size maxed out"); + rc = MDB_MAP_FULL; + goto fail; + } + +search_done: + if (env->me_flags & MDB_WRITEMAP) { + np = (MDB_page *)(env->me_map + env->me_psize * pgno); + } else { + if (!(np = mdb_page_malloc(txn, num))) { + rc = ENOMEM; + goto fail; + } + } + if (i) { + mop[0] = mop_len -= num; + /* Move any stragglers down */ + for (j = i-num; j < mop_len; ) + mop[++j] = mop[++i]; + } else { + txn->mt_next_pgno = pgno + num; + } + np->mp_pgno = pgno; + mdb_page_dirty(txn, np); + *mp = np; + + return MDB_SUCCESS; + +fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +/** Copy the used portions of a non-overflow page. + * @param[in] dst page to copy into + * @param[in] src page to copy from + * @param[in] psize size of a page + */ +static void +mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) +{ + enum { Align = sizeof(pgno_t) }; + indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; + + /* If page isn't full, just copy the used portion. Adjust + * alignment so memcpy may copy words instead of bytes. + */ + if ((unused &= -Align) && !IS_LEAF2(src)) { + upper = (upper + PAGEBASE) & -Align; + memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align); + memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), + psize - upper); + } else { + memcpy(dst, src, psize - unused); + } +} + +/** Pull a page off the txn's spill list, if present. + * If a page being referenced was spilled to disk in this txn, bring + * it back and make it dirty/writable again. + * @param[in] txn the transaction handle. + * @param[in] mp the page being referenced. It must not be dirty. + * @param[out] ret the writable page, if any. ret is unchanged if + * mp wasn't spilled. + */ +static int +mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) +{ + MDB_env *env = txn->mt_env; + const MDB_txn *tx2; + unsigned x; + pgno_t pgno = mp->mp_pgno, pn = pgno << 1; + + for (tx2 = txn; tx2; tx2=tx2->mt_parent) { + if (!tx2->mt_spill_pgs) + continue; + x = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + MDB_page *np; + int num; + if (txn->mt_dirty_room == 0) + return MDB_TXN_FULL; + if (IS_OVERFLOW(mp)) + num = mp->mp_pages; + else + num = 1; + if (env->me_flags & MDB_WRITEMAP) { + np = mp; + } else { + np = mdb_page_malloc(txn, num); + if (!np) + return ENOMEM; + if (num > 1) + memcpy(np, mp, num * env->me_psize); + else + mdb_page_copy(np, mp, env->me_psize); + } + if (tx2 == txn) { + /* If in current txn, this page is no longer spilled. + * If it happens to be the last page, truncate the spill list. + * Otherwise mark it as deleted by setting the LSB. + */ + if (x == txn->mt_spill_pgs[0]) + txn->mt_spill_pgs[0]--; + else + txn->mt_spill_pgs[x] |= 1; + } /* otherwise, if belonging to a parent txn, the + * page remains spilled until child commits + */ + + mdb_page_dirty(txn, np); + np->mp_flags |= P_DIRTY; + *ret = np; + break; + } + } + return MDB_SUCCESS; +} + +/** Touch a page: make it dirty and re-insert into tree with updated pgno. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc cursor pointing to the page to be touched + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_touch(MDB_cursor *mc) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top], *np; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m2, *m3; + pgno_t pgno; + int rc; + + if (!F_ISSET(mp->mp_flags, P_DIRTY)) { + if (txn->mt_flags & MDB_TXN_SPILLS) { + np = NULL; + rc = mdb_page_unspill(txn, mp, &np); + if (rc) + goto fail; + if (np) + goto done; + } + if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || + (rc = mdb_page_alloc(mc, 1, &np))) + goto fail; + pgno = np->mp_pgno; + DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), + mp->mp_pgno, pgno)); + mdb_cassert(mc, mp->mp_pgno != pgno); + mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + /* Update the parent page, if any, to point to the new page */ + if (mc->mc_top) { + MDB_page *parent = mc->mc_pg[mc->mc_top-1]; + MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); + SETPGNO(node, pgno); + } else { + mc->mc_db->md_root = pgno; + } + } else if (txn->mt_parent && !IS_SUBP(mp)) { + MDB_ID2 mid, *dl = txn->mt_u.dirty_list; + pgno = mp->mp_pgno; + /* If txn has a parent, make sure the page is in our + * dirty list. + */ + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (mp != dl[x].mptr) { /* bad cursor? */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + return 0; + } + } + mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); + /* No - copy it */ + np = mdb_page_malloc(txn, 1); + if (!np) + return ENOMEM; + mid.mid = pgno; + mid.mptr = np; + rc = mdb_mid2l_insert(dl, &mid); + mdb_cassert(mc, rc == 0); + } else { + return 0; + } + + mdb_page_copy(np, mp, txn->mt_env->me_psize); + np->mp_pgno = pgno; + np->mp_flags |= P_DIRTY; + +done: + /* Adjust cursors pointing to mp */ + mc->mc_pg[mc->mc_top] = np; + m2 = txn->mt_cursors[mc->mc_dbi]; + if (mc->mc_flags & C_SUB) { + for (; m2; m2=m2->mc_next) { + m3 = &m2->mc_xcursor->mx_cursor; + if (m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[mc->mc_top] == mp) + m3->mc_pg[mc->mc_top] = np; + } + } else { + for (; m2; m2=m2->mc_next) { + if (m2->mc_snum < mc->mc_snum) continue; + if (m2 == mc) continue; + if (m2->mc_pg[mc->mc_top] == mp) { + m2->mc_pg[mc->mc_top] = np; + if (XCURSOR_INITED(m2) && IS_LEAF(np)) + XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]); + } + } + } + return 0; + +fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_env_sync(MDB_env *env, int force) +{ + int rc = 0; + if (env->me_flags & MDB_RDONLY) + return EACCES; + if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { + if (env->me_flags & MDB_WRITEMAP) { + int flags = ((env->me_flags & MDB_MAPASYNC) && !force) + ? MS_ASYNC : MS_SYNC; + if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) + rc = ErrCode(); +#ifdef _WIN32 + else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) + rc = ErrCode(); +#endif + } else { +#ifdef BROKEN_FDATASYNC + if (env->me_flags & MDB_FSYNCONLY) { + if (fsync(env->me_fd)) + rc = ErrCode(); + } else +#endif + if (MDB_FDATASYNC(env->me_fd)) + rc = ErrCode(); + } + } + return rc; +} + +/** Back up parent txn's cursors, then grab the originals for tracking */ +static int +mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) +{ + MDB_cursor *mc, *bk; + MDB_xcursor *mx; + size_t size; + int i; + + for (i = src->mt_numdbs; --i >= 0; ) { + if ((mc = src->mt_cursors[i]) != NULL) { + size = sizeof(MDB_cursor); + if (mc->mc_xcursor) + size += sizeof(MDB_xcursor); + for (; mc; mc = bk->mc_next) { + bk = malloc(size); + if (!bk) + return ENOMEM; + *bk = *mc; + mc->mc_backup = bk; + mc->mc_db = &dst->mt_dbs[i]; + /* Kill pointers into src to reduce abuse: The + * user may not use mc until dst ends. But we need a valid + * txn pointer here for cursor fixups to keep working. + */ + mc->mc_txn = dst; + mc->mc_dbflag = &dst->mt_dbflags[i]; + if ((mx = mc->mc_xcursor) != NULL) { + *(MDB_xcursor *)(bk+1) = *mx; + mx->mx_cursor.mc_txn = dst; + } + mc->mc_next = dst->mt_cursors[i]; + dst->mt_cursors[i] = mc; + } + } + } + return MDB_SUCCESS; +} + +/** Close this write txn's cursors, give parent txn's cursors back to parent. + * @param[in] txn the transaction handle. + * @param[in] merge true to keep changes to parent cursors, false to revert. + * @return 0 on success, non-zero on failure. + */ +static void +mdb_cursors_close(MDB_txn *txn, unsigned merge) +{ + MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; + MDB_xcursor *mx; + int i; + + for (i = txn->mt_numdbs; --i >= 0; ) { + for (mc = cursors[i]; mc; mc = next) { + next = mc->mc_next; + if ((bk = mc->mc_backup) != NULL) { + if (merge) { + /* Commit changes to parent txn */ + mc->mc_next = bk->mc_next; + mc->mc_backup = bk->mc_backup; + mc->mc_txn = bk->mc_txn; + mc->mc_db = bk->mc_db; + mc->mc_dbflag = bk->mc_dbflag; + if ((mx = mc->mc_xcursor) != NULL) + mx->mx_cursor.mc_txn = bk->mc_txn; + } else { + /* Abort nested txn */ + *mc = *bk; + if ((mx = mc->mc_xcursor) != NULL) + *mx = *(MDB_xcursor *)(bk+1); + } + mc = bk; + } + /* Only malloced cursors are permanently tracked. */ + free(mc); + } + cursors[i] = NULL; + } +} + +#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ +enum Pidlock_op { + Pidset, Pidcheck +}; +#else +enum Pidlock_op { + Pidset = F_SETLK, Pidcheck = F_GETLK +}; +#endif + +/** Set or check a pid lock. Set returns 0 on success. + * Check returns 0 if the process is certainly dead, nonzero if it may + * be alive (the lock exists or an error happened so we do not know). + * + * On Windows Pidset is a no-op, we merely check for the existence + * of the process with the given pid. On POSIX we use a single byte + * lock on the lockfile, set at an offset equal to the pid. + */ +static int +mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid) +{ +#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ + int ret = 0; + HANDLE h; + if (op == Pidcheck) { + h = OpenProcess(env->me_pidquery, FALSE, pid); + /* No documented "no such process" code, but other program use this: */ + if (!h) + return ErrCode() != ERROR_INVALID_PARAMETER; + /* A process exists until all handles to it close. Has it exited? */ + ret = WaitForSingleObject(h, 0) != 0; + CloseHandle(h); + } + return ret; +#else + for (;;) { + int rc; + struct flock lock_info; + memset(&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_WRLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = pid; + lock_info.l_len = 1; + if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { + if (op == F_GETLK && lock_info.l_type != F_UNLCK) + rc = -1; + } else if ((rc = ErrCode()) == EINTR) { + continue; + } + return rc; + } +#endif +} + +/** Common code for #mdb_txn_begin() and #mdb_txn_renew(). + * @param[in] txn the transaction handle to initialize + * @return 0 on success, non-zero on failure. + */ +static int +mdb_txn_renew0(MDB_txn *txn) +{ + MDB_env *env = txn->mt_env; + MDB_txninfo *ti = env->me_txns; + MDB_meta *meta; + unsigned int i, nr, flags = txn->mt_flags; + uint16_t x; + int rc, new_notls = 0; + + if ((flags &= MDB_TXN_RDONLY) != 0) { + if (!ti) { + meta = mdb_env_pick_meta(env); + txn->mt_txnid = meta->mm_txnid; + txn->mt_u.reader = NULL; + } else { + MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : + pthread_getspecific(env->me_txkey); + if (r) { + if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) + return MDB_BAD_RSLOT; + } else { + MDB_PID_T pid = env->me_pid; + MDB_THR_T tid = pthread_self(); + mdb_mutexref_t rmutex = env->me_rmutex; + + if (!env->me_live_reader) { + rc = mdb_reader_pid(env, Pidset, pid); + if (rc) + return rc; + env->me_live_reader = 1; + } + + if (LOCK_MUTEX(rc, env, rmutex)) + return rc; + nr = ti->mti_numreaders; + for (i=0; i<nr; i++) + if (ti->mti_readers[i].mr_pid == 0) + break; + if (i == env->me_maxreaders) { + UNLOCK_MUTEX(rmutex); + return MDB_READERS_FULL; + } + r = &ti->mti_readers[i]; + /* Claim the reader slot, carefully since other code + * uses the reader table un-mutexed: First reset the + * slot, next publish it in mti_numreaders. After + * that, it is safe for mdb_env_close() to touch it. + * When it will be closed, we can finally claim it. + */ + r->mr_pid = 0; + r->mr_txnid = (txnid_t)-1; + r->mr_tid = tid; + if (i == nr) + ti->mti_numreaders = ++nr; + env->me_close_readers = nr; + r->mr_pid = pid; + UNLOCK_MUTEX(rmutex); + + new_notls = (env->me_flags & MDB_NOTLS); + if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { + r->mr_pid = 0; + return rc; + } + } + do /* LY: Retry on a race, ITS#7970. */ + r->mr_txnid = ti->mti_txnid; + while(r->mr_txnid != ti->mti_txnid); + txn->mt_txnid = r->mr_txnid; + txn->mt_u.reader = r; + meta = env->me_metas[txn->mt_txnid & 1]; + } + + } else { + /* Not yet touching txn == env->me_txn0, it may be active */ + if (ti) { + if (LOCK_MUTEX(rc, env, env->me_wmutex)) + return rc; + txn->mt_txnid = ti->mti_txnid; + meta = env->me_metas[txn->mt_txnid & 1]; + } else { + meta = mdb_env_pick_meta(env); + txn->mt_txnid = meta->mm_txnid; + } + txn->mt_txnid++; +#if MDB_DEBUG + if (txn->mt_txnid == mdb_debug_start) + mdb_debug = 1; +#endif + txn->mt_child = NULL; + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + txn->mt_dirty_room = MDB_IDL_UM_MAX; + txn->mt_u.dirty_list = env->me_dirty_list; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_free_pgs = env->me_free_pgs; + txn->mt_free_pgs[0] = 0; + txn->mt_spill_pgs = NULL; + env->me_txn = txn; + memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int)); + } + + /* Copy the DB info and flags */ + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); + + /* Moved to here to avoid a data race in read TXNs */ + txn->mt_next_pgno = meta->mm_last_pg+1; + + txn->mt_flags = flags; + + /* Setup db info */ + txn->mt_numdbs = env->me_numdbs; + for (i=CORE_DBS; i<txn->mt_numdbs; i++) { + x = env->me_dbflags[i]; + txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; + txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0; + } + txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID; + txn->mt_dbflags[FREE_DBI] = DB_VALID; + + if (env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + rc = MDB_PANIC; + } else if (env->me_maxpg < txn->mt_next_pgno) { + rc = MDB_MAP_RESIZED; + } else { + return MDB_SUCCESS; + } + mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); + return rc; +} + +int +mdb_txn_renew(MDB_txn *txn) +{ + int rc; + + if (!txn || !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED)) + return EINVAL; + + rc = mdb_txn_renew0(txn); + if (rc == MDB_SUCCESS) { + DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); + } + return rc; +} + +int +mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) +{ + MDB_txn *txn; + MDB_ntxn *ntxn; + int rc, size, tsize; + + flags &= MDB_TXN_BEGIN_FLAGS; + flags |= env->me_flags & MDB_WRITEMAP; + + if (env->me_flags & MDB_RDONLY & ~flags) /* write txn in RDONLY env */ + return EACCES; + + if (parent) { + /* Nested transactions: Max 1 child, write txns only, no writemap */ + flags |= parent->mt_flags; + if (flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_BLOCKED)) { + return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; + } + /* Child txns save MDB_pgstate and use own copy of cursors */ + size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1); + size += tsize = sizeof(MDB_ntxn); + } else if (flags & MDB_RDONLY) { + size = env->me_maxdbs * (sizeof(MDB_db)+1); + size += tsize = sizeof(MDB_txn); + } else { + /* Reuse preallocated write txn. However, do not touch it until + * mdb_txn_renew0() succeeds, since it currently may be active. + */ + txn = env->me_txn0; + goto renew; + } + if ((txn = calloc(1, size)) == NULL) { + DPRINTF(("calloc: %s", strerror(errno))); + return ENOMEM; + } + txn->mt_dbxs = env->me_dbxs; /* static */ + txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); + txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; + txn->mt_flags = flags; + txn->mt_env = env; + + if (parent) { + unsigned int i; + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = parent->mt_dbiseqs; + txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); + if (!txn->mt_u.dirty_list || + !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) + { + free(txn->mt_u.dirty_list); + free(txn); + return ENOMEM; + } + txn->mt_txnid = parent->mt_txnid; + txn->mt_dirty_room = parent->mt_dirty_room; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_spill_pgs = NULL; + txn->mt_next_pgno = parent->mt_next_pgno; + parent->mt_flags |= MDB_TXN_HAS_CHILD; + parent->mt_child = txn; + txn->mt_parent = parent; + txn->mt_numdbs = parent->mt_numdbs; + memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + /* Copy parent's mt_dbflags, but clear DB_NEW */ + for (i=0; i<txn->mt_numdbs; i++) + txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; + rc = 0; + ntxn = (MDB_ntxn *)txn; + ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ + if (env->me_pghead) { + size = MDB_IDL_SIZEOF(env->me_pghead); + env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); + if (env->me_pghead) + memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); + else + rc = ENOMEM; + } + if (!rc) + rc = mdb_cursor_shadow(parent, txn); + if (rc) + mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD); + } else { /* MDB_RDONLY */ + txn->mt_dbiseqs = env->me_dbiseqs; +renew: + rc = mdb_txn_renew0(txn); + } + if (rc) { + if (txn != env->me_txn0) + free(txn); + } else { + txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */ + *ret = txn; + DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', + (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); + } + + return rc; +} + +MDB_env * +mdb_txn_env(MDB_txn *txn) +{ + if(!txn) return NULL; + return txn->mt_env; +} + +size_t +mdb_txn_id(MDB_txn *txn) +{ + if(!txn) return 0; + return txn->mt_txnid; +} + +/** Export or close DBI handles opened in this txn. */ +static void +mdb_dbis_update(MDB_txn *txn, int keep) +{ + int i; + MDB_dbi n = txn->mt_numdbs; + MDB_env *env = txn->mt_env; + unsigned char *tdbflags = txn->mt_dbflags; + + for (i = n; --i >= CORE_DBS;) { + if (tdbflags[i] & DB_NEW) { + if (keep) { + env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; + } else { + char *ptr = env->me_dbxs[i].md_name.mv_data; + if (ptr) { + env->me_dbxs[i].md_name.mv_data = NULL; + env->me_dbxs[i].md_name.mv_size = 0; + env->me_dbflags[i] = 0; + env->me_dbiseqs[i]++; + free(ptr); + } + } + } + } + if (keep && env->me_numdbs < n) + env->me_numdbs = n; +} + +/** End a transaction, except successful commit of a nested transaction. + * May be called twice for readonly txns: First reset it, then abort. + * @param[in] txn the transaction handle to end + * @param[in] mode why and how to end the transaction + */ +static void +mdb_txn_end(MDB_txn *txn, unsigned mode) +{ + MDB_env *env = txn->mt_env; +#if MDB_DEBUG + static const char *const names[] = MDB_END_NAMES; +#endif + + /* Export or close DBI handles opened in this txn */ + mdb_dbis_update(txn, mode & MDB_END_UPDATE); + + DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + names[mode & MDB_END_OPMASK], + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + if (txn->mt_u.reader) { + txn->mt_u.reader->mr_txnid = (txnid_t)-1; + if (!(env->me_flags & MDB_NOTLS)) { + txn->mt_u.reader = NULL; /* txn does not own reader */ + } else if (mode & MDB_END_SLOT) { + txn->mt_u.reader->mr_pid = 0; + txn->mt_u.reader = NULL; + } /* else txn owns the slot until it does MDB_END_SLOT */ + } + txn->mt_numdbs = 0; /* prevent further DBI activity */ + txn->mt_flags |= MDB_TXN_FINISHED; + + } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { + pgno_t *pghead = env->me_pghead; + + if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ + mdb_cursors_close(txn, 0); + if (!(env->me_flags & MDB_WRITEMAP)) { + mdb_dlist_free(txn); + } + + txn->mt_numdbs = 0; + txn->mt_flags = MDB_TXN_FINISHED; + + if (!txn->mt_parent) { + mdb_midl_shrink(&txn->mt_free_pgs); + env->me_free_pgs = txn->mt_free_pgs; + /* me_pgstate: */ + env->me_pghead = NULL; + env->me_pglast = 0; + + env->me_txn = NULL; + mode = 0; /* txn == env->me_txn0, do not free() it */ + + /* The writer mutex was locked in mdb_txn_begin. */ + if (env->me_txns) + UNLOCK_MUTEX(env->me_wmutex); + } else { + txn->mt_parent->mt_child = NULL; + txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; + env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; + mdb_midl_free(txn->mt_free_pgs); + mdb_midl_free(txn->mt_spill_pgs); + free(txn->mt_u.dirty_list); + } + + mdb_midl_free(pghead); + } + + if (mode & MDB_END_FREE) + free(txn); +} + +void +mdb_txn_reset(MDB_txn *txn) +{ + if (txn == NULL) + return; + + /* This call is only valid for read-only txns */ + if (!(txn->mt_flags & MDB_TXN_RDONLY)) + return; + + mdb_txn_end(txn, MDB_END_RESET); +} + +void +mdb_txn_abort(MDB_txn *txn) +{ + if (txn == NULL) + return; + + if (txn->mt_child) + mdb_txn_abort(txn->mt_child); + + mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); +} + +/** Save the freelist as of this transaction to the freeDB. + * This changes the freelist. Keep trying until it stabilizes. + */ +static int +mdb_freelist_save(MDB_txn *txn) +{ + /* env->me_pghead[] can grow and shrink during this call. + * env->me_pglast and txn->mt_free_pgs[] can only grow. + * Page numbers cannot disappear from txn->mt_free_pgs[]. + */ + MDB_cursor mc; + MDB_env *env = txn->mt_env; + int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; + txnid_t pglast = 0, head_id = 0; + pgno_t freecnt = 0, *free_pgs, *mop; + ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; + + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + + if (env->me_pghead) { + /* Make sure first page of freeDB is touched and on freelist */ + rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } + + if (!env->me_pghead && txn->mt_loose_pgs) { + /* Put loose page numbers in mt_free_pgs, since + * we may be unable to return them to me_pghead. + */ + MDB_page *mp = txn->mt_loose_pgs; + if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0) + return rc; + for (; mp; mp = NEXT_LOOSE_PAGE(mp)) + mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + } + + /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ + clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) + ? SSIZE_MAX : maxfree_1pg; + + for (;;) { + /* Come back here after each Put() in case freelist changed */ + MDB_val key, data; + pgno_t *pgs; + ssize_t j; + + /* If using records from freeDB which we have not yet + * deleted, delete them and any we reserved for me_pghead. + */ + while (pglast < env->me_pglast) { + rc = mdb_cursor_first(&mc, &key, NULL); + if (rc) + return rc; + pglast = head_id = *(txnid_t *)key.mv_data; + total_room = head_room = 0; + mdb_tassert(txn, pglast <= env->me_pglast); + rc = mdb_cursor_del(&mc, 0); + if (rc) + return rc; + } + + /* Save the IDL of pages freed by this txn, to a single record */ + if (freecnt < txn->mt_free_pgs[0]) { + if (!freecnt) { + /* Make sure last page of freeDB is touched and on freelist */ + rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } + free_pgs = txn->mt_free_pgs; + /* Write to last page of freeDB */ + key.mv_size = sizeof(txn->mt_txnid); + key.mv_data = &txn->mt_txnid; + do { + freecnt = free_pgs[0]; + data.mv_size = MDB_IDL_SIZEOF(free_pgs); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + return rc; + /* Retry if mt_free_pgs[] grew during the Put() */ + free_pgs = txn->mt_free_pgs; + } while (freecnt < free_pgs[0]); + mdb_midl_sort(free_pgs); + memcpy(data.mv_data, free_pgs, data.mv_size); +#if (MDB_DEBUG) > 1 + { + unsigned int i = free_pgs[0]; + DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", + txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); + for (; i; i--) + DPRINTF(("IDL %"Z"u", free_pgs[i])); + } +#endif + continue; + } + + mop = env->me_pghead; + mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; + + /* Reserve records for me_pghead[]. Split it if multi-page, + * to avoid searching freeDB for a page range. Use keys in + * range [1,me_pglast]: Smaller than txnid of oldest reader. + */ + if (total_room >= mop_len) { + if (total_room == mop_len || --more < 0) + break; + } else if (head_room >= maxfree_1pg && head_id > 1) { + /* Keep current record (overflow page), add a new one */ + head_id--; + head_room = 0; + } + /* (Re)write {key = head_id, IDL length = head_room} */ + total_room -= head_room; + head_room = mop_len - total_room; + if (head_room > maxfree_1pg && head_id > 1) { + /* Overflow multi-page for part of me_pghead */ + head_room /= head_id; /* amortize page sizes */ + head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); + } else if (head_room < 0) { + /* Rare case, not bothering to delete this record */ + head_room = 0; + } + key.mv_size = sizeof(head_id); + key.mv_data = &head_id; + data.mv_size = (head_room + 1) * sizeof(pgno_t); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + return rc; + /* IDL is initially empty, zero out at least the length */ + pgs = (pgno_t *)data.mv_data; + j = head_room > clean_limit ? head_room : 0; + do { + pgs[j] = 0; + } while (--j >= 0); + total_room += head_room; + } + + /* Return loose page numbers to me_pghead, though usually none are + * left at this point. The pages themselves remain in dirty_list. + */ + if (txn->mt_loose_pgs) { + MDB_page *mp = txn->mt_loose_pgs; + unsigned count = txn->mt_loose_count; + MDB_IDL loose; + /* Room for loose pages + temp IDL with same */ + if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0) + return rc; + mop = env->me_pghead; + loose = mop + MDB_IDL_ALLOCLEN(mop) - count; + for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) + loose[ ++count ] = mp->mp_pgno; + loose[0] = count; + mdb_midl_sort(loose); + mdb_midl_xmerge(mop, loose); + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + mop_len = mop[0]; + } + + /* Fill in the reserved me_pghead records */ + rc = MDB_SUCCESS; + if (mop_len) { + MDB_val key, data; + + mop += mop_len; + rc = mdb_cursor_first(&mc, &key, &data); + for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { + txnid_t id = *(txnid_t *)key.mv_data; + ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; + MDB_ID save; + + mdb_tassert(txn, len >= 0 && id <= env->me_pglast); + key.mv_data = &id; + if (len > mop_len) { + len = mop_len; + data.mv_size = (len + 1) * sizeof(MDB_ID); + } + data.mv_data = mop -= len; + save = mop[0]; + mop[0] = len; + rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT); + mop[0] = save; + if (rc || !(mop_len -= len)) + break; + } + } + return rc; +} + +/** Flush (some) dirty pages to the map, after clearing their dirty flag. + * @param[in] txn the transaction that's being committed + * @param[in] keep number of initial pages in dirty_list to keep dirty. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_flush(MDB_txn *txn, int keep) +{ + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned psize = env->me_psize, j; + int i, pagecount = dl[0].mid, rc; + size_t size = 0, pos = 0; + pgno_t pgno = 0; + MDB_page *dp = NULL; +#ifdef _WIN32 + OVERLAPPED ov; +#else + struct iovec iov[MDB_COMMIT_PAGES]; + ssize_t wpos = 0, wsize = 0, wres; + size_t next_pos = 1; /* impossible pos, so pos != next_pos */ + int n = 0; +#endif + + j = i = keep; + + if (env->me_flags & MDB_WRITEMAP) { + /* Clear dirty flags */ + while (++i <= pagecount) { + dp = dl[i].mptr; + /* Don't flush this page yet */ + if (dp->mp_flags & (P_LOOSE|P_KEEP)) { + dp->mp_flags &= ~P_KEEP; + dl[++j] = dl[i]; + continue; + } + dp->mp_flags &= ~P_DIRTY; + } + goto done; + } + + /* Write the pages */ + for (;;) { + if (++i <= pagecount) { + dp = dl[i].mptr; + /* Don't flush this page yet */ + if (dp->mp_flags & (P_LOOSE|P_KEEP)) { + dp->mp_flags &= ~P_KEEP; + dl[i].mid = 0; + continue; + } + pgno = dl[i].mid; + /* clear dirty flag */ + dp->mp_flags &= ~P_DIRTY; + pos = pgno * psize; + size = psize; + if (IS_OVERFLOW(dp)) size *= dp->mp_pages; + } +#ifdef _WIN32 + else break; + + /* Windows actually supports scatter/gather I/O, but only on + * unbuffered file handles. Since we're relying on the OS page + * cache for all our data, that's self-defeating. So we just + * write pages one at a time. We use the ov structure to set + * the write offset, to at least save the overhead of a Seek + * system call. + */ + DPRINTF(("committing page %"Z"u", pgno)); + memset(&ov, 0, sizeof(ov)); + ov.Offset = pos & 0xffffffff; + ov.OffsetHigh = pos >> 16 >> 16; + if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { + rc = ErrCode(); + DPRINTF(("WriteFile: %d", rc)); + return rc; + } +#else + /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ + if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { + if (n) { +retry_write: + /* Write previous page(s) */ +#ifdef MDB_USE_PWRITEV + wres = pwritev(env->me_fd, iov, n, wpos); +#else + if (n == 1) { + wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); + } else { +retry_seek: + if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { + rc = ErrCode(); + if (rc == EINTR) + goto retry_seek; + DPRINTF(("lseek: %s", strerror(rc))); + return rc; + } + wres = writev(env->me_fd, iov, n); + } +#endif + if (wres != wsize) { + if (wres < 0) { + rc = ErrCode(); + if (rc == EINTR) + goto retry_write; + DPRINTF(("Write error: %s", strerror(rc))); + } else { + rc = EIO; /* TODO: Use which error code? */ + DPUTS("short write, filesystem full?"); + } + return rc; + } + n = 0; + } + if (i > pagecount) + break; + wpos = pos; + wsize = 0; + } + DPRINTF(("committing page %"Z"u", pgno)); + next_pos = pos + size; + iov[n].iov_len = size; + iov[n].iov_base = (char *)dp; + wsize += size; + n++; +#endif /* _WIN32 */ + } + + /* MIPS has cache coherency issues, this is a no-op everywhere else + * Note: for any size >= on-chip cache size, entire on-chip cache is + * flushed. + */ + CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); + + for (i = keep; ++i <= pagecount; ) { + dp = dl[i].mptr; + /* This is a page we skipped above */ + if (!dl[i].mid) { + dl[++j] = dl[i]; + dl[j].mid = dp->mp_pgno; + continue; + } + mdb_dpage_free(env, dp); + } + +done: + i--; + txn->mt_dirty_room += i - j; + dl[0].mid = j; + return MDB_SUCCESS; +} + +int +mdb_txn_commit(MDB_txn *txn) +{ + int rc; + unsigned int i, end_mode; + MDB_env *env; + + if (txn == NULL) + return EINVAL; + + /* mdb_txn_end() mode for a commit which writes nothing */ + end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; + + if (txn->mt_child) { + rc = mdb_txn_commit(txn->mt_child); + if (rc) + goto fail; + } + + env = txn->mt_env; + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + goto done; + } + + if (txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR)) { + DPUTS("txn has failed/finished, can't commit"); + if (txn->mt_parent) + txn->mt_parent->mt_flags |= MDB_TXN_ERROR; + rc = MDB_BAD_TXN; + goto fail; + } + + if (txn->mt_parent) { + MDB_txn *parent = txn->mt_parent; + MDB_page **lp; + MDB_ID2L dst, src; + MDB_IDL pspill; + unsigned x, y, len, ps_len; + + /* Append our free list to parent's */ + rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); + if (rc) + goto fail; + mdb_midl_free(txn->mt_free_pgs); + /* Failures after this must either undo the changes + * to the parent or set MDB_TXN_ERROR in the parent. + */ + + parent->mt_next_pgno = txn->mt_next_pgno; + parent->mt_flags = txn->mt_flags; + + /* Merge our cursors into parent's and close them */ + mdb_cursors_close(txn, 1); + + /* Update parent's DB table. */ + memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + parent->mt_numdbs = txn->mt_numdbs; + parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; + parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; + for (i=CORE_DBS; i<txn->mt_numdbs; i++) { + /* preserve parent's DB_NEW status */ + x = parent->mt_dbflags[i] & DB_NEW; + parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; + } + + dst = parent->mt_u.dirty_list; + src = txn->mt_u.dirty_list; + /* Remove anything in our dirty list from parent's spill list */ + if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { + x = y = ps_len; + pspill[0] = (pgno_t)-1; + /* Mark our dirty pages as deleted in parent spill list */ + for (i=0, len=src[0].mid; ++i <= len; ) { + MDB_ID pn = src[i].mid << 1; + while (pn > pspill[x]) + x--; + if (pn == pspill[x]) { + pspill[x] = 1; + y = --x; + } + } + /* Squash deleted pagenums if we deleted any */ + for (x=y; ++x <= ps_len; ) + if (!(pspill[x] & 1)) + pspill[++y] = pspill[x]; + pspill[0] = y; + } + + /* Remove anything in our spill list from parent's dirty list */ + if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { + for (i=1; i<=txn->mt_spill_pgs[0]; i++) { + MDB_ID pn = txn->mt_spill_pgs[i]; + if (pn & 1) + continue; /* deleted spillpg */ + pn >>= 1; + y = mdb_mid2l_search(dst, pn); + if (y <= dst[0].mid && dst[y].mid == pn) { + free(dst[y].mptr); + while (y < dst[0].mid) { + dst[y] = dst[y+1]; + y++; + } + dst[0].mid--; + } + } + } + + /* Find len = length of merging our dirty list with parent's */ + x = dst[0].mid; + dst[0].mid = 0; /* simplify loops */ + if (parent->mt_parent) { + len = x + src[0].mid; + y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; + for (i = x; y && i; y--) { + pgno_t yp = src[y].mid; + while (yp < dst[i].mid) + i--; + if (yp == dst[i].mid) { + i--; + len--; + } + } + } else { /* Simplify the above for single-ancestor case */ + len = MDB_IDL_UM_MAX - txn->mt_dirty_room; + } + /* Merge our dirty list with parent's */ + y = src[0].mid; + for (i = len; y; dst[i--] = src[y--]) { + pgno_t yp = src[y].mid; + while (yp < dst[x].mid) + dst[i--] = dst[x--]; + if (yp == dst[x].mid) + free(dst[x--].mptr); + } + mdb_tassert(txn, i == x); + dst[0].mid = len; + free(txn->mt_u.dirty_list); + parent->mt_dirty_room = txn->mt_dirty_room; + if (txn->mt_spill_pgs) { + if (parent->mt_spill_pgs) { + /* TODO: Prevent failure here, so parent does not fail */ + rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); + if (rc) + parent->mt_flags |= MDB_TXN_ERROR; + mdb_midl_free(txn->mt_spill_pgs); + mdb_midl_sort(parent->mt_spill_pgs); + } else { + parent->mt_spill_pgs = txn->mt_spill_pgs; + } + } + + /* Append our loose page list to parent's */ + for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) + ; + *lp = txn->mt_loose_pgs; + parent->mt_loose_count += txn->mt_loose_count; + + parent->mt_child = NULL; + mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); + free(txn); + return rc; + } + + if (txn != env->me_txn) { + DPUTS("attempt to commit unknown transaction"); + rc = EINVAL; + goto fail; + } + + mdb_cursors_close(txn, 0); + + if (!txn->mt_u.dirty_list[0].mid && + !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) + goto done; + + DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); + + /* Update DB root pointers */ + if (txn->mt_numdbs > CORE_DBS) { + MDB_cursor mc; + MDB_dbi i; + MDB_val data; + data.mv_size = sizeof(MDB_db); + + mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); + for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + if (TXN_DBI_CHANGED(txn, i)) { + rc = MDB_BAD_DBI; + goto fail; + } + data.mv_data = &txn->mt_dbs[i]; + rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, + F_SUBDATA); + if (rc) + goto fail; + } + } + } + + rc = mdb_freelist_save(txn); + if (rc) + goto fail; + + mdb_midl_free(env->me_pghead); + env->me_pghead = NULL; + mdb_midl_shrink(&txn->mt_free_pgs); + +#if (MDB_DEBUG) > 2 + mdb_audit(txn); +#endif + + if ((rc = mdb_page_flush(txn, 0)) || + (rc = mdb_env_sync(env, 0)) || + (rc = mdb_env_write_meta(txn))) + goto fail; + end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; + +done: + mdb_txn_end(txn, end_mode); + return MDB_SUCCESS; + +fail: + mdb_txn_abort(txn); + return rc; +} + +/** Read the environment parameters of a DB environment before + * mapping it into memory. + * @param[in] env the environment handle + * @param[out] meta address of where to store the meta information + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_env_read_header(MDB_env *env, MDB_meta *meta) +{ + MDB_metabuf pbuf; + MDB_page *p; + MDB_meta *m; + int i, rc, off; + enum { Size = sizeof(pbuf) }; + + /* We don't know the page size yet, so use a minimum value. + * Read both meta pages so we can use the latest one. + */ + + for (i=off=0; i<NUM_METAS; i++, off += meta->mm_psize) { +#ifdef _WIN32 + DWORD len; + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1; + if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF) + rc = 0; +#else + rc = pread(env->me_fd, &pbuf, Size, off); +#endif + if (rc != Size) { + if (rc == 0 && off == 0) + return ENOENT; + rc = rc < 0 ? (int) ErrCode() : MDB_INVALID; + DPRINTF(("read: %s", mdb_strerror(rc))); + return rc; + } + + p = (MDB_page *)&pbuf; + + if (!F_ISSET(p->mp_flags, P_META)) { + DPRINTF(("page %"Z"u not a meta page", p->mp_pgno)); + return MDB_INVALID; + } + + m = METADATA(p); + if (m->mm_magic != MDB_MAGIC) { + DPUTS("meta has invalid magic"); + return MDB_INVALID; + } + + if (m->mm_version != MDB_DATA_VERSION) { + DPRINTF(("database is version %u, expected version %u", + m->mm_version, MDB_DATA_VERSION)); + return MDB_VERSION_MISMATCH; + } + + if (off == 0 || m->mm_txnid > meta->mm_txnid) + *meta = *m; + } + return 0; +} + +/** Fill in most of the zeroed #MDB_meta for an empty database environment */ +static void ESECT +mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) +{ + meta->mm_magic = MDB_MAGIC; + meta->mm_version = MDB_DATA_VERSION; + meta->mm_mapsize = env->me_mapsize; + meta->mm_psize = env->me_psize; + meta->mm_last_pg = NUM_METAS-1; + meta->mm_flags = env->me_flags & 0xffff; + meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ + meta->mm_dbs[FREE_DBI].md_root = P_INVALID; + meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; +} + +/** Write the environment parameters of a freshly created DB environment. + * @param[in] env the environment handle + * @param[in] meta the #MDB_meta to write + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_env_init_meta(MDB_env *env, MDB_meta *meta) +{ + MDB_page *p, *q; + int rc; + unsigned int psize; +#ifdef _WIN32 + DWORD len; + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); +#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ + ov.Offset = pos; \ + rc = WriteFile(fd, ptr, size, &len, &ov); } while(0) +#else + int len; +#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ + len = pwrite(fd, ptr, size, pos); \ + if (len == -1 && ErrCode() == EINTR) continue; \ + rc = (len >= 0); break; } while(1) +#endif + + DPUTS("writing new meta page"); + + psize = env->me_psize; + + p = calloc(NUM_METAS, psize); + if (!p) + return ENOMEM; + + p->mp_pgno = 0; + p->mp_flags = P_META; + *(MDB_meta *)METADATA(p) = *meta; + + q = (MDB_page *)((char *)p + psize); + q->mp_pgno = 1; + q->mp_flags = P_META; + *(MDB_meta *)METADATA(q) = *meta; + + DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0); + if (!rc) + rc = ErrCode(); + else if ((unsigned) len == psize * NUM_METAS) + rc = MDB_SUCCESS; + else + rc = ENOSPC; + free(p); + return rc; +} + +/** Update the environment info to commit a transaction. + * @param[in] txn the transaction that's being committed + * @return 0 on success, non-zero on failure. + */ +static int +mdb_env_write_meta(MDB_txn *txn) +{ + MDB_env *env; + MDB_meta meta, metab, *mp; + unsigned flags; + size_t mapsize; + off_t off; + int rc, len, toggle; + char *ptr; + HANDLE mfd; +#ifdef _WIN32 + OVERLAPPED ov; +#else + int r2; +#endif + + toggle = txn->mt_txnid & 1; + DPRINTF(("writing meta page %d for root page %"Z"u", + toggle, txn->mt_dbs[MAIN_DBI].md_root)); + + env = txn->mt_env; + flags = env->me_flags; + mp = env->me_metas[toggle]; + mapsize = env->me_metas[toggle ^ 1]->mm_mapsize; + /* Persist any increases of mapsize config */ + if (mapsize < env->me_mapsize) + mapsize = env->me_mapsize; + + if (flags & MDB_WRITEMAP) { + mp->mm_mapsize = mapsize; + mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + mp->mm_last_pg = txn->mt_next_pgno - 1; +#if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && /* TODO: portability */ \ + !(defined(__i386__) || defined(__x86_64__)) + /* LY: issue a memory barrier, if not x86. ITS#7969 */ + __sync_synchronize(); +#endif + mp->mm_txnid = txn->mt_txnid; + if (!(flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { + unsigned meta_size = env->me_psize; + rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; + ptr = (char *)mp - PAGEHDRSZ; +#ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ + r2 = (ptr - env->me_map) & (env->me_os_psize - 1); + ptr -= r2; + meta_size += r2; +#endif + if (MDB_MSYNC(ptr, meta_size, rc)) { + rc = ErrCode(); + goto fail; + } + } + goto done; + } + metab.mm_txnid = mp->mm_txnid; + metab.mm_last_pg = mp->mm_last_pg; + + meta.mm_mapsize = mapsize; + meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + meta.mm_last_pg = txn->mt_next_pgno - 1; + meta.mm_txnid = txn->mt_txnid; + + off = offsetof(MDB_meta, mm_mapsize); + ptr = (char *)&meta + off; + len = sizeof(MDB_meta) - off; + off += (char *)mp - env->me_map; + + /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC. + * (me_mfd goes to the same file as me_fd, but writing to it + * also syncs to disk. Avoids a separate fdatasync() call.) + */ + mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; +#ifdef _WIN32 + { + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) + rc = -1; + } +#else +retry_write: + rc = pwrite(mfd, ptr, len, off); +#endif + if (rc != len) { + rc = rc < 0 ? ErrCode() : EIO; +#ifndef _WIN32 + if (rc == EINTR) + goto retry_write; +#endif + DPUTS("write failed, disk error?"); + /* On a failure, the pagecache still contains the new data. + * Write some old data back, to prevent it from being used. + * Use the non-SYNC fd; we know it will fail anyway. + */ + meta.mm_last_pg = metab.mm_last_pg; + meta.mm_txnid = metab.mm_txnid; +#ifdef _WIN32 + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + WriteFile(env->me_fd, ptr, len, NULL, &ov); +#else + r2 = pwrite(env->me_fd, ptr, len, off); + (void)r2; /* Silence warnings. We don't care about pwrite's return value */ +#endif +fail: + env->me_flags |= MDB_FATAL_ERROR; + return rc; + } + /* MIPS has cache coherency issues, this is a no-op everywhere else */ + CACHEFLUSH(env->me_map + off, len, DCACHE); +done: + /* Memory ordering issues are irrelevant; since the entire writer + * is wrapped by wmutex, all of these changes will become visible + * after the wmutex is unlocked. Since the DB is multi-version, + * readers will get consistent data regardless of how fresh or + * how stale their view of these values is. + */ + if (env->me_txns) + env->me_txns->mti_txnid = txn->mt_txnid; + + return MDB_SUCCESS; +} + +/** Check both meta pages to see which one is newer. + * @param[in] env the environment handle + * @return newest #MDB_meta. + */ +static MDB_meta * +mdb_env_pick_meta(const MDB_env *env) +{ + MDB_meta *const *metas = env->me_metas; + return metas[ metas[0]->mm_txnid < metas[1]->mm_txnid ]; +} + +int ESECT +mdb_env_create(MDB_env **env) +{ + MDB_env *e; + + e = calloc(1, sizeof(MDB_env)); + if (!e) + return ENOMEM; + + e->me_maxreaders = DEFAULT_READERS; + e->me_maxdbs = e->me_numdbs = CORE_DBS; + e->me_fd = INVALID_HANDLE_VALUE; + e->me_lfd = INVALID_HANDLE_VALUE; + e->me_mfd = INVALID_HANDLE_VALUE; +#ifdef MDB_USE_POSIX_SEM + e->me_rmutex = SEM_FAILED; + e->me_wmutex = SEM_FAILED; +#endif + e->me_pid = getpid(); + GET_PAGESIZE(e->me_os_psize); + VGMEMP_CREATE(e,0,0); + *env = e; + return MDB_SUCCESS; +} + +static int ESECT +mdb_env_map(MDB_env *env, void *addr) +{ + MDB_page *p; + unsigned int flags = env->me_flags; +#ifdef _WIN32 + int rc; + HANDLE mh; + LONG sizelo, sizehi; + size_t msize; + + if (flags & MDB_RDONLY) { + /* Don't set explicit map size, use whatever exists */ + msize = 0; + sizelo = 0; + sizehi = 0; + } else { + msize = env->me_mapsize; + sizelo = msize & 0xffffffff; + sizehi = msize >> 16 >> 16; /* only needed on Win64 */ + + /* Windows won't create mappings for zero length files. + * and won't map more than the file size. + * Just set the maxsize right now. + */ + if (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo + || !SetEndOfFile(env->me_fd) + || SetFilePointer(env->me_fd, 0, NULL, 0) != 0) + return ErrCode(); + } + + mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? + PAGE_READWRITE : PAGE_READONLY, + sizehi, sizelo, NULL); + if (!mh) + return ErrCode(); + env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? + FILE_MAP_WRITE : FILE_MAP_READ, + 0, 0, msize, addr); + rc = env->me_map ? 0 : ErrCode(); + CloseHandle(mh); + if (rc) + return rc; +#else + int prot = PROT_READ; + if (flags & MDB_WRITEMAP) { + prot |= PROT_WRITE; + if (ftruncate(env->me_fd, env->me_mapsize) < 0) + return ErrCode(); + } + env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, + env->me_fd, 0); + if (env->me_map == MAP_FAILED) { + env->me_map = NULL; + return ErrCode(); + } + + if (flags & MDB_NORDAHEAD) { + /* Turn off readahead. It's harmful when the DB is larger than RAM. */ +#ifdef MADV_RANDOM + madvise(env->me_map, env->me_mapsize, MADV_RANDOM); +#else +#ifdef POSIX_MADV_RANDOM + posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); +#endif /* POSIX_MADV_RANDOM */ +#endif /* MADV_RANDOM */ + } +#endif /* _WIN32 */ + + /* Can happen because the address argument to mmap() is just a + * hint. mmap() can pick another, e.g. if the range is in use. + * The MAP_FIXED flag would prevent that, but then mmap could + * instead unmap existing pages to make room for the new map. + */ + if (addr && env->me_map != addr) + return EBUSY; /* TODO: Make a new MDB_* error code? */ + + p = (MDB_page *)env->me_map; + env->me_metas[0] = METADATA(p); + env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize); + + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_mapsize(MDB_env *env, size_t size) +{ + /* If env is already open, caller is responsible for making + * sure there are no active txns. + */ + if (env->me_map) { + int rc; + MDB_meta *meta; + void *old; + if (env->me_txn) + return EINVAL; + meta = mdb_env_pick_meta(env); + if (!size) + size = meta->mm_mapsize; + { + /* Silently round up to minimum if the size is too small */ + size_t minsize = (meta->mm_last_pg + 1) * env->me_psize; + if (size < minsize) + size = minsize; + } + munmap(env->me_map, env->me_mapsize); + env->me_mapsize = size; + old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; + rc = mdb_env_map(env, old); + if (rc) + return rc; + } + env->me_mapsize = size; + if (env->me_psize) + env->me_maxpg = env->me_mapsize / env->me_psize; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) +{ + if (env->me_map) + return EINVAL; + env->me_maxdbs = dbs + CORE_DBS; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_maxreaders(MDB_env *env, unsigned int readers) +{ + if (env->me_map || readers < 1) + return EINVAL; + env->me_maxreaders = readers; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) +{ + if (!env || !readers) + return EINVAL; + *readers = env->me_maxreaders; + return MDB_SUCCESS; +} + +static int ESECT +mdb_fsize(HANDLE fd, size_t *size) +{ +#ifdef _WIN32 + LARGE_INTEGER fsize; + + if (!GetFileSizeEx(fd, &fsize)) + return ErrCode(); + + *size = fsize.QuadPart; +#else + struct stat st; + + if (fstat(fd, &st)) + return ErrCode(); + + *size = st.st_size; +#endif + return MDB_SUCCESS; +} + + +#ifdef _WIN32 +typedef wchar_t mdb_nchar_t; +# define MDB_NAME(str) L##str +# define mdb_name_cpy wcscpy +#else +/** Character type for file names: char on Unix, wchar_t on Windows */ +typedef char mdb_nchar_t; +# define MDB_NAME(str) str /**< #mdb_nchar_t[] string literal */ +# define mdb_name_cpy strcpy /**< Copy name (#mdb_nchar_t string) */ +#endif + +/** Filename - string of #mdb_nchar_t[] */ +typedef struct MDB_name { + int mn_len; /**< Length */ + int mn_alloced; /**< True if #mn_val was malloced */ + mdb_nchar_t *mn_val; /**< Contents */ +} MDB_name; + +/** Filename suffixes [datafile,lockfile][without,with MDB_NOSUBDIR] */ +static const mdb_nchar_t *const mdb_suffixes[2][2] = { + { MDB_NAME("/data.mdb"), MDB_NAME("") }, + { MDB_NAME("/lock.mdb"), MDB_NAME("-lock") } +}; + +#define MDB_SUFFLEN 9 /**< Max string length in #mdb_suffixes[] */ + +/** Set up filename + scratch area for filename suffix, for opening files. + * It should be freed with #mdb_fname_destroy(). + * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16. + * + * @param[in] path Pathname for #mdb_env_open(). + * @param[in] envflags Whether a subdir and/or lockfile will be used. + * @param[out] fname Resulting filename, with room for a suffix if necessary. + */ +static int ESECT +mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname) +{ + int no_suffix = F_ISSET(envflags, MDB_NOSUBDIR|MDB_NOLOCK); + fname->mn_alloced = 0; +#ifdef _WIN32 + return utf8_to_utf16(path, fname, no_suffix ? 0 : MDB_SUFFLEN); +#else + fname->mn_len = strlen(path); + if (no_suffix) + fname->mn_val = (char *) path; + else if ((fname->mn_val = malloc(fname->mn_len + MDB_SUFFLEN+1)) != NULL) { + fname->mn_alloced = 1; + strcpy(fname->mn_val, path); + } + else + return ENOMEM; + return MDB_SUCCESS; +#endif +} + +/** Destroy \b fname from #mdb_fname_init() */ +#define mdb_fname_destroy(fname) \ + do { if ((fname).mn_alloced) free((fname).mn_val); } while (0) + +#ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */ +# define MDB_CLOEXEC O_CLOEXEC +#else +# define MDB_CLOEXEC 0 +#endif + +/** File type, access mode etc. for #mdb_fopen() */ +enum mdb_fopen_type { +#ifdef _WIN32 + MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS +#else + /* A comment in mdb_fopen() explains some O_* flag choices. */ + MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ + MDB_O_RDWR = O_RDWR |O_CREAT, /**< for me_fd */ + MDB_O_META = O_WRONLY|MDB_DSYNC |MDB_CLOEXEC, /**< for me_mfd */ + MDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|MDB_CLOEXEC, /**< for #mdb_env_copy() */ + /** Bitmask for open() flags in enum #mdb_fopen_type. The other bits + * distinguish otherwise-equal MDB_O_* constants from each other. + */ + MDB_O_MASK = MDB_O_RDWR|MDB_CLOEXEC | MDB_O_RDONLY|MDB_O_META|MDB_O_COPY, + MDB_O_LOCKS = MDB_O_RDWR|MDB_CLOEXEC | ((MDB_O_MASK+1) & ~MDB_O_MASK) /**< for me_lfd */ +#endif +}; + +/** Open an LMDB file. + * @param[in] env The LMDB environment. + * @param[in,out] fname Path from from #mdb_fname_init(). A suffix is + * appended if necessary to create the filename, without changing mn_len. + * @param[in] which Determines file type, access mode, etc. + * @param[in] mode The Unix permissions for the file, if we create it. + * @param[out] res Resulting file handle. + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_fopen(const MDB_env *env, MDB_name *fname, + enum mdb_fopen_type which, mdb_mode_t mode, + HANDLE *res) +{ + int rc = MDB_SUCCESS; + HANDLE fd; +#ifdef _WIN32 + DWORD acc, share, disp, attrs; +#else + int flags; +#endif + + if (fname->mn_alloced) /* modifiable copy */ + mdb_name_cpy(fname->mn_val + fname->mn_len, + mdb_suffixes[which==MDB_O_LOCKS][F_ISSET(env->me_flags, MDB_NOSUBDIR)]); + + /* The directory must already exist. Usually the file need not. + * MDB_O_META requires the file because we already created it using + * MDB_O_RDWR. MDB_O_COPY must not overwrite an existing file. + * + * With MDB_O_COPY we do not want the OS to cache the writes, since + * the source data is already in the OS cache. + * + * The lockfile needs FD_CLOEXEC (close file descriptor on exec*()) + * to avoid the flock() issues noted under Caveats in lmdb.h. + * Also set it for other filehandles which the user cannot get at + * and close himself, which he may need after fork(). I.e. all but + * me_fd, which programs do use via mdb_env_get_fd(). + */ + +#ifdef _WIN32 + acc = GENERIC_READ|GENERIC_WRITE; + share = FILE_SHARE_READ|FILE_SHARE_WRITE; + disp = OPEN_ALWAYS; + attrs = FILE_ATTRIBUTE_NORMAL; + switch (which) { + case MDB_O_RDONLY: /* read-only datafile */ + acc = GENERIC_READ; + disp = OPEN_EXISTING; + break; + case MDB_O_META: /* for writing metapages */ + acc = GENERIC_WRITE; + disp = OPEN_EXISTING; + attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH; + break; + case MDB_O_COPY: /* mdb_env_copy() & co */ + acc = GENERIC_WRITE; + share = 0; + disp = CREATE_NEW; + attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH; + break; + default: break; /* silence gcc -Wswitch (not all enum values handled) */ + } + fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL); +#else + fd = open(fname->mn_val, which & MDB_O_MASK, mode); +#endif + + if (fd == INVALID_HANDLE_VALUE) + rc = ErrCode(); +#ifndef _WIN32 + else { + if (which != MDB_O_RDONLY && which != MDB_O_RDWR) { + /* Set CLOEXEC if we could not pass it to open() */ + if (!MDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1) + (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC); + } + if (which == MDB_O_COPY && env->me_psize >= env->me_os_psize) { + /* This may require buffer alignment. There is no portable + * way to ask how much, so we require OS pagesize alignment. + */ +# ifdef F_NOCACHE /* __APPLE__ */ + (void) fcntl(fd, F_NOCACHE, 1); +# elif defined O_DIRECT + /* open(...O_DIRECT...) would break on filesystems without + * O_DIRECT support (ITS#7682). Try to set it here instead. + */ + if ((flags = fcntl(fd, F_GETFL)) != -1) + (void) fcntl(fd, F_SETFL, flags | O_DIRECT); +# endif + } + } +#endif /* !_WIN32 */ + + *res = fd; + return rc; +} + + +#ifdef BROKEN_FDATASYNC +#include <sys/utsname.h> +#include <sys/vfs.h> +#endif + +/** Further setup required for opening an LMDB environment + */ +static int ESECT +mdb_env_open2(MDB_env *env) +{ + unsigned int flags = env->me_flags; + int i, newenv = 0, rc; + MDB_meta meta; + +#ifdef _WIN32 + /* See if we should use QueryLimited */ + rc = GetVersion(); + if ((rc & 0xff) > 5) + env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION; + else + env->me_pidquery = PROCESS_QUERY_INFORMATION; +#endif /* _WIN32 */ + +#ifdef BROKEN_FDATASYNC + /* ext3/ext4 fdatasync is broken on some older Linux kernels. + * https://lkml.org/lkml/2012/9/3/83 + * Kernels after 3.6-rc6 are known good. + * https://lkml.org/lkml/2012/9/10/556 + * See if the DB is on ext3/ext4, then check for new enough kernel + * Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known + * to be patched. + */ + { + struct statfs st; + fstatfs(env->me_fd, &st); + while (st.f_type == 0xEF53) { + struct utsname uts; + int i; + uname(&uts); + if (uts.release[0] < '3') { + if (!strncmp(uts.release, "2.6.32.", 7)) { + i = atoi(uts.release+7); + if (i >= 60) + break; /* 2.6.32.60 and newer is OK */ + } else if (!strncmp(uts.release, "2.6.34.", 7)) { + i = atoi(uts.release+7); + if (i >= 15) + break; /* 2.6.34.15 and newer is OK */ + } + } else if (uts.release[0] == '3') { + i = atoi(uts.release+2); + if (i > 5) + break; /* 3.6 and newer is OK */ + if (i == 5) { + i = atoi(uts.release+4); + if (i >= 4) + break; /* 3.5.4 and newer is OK */ + } else if (i == 2) { + i = atoi(uts.release+4); + if (i >= 30) + break; /* 3.2.30 and newer is OK */ + } + } else { /* 4.x and newer is OK */ + break; + } + env->me_flags |= MDB_FSYNCONLY; + break; + } + } +#endif + + if ((i = mdb_env_read_header(env, &meta)) != 0) { + if (i != ENOENT) + return i; + DPUTS("new mdbenv"); + newenv = 1; + env->me_psize = env->me_os_psize; + if (env->me_psize > MAX_PAGESIZE) + env->me_psize = MAX_PAGESIZE; + memset(&meta, 0, sizeof(meta)); + mdb_env_init_meta0(env, &meta); + meta.mm_mapsize = DEFAULT_MAPSIZE; + } else { + env->me_psize = meta.mm_psize; + } + + /* Was a mapsize configured? */ + if (!env->me_mapsize) { + env->me_mapsize = meta.mm_mapsize; + } + { + /* Make sure mapsize >= committed data size. Even when using + * mm_mapsize, which could be broken in old files (ITS#7789). + */ + size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; + if (env->me_mapsize < minsize) + env->me_mapsize = minsize; + } + meta.mm_mapsize = env->me_mapsize; + + if (newenv && !(flags & MDB_FIXEDMAP)) { + /* mdb_env_map() may grow the datafile. Write the metapages + * first, so the file will be valid if initialization fails. + * Except with FIXEDMAP, since we do not yet know mm_address. + * We could fill in mm_address later, but then a different + * program might end up doing that - one with a memory layout + * and map address which does not suit the main program. + */ + rc = mdb_env_init_meta(env, &meta); + if (rc) + return rc; + newenv = 0; + } + + rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL); + if (rc) + return rc; + + if (newenv) { + if (flags & MDB_FIXEDMAP) + meta.mm_address = env->me_map; + i = mdb_env_init_meta(env, &meta); + if (i != MDB_SUCCESS) { + return i; + } + } + + env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) + - sizeof(indx_t); +#if !(MDB_MAXKEYSIZE) + env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); +#endif + env->me_maxpg = env->me_mapsize / env->me_psize; + +#if MDB_DEBUG + { + MDB_meta *meta = mdb_env_pick_meta(env); + MDB_db *db = &meta->mm_dbs[MAIN_DBI]; + + DPRINTF(("opened database version %u, pagesize %u", + meta->mm_version, env->me_psize)); + DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1))); + DPRINTF(("depth: %u", db->md_depth)); + DPRINTF(("entries: %"Z"u", db->md_entries)); + DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); + DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages)); + DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages)); + DPRINTF(("root: %"Z"u", db->md_root)); + } +#endif + + return MDB_SUCCESS; +} + + +/** Release a reader thread's slot in the reader lock table. + * This function is called automatically when a thread exits. + * @param[in] ptr This points to the slot in the reader lock table. + */ +static void +mdb_env_reader_dest(void *ptr) +{ + MDB_reader *reader = ptr; + +#ifndef _WIN32 + if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */ +#endif + /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */ + reader->mr_pid = 0; +} + +#ifdef _WIN32 +/** Junk for arranging thread-specific callbacks on Windows. This is + * necessarily platform and compiler-specific. Windows supports up + * to 1088 keys. Let's assume nobody opens more than 64 environments + * in a single process, for now. They can override this if needed. + */ +#ifndef MAX_TLS_KEYS +#define MAX_TLS_KEYS 64 +#endif +static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS]; +static int mdb_tls_nkeys; + +static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr) +{ + int i; + switch(reason) { + case DLL_PROCESS_ATTACH: break; + case DLL_THREAD_ATTACH: break; + case DLL_THREAD_DETACH: + for (i=0; i<mdb_tls_nkeys; i++) { + MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]); + if (r) { + mdb_env_reader_dest(r); + } + } + break; + case DLL_PROCESS_DETACH: break; + } +} +#ifdef __GNUC__ +#ifdef _WIN64 +const PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback; +#else +PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback; +#endif +#else +#ifdef _WIN64 +/* Force some symbol references. + * _tls_used forces the linker to create the TLS directory if not already done + * mdb_tls_cbp prevents whole-program-optimizer from dropping the symbol. + */ +#pragma comment(linker, "/INCLUDE:_tls_used") +#pragma comment(linker, "/INCLUDE:mdb_tls_cbp") +#pragma const_seg(".CRT$XLB") +extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp; +const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; +#pragma const_seg() +#else /* _WIN32 */ +#pragma comment(linker, "/INCLUDE:__tls_used") +#pragma comment(linker, "/INCLUDE:_mdb_tls_cbp") +#pragma data_seg(".CRT$XLB") +PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; +#pragma data_seg() +#endif /* WIN 32/64 */ +#endif /* !__GNUC__ */ +#endif + +/** Downgrade the exclusive lock on the region back to shared */ +static int ESECT +mdb_env_share_locks(MDB_env *env, int *excl) +{ + int rc = 0; + MDB_meta *meta = mdb_env_pick_meta(env); + + env->me_txns->mti_txnid = meta->mm_txnid; + +#ifdef _WIN32 + { + OVERLAPPED ov; + /* First acquire a shared lock. The Unlock will + * then release the existing exclusive lock. + */ + memset(&ov, 0, sizeof(ov)); + if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { + rc = ErrCode(); + } else { + UnlockFile(env->me_lfd, 0, 0, 1, 0); + *excl = 0; + } + } +#else + { + struct flock lock_info; + /* The shared lock replaces the existing lock */ + memset((void *)&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_RDLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = 0; + lock_info.l_len = 1; + while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && + (rc = ErrCode()) == EINTR) ; + *excl = rc ? -1 : 0; /* error may mean we lost the lock */ + } +#endif + + return rc; +} + +/** Try to get exclusive lock, otherwise shared. + * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. + */ +static int ESECT +mdb_env_excl_lock(MDB_env *env, int *excl) +{ + int rc = 0; +#ifdef _WIN32 + if (LockFile(env->me_lfd, 0, 0, 1, 0)) { + *excl = 1; + } else { + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { + *excl = 0; + } else { + rc = ErrCode(); + } + } +#else + struct flock lock_info; + memset((void *)&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_WRLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = 0; + lock_info.l_len = 1; + while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && + (rc = ErrCode()) == EINTR) ; + if (!rc) { + *excl = 1; + } else +# ifndef MDB_USE_POSIX_MUTEX + if (*excl < 0) /* always true when MDB_USE_POSIX_MUTEX */ +# endif + { + lock_info.l_type = F_RDLCK; + while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && + (rc = ErrCode()) == EINTR) ; + if (rc == 0) + *excl = 0; + } +#endif + return rc; +} + +#ifdef MDB_USE_HASH +/* + * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code + * + * @(#) $Revision: 5.1 $ + * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $ + * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $ + * + * http://www.isthe.com/chongo/tech/comp/fnv/index.html + * + *** + * + * Please do not copyright this code. This code is in the public domain. + * + * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO + * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + * + * By: + * chongo <Landon Curt Noll> /\oo/\ + * http://www.isthe.com/chongo/ + * + * Share and Enjoy! :-) + */ + +typedef unsigned long long mdb_hash_t; +#define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL) + +/** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer + * @param[in] val value to hash + * @param[in] hval initial value for hash + * @return 64 bit hash + * + * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the + * hval arg on the first call. + */ +static mdb_hash_t +mdb_hash_val(MDB_val *val, mdb_hash_t hval) +{ + unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ + unsigned char *end = s + val->mv_size; + /* + * FNV-1a hash each octet of the string + */ + while (s < end) { + /* xor the bottom with the current octet */ + hval ^= (mdb_hash_t)*s++; + + /* multiply by the 64 bit FNV magic prime mod 2^64 */ + hval += (hval << 1) + (hval << 4) + (hval << 5) + + (hval << 7) + (hval << 8) + (hval << 40); + } + /* return our new hash value */ + return hval; +} + +/** Hash the string and output the encoded hash. + * This uses modified RFC1924 Ascii85 encoding to accommodate systems with + * very short name limits. We don't care about the encoding being reversible, + * we just want to preserve as many bits of the input as possible in a + * small printable string. + * @param[in] str string to hash + * @param[out] encbuf an array of 11 chars to hold the hash + */ +static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; + +static void ESECT +mdb_pack85(unsigned long l, char *out) +{ + int i; + + for (i=0; i<5; i++) { + *out++ = mdb_a85[l % 85]; + l /= 85; + } +} + +static void ESECT +mdb_hash_enc(MDB_val *val, char *encbuf) +{ + mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); + + mdb_pack85(h, encbuf); + mdb_pack85(h>>32, encbuf+5); + encbuf[10] = '\0'; +} +#endif + +/** Open and/or initialize the lock region for the environment. + * @param[in] env The LMDB environment. + * @param[in] fname Filename + scratch area, from #mdb_fname_init(). + * @param[in] mode The Unix permissions for the file, if we create it. + * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) +{ +#ifdef _WIN32 +# define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT +#else +# define MDB_ERRCODE_ROFS EROFS +#endif + int rc; + off_t size, rsize; + + rc = mdb_fopen(env, fname, MDB_O_LOCKS, mode, &env->me_lfd); + if (rc) { + /* Omit lockfile if read-only env on read-only filesystem */ + if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { + return MDB_SUCCESS; + } + goto fail; + } + + if (!(env->me_flags & MDB_NOTLS)) { + rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); + if (rc) + goto fail; + env->me_flags |= MDB_ENV_TXKEY; +#ifdef _WIN32 + /* Windows TLS callbacks need help finding their TLS info. */ + if (mdb_tls_nkeys >= MAX_TLS_KEYS) { + rc = MDB_TLS_FULL; + goto fail; + } + mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; +#endif + } + + /* Try to get exclusive lock. If we succeed, then + * nobody is using the lock region and we should initialize it. + */ + if ((rc = mdb_env_excl_lock(env, excl))) goto fail; + +#ifdef _WIN32 + size = GetFileSize(env->me_lfd, NULL); +#else + size = lseek(env->me_lfd, 0, SEEK_END); + if (size == -1) goto fail_errno; +#endif + rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); + if (size < rsize && *excl > 0) { +#ifdef _WIN32 + if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize + || !SetEndOfFile(env->me_lfd)) + goto fail_errno; +#else + if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; +#endif + } else { + rsize = size; + size = rsize - sizeof(MDB_txninfo); + env->me_maxreaders = size/sizeof(MDB_reader) + 1; + } + { +#ifdef _WIN32 + HANDLE mh; + mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE, + 0, 0, NULL); + if (!mh) goto fail_errno; + env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL); + CloseHandle(mh); + if (!env->me_txns) goto fail_errno; +#else + void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, + env->me_lfd, 0); + if (m == MAP_FAILED) goto fail_errno; + env->me_txns = m; +#endif + } + if (*excl > 0) { +#ifdef _WIN32 + BY_HANDLE_FILE_INFORMATION stbuf; + struct { + DWORD volume; + DWORD nhigh; + DWORD nlow; + } idbuf; + MDB_val val; + char encbuf[11]; + + if (!mdb_sec_inited) { + InitializeSecurityDescriptor(&mdb_null_sd, + SECURITY_DESCRIPTOR_REVISION); + SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE); + mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES); + mdb_all_sa.bInheritHandle = FALSE; + mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd; + mdb_sec_inited = 1; + } + if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno; + idbuf.volume = stbuf.dwVolumeSerialNumber; + idbuf.nhigh = stbuf.nFileIndexHigh; + idbuf.nlow = stbuf.nFileIndexLow; + val.mv_data = &idbuf; + val.mv_size = sizeof(idbuf); + mdb_hash_enc(&val, encbuf); + sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf); + sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf); + env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); + if (!env->me_rmutex) goto fail_errno; + env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); + if (!env->me_wmutex) goto fail_errno; +#elif defined(MDB_USE_POSIX_SEM) + struct stat stbuf; + struct { + dev_t dev; + ino_t ino; + } idbuf; + MDB_val val; + char encbuf[11]; + +#if defined(__NetBSD__) +#define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */ +#endif + if (fstat(env->me_lfd, &stbuf)) goto fail_errno; + idbuf.dev = stbuf.st_dev; + idbuf.ino = stbuf.st_ino; + val.mv_data = &idbuf; + val.mv_size = sizeof(idbuf); + mdb_hash_enc(&val, encbuf); +#ifdef MDB_SHORT_SEMNAMES + encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */ +#endif + sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf); + sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf); + /* Clean up after a previous run, if needed: Try to + * remove both semaphores before doing anything else. + */ + sem_unlink(env->me_txns->mti_rmname); + sem_unlink(env->me_txns->mti_wmname); + env->me_rmutex = sem_open(env->me_txns->mti_rmname, + O_CREAT|O_EXCL, mode, 1); + if (env->me_rmutex == SEM_FAILED) goto fail_errno; + env->me_wmutex = sem_open(env->me_txns->mti_wmname, + O_CREAT|O_EXCL, mode, 1); + if (env->me_wmutex == SEM_FAILED) goto fail_errno; +#else /* MDB_USE_POSIX_MUTEX: */ + pthread_mutexattr_t mattr; + + /* Solaris needs this before initing a robust mutex. Otherwise + * it may skip the init and return EBUSY "seems someone already + * inited" or EINVAL "it was inited differently". + */ + memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex)); + memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex)); + + if ((rc = pthread_mutexattr_init(&mattr))) + goto fail; + + rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); +#ifdef MDB_ROBUST_SUPPORTED + if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); +#endif + if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr); + if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr); + pthread_mutexattr_destroy(&mattr); + if (rc) + goto fail; +#endif /* _WIN32 || MDB_USE_POSIX_SEM */ + + env->me_txns->mti_magic = MDB_MAGIC; + env->me_txns->mti_format = MDB_LOCK_FORMAT; + env->me_txns->mti_txnid = 0; + env->me_txns->mti_numreaders = 0; + + } else { + if (env->me_txns->mti_magic != MDB_MAGIC) { + DPUTS("lock region has invalid magic"); + rc = MDB_INVALID; + goto fail; + } + if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { + DPRINTF(("lock region has format+version 0x%x, expected 0x%x", + env->me_txns->mti_format, MDB_LOCK_FORMAT)); + rc = MDB_VERSION_MISMATCH; + goto fail; + } + rc = ErrCode(); + if (rc && rc != EACCES && rc != EAGAIN) { + goto fail; + } +#ifdef _WIN32 + env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); + if (!env->me_rmutex) goto fail_errno; + env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); + if (!env->me_wmutex) goto fail_errno; +#elif defined(MDB_USE_POSIX_SEM) + env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0); + if (env->me_rmutex == SEM_FAILED) goto fail_errno; + env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0); + if (env->me_wmutex == SEM_FAILED) goto fail_errno; +#endif + } + return MDB_SUCCESS; + +fail_errno: + rc = ErrCode(); +fail: + return rc; +} + + /** Only a subset of the @ref mdb_env flags can be changed + * at runtime. Changing other flags requires closing the + * environment and re-opening it with the new flags. + */ +#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) +#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ + MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD) + +#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) +# error "Persistent DB flags & env flags overlap, but both go in mm_flags" +#endif + +int ESECT +mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) +{ + int rc, excl = -1; + MDB_name fname; + + if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) + return EINVAL; + + flags |= env->me_flags; + + rc = mdb_fname_init(path, flags, &fname); + if (rc) + return rc; + + if (flags & MDB_RDONLY) { + /* silently ignore WRITEMAP when we're only getting read access */ + flags &= ~MDB_WRITEMAP; + } else { + if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) && + (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) + rc = ENOMEM; + } + env->me_flags = flags |= MDB_ENV_ACTIVE; + if (rc) + goto leave; + + env->me_path = strdup(path); + env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); + env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); + env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int)); + if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { + rc = ENOMEM; + goto leave; + } + env->me_dbxs[FREE_DBI].md_cmp = mdb_cmp_long; /* aligned MDB_INTEGERKEY */ + + /* For RDONLY, get lockfile after we know datafile exists */ + if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { + rc = mdb_env_setup_locks(env, &fname, mode, &excl); + if (rc) + goto leave; + } + + rc = mdb_fopen(env, &fname, + (flags & MDB_RDONLY) ? MDB_O_RDONLY : MDB_O_RDWR, + mode, &env->me_fd); + if (rc) + goto leave; + + if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { + rc = mdb_env_setup_locks(env, &fname, mode, &excl); + if (rc) + goto leave; + } + + if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { + if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { + /* Synchronous fd for meta writes. Needed even with + * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. + */ + rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); + if (rc) + goto leave; + } + DPRINTF(("opened dbenv %p", (void *) env)); + if (excl > 0) { + rc = mdb_env_share_locks(env, &excl); + if (rc) + goto leave; + } + if (!(flags & MDB_RDONLY)) { + MDB_txn *txn; + int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs * + (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned int)+1); + if ((env->me_pbuf = calloc(1, env->me_psize)) && + (txn = calloc(1, size))) + { + txn->mt_dbs = (MDB_db *)((char *)txn + tsize); + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); + txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); + txn->mt_env = env; + txn->mt_dbxs = env->me_dbxs; + txn->mt_flags = MDB_TXN_FINISHED; + env->me_txn0 = txn; + } else { + rc = ENOMEM; + } + } + } + +leave: + if (rc) { + mdb_env_close0(env, excl); + } + mdb_fname_destroy(fname); + return rc; +} + +/** Destroy resources from mdb_env_open(), clear our readers & DBIs */ +static void ESECT +mdb_env_close0(MDB_env *env, int excl) +{ + int i; + + if (!(env->me_flags & MDB_ENV_ACTIVE)) + return; + + /* Doing this here since me_dbxs may not exist during mdb_env_close */ + if (env->me_dbxs) { + for (i = env->me_maxdbs; --i >= CORE_DBS; ) + free(env->me_dbxs[i].md_name.mv_data); + free(env->me_dbxs); + } + + free(env->me_pbuf); + free(env->me_dbiseqs); + free(env->me_dbflags); + free(env->me_path); + free(env->me_dirty_list); + free(env->me_txn0); + mdb_midl_free(env->me_free_pgs); + + if (env->me_flags & MDB_ENV_TXKEY) { + pthread_key_delete(env->me_txkey); +#ifdef _WIN32 + /* Delete our key from the global list */ + for (i=0; i<mdb_tls_nkeys; i++) + if (mdb_tls_keys[i] == env->me_txkey) { + mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; + mdb_tls_nkeys--; + break; + } +#endif + } + + if (env->me_map) { + munmap(env->me_map, env->me_mapsize); + } + if (env->me_mfd != INVALID_HANDLE_VALUE) + (void) close(env->me_mfd); + if (env->me_fd != INVALID_HANDLE_VALUE) + (void) close(env->me_fd); + if (env->me_txns) { + MDB_PID_T pid = env->me_pid; + /* Clearing readers is done in this function because + * me_txkey with its destructor must be disabled first. + * + * We skip the the reader mutex, so we touch only + * data owned by this process (me_close_readers and + * our readers), and clear each reader atomically. + */ + for (i = env->me_close_readers; --i >= 0; ) + if (env->me_txns->mti_readers[i].mr_pid == pid) + env->me_txns->mti_readers[i].mr_pid = 0; +#ifdef _WIN32 + if (env->me_rmutex) { + CloseHandle(env->me_rmutex); + if (env->me_wmutex) CloseHandle(env->me_wmutex); + } + /* Windows automatically destroys the mutexes when + * the last handle closes. + */ +#elif defined(MDB_USE_POSIX_SEM) + if (env->me_rmutex != SEM_FAILED) { + sem_close(env->me_rmutex); + if (env->me_wmutex != SEM_FAILED) + sem_close(env->me_wmutex); + /* If we have the filelock: If we are the + * only remaining user, clean up semaphores. + */ + if (excl == 0) + mdb_env_excl_lock(env, &excl); + if (excl > 0) { + sem_unlink(env->me_txns->mti_rmname); + sem_unlink(env->me_txns->mti_wmname); + } + } +#endif + munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); + } + if (env->me_lfd != INVALID_HANDLE_VALUE) { +#ifdef _WIN32 + if (excl >= 0) { + /* Unlock the lockfile. Windows would have unlocked it + * after closing anyway, but not necessarily at once. + */ + UnlockFile(env->me_lfd, 0, 0, 1, 0); + } +#endif + (void) close(env->me_lfd); + } + + env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); +} + +void ESECT +mdb_env_close(MDB_env *env) +{ + MDB_page *dp; + + if (env == NULL) + return; + + VGMEMP_DESTROY(env); + while ((dp = env->me_dpages) != NULL) { + VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); + env->me_dpages = dp->mp_next; + free(dp); + } + + mdb_env_close0(env, 0); + free(env); +} + +/** Compare two items pointing at aligned size_t's */ +static int +mdb_cmp_long(const MDB_val *a, const MDB_val *b) +{ + return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 : + *(size_t *)a->mv_data > *(size_t *)b->mv_data; +} + +/** Compare two items pointing at aligned unsigned int's. + * + * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp, + * but #mdb_cmp_clong() is called instead if the data type is size_t. + */ +static int +mdb_cmp_int(const MDB_val *a, const MDB_val *b) +{ + return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 : + *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data; +} + +/** Compare two items pointing at unsigned ints of unknown alignment. + * Nodes and keys are guaranteed to be 2-byte aligned. + */ +static int +mdb_cmp_cint(const MDB_val *a, const MDB_val *b) +{ +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned short *u, *c; + int x; + + u = (unsigned short *) ((char *) a->mv_data + a->mv_size); + c = (unsigned short *) ((char *) b->mv_data + a->mv_size); + do { + x = *--u - *--c; + } while(!x && u > (unsigned short *)a->mv_data); + return x; +#else + unsigned short *u, *c, *end; + int x; + + end = (unsigned short *) ((char *) a->mv_data + a->mv_size); + u = (unsigned short *)a->mv_data; + c = (unsigned short *)b->mv_data; + do { + x = *u++ - *c++; + } while(!x && u < end); + return x; +#endif +} + +/** Compare two items lexically */ +static int +mdb_cmp_memn(const MDB_val *a, const MDB_val *b) +{ + int diff; + ssize_t len_diff; + unsigned int len; + + len = a->mv_size; + len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; + if (len_diff > 0) { + len = b->mv_size; + len_diff = 1; + } + + diff = memcmp(a->mv_data, b->mv_data, len); + return diff ? diff : len_diff<0 ? -1 : len_diff; +} + +/** Compare two items in reverse byte order */ +static int +mdb_cmp_memnr(const MDB_val *a, const MDB_val *b) +{ + const unsigned char *p1, *p2, *p1_lim; + ssize_t len_diff; + int diff; + + p1_lim = (const unsigned char *)a->mv_data; + p1 = (const unsigned char *)a->mv_data + a->mv_size; + p2 = (const unsigned char *)b->mv_data + b->mv_size; + + len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; + if (len_diff > 0) { + p1_lim += len_diff; + len_diff = 1; + } + + while (p1 > p1_lim) { + diff = *--p1 - *--p2; + if (diff) + return diff; + } + return len_diff<0 ? -1 : len_diff; +} + +/** Search for key within a page, using binary search. + * Returns the smallest entry larger or equal to the key. + * If exactp is non-null, stores whether the found entry was an exact match + * in *exactp (1 or 0). + * Updates the cursor index with the index of the found entry. + * If no entry larger or equal to the key is found, returns NULL. + */ +static MDB_node * +mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) +{ + unsigned int i = 0, nkeys; + int low, high; + int rc = 0; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NULL; + MDB_val nodekey; + MDB_cmp_func *cmp; + DKBUF; + + nkeys = NUMKEYS(mp); + + DPRINTF(("searching %u keys in %s %spage %"Z"u", + nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", + mdb_dbg_pgno(mp))); + + low = IS_LEAF(mp) ? 0 : 1; + high = nkeys - 1; + cmp = mc->mc_dbx->md_cmp; + + /* Branch pages have no data, so if using integer keys, + * alignment is guaranteed. Use faster mdb_cmp_int. + */ + if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { + if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) + cmp = mdb_cmp_long; + else + cmp = mdb_cmp_int; + } + + if (IS_LEAF2(mp)) { + nodekey.mv_size = mc->mc_db->md_pad; + node = NODEPTR(mp, 0); /* fake */ + while (low <= high) { + i = (low + high) >> 1; + nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); + rc = cmp(key, &nodekey); + DPRINTF(("found leaf index %u [%s], rc = %i", + i, DKEY(&nodekey), rc)); + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } else { + while (low <= high) { + i = (low + high) >> 1; + + node = NODEPTR(mp, i); + nodekey.mv_size = NODEKSZ(node); + nodekey.mv_data = NODEKEY(node); + + rc = cmp(key, &nodekey); +#if MDB_DEBUG + if (IS_LEAF(mp)) + DPRINTF(("found leaf index %u [%s], rc = %i", + i, DKEY(&nodekey), rc)); + else + DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i", + i, DKEY(&nodekey), NODEPGNO(node), rc)); +#endif + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } + + if (rc > 0) { /* Found entry is less than the key. */ + i++; /* Skip to get the smallest entry larger than key. */ + if (!IS_LEAF2(mp)) + node = NODEPTR(mp, i); + } + if (exactp) + *exactp = (rc == 0 && nkeys > 0); + /* store the key index */ + mc->mc_ki[mc->mc_top] = i; + if (i >= nkeys) + /* There is no entry larger or equal to the key. */ + return NULL; + + /* nodeptr is fake for LEAF2 */ + return node; +} + +#if 0 +static void +mdb_cursor_adjust(MDB_cursor *mc, func) +{ + MDB_cursor *m2; + + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { + func(mc, m2); + } + } +} +#endif + +/** Pop a page off the top of the cursor's stack. */ +static void +mdb_cursor_pop(MDB_cursor *mc) +{ + if (mc->mc_snum) { + DPRINTF(("popping page %"Z"u off db %d cursor %p", + mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc)); + + mc->mc_snum--; + if (mc->mc_snum) { + mc->mc_top--; + } else { + mc->mc_flags &= ~C_INITIALIZED; + } + } +} + +/** Push a page onto the top of the cursor's stack. + * Set #MDB_TXN_ERROR on failure. + */ +static int +mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) +{ + DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, + DDBI(mc), (void *) mc)); + + if (mc->mc_snum >= CURSOR_STACK) { + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CURSOR_FULL; + } + + mc->mc_top = mc->mc_snum++; + mc->mc_pg[mc->mc_top] = mp; + mc->mc_ki[mc->mc_top] = 0; + + return MDB_SUCCESS; +} + +/** Find the address of the page corresponding to a given page number. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc the cursor accessing the page. + * @param[in] pgno the page number for the page to retrieve. + * @param[out] ret address of a pointer where the page's address will be stored. + * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) +{ + MDB_txn *txn = mc->mc_txn; + MDB_env *env = txn->mt_env; + MDB_page *p = NULL; + int level; + + if (! (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_WRITEMAP))) { + MDB_txn *tx2 = txn; + level = 1; + do { + MDB_ID2L dl = tx2->mt_u.dirty_list; + unsigned x; + /* Spilled pages were dirtied in this txn and flushed + * because the dirty list got full. Bring this page + * back in from the map (but don't unspill it here, + * leave that unless page_touch happens again). + */ + if (tx2->mt_spill_pgs) { + MDB_ID pn = pgno << 1; + x = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + goto done; + } + } + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + p = dl[x].mptr; + goto done; + } + } + level++; + } while ((tx2 = tx2->mt_parent) != NULL); + } + + if (pgno < txn->mt_next_pgno) { + level = 0; + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + } else { + DPRINTF(("page %"Z"u not found", pgno)); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_NOTFOUND; + } + +done: + *ret = p; + if (lvl) + *lvl = level; + return MDB_SUCCESS; +} + +/** Finish #mdb_page_search() / #mdb_page_search_lowest(). + * The cursor is at the root page, set up the rest of it. + */ +static int +mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int rc; + DKBUF; + + while (IS_BRANCH(mp)) { + MDB_node *node; + indx_t i; + + DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); + /* Don't assert on branch pages in the FreeDB. We can get here + * while in the process of rebalancing a FreeDB branch page; we must + * let that proceed. ITS#8336 + */ + mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); + DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); + + if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { + i = 0; + if (flags & MDB_PS_LAST) { + i = NUMKEYS(mp) - 1; + /* if already init'd, see if we're already in right place */ + if (mc->mc_flags & C_INITIALIZED) { + if (mc->mc_ki[mc->mc_top] == i) { + mc->mc_top = mc->mc_snum++; + mp = mc->mc_pg[mc->mc_top]; + goto ready; + } + } + } + } else { + int exact; + node = mdb_node_search(mc, key, &exact); + if (node == NULL) + i = NUMKEYS(mp) - 1; + else { + i = mc->mc_ki[mc->mc_top]; + if (!exact) { + mdb_cassert(mc, i > 0); + i--; + } + } + DPRINTF(("following index %u for key [%s]", i, DKEY(key))); + } + + mdb_cassert(mc, i < NUMKEYS(mp)); + node = NODEPTR(mp, i); + + if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) + return rc; + + mc->mc_ki[mc->mc_top] = i; + if ((rc = mdb_cursor_push(mc, mp))) + return rc; + +ready: + if (flags & MDB_PS_MODIFY) { + if ((rc = mdb_page_touch(mc)) != 0) + return rc; + mp = mc->mc_pg[mc->mc_top]; + } + } + + if (!IS_LEAF(mp)) { + DPRINTF(("internal error, index points to a %02X page!?", + mp->mp_flags)); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + + DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, + key ? DKEY(key) : "null")); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + return MDB_SUCCESS; +} + +/** Search for the lowest key under the current branch page. + * This just bypasses a NUMKEYS check in the current page + * before calling mdb_page_search_root(), because the callers + * are all in situations where the current page is known to + * be underfilled. + */ +static int +mdb_page_search_lowest(MDB_cursor *mc) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NODEPTR(mp, 0); + int rc; + + if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) + return rc; + + mc->mc_ki[mc->mc_top] = 0; + if ((rc = mdb_cursor_push(mc, mp))) + return rc; + return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); +} + +/** Search for the page a given key should be in. + * Push it and its parent pages on the cursor stack. + * @param[in,out] mc the cursor for this operation. + * @param[in] key the key to search for, or NULL for first/last page. + * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB + * are touched (updated with new page numbers). + * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. + * This is used by #mdb_cursor_first() and #mdb_cursor_last(). + * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) +{ + int rc; + pgno_t root; + + /* Make sure the txn is still viable, then find the root from + * the txn's db table and set it as the root of the cursor's stack. + */ + if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) { + DPUTS("transaction may not be used now"); + return MDB_BAD_TXN; + } else { + /* Make sure we're using an up-to-date root */ + if (*mc->mc_dbflag & DB_STALE) { + MDB_cursor mc2; + if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) + return MDB_BAD_DBI; + mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); + if (rc) + return rc; + { + MDB_val data; + int exact = 0; + uint16_t flags; + MDB_node *leaf = mdb_node_search(&mc2, + &mc->mc_dbx->md_name, &exact); + if (!exact) + return MDB_NOTFOUND; + if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) + return MDB_INCOMPATIBLE; /* not a named DB */ + rc = mdb_node_read(&mc2, leaf, &data); + if (rc) + return rc; + memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), + sizeof(uint16_t)); + /* The txn may not know this DBI, or another process may + * have dropped and recreated the DB with other flags. + */ + if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) + return MDB_INCOMPATIBLE; + memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); + } + *mc->mc_dbflag &= ~DB_STALE; + } + root = mc->mc_db->md_root; + + if (root == P_INVALID) { /* Tree is empty. */ + DPUTS("tree is empty"); + return MDB_NOTFOUND; + } + } + + mdb_cassert(mc, root > 1); + if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) + if ((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0) + return rc; + + mc->mc_snum = 1; + mc->mc_top = 0; + + DPRINTF(("db %d root page %"Z"u has flags 0x%X", + DDBI(mc), root, mc->mc_pg[0]->mp_flags)); + + if (flags & MDB_PS_MODIFY) { + if ((rc = mdb_page_touch(mc))) + return rc; + } + + if (flags & MDB_PS_ROOTONLY) + return MDB_SUCCESS; + + return mdb_page_search_root(mc, key, flags); +} + +static int +mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) +{ + MDB_txn *txn = mc->mc_txn; + pgno_t pg = mp->mp_pgno; + unsigned x = 0, ovpages = mp->mp_pages; + MDB_env *env = txn->mt_env; + MDB_IDL sl = txn->mt_spill_pgs; + MDB_ID pn = pg << 1; + int rc; + + DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); + /* If the page is dirty or on the spill list we just acquired it, + * so we should give it back to our current free list, if any. + * Otherwise put it onto the list of pages we freed in this txn. + * + * Won't create me_pghead: me_pglast must be inited along with it. + * Unsupported in nested txns: They would need to hide the page + * range in ancestor txns' dirty and spilled lists. + */ + if (env->me_pghead && + !txn->mt_parent && + ((mp->mp_flags & P_DIRTY) || + (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) + { + unsigned i, j; + pgno_t *mop; + MDB_ID2 *dl, ix, iy; + rc = mdb_midl_need(&env->me_pghead, ovpages); + if (rc) + return rc; + if (!(mp->mp_flags & P_DIRTY)) { + /* This page is no longer spilled */ + if (x == sl[0]) + sl[0]--; + else + sl[x] |= 1; + goto release; + } + /* Remove from dirty list */ + dl = txn->mt_u.dirty_list; + x = dl[0].mid--; + for (ix = dl[x]; ix.mptr != mp; ix = iy) { + if (x > 1) { + x--; + iy = dl[x]; + dl[x] = ix; + } else { + mdb_cassert(mc, x > 1); + j = ++(dl[0].mid); + dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + } + txn->mt_dirty_room++; + if (!(env->me_flags & MDB_WRITEMAP)) + mdb_dpage_free(env, mp); +release: + /* Insert in me_pghead */ + mop = env->me_pghead; + j = mop[0] + ovpages; + for (i = mop[0]; i && mop[i] < pg; i--) + mop[j--] = mop[i]; + while (j>i) + mop[j--] = pg++; + mop[0] += ovpages; + } else { + rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); + if (rc) + return rc; + } + mc->mc_db->md_overflow_pages -= ovpages; + return 0; +} + +/** Return the data associated with a given node. + * @param[in] mc The cursor for this operation. + * @param[in] leaf The node being read. + * @param[out] data Updated to point to the node's data. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) +{ + MDB_page *omp; /* overflow page */ + pgno_t pgno; + int rc; + + if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { + data->mv_size = NODEDSZ(leaf); + data->mv_data = NODEDATA(leaf); + return MDB_SUCCESS; + } + + /* Read overflow data. + */ + data->mv_size = NODEDSZ(leaf); + memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); + if ((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0) { + DPRINTF(("read overflow page %"Z"u failed", pgno)); + return rc; + } + data->mv_data = METADATA(omp); + + return MDB_SUCCESS; +} + +int +mdb_get(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data) +{ + MDB_cursor mc; + MDB_xcursor mx; + int exact = 0; + DKBUF; + + DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); + + if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + mdb_cursor_init(&mc, txn, dbi, &mx); + return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); +} + +/** Find a sibling for a page. + * Replaces the page at the top of the cursor's stack with the + * specified sibling, if one exists. + * @param[in] mc The cursor for this operation. + * @param[in] move_right Non-zero if the right sibling is requested, + * otherwise the left sibling. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_cursor_sibling(MDB_cursor *mc, int move_right) +{ + int rc; + MDB_node *indx; + MDB_page *mp; + + if (mc->mc_snum < 2) { + return MDB_NOTFOUND; /* root has no siblings */ + } + + mdb_cursor_pop(mc); + DPRINTF(("parent page is page %"Z"u, index %u", + mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); + + if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) + : (mc->mc_ki[mc->mc_top] == 0)) { + DPRINTF(("no more keys left, moving to %s sibling", + move_right ? "right" : "left")); + if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { + /* undo cursor_pop before returning */ + mc->mc_top++; + mc->mc_snum++; + return rc; + } + } else { + if (move_right) + mc->mc_ki[mc->mc_top]++; + else + mc->mc_ki[mc->mc_top]--; + DPRINTF(("just moving to %s index key %u", + move_right ? "right" : "left", mc->mc_ki[mc->mc_top])); + } + mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + + indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if ((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) { + /* mc will be inconsistent if caller does mc_snum++ as above */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + return rc; + } + + mdb_cursor_push(mc, mp); + if (!move_right) + mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; + + return MDB_SUCCESS; +} + +/** Move the cursor to the next data item. */ +static int +mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) +{ + MDB_page *mp; + MDB_node *leaf; + int rc; + + if ((mc->mc_flags & C_DEL && op == MDB_NEXT_DUP)) + return MDB_NOTFOUND; + + if (!(mc->mc_flags & C_INITIALIZED)) + return mdb_cursor_first(mc, key, data); + + mp = mc->mc_pg[mc->mc_top]; + + if (mc->mc_flags & C_EOF) { + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1) + return MDB_NOTFOUND; + mc->mc_flags ^= C_EOF; + } + + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_NEXT || op == MDB_NEXT_DUP) { + rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); + if (op != MDB_NEXT || rc != MDB_NOTFOUND) { + if (rc == MDB_SUCCESS) + MDB_GET_KEY(leaf, key); + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (op == MDB_NEXT_DUP) + return MDB_NOTFOUND; + } + } + + DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", + mdb_dbg_pgno(mp), (void *) mc)); + if (mc->mc_flags & C_DEL) { + mc->mc_flags ^= C_DEL; + goto skip; + } + + if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { + DPUTS("=====> move to next sibling page"); + if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { + mc->mc_flags |= C_EOF; + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + } else + mc->mc_ki[mc->mc_top]++; + +skip: + DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + } + if (data) { + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc != MDB_SUCCESS) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Move the cursor to the previous data item. */ +static int +mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) +{ + MDB_page *mp; + MDB_node *leaf; + int rc; + + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = mdb_cursor_last(mc, key, data); + if (rc) + return rc; + mc->mc_ki[mc->mc_top]++; + } + + mp = mc->mc_pg[mc->mc_top]; + + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_PREV || op == MDB_PREV_DUP) { + rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); + if (op != MDB_PREV || rc != MDB_NOTFOUND) { + if (rc == MDB_SUCCESS) { + MDB_GET_KEY(leaf, key); + mc->mc_flags &= ~C_EOF; + } + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (op == MDB_PREV_DUP) + return MDB_NOTFOUND; + } + } + + DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", + mdb_dbg_pgno(mp), (void *) mc)); + + mc->mc_flags &= ~(C_EOF|C_DEL); + + if (mc->mc_ki[mc->mc_top] == 0) { + DPUTS("=====> move to prev sibling page"); + if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; + DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + } else + mc->mc_ki[mc->mc_top]--; + + DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + } + if (data) { + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc != MDB_SUCCESS) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Set the cursor on a specific data item. */ +static int +mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op, int *exactp) +{ + int rc; + MDB_page *mp; + MDB_node *leaf = NULL; + DKBUF; + + if (key->mv_size == 0) + return MDB_BAD_VALSIZE; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + /* See if we're already on the right page */ + if (mc->mc_flags & C_INITIALIZED) { + MDB_val nodekey; + + mp = mc->mc_pg[mc->mc_top]; + if (!NUMKEYS(mp)) { + mc->mc_ki[mc->mc_top] = 0; + return MDB_NOTFOUND; + } + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_size = mc->mc_db->md_pad; + nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); + } else { + leaf = NODEPTR(mp, 0); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* Probably happens rarely, but first node on the page + * was the one we wanted. + */ + mc->mc_ki[mc->mc_top] = 0; + if (exactp) + *exactp = 1; + goto set1; + } + if (rc > 0) { + unsigned int i; + unsigned int nkeys = NUMKEYS(mp); + if (nkeys > 1) { + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_data = LEAF2KEY(mp, + nkeys-1, nodekey.mv_size); + } else { + leaf = NODEPTR(mp, nkeys-1); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* last node was the one we wanted */ + mc->mc_ki[mc->mc_top] = nkeys-1; + if (exactp) + *exactp = 1; + goto set1; + } + if (rc < 0) { + if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { + /* This is definitely the right page, skip search_page */ + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_data = LEAF2KEY(mp, + mc->mc_ki[mc->mc_top], nodekey.mv_size); + } else { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* current node was the one we wanted */ + if (exactp) + *exactp = 1; + goto set1; + } + } + rc = 0; + mc->mc_flags &= ~C_EOF; + goto set2; + } + } + /* If any parents have right-sibs, search. + * Otherwise, there's nothing further. + */ + for (i=0; i<mc->mc_top; i++) + if (mc->mc_ki[i] < + NUMKEYS(mc->mc_pg[i])-1) + break; + if (i == mc->mc_top) { + /* There are no other pages */ + mc->mc_ki[mc->mc_top] = nkeys; + return MDB_NOTFOUND; + } + } + if (!mc->mc_top) { + /* There are no other pages */ + mc->mc_ki[mc->mc_top] = 0; + if (op == MDB_SET_RANGE && !exactp) { + rc = 0; + goto set1; + } else + return MDB_NOTFOUND; + } + } else { + mc->mc_pg[0] = 0; + } + + rc = mdb_page_search(mc, key, 0); + if (rc != MDB_SUCCESS) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + mdb_cassert(mc, IS_LEAF(mp)); + +set2: + leaf = mdb_node_search(mc, key, exactp); + if (exactp != NULL && !*exactp) { + /* MDB_SET specified and not an exact match. */ + return MDB_NOTFOUND; + } + + if (leaf == NULL) { + DPUTS("===> inexact leaf not found, goto sibling"); + if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { + mc->mc_flags |= C_EOF; + return rc; /* no entries matched */ + } + mp = mc->mc_pg[mc->mc_top]; + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, 0); + } + +set1: + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + if (IS_LEAF2(mp)) { + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } + return MDB_SUCCESS; + } + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + } + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + } else { + int ex2, *ex2p; + if (op == MDB_GET_BOTH) { + ex2p = &ex2; + ex2 = 0; + } else { + ex2p = NULL; + } + rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); + if (rc != MDB_SUCCESS) + return rc; + } + } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { + MDB_val olddata; + MDB_cmp_func *dcmp; + if ((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS) + return rc; + dcmp = mc->mc_dbx->md_dcmp; +#if UINT_MAX < SIZE_MAX + if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) + dcmp = mdb_cmp_clong; +#endif + rc = dcmp(data, &olddata); + if (rc) { + if (op == MDB_GET_BOTH || rc > 0) + return MDB_NOTFOUND; + rc = 0; + } + *data = olddata; + + } else { + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + + /* The key already matches in all other cases */ + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) + MDB_GET_KEY(leaf, key); + DPRINTF(("==> cursor placed on key [%s]", DKEY(key))); + + return rc; +} + +/** Move the cursor to the first item in the database. */ +static int +mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) +{ + int rc; + MDB_node *leaf; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); + if (rc != MDB_SUCCESS) + return rc; + } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + mc->mc_ki[mc->mc_top] = 0; + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); + return MDB_SUCCESS; + } + + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc) + return rc; + } else { + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Move the cursor to the last item in the database. */ +static int +mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) +{ + int rc; + MDB_node *leaf; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_LAST); + if (rc != MDB_SUCCESS) + return rc; + } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; + mc->mc_flags |= C_INITIALIZED|C_EOF; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc) + return rc; + } else { + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +int +mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op) +{ + int rc; + int exact = 0; + int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); + + if (mc == NULL) + return EINVAL; + + if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + switch (op) { + case MDB_GET_CURRENT: + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + } else { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int nkeys = NUMKEYS(mp); + if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { + mc->mc_ki[mc->mc_top] = nkeys; + rc = MDB_NOTFOUND; + break; + } + rc = MDB_SUCCESS; + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } else { + MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY(leaf, key); + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); + } else { + rc = mdb_node_read(mc, leaf, data); + } + } + } + } + break; + case MDB_GET_BOTH: + case MDB_GET_BOTH_RANGE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (mc->mc_xcursor == NULL) { + rc = MDB_INCOMPATIBLE; + break; + } + /* FALLTHRU */ + case MDB_SET: + case MDB_SET_KEY: + case MDB_SET_RANGE: + if (key == NULL) { + rc = EINVAL; + } else { + rc = mdb_cursor_set(mc, key, data, op, + op == MDB_SET_RANGE ? NULL : &exact); + } + break; + case MDB_GET_MULTIPLE: + if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + rc = MDB_SUCCESS; + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || + (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) + break; + goto fetchm; + case MDB_NEXT_MULTIPLE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); + if (rc == MDB_SUCCESS) { + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + MDB_cursor *mx; +fetchm: + mx = &mc->mc_xcursor->mx_cursor; + data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * + mx->mc_db->md_pad; + data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); + mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; + } else { + rc = MDB_NOTFOUND; + } + } + break; + case MDB_PREV_MULTIPLE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdb_cursor_last(mc, key, data); + else + rc = MDB_SUCCESS; + if (rc == MDB_SUCCESS) { + MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; + if (mx->mc_flags & C_INITIALIZED) { + rc = mdb_cursor_sibling(mx, 0); + if (rc == MDB_SUCCESS) + goto fetchm; + } else { + rc = MDB_NOTFOUND; + } + } + break; + case MDB_NEXT: + case MDB_NEXT_DUP: + case MDB_NEXT_NODUP: + rc = mdb_cursor_next(mc, key, data, op); + break; + case MDB_PREV: + case MDB_PREV_DUP: + case MDB_PREV_NODUP: + rc = mdb_cursor_prev(mc, key, data, op); + break; + case MDB_FIRST: + rc = mdb_cursor_first(mc, key, data); + break; + case MDB_FIRST_DUP: + mfunc = mdb_cursor_first; + mmove: + if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + if (mc->mc_xcursor == NULL) { + rc = MDB_INCOMPATIBLE; + break; + } + { + MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDB_GET_KEY(leaf, key); + rc = mdb_node_read(mc, leaf, data); + break; + } + } + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); + break; + case MDB_LAST: + rc = mdb_cursor_last(mc, key, data); + break; + case MDB_LAST_DUP: + mfunc = mdb_cursor_last; + goto mmove; + default: + DPRINTF(("unhandled/unimplemented cursor operation %u", op)); + rc = EINVAL; + break; + } + + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; + + return rc; +} + +/** Touch all the pages in the cursor stack. Set mc_top. + * Makes sure all the pages are writable, before attempting a write operation. + * @param[in] mc The cursor to operate on. + */ +static int +mdb_cursor_touch(MDB_cursor *mc) +{ + int rc = MDB_SUCCESS; + + if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { + /* Touch DB record of named DB */ + MDB_cursor mc2; + MDB_xcursor mcx; + if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) + return MDB_BAD_DBI; + mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); + if (rc) + return rc; + *mc->mc_dbflag |= DB_DIRTY; + } + mc->mc_top = 0; + if (mc->mc_snum) { + do { + rc = mdb_page_touch(mc); + } while (!rc && ++(mc->mc_top) < mc->mc_snum); + mc->mc_top = mc->mc_snum-1; + } + return rc; +} + +/** Do not spill pages to disk if txn is getting full, may fail instead */ +#define MDB_NOSPILL 0x8000 + +int +mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, + unsigned int flags) +{ + MDB_env *env; + MDB_node *leaf = NULL; + MDB_page *fp, *mp, *sub_root = NULL; + uint16_t fp_flags; + MDB_val xdata, *rdata, dkey, olddata; + MDB_db dummy; + int do_sub = 0, insert_key, insert_data; + unsigned int mcount = 0, dcount = 0, nospill; + size_t nsize; + int rc, rc2; + unsigned int nflags; + DKBUF; + + if (mc == NULL || key == NULL) + return EINVAL; + + env = mc->mc_txn->mt_env; + + /* Check this first so counter will always be zero on any + * early failures. + */ + if (flags & MDB_MULTIPLE) { + dcount = data[1].mv_size; + data[1].mv_size = 0; + if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) + return MDB_INCOMPATIBLE; + } + + nospill = flags & MDB_NOSPILL; + flags &= ~MDB_NOSPILL; + + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (key->mv_size-1 >= ENV_MAXKEY(env)) + return MDB_BAD_VALSIZE; + +#if SIZE_MAX > MAXDATASIZE + if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) + return MDB_BAD_VALSIZE; +#else + if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) + return MDB_BAD_VALSIZE; +#endif + + DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u", + DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); + + dkey.mv_size = 0; + + if (flags == MDB_CURRENT) { + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + rc = MDB_SUCCESS; + } else if (mc->mc_db->md_root == P_INVALID) { + /* new database, cursor has nothing to point to */ + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + rc = MDB_NO_ROOT; + } else { + int exact = 0; + MDB_val d2; + if (flags & MDB_APPEND) { + MDB_val k2; + rc = mdb_cursor_last(mc, &k2, &d2); + if (rc == 0) { + rc = mc->mc_dbx->md_cmp(key, &k2); + if (rc > 0) { + rc = MDB_NOTFOUND; + mc->mc_ki[mc->mc_top]++; + } else { + /* new key is <= last key */ + rc = MDB_KEYEXIST; + } + } + } else { + rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); + } + if ((flags & MDB_NOOVERWRITE) && rc == 0) { + DPRINTF(("duplicate key [%s]", DKEY(key))); + *data = d2; + return MDB_KEYEXIST; + } + if (rc && rc != MDB_NOTFOUND) + return rc; + } + + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; + + /* Cursor is positioned, check for room in the dirty list */ + if (!nospill) { + if (flags & MDB_MULTIPLE) { + rdata = &xdata; + xdata.mv_size = data->mv_size * dcount; + } else { + rdata = data; + } + if ((rc2 = mdb_page_spill(mc, key, rdata))) + return rc2; + } + + if (rc == MDB_NO_ROOT) { + MDB_page *np; + /* new database, write a root leaf page */ + DPUTS("allocating new root leaf page"); + if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { + return rc2; + } + mdb_cursor_push(mc, np); + mc->mc_db->md_root = np->mp_pgno; + mc->mc_db->md_depth++; + *mc->mc_dbflag |= DB_DIRTY; + if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) + == MDB_DUPFIXED) + np->mp_flags |= P_LEAF2; + mc->mc_flags |= C_INITIALIZED; + } else { + /* make sure all cursor pages are writable */ + rc2 = mdb_cursor_touch(mc); + if (rc2) + return rc2; + } + + insert_key = insert_data = rc; + if (insert_key) { + /* The key does not exist */ + DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + LEAFSIZE(key, data) > env->me_nodemax) + { + /* Too big for a node, insert in sub-DB. Set up an empty + * "old sub-page" for prep_subDB to expand to a full page. + */ + fp_flags = P_LEAF|P_DIRTY; + fp = env->me_pbuf; + fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */ + fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE); + olddata.mv_size = PAGEHDRSZ; + goto prep_subDB; + } + } else { + /* there's only a key anyway, so this is a no-op */ + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + char *ptr; + unsigned int ksize = mc->mc_db->md_pad; + if (key->mv_size != ksize) + return MDB_BAD_VALSIZE; + ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + memcpy(ptr, key->mv_data, ksize); +fix_parent: + /* if overwriting slot 0 of leaf, need to + * update branch key if there is a parent page + */ + if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { + unsigned short dtop = 1; + mc->mc_top--; + /* slot 0 is always an empty key, find real slot */ + while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { + mc->mc_top--; + dtop++; + } + if (mc->mc_ki[mc->mc_top]) + rc2 = mdb_update_key(mc, key); + else + rc2 = MDB_SUCCESS; + mc->mc_top += dtop; + if (rc2) + return rc2; + } + return MDB_SUCCESS; + } + +more: + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + olddata.mv_size = NODEDSZ(leaf); + olddata.mv_data = NODEDATA(leaf); + + /* DB has dups? */ + if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { + /* Prepare (sub-)page/sub-DB to accept the new item, + * if needed. fp: old sub-page or a header faking + * it. mp: new (sub-)page. offset: growth in page + * size. xdata: node data with new page or DB. + */ + unsigned i, offset = 0; + mp = fp = xdata.mv_data = env->me_pbuf; + mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; + + /* Was a single item before, must convert now */ + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDB_cmp_func *dcmp; + /* Just overwrite the current item */ + if (flags == MDB_CURRENT) + goto current; + dcmp = mc->mc_dbx->md_dcmp; +#if UINT_MAX < SIZE_MAX + if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) + dcmp = mdb_cmp_clong; +#endif + /* does data match? */ + if (!dcmp(data, &olddata)) { + if (flags & (MDB_NODUPDATA|MDB_APPENDDUP)) + return MDB_KEYEXIST; + /* overwrite it */ + goto current; + } + + /* Back up original data item */ + dkey.mv_size = olddata.mv_size; + dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); + + /* Make sub-page header for the dup items, with dummy body */ + fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; + fp->mp_lower = (PAGEHDRSZ-PAGEBASE); + xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp->mp_flags |= P_LEAF2; + fp->mp_pad = data->mv_size; + xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ + } else { + xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + + (dkey.mv_size & 1) + (data->mv_size & 1); + } + fp->mp_upper = xdata.mv_size - PAGEBASE; + olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ + } else if (leaf->mn_flags & F_SUBDATA) { + /* Data is on sub-DB, just store it */ + flags |= F_DUPDATA|F_SUBDATA; + goto put_sub; + } else { + /* Data is on sub-page */ + fp = olddata.mv_data; + switch (flags) { + default: + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + offset = EVEN(NODESIZE + sizeof(indx_t) + + data->mv_size); + break; + } + offset = fp->mp_pad; + if (SIZELEFT(fp) < offset) { + offset *= 4; /* space for 4 more */ + break; + } + /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */ + case MDB_CURRENT: + fp->mp_flags |= P_DIRTY; + COPY_PGNO(fp->mp_pgno, mp->mp_pgno); + mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; + flags |= F_DUPDATA; + goto put_sub; + } + xdata.mv_size = olddata.mv_size + offset; + } + + fp_flags = fp->mp_flags; + if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { + /* Too big for a sub-page, convert to sub-DB */ + fp_flags &= ~P_SUBP; +prep_subDB: + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp_flags |= P_LEAF2; + dummy.md_pad = fp->mp_pad; + dummy.md_flags = MDB_DUPFIXED; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + dummy.md_flags |= MDB_INTEGERKEY; + } else { + dummy.md_pad = 0; + dummy.md_flags = 0; + } + dummy.md_depth = 1; + dummy.md_branch_pages = 0; + dummy.md_leaf_pages = 1; + dummy.md_overflow_pages = 0; + dummy.md_entries = NUMKEYS(fp); + xdata.mv_size = sizeof(MDB_db); + xdata.mv_data = &dummy; + if ((rc = mdb_page_alloc(mc, 1, &mp))) + return rc; + offset = env->me_psize - olddata.mv_size; + flags |= F_DUPDATA|F_SUBDATA; + dummy.md_root = mp->mp_pgno; + sub_root = mp; + } + if (mp != fp) { + mp->mp_flags = fp_flags | P_DIRTY; + mp->mp_pad = fp->mp_pad; + mp->mp_lower = fp->mp_lower; + mp->mp_upper = fp->mp_upper + offset; + if (fp_flags & P_LEAF2) { + memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); + } else { + memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, + olddata.mv_size - fp->mp_upper - PAGEBASE); + for (i=0; i<NUMKEYS(fp); i++) + mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; + } + } + + rdata = &xdata; + flags |= F_DUPDATA; + do_sub = 1; + if (!insert_key) + mdb_node_del(mc, 0); + goto new_sub; + } +current: + /* LMDB passes F_SUBDATA in 'flags' to write a DB record */ + if ((leaf->mn_flags ^ flags) & F_SUBDATA) + return MDB_INCOMPATIBLE; + /* overflow page overwrites need special handling */ + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); + + memcpy(&pg, olddata.mv_data, sizeof(pg)); + if ((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0) + return rc2; + ovpages = omp->mp_pages; + + /* Is the ov page large enough? */ + if (ovpages >= dpages) { + if (!(omp->mp_flags & P_DIRTY) && + (level || (env->me_flags & MDB_WRITEMAP))) + { + rc = mdb_page_unspill(mc->mc_txn, omp, &omp); + if (rc) + return rc; + level = 0; /* dirty in this txn or clean */ + } + /* Is it dirty? */ + if (omp->mp_flags & P_DIRTY) { + /* yes, overwrite it. Note in this case we don't + * bother to try shrinking the page if the new data + * is smaller than the overflow threshold. + */ + if (level > 1) { + /* It is writable only in a parent txn */ + size_t sz = (size_t) env->me_psize * ovpages, off; + MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); + MDB_ID2 id2; + if (!np) + return ENOMEM; + id2.mid = pg; + id2.mptr = np; + /* Note - this page is already counted in parent's dirty_room */ + rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); + mdb_cassert(mc, rc2 == 0); + /* Currently we make the page look as with put() in the + * parent txn, in case the user peeks at MDB_RESERVEd + * or unused parts. Some users treat ovpages specially. + */ + if (!(flags & MDB_RESERVE)) { + /* Skip the part where LMDB will put *data. + * Copy end of page, adjusting alignment so + * compiler may copy words instead of bytes. + */ + off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); + memcpy((size_t *)((char *)np + off), + (size_t *)((char *)omp + off), sz - off); + sz = PAGEHDRSZ; + } + memcpy(np, omp, sz); /* Copy beginning of page */ + omp = np; + } + SETDSZ(leaf, data->mv_size); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = METADATA(omp); + else + memcpy(METADATA(omp), data->mv_data, data->mv_size); + return MDB_SUCCESS; + } + } + if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) + return rc2; + } else if (data->mv_size == olddata.mv_size) { + /* same size, just replace it. Note that we could + * also reuse this node if the new data is smaller, + * but instead we opt to shrink the node in that case. + */ + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = olddata.mv_data; + else if (!(mc->mc_flags & C_SUB)) + memcpy(olddata.mv_data, data->mv_data, data->mv_size); + else { + memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); + goto fix_parent; + } + return MDB_SUCCESS; + } + mdb_node_del(mc, 0); + } + + rdata = data; + +new_sub: + nflags = flags & NODE_ADD_FLAGS; + nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); + if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { + if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) + nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ + if (!insert_key) + nflags |= MDB_SPLIT_REPLACE; + rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); + } else { + /* There is room already in this leaf page. */ + rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); + if (rc == 0) { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue; + if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { + m3->mc_ki[i]++; + } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]); + } + } + } + + if (rc == MDB_SUCCESS) { + /* Now store the actual data in the child DB. Note that we're + * storing the user data in the keys field, so there are strict + * size limits on dupdata. The actual data fields of the child + * DB are all zero size. + */ + if (do_sub) { + int xflags, new_dupdata; + size_t ecount; +put_sub: + xdata.mv_size = 0; + xdata.mv_data = ""; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (flags & MDB_CURRENT) { + xflags = MDB_CURRENT|MDB_NOSPILL; + } else { + mdb_xcursor_init1(mc, leaf); + xflags = (flags & MDB_NODUPDATA) ? + MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; + } + if (sub_root) + mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; + new_dupdata = (int)dkey.mv_size; + /* converted, write the original data first */ + if (dkey.mv_size) { + rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); + if (rc) + goto bad_sub; + /* we've done our job */ + dkey.mv_size = 0; + } + if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2; + MDB_xcursor *mx = mc->mc_xcursor; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + int nkeys = NUMKEYS(mp); + + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (!(m2->mc_flags & C_INITIALIZED)) continue; + if (m2->mc_pg[i] == mp) { + if (m2->mc_ki[i] == mc->mc_ki[i]) { + mdb_xcursor_init2(m2, mx, new_dupdata); + } else if (!insert_key && m2->mc_ki[i] < nkeys) { + XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]); + } + } + } + } + ecount = mc->mc_xcursor->mx_db.md_entries; + if (flags & MDB_APPENDDUP) + xflags |= MDB_APPEND; + rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); + if (flags & F_SUBDATA) { + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } + insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; + } + /* Increment count unless we just replaced an existing item. */ + if (insert_data) + mc->mc_db->md_entries++; + if (insert_key) { + /* Invalidate txn if we created an empty sub-DB */ + if (rc) + goto bad_sub; + /* If we succeeded and the key didn't exist before, + * make sure the cursor is marked valid. + */ + mc->mc_flags |= C_INITIALIZED; + } + if (flags & MDB_MULTIPLE) { + if (!rc) { + mcount++; + /* let caller know how many succeeded, if any */ + data[1].mv_size = mcount; + if (mcount < dcount) { + data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; + insert_key = insert_data = 0; + goto more; + } + } + } + return rc; +bad_sub: + if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */ + rc = MDB_CORRUPTED; + } + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_cursor_del(MDB_cursor *mc, unsigned int flags) +{ + MDB_node *leaf; + MDB_page *mp; + int rc; + + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + return MDB_NOTFOUND; + + if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) + return rc; + + rc = mdb_cursor_touch(mc); + if (rc) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + if (IS_LEAF2(mp)) + goto del_key; + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (flags & MDB_NODUPDATA) { + /* mdb_cursor_del0() will subtract the final entry */ + mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; + mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + } else { + if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); + if (rc) + return rc; + /* If sub-DB still has entries, we're done */ + if (mc->mc_xcursor->mx_db.md_entries) { + if (leaf->mn_flags & F_SUBDATA) { + /* update subDB info */ + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } else { + MDB_cursor *m2; + /* shrink fake page */ + mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + /* fix other sub-DB cursors pointed at fake pages on this page */ + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (!(m2->mc_flags & C_INITIALIZED)) continue; + if (m2->mc_pg[mc->mc_top] == mp) { + MDB_node *n2 = leaf; + if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) { + n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); + if (n2->mn_flags & F_SUBDATA) continue; + } + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); + } + } + } + mc->mc_db->md_entries--; + return rc; + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + } + /* otherwise fall thru and delete the sub-DB */ + } + + if (leaf->mn_flags & F_SUBDATA) { + /* add all the child DB's pages to the free list */ + rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (rc) + goto fail; + } + } + /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */ + else if ((leaf->mn_flags ^ flags) & F_SUBDATA) { + rc = MDB_INCOMPATIBLE; + goto fail; + } + + /* add overflow pages to free list */ + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + + memcpy(&pg, NODEDATA(leaf), sizeof(pg)); + if ((rc = mdb_page_get(mc, pg, &omp, NULL)) || + (rc = mdb_ovpage_free(mc, omp))) + goto fail; + } + +del_key: + return mdb_cursor_del0(mc); + +fail: + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +/** Allocate and initialize new pages for a database. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc a cursor on the database being added to. + * @param[in] flags flags defining what type of page is being allocated. + * @param[in] num the number of pages to allocate. This is usually 1, + * unless allocating overflow pages for a large record. + * @param[out] mp Address of a page, or NULL on failure. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) +{ + MDB_page *np; + int rc; + + if ((rc = mdb_page_alloc(mc, num, &np))) + return rc; + DPRINTF(("allocated new mpage %"Z"u, page size %u", + np->mp_pgno, mc->mc_txn->mt_env->me_psize)); + np->mp_flags = flags | P_DIRTY; + np->mp_lower = (PAGEHDRSZ-PAGEBASE); + np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; + + if (IS_BRANCH(np)) + mc->mc_db->md_branch_pages++; + else if (IS_LEAF(np)) + mc->mc_db->md_leaf_pages++; + else if (IS_OVERFLOW(np)) { + mc->mc_db->md_overflow_pages += num; + np->mp_pages = num; + } + *mp = np; + + return 0; +} + +/** Calculate the size of a leaf node. + * The size depends on the environment's page size; if a data item + * is too large it will be put onto an overflow page and the node + * size will only include the key and not the data. Sizes are always + * rounded up to an even number of bytes, to guarantee 2-byte alignment + * of the #MDB_node headers. + * @param[in] env The environment handle. + * @param[in] key The key for the node. + * @param[in] data The data for the node. + * @return The number of bytes needed to store the node. + */ +static size_t +mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) +{ + size_t sz; + + sz = LEAFSIZE(key, data); + if (sz > env->me_nodemax) { + /* put on overflow page */ + sz -= data->mv_size - sizeof(pgno_t); + } + + return EVEN(sz + sizeof(indx_t)); +} + +/** Calculate the size of a branch node. + * The size should depend on the environment's page size but since + * we currently don't support spilling large keys onto overflow + * pages, it's simply the size of the #MDB_node header plus the + * size of the key. Sizes are always rounded up to an even number + * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. + * @param[in] env The environment handle. + * @param[in] key The key for the node. + * @return The number of bytes needed to store the node. + */ +static size_t +mdb_branch_size(MDB_env *env, MDB_val *key) +{ + size_t sz; + + sz = INDXSIZE(key); + if (sz > env->me_nodemax) { + /* put on overflow page */ + /* not implemented */ + /* sz -= key->size - sizeof(pgno_t); */ + } + + return sz + sizeof(indx_t); +} + +/** Add a node to the page pointed to by the cursor. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc The cursor for this operation. + * @param[in] indx The index on the page where the new node should be added. + * @param[in] key The key for the new node. + * @param[in] data The data for the new node, if any. + * @param[in] pgno The page number, if adding a branch node. + * @param[in] flags Flags for the node. + * @return 0 on success, non-zero on failure. Possible errors are: + * <ul> + * <li>ENOMEM - failed to allocate overflow pages for the node. + * <li>MDB_PAGE_FULL - there is insufficient room in the page. This error + * should never happen since all callers already calculate the + * page's free space before calling this function. + * </ul> + */ +static int +mdb_node_add(MDB_cursor *mc, indx_t indx, + MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags) +{ + unsigned int i; + size_t node_size = NODESIZE; + ssize_t room; + indx_t ofs; + MDB_node *node; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_page *ofp = NULL; /* overflow page */ + void *ndata; + DKBUF; + + mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); + + DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", + IS_LEAF(mp) ? "leaf" : "branch", + IS_SUBP(mp) ? "sub-" : "", + mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, + key ? key->mv_size : 0, key ? DKEY(key) : "null")); + + if (IS_LEAF2(mp)) { + /* Move higher keys up one slot. */ + int ksize = mc->mc_db->md_pad, dif; + char *ptr = LEAF2KEY(mp, indx, ksize); + dif = NUMKEYS(mp) - indx; + if (dif > 0) + memmove(ptr+ksize, ptr, dif*ksize); + /* insert new key */ + memcpy(ptr, key->mv_data, ksize); + + /* Just using these for counting */ + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + return MDB_SUCCESS; + } + + room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); + if (key != NULL) + node_size += key->mv_size; + if (IS_LEAF(mp)) { + mdb_cassert(mc, key && data); + if (F_ISSET(flags, F_BIGDATA)) { + /* Data already on overflow page. */ + node_size += sizeof(pgno_t); + } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { + int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); + int rc; + /* Put data on overflow page. */ + DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", + data->mv_size, node_size+data->mv_size)); + node_size = EVEN(node_size + sizeof(pgno_t)); + if ((ssize_t)node_size > room) + goto full; + if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) + return rc; + DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); + flags |= F_BIGDATA; + goto update; + } else { + node_size += data->mv_size; + } + } + node_size = EVEN(node_size); + if ((ssize_t)node_size > room) + goto full; + +update: + /* Move higher pointers up one slot. */ + for (i = NUMKEYS(mp); i > indx; i--) + mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; + + /* Adjust free space offsets. */ + ofs = mp->mp_upper - node_size; + mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); + mp->mp_ptrs[indx] = ofs; + mp->mp_upper = ofs; + mp->mp_lower += sizeof(indx_t); + + /* Write the node data. */ + node = NODEPTR(mp, indx); + node->mn_ksize = (key == NULL) ? 0 : key->mv_size; + node->mn_flags = flags; + if (IS_LEAF(mp)) + SETDSZ(node,data->mv_size); + else + SETPGNO(node,pgno); + + if (key) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); + + if (IS_LEAF(mp)) { + ndata = NODEDATA(node); + if (ofp == NULL) { + if (F_ISSET(flags, F_BIGDATA)) + memcpy(ndata, data->mv_data, sizeof(pgno_t)); + else if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = ndata; + else + memcpy(ndata, data->mv_data, data->mv_size); + } else { + memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); + ndata = METADATA(ofp); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = ndata; + else + memcpy(ndata, data->mv_data, data->mv_size); + } + } + + return MDB_SUCCESS; + +full: + DPRINTF(("not enough room in page %"Z"u, got %u ptrs", + mdb_dbg_pgno(mp), NUMKEYS(mp))); + DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); + DPRINTF(("node size = %"Z"u", node_size)); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_FULL; +} + +/** Delete the specified node from a page. + * @param[in] mc Cursor pointing to the node to delete. + * @param[in] ksize The size of a node. Only used if the page is + * part of a #MDB_DUPFIXED database. + */ +static void +mdb_node_del(MDB_cursor *mc, int ksize) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top]; + indx_t indx = mc->mc_ki[mc->mc_top]; + unsigned int sz; + indx_t i, j, numkeys, ptr; + MDB_node *node; + char *base; + + DPRINTF(("delete node %u on %s page %"Z"u", indx, + IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); + numkeys = NUMKEYS(mp); + mdb_cassert(mc, indx < numkeys); + + if (IS_LEAF2(mp)) { + int x = numkeys - 1 - indx; + base = LEAF2KEY(mp, indx, ksize); + if (x) + memmove(base, base + ksize, x * ksize); + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += ksize - sizeof(indx_t); + return; + } + + node = NODEPTR(mp, indx); + sz = NODESIZE + node->mn_ksize; + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + sz += sizeof(pgno_t); + else + sz += NODEDSZ(node); + } + sz = EVEN(sz); + + ptr = mp->mp_ptrs[indx]; + for (i = j = 0; i < numkeys; i++) { + if (i != indx) { + mp->mp_ptrs[j] = mp->mp_ptrs[i]; + if (mp->mp_ptrs[i] < ptr) + mp->mp_ptrs[j] += sz; + j++; + } + } + + base = (char *)mp + mp->mp_upper + PAGEBASE; + memmove(base + sz, base, ptr - mp->mp_upper); + + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += sz; +} + +/** Compact the main page after deleting a node on a subpage. + * @param[in] mp The main page to operate on. + * @param[in] indx The index of the subpage on the main page. + */ +static void +mdb_node_shrink(MDB_page *mp, indx_t indx) +{ + MDB_node *node; + MDB_page *sp, *xp; + char *base; + indx_t delta, nsize, len, ptr; + int i; + + node = NODEPTR(mp, indx); + sp = (MDB_page *)NODEDATA(node); + delta = SIZELEFT(sp); + nsize = NODEDSZ(node) - delta; + + /* Prepare to shift upward, set len = length(subpage part to shift) */ + if (IS_LEAF2(sp)) { + len = nsize; + if (nsize & 1) + return; /* do not make the node uneven-sized */ + } else { + xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ + for (i = NUMKEYS(sp); --i >= 0; ) + xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; + len = PAGEHDRSZ; + } + sp->mp_upper = sp->mp_lower; + COPY_PGNO(sp->mp_pgno, mp->mp_pgno); + SETDSZ(node, nsize); + + /* Shift <lower nodes...initial part of subpage> upward */ + base = (char *)mp + mp->mp_upper + PAGEBASE; + memmove(base + delta, base, (char *)sp + len - base); + + ptr = mp->mp_ptrs[indx]; + for (i = NUMKEYS(mp); --i >= 0; ) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] += delta; + } + mp->mp_upper += delta; +} + +/** Initial setup of a sorted-dups cursor. + * Sorted duplicates are implemented as a sub-database for the given key. + * The duplicate data items are actually keys of the sub-database. + * Operations on the duplicate data items are performed using a sub-cursor + * initialized when the sub-database is first accessed. This function does + * the preliminary setup of the sub-cursor, filling in the fields that + * depend only on the parent DB. + * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. + */ +static void +mdb_xcursor_init0(MDB_cursor *mc) +{ + MDB_xcursor *mx = mc->mc_xcursor; + + mx->mx_cursor.mc_xcursor = NULL; + mx->mx_cursor.mc_txn = mc->mc_txn; + mx->mx_cursor.mc_db = &mx->mx_db; + mx->mx_cursor.mc_dbx = &mx->mx_dbx; + mx->mx_cursor.mc_dbi = mc->mc_dbi; + mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + mx->mx_dbx.md_name.mv_size = 0; + mx->mx_dbx.md_name.mv_data = NULL; + mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; + mx->mx_dbx.md_dcmp = NULL; + mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; +} + +/** Final setup of a sorted-dups cursor. + * Sets up the fields that depend on the data from the main cursor. + * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. + * @param[in] node The data containing the #MDB_db record for the + * sorted-dup database. + */ +static void +mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) +{ + MDB_xcursor *mx = mc->mc_xcursor; + + if (node->mn_flags & F_SUBDATA) { + memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); + mx->mx_cursor.mc_pg[0] = 0; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + } else { + MDB_page *fp = NODEDATA(node); + mx->mx_db.md_pad = 0; + mx->mx_db.md_flags = 0; + mx->mx_db.md_depth = 1; + mx->mx_db.md_branch_pages = 0; + mx->mx_db.md_leaf_pages = 1; + mx->mx_db.md_overflow_pages = 0; + mx->mx_db.md_entries = NUMKEYS(fp); + COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; + mx->mx_cursor.mc_pg[0] = fp; + mx->mx_cursor.mc_ki[0] = 0; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + mx->mx_db.md_flags = MDB_DUPFIXED; + mx->mx_db.md_pad = fp->mp_pad; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + mx->mx_db.md_flags |= MDB_INTEGERKEY; + } + } + DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, + mx->mx_db.md_root)); + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; +#if UINT_MAX < SIZE_MAX + if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) + mx->mx_dbx.md_cmp = mdb_cmp_clong; +#endif +} + + +/** Fixup a sorted-dups cursor due to underlying update. + * Sets up some fields that depend on the data from the main cursor. + * Almost the same as init1, but skips initialization steps if the + * xcursor had already been used. + * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up. + * @param[in] src_mx The xcursor of an up-to-date cursor. + * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item. + */ +static void +mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) +{ + MDB_xcursor *mx = mc->mc_xcursor; + + if (new_dupdata) { + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags |= C_INITIALIZED; + mx->mx_cursor.mc_ki[0] = 0; + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; +#if UINT_MAX < SIZE_MAX + mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; +#endif + } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) { + return; + } + mx->mx_db = src_mx->mx_db; + mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; + DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, + mx->mx_db.md_root)); +} + +/** Initialize a cursor for a given transaction and database. */ +static void +mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) +{ + mc->mc_next = NULL; + mc->mc_backup = NULL; + mc->mc_dbi = dbi; + mc->mc_txn = txn; + mc->mc_db = &txn->mt_dbs[dbi]; + mc->mc_dbx = &txn->mt_dbxs[dbi]; + mc->mc_dbflag = &txn->mt_dbflags[dbi]; + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_pg[0] = 0; + mc->mc_ki[0] = 0; + mc->mc_flags = 0; + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { + mdb_tassert(txn, mx != NULL); + mc->mc_xcursor = mx; + mdb_xcursor_init0(mc); + } else { + mc->mc_xcursor = NULL; + } + if (*mc->mc_dbflag & DB_STALE) { + mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); + } +} + +int +mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) +{ + MDB_cursor *mc; + size_t size = sizeof(MDB_cursor); + + if (!ret || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + if (dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EINVAL; + + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) + size += sizeof(MDB_xcursor); + + if ((mc = malloc(size)) != NULL) { + mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); + if (txn->mt_cursors) { + mc->mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = mc; + mc->mc_flags |= C_UNTRACK; + } + } else { + return ENOMEM; + } + + *ret = mc; + + return MDB_SUCCESS; +} + +int +mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) +{ + if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID)) + return EINVAL; + + if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); + return MDB_SUCCESS; +} + +/* Return the count of duplicate data items for the current key */ +int +mdb_cursor_count(MDB_cursor *mc, size_t *countp) +{ + MDB_node *leaf; + + if (mc == NULL || countp == NULL) + return EINVAL; + + if (mc->mc_xcursor == NULL) + return MDB_INCOMPATIBLE; + + if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + + if (!mc->mc_snum) + return MDB_NOTFOUND; + + if (mc->mc_flags & C_EOF) { + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + return MDB_NOTFOUND; + mc->mc_flags ^= C_EOF; + } + + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + *countp = 1; + } else { + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + return EINVAL; + + *countp = mc->mc_xcursor->mx_db.md_entries; + } + return MDB_SUCCESS; +} + +void +mdb_cursor_close(MDB_cursor *mc) +{ + if (mc && !mc->mc_backup) { + /* remove from txn, if tracked */ + if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { + MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + while (*prev && *prev != mc) prev = &(*prev)->mc_next; + if (*prev == mc) + *prev = mc->mc_next; + } + free(mc); + } +} + +MDB_txn * +mdb_cursor_txn(MDB_cursor *mc) +{ + if (!mc) return NULL; + return mc->mc_txn; +} + +MDB_dbi +mdb_cursor_dbi(MDB_cursor *mc) +{ + return mc->mc_dbi; +} + +/** Replace the key for a branch node with a new key. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc Cursor pointing to the node to operate on. + * @param[in] key The new key to use. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_update_key(MDB_cursor *mc, MDB_val *key) +{ + MDB_page *mp; + MDB_node *node; + char *base; + size_t len; + int delta, ksize, oksize; + indx_t ptr, i, numkeys, indx; + DKBUF; + + indx = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + node = NODEPTR(mp, indx); + ptr = mp->mp_ptrs[indx]; +#if MDB_DEBUG + { + MDB_val k2; + char kbuf2[DKBUF_MAXKEYSIZE*2+1]; + k2.mv_data = NODEKEY(node); + k2.mv_size = node->mn_ksize; + DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", + indx, ptr, + mdb_dkey(&k2, kbuf2), + DKEY(key), + mp->mp_pgno)); + } +#endif + + /* Sizes must be 2-byte aligned. */ + ksize = EVEN(key->mv_size); + oksize = EVEN(node->mn_ksize); + delta = ksize - oksize; + + /* Shift node contents if EVEN(key length) changed. */ + if (delta) { + if (delta > 0 && SIZELEFT(mp) < delta) { + pgno_t pgno; + /* not enough space left, do a delete and split */ + DPRINTF(("Not enough room, delta = %d, splitting...", delta)); + pgno = NODEPGNO(node); + mdb_node_del(mc, 0); + return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); + } + + numkeys = NUMKEYS(mp); + for (i = 0; i < numkeys; i++) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] -= delta; + } + + base = (char *)mp + mp->mp_upper + PAGEBASE; + len = ptr - mp->mp_upper + NODESIZE; + memmove(base - delta, base, len); + mp->mp_upper -= delta; + + node = NODEPTR(mp, indx); + } + + /* But even if no shift was needed, update ksize */ + if (node->mn_ksize != key->mv_size) + node->mn_ksize = key->mv_size; + + if (key->mv_size) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); + + return MDB_SUCCESS; +} + +static void +mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); + +/** Perform \b act while tracking temporary cursor \b mn */ +#define WITH_CURSOR_TRACKING(mn, act) do { \ + MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ + if ((mn).mc_flags & C_SUB) { \ + dummy.mc_flags = C_INITIALIZED; \ + dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ + tracked = &dummy; \ + } else { \ + tracked = &(mn); \ + } \ + tracked->mc_next = *tp; \ + *tp = tracked; \ + { act; } \ + *tp = tracked->mc_next; \ +} while (0) + +/** Move a node from csrc to cdst. + */ +static int +mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) +{ + MDB_node *srcnode; + MDB_val key, data; + pgno_t srcpg; + MDB_cursor mn; + int rc; + unsigned short flags; + + DKBUF; + + /* Mark src and dst as dirty. */ + if ((rc = mdb_page_touch(csrc)) || + (rc = mdb_page_touch(cdst))) + return rc; + + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); + data.mv_size = 0; + data.mv_data = NULL; + srcpg = 0; + flags = 0; + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); + mdb_cassert(csrc, !((size_t)srcnode & 1)); + srcpg = NODEPGNO(srcnode); + flags = srcnode->mn_flags; + if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + unsigned int snum = csrc->mc_snum; + MDB_node *s2; + /* must find the lowest key below src */ + rc = mdb_page_search_lowest(csrc); + if (rc) + return rc; + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + csrc->mc_snum = snum--; + csrc->mc_top = snum; + } else { + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + } + mn.mc_xcursor = NULL; + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { + unsigned int snum = cdst->mc_snum; + MDB_node *s2; + MDB_val bkey; + /* must find the lowest key below dst */ + mdb_cursor_copy(cdst, &mn); + rc = mdb_page_search_lowest(&mn); + if (rc) + return rc; + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + bkey.mv_size = mn.mc_db->md_pad; + bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); + } else { + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); + bkey.mv_size = NODEKSZ(s2); + bkey.mv_data = NODEKEY(s2); + } + mn.mc_snum = snum--; + mn.mc_top = snum; + mn.mc_ki[snum] = 0; + rc = mdb_update_key(&mn, &bkey); + if (rc) + return rc; + } + + DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", + IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", + csrc->mc_ki[csrc->mc_top], + DKEY(&key), + csrc->mc_pg[csrc->mc_top]->mp_pgno, + cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); + + /* Add the node to the destination page. + */ + rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); + if (rc != MDB_SUCCESS) + return rc; + + /* Delete the node from the source page. + */ + mdb_node_del(csrc, key.mv_size); + + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + MDB_page *mpd, *mps; + + mps = csrc->mc_pg[csrc->mc_top]; + /* If we're adding on the left, bump others up */ + if (fromleft) { + mpd = cdst->mc_pg[csrc->mc_top]; + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; + if (m3 != cdst && + m3->mc_pg[csrc->mc_top] == mpd && + m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { + m3->mc_ki[csrc->mc_top]++; + } + if (m3 !=csrc && + m3->mc_pg[csrc->mc_top] == mps && + m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + m3->mc_ki[csrc->mc_top-1]++; + } + if (XCURSOR_INITED(m3) && IS_LEAF(mps)) + XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); + } + } else + /* Adding on the right, bump others down */ + { + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) continue; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; + if (m3->mc_pg[csrc->mc_top] == mps) { + if (!m3->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + m3->mc_ki[csrc->mc_top-1]--; + } else { + m3->mc_ki[csrc->mc_top]--; + } + if (XCURSOR_INITED(m3) && IS_LEAF(mps)) + XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); + } + } + } + } + + /* Update the parent separators. + */ + if (csrc->mc_ki[csrc->mc_top] == 0) { + if (csrc->mc_ki[csrc->mc_top-1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + DPRINTF(("update separator for source page %"Z"u to [%s]", + csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); + mdb_cursor_copy(csrc, &mn); + mn.mc_snum--; + mn.mc_top--; + /* We want mdb_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_update_key(&mn, &key)); + if (rc) + return rc; + } + if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + MDB_val nullkey; + indx_t ix = csrc->mc_ki[csrc->mc_top]; + nullkey.mv_size = 0; + csrc->mc_ki[csrc->mc_top] = 0; + rc = mdb_update_key(csrc, &nullkey); + csrc->mc_ki[csrc->mc_top] = ix; + mdb_cassert(csrc, rc == MDB_SUCCESS); + } + } + + if (cdst->mc_ki[cdst->mc_top] == 0) { + if (cdst->mc_ki[cdst->mc_top-1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + DPRINTF(("update separator for destination page %"Z"u to [%s]", + cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); + mdb_cursor_copy(cdst, &mn); + mn.mc_snum--; + mn.mc_top--; + /* We want mdb_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_update_key(&mn, &key)); + if (rc) + return rc; + } + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { + MDB_val nullkey; + indx_t ix = cdst->mc_ki[cdst->mc_top]; + nullkey.mv_size = 0; + cdst->mc_ki[cdst->mc_top] = 0; + rc = mdb_update_key(cdst, &nullkey); + cdst->mc_ki[cdst->mc_top] = ix; + mdb_cassert(cdst, rc == MDB_SUCCESS); + } + } + + return MDB_SUCCESS; +} + +/** Merge one page into another. + * The nodes from the page pointed to by \b csrc will + * be copied to the page pointed to by \b cdst and then + * the \b csrc page will be freed. + * @param[in] csrc Cursor pointing to the source page. + * @param[in] cdst Cursor pointing to the destination page. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) +{ + MDB_page *psrc, *pdst; + MDB_node *srcnode; + MDB_val key, data; + unsigned nkeys; + int rc; + indx_t i, j; + + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; + + DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno)); + + mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ + mdb_cassert(csrc, cdst->mc_snum > 1); + + /* Mark dst as dirty. */ + if ((rc = mdb_page_touch(cdst))) + return rc; + + /* get dst page again now that we've touched it. */ + pdst = cdst->mc_pg[cdst->mc_top]; + + /* Move all nodes from src to dst. + */ + j = nkeys = NUMKEYS(pdst); + if (IS_LEAF2(psrc)) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = METADATA(psrc); + for (i = 0; i < NUMKEYS(psrc); i++, j++) { + rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); + if (rc != MDB_SUCCESS) + return rc; + key.mv_data = (char *)key.mv_data + key.mv_size; + } + } else { + for (i = 0; i < NUMKEYS(psrc); i++, j++) { + srcnode = NODEPTR(psrc, i); + if (i == 0 && IS_BRANCH(psrc)) { + MDB_cursor mn; + MDB_node *s2; + mdb_cursor_copy(csrc, &mn); + mn.mc_xcursor = NULL; + /* must find the lowest key below src */ + rc = mdb_page_search_lowest(&mn); + if (rc) + return rc; + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + key.mv_size = mn.mc_db->md_pad; + key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + } else { + key.mv_size = srcnode->mn_ksize; + key.mv_data = NODEKEY(srcnode); + } + + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); + if (rc != MDB_SUCCESS) + return rc; + } + } + + DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", + pdst->mp_pgno, NUMKEYS(pdst), + (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10)); + + /* Unlink the src page from parent and add to free list. + */ + csrc->mc_top--; + mdb_node_del(csrc, 0); + if (csrc->mc_ki[csrc->mc_top] == 0) { + key.mv_size = 0; + rc = mdb_update_key(csrc, &key); + if (rc) { + csrc->mc_top++; + return rc; + } + } + csrc->mc_top++; + + psrc = csrc->mc_pg[csrc->mc_top]; + /* If not operating on FreeDB, allow this page to be reused + * in this txn. Otherwise just add to free list. + */ + rc = mdb_page_loose(csrc, psrc); + if (rc) + return rc; + if (IS_LEAF(psrc)) + csrc->mc_db->md_leaf_pages--; + else + csrc->mc_db->md_branch_pages--; + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + unsigned int top = csrc->mc_top; + + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) continue; + if (m3->mc_snum < csrc->mc_snum) continue; + if (m3->mc_pg[top] == psrc) { + m3->mc_pg[top] = pdst; + m3->mc_ki[top] += nkeys; + m3->mc_ki[top-1] = cdst->mc_ki[top-1]; + } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] && + m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { + m3->mc_ki[top-1]--; + } + if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) + XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]); + } + } + { + unsigned int snum = cdst->mc_snum; + uint16_t depth = cdst->mc_db->md_depth; + mdb_cursor_pop(cdst); + rc = mdb_rebalance(cdst); + /* Did the tree height change? */ + if (depth != cdst->mc_db->md_depth) + snum += cdst->mc_db->md_depth - depth; + cdst->mc_snum = snum; + cdst->mc_top = snum-1; + } + return rc; +} + +/** Copy the contents of a cursor. + * @param[in] csrc The cursor to copy from. + * @param[out] cdst The cursor to copy to. + */ +static void +mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) +{ + unsigned int i; + + cdst->mc_txn = csrc->mc_txn; + cdst->mc_dbi = csrc->mc_dbi; + cdst->mc_db = csrc->mc_db; + cdst->mc_dbx = csrc->mc_dbx; + cdst->mc_snum = csrc->mc_snum; + cdst->mc_top = csrc->mc_top; + cdst->mc_flags = csrc->mc_flags; + + for (i=0; i<csrc->mc_snum; i++) { + cdst->mc_pg[i] = csrc->mc_pg[i]; + cdst->mc_ki[i] = csrc->mc_ki[i]; + } +} + +/** Rebalance the tree after a delete operation. + * @param[in] mc Cursor pointing to the page where rebalancing + * should begin. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_rebalance(MDB_cursor *mc) +{ + MDB_node *node; + int rc, fromleft; + unsigned int ptop, minkeys, thresh; + MDB_cursor mn; + indx_t oldki; + + if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { + minkeys = 2; + thresh = 1; + } else { + minkeys = 1; + thresh = FILL_THRESHOLD; + } + DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", + IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", + mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); + + if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && + NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { + DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", + mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); + return MDB_SUCCESS; + } + + if (mc->mc_snum < 2) { + MDB_page *mp = mc->mc_pg[0]; + if (IS_SUBP(mp)) { + DPUTS("Can't rebalance a subpage, ignoring"); + return MDB_SUCCESS; + } + if (NUMKEYS(mp) == 0) { + DPUTS("tree is completely empty"); + mc->mc_db->md_root = P_INVALID; + mc->mc_db->md_depth = 0; + mc->mc_db->md_leaf_pages = 0; + rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (rc) + return rc; + /* Adjust cursors pointing to mp */ + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + { + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) + continue; + if (m3->mc_pg[0] == mp) { + m3->mc_snum = 0; + m3->mc_top = 0; + m3->mc_flags &= ~C_INITIALIZED; + } + } + } + } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { + int i; + DPUTS("collapsing root page!"); + rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (rc) + return rc; + mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); + rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); + if (rc) + return rc; + mc->mc_db->md_depth--; + mc->mc_db->md_branch_pages--; + mc->mc_ki[0] = mc->mc_ki[1]; + for (i = 1; i<mc->mc_db->md_depth; i++) { + mc->mc_pg[i] = mc->mc_pg[i+1]; + mc->mc_ki[i] = mc->mc_ki[i+1]; + } + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc) continue; + if (!(m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_pg[0] == mp) { + for (i=0; i<mc->mc_db->md_depth; i++) { + m3->mc_pg[i] = m3->mc_pg[i+1]; + m3->mc_ki[i] = m3->mc_ki[i+1]; + } + m3->mc_snum--; + m3->mc_top--; + } + } + } + } else + DPUTS("root page doesn't need rebalancing"); + return MDB_SUCCESS; + } + + /* The parent (branch page) must have at least 2 pointers, + * otherwise the tree is invalid. + */ + ptop = mc->mc_top-1; + mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); + + /* Leaf page fill factor is below the threshold. + * Try to move keys from left or right neighbor, or + * merge with a neighbor page. + */ + + /* Find neighbors. + */ + mdb_cursor_copy(mc, &mn); + mn.mc_xcursor = NULL; + + oldki = mc->mc_ki[mc->mc_top]; + if (mc->mc_ki[ptop] == 0) { + /* We're the leftmost leaf in our parent. + */ + DPUTS("reading right neighbor"); + mn.mc_ki[ptop]++; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); + if (rc) + return rc; + mn.mc_ki[mn.mc_top] = 0; + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + fromleft = 0; + } else { + /* There is at least one neighbor to the left. + */ + DPUTS("reading left neighbor"); + mn.mc_ki[ptop]--; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); + if (rc) + return rc; + mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; + mc->mc_ki[mc->mc_top] = 0; + fromleft = 1; + } + + DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", + mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); + + /* If the neighbor page is above threshold and has enough keys, + * move one key from it. Otherwise we should try to merge them. + * (A branch page must never have less than 2 keys.) + */ + if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { + rc = mdb_node_move(&mn, mc, fromleft); + if (fromleft) { + /* if we inserted on left, bump position up */ + oldki++; + } + } else { + if (!fromleft) { + rc = mdb_page_merge(&mn, mc); + } else { + oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); + mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; + /* We want mdb_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_page_merge(mc, &mn)); + mdb_cursor_copy(&mn, mc); + } + mc->mc_flags &= ~C_EOF; + } + mc->mc_ki[mc->mc_top] = oldki; + return rc; +} + +/** Complete a delete operation started by #mdb_cursor_del(). */ +static int +mdb_cursor_del0(MDB_cursor *mc) +{ + int rc; + MDB_page *mp; + indx_t ki; + unsigned int nkeys; + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + mdb_node_del(mc, mc->mc_db->md_pad); + mc->mc_db->md_entries--; + { + /* Adjust other cursors pointing to mp */ + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3 == mc || m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] == ki) { + m3->mc_flags |= C_DEL; + if (mc->mc_db->md_flags & MDB_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + } + continue; + } else if (m3->mc_ki[mc->mc_top] > ki) { + m3->mc_ki[mc->mc_top]--; + } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + } + } + } + rc = mdb_rebalance(mc); + + if (rc == MDB_SUCCESS) { + /* DB is totally empty now, just bail out. + * Other cursors adjustments were already done + * by mdb_rebalance and aren't needed here. + */ + if (!mc->mc_snum) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + nkeys = NUMKEYS(mp); + + /* Adjust other cursors pointing to mp */ + for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + /* if m3 points past last node in page, find next sibling */ + if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdb_cursor_sibling(m3, 1); + if (rc == MDB_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDB_SUCCESS; + continue; + } + } + if (mc->mc_db->md_flags & MDB_DUPSORT) { + MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not initd it must be reinited. + * Else if node points to a subDB, nothing is needed. + * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset. + */ + if (node->mn_flags & F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node->mn_flags & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } else + mdb_xcursor_init1(m3, node); + } + } + } + } + } + mc->mc_flags |= C_DEL; + } + + if (rc) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_del(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data) +{ + if (!key || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { + /* must ignore any data */ + data = NULL; + } + + return mdb_del0(txn, dbi, key, data, 0); +} + +static int +mdb_del0(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data, unsigned flags) +{ + MDB_cursor mc; + MDB_xcursor mx; + MDB_cursor_op op; + MDB_val rdata, *xdata; + int rc, exact = 0; + DKBUF; + + DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); + + mdb_cursor_init(&mc, txn, dbi, &mx); + + if (data) { + op = MDB_GET_BOTH; + rdata = *data; + xdata = &rdata; + } else { + op = MDB_SET; + xdata = NULL; + flags |= MDB_NODUPDATA; + } + rc = mdb_cursor_set(&mc, key, xdata, op, &exact); + if (rc == 0) { + /* let mdb_page_split know about this cursor if needed: + * delete will trigger a rebalance; if it needs to move + * a node from one page to another, it will have to + * update the parent's separator key(s). If the new sepkey + * is larger than the current one, the parent page may + * run out of space, triggering a split. We need this + * cursor to be consistent until the end of the rebalance. + */ + mc.mc_flags |= C_UNTRACK; + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdb_cursor_del(&mc, flags); + txn->mt_cursors[dbi] = mc.mc_next; + } + return rc; +} + +/** Split a page and insert a new node. + * Set #MDB_TXN_ERROR on failure. + * @param[in,out] mc Cursor pointing to the page and desired insertion index. + * The cursor will be updated to point to the actual page and index where + * the node got inserted after the split. + * @param[in] newkey The key for the newly inserted node. + * @param[in] newdata The data for the newly inserted node. + * @param[in] newpgno The page number, if the new node is a branch node. + * @param[in] nflags The #NODE_ADD_FLAGS for the new node. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, + unsigned int nflags) +{ + unsigned int flags; + int rc = MDB_SUCCESS, new_root = 0, did_split = 0; + indx_t newindx; + pgno_t pgno = 0; + int i, j, split_indx, nkeys, pmax; + MDB_env *env = mc->mc_txn->mt_env; + MDB_node *node; + MDB_val sepkey, rkey, xdata, *rdata = &xdata; + MDB_page *copy = NULL; + MDB_page *mp, *rp, *pp; + int ptop; + MDB_cursor mn; + DKBUF; + + mp = mc->mc_pg[mc->mc_top]; + newindx = mc->mc_ki[mc->mc_top]; + nkeys = NUMKEYS(mp); + + DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, + DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); + + /* Create a right sibling. */ + if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) + return rc; + rp->mp_pad = mp->mp_pad; + DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); + + /* Usually when splitting the root page, the cursor + * height is 1. But when called from mdb_update_key, + * the cursor height may be greater because it walks + * up the stack while finding the branch slot to update. + */ + if (mc->mc_top < 1) { + if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) + goto done; + /* shift current top to make room for new parent */ + for (i=mc->mc_snum; i>0; i--) { + mc->mc_pg[i] = mc->mc_pg[i-1]; + mc->mc_ki[i] = mc->mc_ki[i-1]; + } + mc->mc_pg[0] = pp; + mc->mc_ki[0] = 0; + mc->mc_db->md_root = pp->mp_pgno; + DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); + new_root = mc->mc_db->md_depth++; + + /* Add left (implicit) pointer. */ + if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { + /* undo the pre-push */ + mc->mc_pg[0] = mc->mc_pg[1]; + mc->mc_ki[0] = mc->mc_ki[1]; + mc->mc_db->md_root = mp->mp_pgno; + mc->mc_db->md_depth--; + goto done; + } + mc->mc_snum++; + mc->mc_top++; + ptop = 0; + } else { + ptop = mc->mc_top-1; + DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); + } + + mdb_cursor_copy(mc, &mn); + mn.mc_xcursor = NULL; + mn.mc_pg[mn.mc_top] = rp; + mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; + + if (nflags & MDB_APPEND) { + mn.mc_ki[mn.mc_top] = 0; + sepkey = *newkey; + split_indx = newindx; + nkeys = 0; + } else { + + split_indx = (nkeys+1) / 2; + + if (IS_LEAF2(rp)) { + char *split, *ins; + int x; + unsigned int lsize, rsize, ksize; + /* Move half of the keys to the right sibling */ + x = mc->mc_ki[mc->mc_top] - split_indx; + ksize = mc->mc_db->md_pad; + split = LEAF2KEY(mp, split_indx, ksize); + rsize = (nkeys - split_indx) * ksize; + lsize = (nkeys - split_indx) * sizeof(indx_t); + mp->mp_lower -= lsize; + rp->mp_lower += lsize; + mp->mp_upper += rsize - lsize; + rp->mp_upper -= rsize - lsize; + sepkey.mv_size = ksize; + if (newindx == split_indx) { + sepkey.mv_data = newkey->mv_data; + } else { + sepkey.mv_data = split; + } + if (x<0) { + ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); + memcpy(rp->mp_ptrs, split, rsize); + sepkey.mv_data = rp->mp_ptrs; + memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); + memcpy(ins, newkey->mv_data, ksize); + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + } else { + if (x) + memcpy(rp->mp_ptrs, split, x * ksize); + ins = LEAF2KEY(rp, x, ksize); + memcpy(ins, newkey->mv_data, ksize); + memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); + rp->mp_lower += sizeof(indx_t); + rp->mp_upper -= ksize - sizeof(indx_t); + mc->mc_ki[mc->mc_top] = x; + } + } else { + int psize, nsize, k; + /* Maximum free space in an empty page */ + pmax = env->me_psize - PAGEHDRSZ; + if (IS_LEAF(mp)) + nsize = mdb_leaf_size(env, newkey, newdata); + else + nsize = mdb_branch_size(env, newkey); + nsize = EVEN(nsize); + + /* grab a page to hold a temporary copy */ + copy = mdb_page_malloc(mc->mc_txn, 1); + if (copy == NULL) { + rc = ENOMEM; + goto done; + } + copy->mp_pgno = mp->mp_pgno; + copy->mp_flags = mp->mp_flags; + copy->mp_lower = (PAGEHDRSZ-PAGEBASE); + copy->mp_upper = env->me_psize - PAGEBASE; + + /* prepare to insert */ + for (i=0, j=0; i<nkeys; i++) { + if (i == newindx) { + copy->mp_ptrs[j++] = 0; + } + copy->mp_ptrs[j++] = mp->mp_ptrs[i]; + } + + /* When items are relatively large the split point needs + * to be checked, because being off-by-one will make the + * difference between success or failure in mdb_node_add. + * + * It's also relevant if a page happens to be laid out + * such that one half of its nodes are all "small" and + * the other half of its nodes are "large." If the new + * item is also "large" and falls on the half with + * "large" nodes, it also may not fit. + * + * As a final tweak, if the new item goes on the last + * spot on the page (and thus, onto the new page), bias + * the split so the new page is emptier than the old page. + * This yields better packing during sequential inserts. + */ + if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) { + /* Find split point */ + psize = 0; + if (newindx <= split_indx || newindx >= nkeys) { + i = 0; j = 1; + k = newindx >= nkeys ? nkeys : split_indx+1+IS_LEAF(mp); + } else { + i = nkeys; j = -1; + k = split_indx-1; + } + for (; i!=k; i+=j) { + if (i == newindx) { + psize += nsize; + node = NULL; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + psize += sizeof(pgno_t); + else + psize += NODEDSZ(node); + } + psize = EVEN(psize); + } + if (psize > pmax || i == k-j) { + split_indx = i + (j<0); + break; + } + } + } + if (split_indx == newindx) { + sepkey.mv_size = newkey->mv_size; + sepkey.mv_data = newkey->mv_data; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); + sepkey.mv_size = node->mn_ksize; + sepkey.mv_data = NODEKEY(node); + } + } + } + + DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey))); + + /* Copy separator key to the parent. + */ + if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { + int snum = mc->mc_snum; + mn.mc_snum--; + mn.mc_top--; + did_split = 1; + /* We want other splits to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, + rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); + if (rc) + goto done; + + /* root split? */ + if (mc->mc_snum > snum) { + ptop++; + } + /* Right page might now have changed parent. + * Check if left page also changed parent. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; i<ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + mc->mc_pg[ptop] = mn.mc_pg[ptop]; + if (mn.mc_ki[ptop]) { + mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; + } else { + /* find right page's left sibling */ + mc->mc_ki[ptop] = mn.mc_ki[ptop]; + mdb_cursor_sibling(mc, 0); + } + } + } else { + mn.mc_top--; + rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); + mn.mc_top++; + } + if (rc != MDB_SUCCESS) { + goto done; + } + if (nflags & MDB_APPEND) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[mc->mc_top] = 0; + rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); + if (rc) + goto done; + for (i=0; i<mc->mc_top; i++) + mc->mc_ki[i] = mn.mc_ki[i]; + } else if (!IS_LEAF2(mp)) { + /* Move nodes */ + mc->mc_pg[mc->mc_top] = rp; + i = split_indx; + j = 0; + do { + if (i == newindx) { + rkey.mv_data = newkey->mv_data; + rkey.mv_size = newkey->mv_size; + if (IS_LEAF(mp)) { + rdata = newdata; + } else + pgno = newpgno; + flags = nflags; + /* Update index for the new key. */ + mc->mc_ki[mc->mc_top] = j; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + rkey.mv_data = NODEKEY(node); + rkey.mv_size = node->mn_ksize; + if (IS_LEAF(mp)) { + xdata.mv_data = NODEDATA(node); + xdata.mv_size = NODEDSZ(node); + rdata = &xdata; + } else + pgno = NODEPGNO(node); + flags = node->mn_flags; + } + + if (!IS_LEAF(mp) && j == 0) { + /* First branch index doesn't need key data. */ + rkey.mv_size = 0; + } + + rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); + if (rc) + goto done; + if (i == nkeys) { + i = 0; + j = 0; + mc->mc_pg[mc->mc_top] = copy; + } else { + i++; + j++; + } + } while (i != split_indx); + + nkeys = NUMKEYS(copy); + for (i=0; i<nkeys; i++) + mp->mp_ptrs[i] = copy->mp_ptrs[i]; + mp->mp_lower = copy->mp_lower; + mp->mp_upper = copy->mp_upper; + memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), + env->me_psize - copy->mp_upper - PAGEBASE); + + /* reset back to original page */ + if (newindx < split_indx) { + mc->mc_pg[mc->mc_top] = mp; + } else { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; i<=ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } + if (nflags & MDB_RESERVE) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!(node->mn_flags & F_BIGDATA)) + newdata->mv_data = NODEDATA(node); + } + } else { + if (newindx >= split_indx) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; i<=ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } + } + + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + nkeys = NUMKEYS(mp); + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc) + continue; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (new_root) { + int k; + /* sub cursors may be on different DB */ + if (m3->mc_pg[0] != mp) + continue; + /* root split */ + for (k=new_root; k>=0; k--) { + m3->mc_ki[k+1] = m3->mc_ki[k]; + m3->mc_pg[k+1] = m3->mc_pg[k]; + } + if (m3->mc_ki[0] >= nkeys) { + m3->mc_ki[0] = 1; + } else { + m3->mc_ki[0] = 0; + } + m3->mc_pg[0] = mc->mc_pg[0]; + m3->mc_snum++; + m3->mc_top++; + } + if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) + m3->mc_ki[mc->mc_top]++; + if (m3->mc_ki[mc->mc_top] >= nkeys) { + m3->mc_pg[mc->mc_top] = rp; + m3->mc_ki[mc->mc_top] -= nkeys; + for (i=0; i<mc->mc_top; i++) { + m3->mc_ki[i] = mn.mc_ki[i]; + m3->mc_pg[i] = mn.mc_pg[i]; + } + } + } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && + m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { + m3->mc_ki[ptop]++; + } + if (XCURSOR_INITED(m3) && IS_LEAF(mp)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + } + } + DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); + +done: + if (copy) /* tmp page */ + mdb_page_free(env, copy); + if (rc) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int +mdb_put(MDB_txn *txn, MDB_dbi dbi, + MDB_val *key, MDB_val *data, unsigned int flags) +{ + MDB_cursor mc; + MDB_xcursor mx; + int rc; + + if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + if (flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) + return EINVAL; + + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + mdb_cursor_init(&mc, txn, dbi, &mx); + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdb_cursor_put(&mc, key, data, flags); + txn->mt_cursors[dbi] = mc.mc_next; + return rc; +} + +#ifndef MDB_WBUF +#define MDB_WBUF (1024*1024) +#endif +#define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ + + /** State needed for a double-buffering compacting copy. */ +typedef struct mdb_copy { + MDB_env *mc_env; + MDB_txn *mc_txn; + pthread_mutex_t mc_mutex; + pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ + char *mc_wbuf[2]; + char *mc_over[2]; + int mc_wlen[2]; + int mc_olen[2]; + pgno_t mc_next_pgno; + HANDLE mc_fd; + int mc_toggle; /**< Buffer number in provider */ + int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ + /** Error code. Never cleared if set. Both threads can set nonzero + * to fail the copy. Not mutex-protected, LMDB expects atomic int. + */ + volatile int mc_error; +} mdb_copy; + + /** Dedicated writer thread for compacting copy. */ +static THREAD_RET ESECT CALL_CONV +mdb_env_copythr(void *arg) +{ + mdb_copy *my = arg; + char *ptr; + int toggle = 0, wsize, rc; +#ifdef _WIN32 + DWORD len; +#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) +#else + int len; +#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) +#ifdef SIGPIPE + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGPIPE); + if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) + my->mc_error = rc; +#endif +#endif + + pthread_mutex_lock(&my->mc_mutex); + for(;;) { + while (!my->mc_new) + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ + break; + wsize = my->mc_wlen[toggle]; + ptr = my->mc_wbuf[toggle]; +again: + rc = MDB_SUCCESS; + while (wsize > 0 && !my->mc_error) { + DO_WRITE(rc, my->mc_fd, ptr, wsize, len); + if (!rc) { + rc = ErrCode(); +#if defined(SIGPIPE) && !defined(_WIN32) + if (rc == EPIPE) { + /* Collect the pending SIGPIPE, otherwise at least OS X + * gives it to the process on thread-exit (ITS#8504). + */ + int tmp; + sigwait(&set, &tmp); + } +#endif + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + if (rc) { + my->mc_error = rc; + } + /* If there's an overflow page tail, write it too */ + if (my->mc_olen[toggle]) { + wsize = my->mc_olen[toggle]; + ptr = my->mc_over[toggle]; + my->mc_olen[toggle] = 0; + goto again; + } + my->mc_wlen[toggle] = 0; + toggle ^= 1; + /* Return the empty buffer to provider */ + my->mc_new--; + pthread_cond_signal(&my->mc_cond); + } + pthread_mutex_unlock(&my->mc_mutex); + return (THREAD_RET)0; +#undef DO_WRITE +} + + /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. + * + * @param[in] my control structure. + * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). + */ +static int ESECT +mdb_env_cthr_toggle(mdb_copy *my, int adjust) +{ + pthread_mutex_lock(&my->mc_mutex); + my->mc_new += adjust; + pthread_cond_signal(&my->mc_cond); + while (my->mc_new & 2) /* both buffers in use */ + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + pthread_mutex_unlock(&my->mc_mutex); + + my->mc_toggle ^= (adjust & 1); + /* Both threads reset mc_wlen, to be safe from threading errors */ + my->mc_wlen[my->mc_toggle] = 0; + return my->mc_error; +} + + /** Depth-first tree traversal for compacting copy. + * @param[in] my control structure. + * @param[in,out] pg database root. + * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. + */ +static int ESECT +mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) +{ + MDB_cursor mc = {0}; + MDB_node *ni; + MDB_page *mo, *mp, *leaf; + char *buf, *ptr; + int rc, toggle; + unsigned int i; + + /* Empty DB, nothing to do */ + if (*pg == P_INVALID) + return MDB_SUCCESS; + + mc.mc_snum = 1; + mc.mc_txn = my->mc_txn; + + rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); + if (rc) + return rc; + rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); + if (rc) + return rc; + + /* Make cursor pages writable */ + buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); + if (buf == NULL) + return ENOMEM; + + for (i=0; i<mc.mc_top; i++) { + mdb_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize); + mc.mc_pg[i] = (MDB_page *)ptr; + ptr += my->mc_env->me_psize; + } + + /* This is writable space for a leaf page. Usually not needed. */ + leaf = (MDB_page *)ptr; + + toggle = my->mc_toggle; + while (mc.mc_snum > 0) { + unsigned n; + mp = mc.mc_pg[mc.mc_top]; + n = NUMKEYS(mp); + + if (IS_LEAF(mp)) { + if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { + for (i=0; i<n; i++) { + ni = NODEPTR(mp, i); + if (ni->mn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + + /* Need writable leaf */ + if (mp != leaf) { + mc.mc_pg[mc.mc_top] = leaf; + mdb_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + ni = NODEPTR(mp, i); + } + + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); + rc = mdb_page_get(&mc, pg, &omp, NULL); + if (rc) + goto done; + if (my->mc_wlen[toggle] >= MDB_WBUF) { + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + memcpy(mo, omp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno; + my->mc_next_pgno += omp->mp_pages; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (omp->mp_pages > 1) { + my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); + my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + } else if (ni->mn_flags & F_SUBDATA) { + MDB_db db; + + /* Need writable leaf */ + if (mp != leaf) { + mc.mc_pg[mc.mc_top] = leaf; + mdb_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + ni = NODEPTR(mp, i); + } + + memcpy(&db, NODEDATA(ni), sizeof(db)); + my->mc_toggle = toggle; + rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); + if (rc) + goto done; + toggle = my->mc_toggle; + memcpy(NODEDATA(ni), &db, sizeof(db)); + } + } + } + } else { + mc.mc_ki[mc.mc_top]++; + if (mc.mc_ki[mc.mc_top] < n) { + pgno_t pg; +again: + ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); + pg = NODEPGNO(ni); + rc = mdb_page_get(&mc, pg, &mp, NULL); + if (rc) + goto done; + mc.mc_top++; + mc.mc_snum++; + mc.mc_ki[mc.mc_top] = 0; + if (IS_BRANCH(mp)) { + /* Whenever we advance to a sibling branch page, + * we must proceed all the way down to its first leaf. + */ + mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); + goto again; + } else + mc.mc_pg[mc.mc_top] = mp; + continue; + } + } + if (my->mc_wlen[toggle] >= MDB_WBUF) { + rc = mdb_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + mdb_page_copy(mo, mp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno++; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (mc.mc_top) { + /* Update parent if there is one */ + ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]); + SETPGNO(ni, mo->mp_pgno); + mdb_cursor_pop(&mc); + } else { + /* Otherwise we're done */ + *pg = mo->mp_pgno; + break; + } + } +done: + free(buf); + return rc; +} + + /** Copy environment with compaction. */ +static int ESECT +mdb_env_copyfd1(MDB_env *env, HANDLE fd) +{ + MDB_meta *mm; + MDB_page *mp; + mdb_copy my = {0}; + MDB_txn *txn = NULL; + pthread_t thr; + pgno_t root, new_root; + int rc = MDB_SUCCESS; + +#ifdef _WIN32 + if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) || + !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) { + rc = ErrCode(); + goto done; + } + my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize); + if (my.mc_wbuf[0] == NULL) { + /* _aligned_malloc() sets errno, but we use Windows error codes */ + rc = ERROR_NOT_ENOUGH_MEMORY; + goto done; + } +#else + if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) + return rc; + if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) + goto done2; +#ifdef HAVE_MEMALIGN + my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); + if (my.mc_wbuf[0] == NULL) { + rc = errno; + goto done; + } +#else + { + void *p; + if ((rc = posix_memalign(&p, env->me_os_psize, MDB_WBUF*2)) != 0) + goto done; + my.mc_wbuf[0] = p; + } +#endif +#endif + memset(my.mc_wbuf[0], 0, MDB_WBUF*2); + my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; + my.mc_next_pgno = NUM_METAS; + my.mc_env = env; + my.mc_fd = fd; + rc = THREAD_CREATE(thr, mdb_env_copythr, &my); + if (rc) + goto done; + + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + goto finish; + + mp = (MDB_page *)my.mc_wbuf[0]; + memset(mp, 0, NUM_METAS * env->me_psize); + mp->mp_pgno = 0; + mp->mp_flags = P_META; + mm = (MDB_meta *)METADATA(mp); + mdb_env_init_meta0(env, mm); + mm->mm_address = env->me_metas[0]->mm_address; + + mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); + mp->mp_pgno = 1; + mp->mp_flags = P_META; + *(MDB_meta *)METADATA(mp) = *mm; + mm = (MDB_meta *)METADATA(mp); + + /* Set metapage 1 with current main DB */ + root = new_root = txn->mt_dbs[MAIN_DBI].md_root; + if (root != P_INVALID) { + /* Count free pages + freeDB pages. Subtract from last_pg + * to find the new last_pg, which also becomes the new root. + */ + MDB_ID freecount = 0; + MDB_cursor mc; + MDB_val key, data; + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + freecount += *(MDB_ID *)data.mv_data; + if (rc != MDB_NOTFOUND) + goto finish; + freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + + txn->mt_dbs[FREE_DBI].md_leaf_pages + + txn->mt_dbs[FREE_DBI].md_overflow_pages; + + new_root = txn->mt_next_pgno - 1 - freecount; + mm->mm_last_pg = new_root; + mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + mm->mm_dbs[MAIN_DBI].md_root = new_root; + } else { + /* When the DB is empty, handle it specially to + * fix any breakage like page leaks from ITS#8174. + */ + mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; + } + if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { + mm->mm_txnid = 1; /* use metapage 1 */ + } + + my.mc_wlen[0] = env->me_psize * NUM_METAS; + my.mc_txn = txn; + rc = mdb_env_cwalk(&my, &root, 0); + if (rc == MDB_SUCCESS && root != new_root) { + rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ + } + +finish: + if (rc) + my.mc_error = rc; + mdb_env_cthr_toggle(&my, 1 | MDB_EOF); + rc = THREAD_FINISH(thr); + mdb_txn_abort(txn); + +done: +#ifdef _WIN32 + if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]); + if (my.mc_cond) CloseHandle(my.mc_cond); + if (my.mc_mutex) CloseHandle(my.mc_mutex); +#else + free(my.mc_wbuf[0]); + pthread_cond_destroy(&my.mc_cond); +done2: + pthread_mutex_destroy(&my.mc_mutex); +#endif + return rc ? rc : my.mc_error; +} + + /** Copy environment as-is. */ +static int ESECT +mdb_env_copyfd0(MDB_env *env, HANDLE fd) +{ + MDB_txn *txn = NULL; + mdb_mutexref_t wmutex = NULL; + int rc; + size_t wsize, w3; + char *ptr; +#ifdef _WIN32 + DWORD len, w2; +#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) +#else + ssize_t len; + size_t w2; +#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) +#endif + + /* Do the lock/unlock of the reader mutex before starting the + * write txn. Otherwise other read txns could block writers. + */ + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + return rc; + + if (env->me_txns) { + /* We must start the actual read txn after blocking writers */ + mdb_txn_end(txn, MDB_END_RESET_TMP); + + /* Temporarily block writers until we snapshot the meta pages */ + wmutex = env->me_wmutex; + if (LOCK_MUTEX(rc, env, wmutex)) + goto leave; + + rc = mdb_txn_renew0(txn); + if (rc) { + UNLOCK_MUTEX(wmutex); + goto leave; + } + } + + wsize = env->me_psize * NUM_METAS; + ptr = env->me_map; + w2 = wsize; + while (w2 > 0) { + DO_WRITE(rc, fd, ptr, w2, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + w2 -= len; + continue; + } else { + /* Non-blocking or async handles are not supported */ + rc = EIO; + break; + } + } + if (wmutex) + UNLOCK_MUTEX(wmutex); + + if (rc) + goto leave; + + w3 = txn->mt_next_pgno * env->me_psize; + { + size_t fsize = 0; + if ((rc = mdb_fsize(env->me_fd, &fsize))) + goto leave; + if (w3 > fsize) + w3 = fsize; + } + wsize = w3 - wsize; + while (wsize > 0) { + if (wsize > MAX_WRITE) + w2 = MAX_WRITE; + else + w2 = wsize; + DO_WRITE(rc, fd, ptr, w2, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + +leave: + mdb_txn_abort(txn); + return rc; +} + +int ESECT +mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags) +{ + if (flags & MDB_CP_COMPACT) + return mdb_env_copyfd1(env, fd); + else + return mdb_env_copyfd0(env, fd); +} + +int ESECT +mdb_env_copyfd(MDB_env *env, HANDLE fd) +{ + return mdb_env_copyfd2(env, fd, 0); +} + +int ESECT +mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) +{ + int rc; + MDB_name fname; + HANDLE newfd = INVALID_HANDLE_VALUE; + + rc = mdb_fname_init(path, env->me_flags | MDB_NOLOCK, &fname); + if (rc == MDB_SUCCESS) { + rc = mdb_fopen(env, &fname, MDB_O_COPY, 0666, &newfd); + mdb_fname_destroy(fname); + } + if (rc == MDB_SUCCESS) { + rc = mdb_env_copyfd2(env, newfd, flags); + if (close(newfd) < 0 && rc == MDB_SUCCESS) + rc = ErrCode(); + } + return rc; +} + +int ESECT +mdb_env_copy(MDB_env *env, const char *path) +{ + return mdb_env_copy2(env, path, 0); +} + +int ESECT +mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) +{ + if (flag & ~CHANGEABLE) + return EINVAL; + if (onoff) + env->me_flags |= flag; + else + env->me_flags &= ~flag; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_flags(MDB_env *env, unsigned int *arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = env->me_flags & (CHANGEABLE|CHANGELESS); + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_userctx(MDB_env *env, void *ctx) +{ + if (!env) + return EINVAL; + env->me_userctx = ctx; + return MDB_SUCCESS; +} + +void * ESECT +mdb_env_get_userctx(MDB_env *env) +{ + return env ? env->me_userctx : NULL; +} + +int ESECT +mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) +{ + if (!env) + return EINVAL; +#ifndef NDEBUG + env->me_assert_func = func; +#endif + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_path(MDB_env *env, const char **arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = env->me_path; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) +{ + if (!env || !arg) + return EINVAL; + + *arg = env->me_fd; + return MDB_SUCCESS; +} + +/** Common code for #mdb_stat() and #mdb_env_stat(). + * @param[in] env the environment to operate in. + * @param[in] db the #MDB_db record containing the stats to return. + * @param[out] arg the address of an #MDB_stat structure to receive the stats. + * @return 0, this function always succeeds. + */ +static int ESECT +mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) +{ + arg->ms_psize = env->me_psize; + arg->ms_depth = db->md_depth; + arg->ms_branch_pages = db->md_branch_pages; + arg->ms_leaf_pages = db->md_leaf_pages; + arg->ms_overflow_pages = db->md_overflow_pages; + arg->ms_entries = db->md_entries; + + return MDB_SUCCESS; +} + +int ESECT +mdb_env_stat(MDB_env *env, MDB_stat *arg) +{ + MDB_meta *meta; + + if (env == NULL || arg == NULL) + return EINVAL; + + meta = mdb_env_pick_meta(env); + + return mdb_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); +} + +int ESECT +mdb_env_info(MDB_env *env, MDB_envinfo *arg) +{ + MDB_meta *meta; + + if (env == NULL || arg == NULL) + return EINVAL; + + meta = mdb_env_pick_meta(env); + arg->me_mapaddr = meta->mm_address; + arg->me_last_pgno = meta->mm_last_pg; + arg->me_last_txnid = meta->mm_txnid; + + arg->me_mapsize = env->me_mapsize; + arg->me_maxreaders = env->me_maxreaders; + arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0; + return MDB_SUCCESS; +} + +/** Set the default comparison functions for a database. + * Called immediately after a database is opened to set the defaults. + * The user can then override them with #mdb_set_compare() or + * #mdb_set_dupsort(). + * @param[in] txn A transaction handle returned by #mdb_txn_begin() + * @param[in] dbi A database handle returned by #mdb_dbi_open() + */ +static void +mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) +{ + uint16_t f = txn->mt_dbs[dbi].md_flags; + + txn->mt_dbxs[dbi].md_cmp = + (f & MDB_REVERSEKEY) ? mdb_cmp_memnr : + (f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn; + + txn->mt_dbxs[dbi].md_dcmp = + !(f & MDB_DUPSORT) ? 0 : + ((f & MDB_INTEGERDUP) + ? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint) + : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn)); +} + +int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) +{ + MDB_val key, data; + MDB_dbi i; + MDB_cursor mc; + MDB_db dummy; + int rc, dbflag, exact; + unsigned int unused = 0, seq; + char *namedup; + size_t len; + + if (flags & ~VALID_FLAGS) + return EINVAL; + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + /* main DB? */ + if (!name) { + *dbi = MAIN_DBI; + if (flags & PERSISTENT_FLAGS) { + uint16_t f2 = flags & PERSISTENT_FLAGS; + /* make sure flag changes get committed */ + if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { + txn->mt_dbs[MAIN_DBI].md_flags |= f2; + txn->mt_flags |= MDB_TXN_DIRTY; + } + } + mdb_default_cmp(txn, MAIN_DBI); + return MDB_SUCCESS; + } + + if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { + mdb_default_cmp(txn, MAIN_DBI); + } + + /* Is the DB already open? */ + len = strlen(name); + for (i=CORE_DBS; i<txn->mt_numdbs; i++) { + if (!txn->mt_dbxs[i].md_name.mv_size) { + /* Remember this free slot */ + if (!unused) unused = i; + continue; + } + if (len == txn->mt_dbxs[i].md_name.mv_size && + !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { + *dbi = i; + return MDB_SUCCESS; + } + } + + /* If no free slot and max hit, fail */ + if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) + return MDB_DBS_FULL; + + /* Cannot mix named databases with some mainDB flags */ + if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) + return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; + + /* Find the DB info */ + dbflag = DB_NEW|DB_VALID|DB_USRVALID; + exact = 0; + key.mv_size = len; + key.mv_data = (void *)name; + mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); + rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); + if (rc == MDB_SUCCESS) { + /* make sure this is actually a DB */ + MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) + return MDB_INCOMPATIBLE; + } else if (! (rc == MDB_NOTFOUND && (flags & MDB_CREATE))) { + return rc; + } + + /* Done here so we cannot fail after creating a new DB */ + if ((namedup = strdup(name)) == NULL) + return ENOMEM; + + if (rc) { + /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ + data.mv_size = sizeof(MDB_db); + data.mv_data = &dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.md_root = P_INVALID; + dummy.md_flags = flags & PERSISTENT_FLAGS; + WITH_CURSOR_TRACKING(mc, + rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); + dbflag |= DB_DIRTY; + } + + if (rc) { + free(namedup); + } else { + /* Got info, register DBI in this txn */ + unsigned int slot = unused ? unused : txn->mt_numdbs; + txn->mt_dbxs[slot].md_name.mv_data = namedup; + txn->mt_dbxs[slot].md_name.mv_size = len; + txn->mt_dbxs[slot].md_rel = NULL; + txn->mt_dbflags[slot] = dbflag; + /* txn-> and env-> are the same in read txns, use + * tmp variable to avoid undefined assignment + */ + seq = ++txn->mt_env->me_dbiseqs[slot]; + txn->mt_dbiseqs[slot] = seq; + + memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); + *dbi = slot; + mdb_default_cmp(txn, slot); + if (!unused) { + txn->mt_numdbs++; + } + } + + return rc; +} + +int ESECT +mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) +{ + if (!arg || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_BLOCKED) + return MDB_BAD_TXN; + + if (txn->mt_dbflags[dbi] & DB_STALE) { + MDB_cursor mc; + MDB_xcursor mx; + /* Stale, must read the DB's root. cursor_init does it for us. */ + mdb_cursor_init(&mc, txn, dbi, &mx); + } + return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); +} + +void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) +{ + char *ptr; + if (dbi < CORE_DBS || dbi >= env->me_maxdbs) + return; + ptr = env->me_dbxs[dbi].md_name.mv_data; + /* If there was no name, this was already closed */ + if (ptr) { + env->me_dbxs[dbi].md_name.mv_data = NULL; + env->me_dbxs[dbi].md_name.mv_size = 0; + env->me_dbflags[dbi] = 0; + env->me_dbiseqs[dbi]++; + free(ptr); + } +} + +int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) +{ + /* We could return the flags for the FREE_DBI too but what's the point? */ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; + return MDB_SUCCESS; +} + +/** Add all the DB's pages to the free list. + * @param[in] mc Cursor on the DB to free. + * @param[in] subs non-Zero to check for sub-DBs in this DB. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_drop0(MDB_cursor *mc, int subs) +{ + int rc; + + rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); + if (rc == MDB_SUCCESS) { + MDB_txn *txn = mc->mc_txn; + MDB_node *ni; + MDB_cursor mx; + unsigned int i; + + /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. + * This also avoids any P_LEAF2 pages, which have no nodes. + * Also if the DB doesn't have sub-DBs and has no overflow + * pages, omit scanning leaves. + */ + if ((mc->mc_flags & C_SUB) || + (!subs && !mc->mc_db->md_overflow_pages)) + mdb_cursor_pop(mc); + + mdb_cursor_copy(mc, &mx); + while (mc->mc_snum > 0) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + unsigned n = NUMKEYS(mp); + if (IS_LEAF(mp)) { + for (i=0; i<n; i++) { + ni = NODEPTR(mp, i); + if (ni->mn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + rc = mdb_page_get(mc, pg, &omp, NULL); + if (rc != 0) + goto done; + mdb_cassert(mc, IS_OVERFLOW(omp)); + rc = mdb_midl_append_range(&txn->mt_free_pgs, + pg, omp->mp_pages); + if (rc) + goto done; + mc->mc_db->md_overflow_pages -= omp->mp_pages; + if (!mc->mc_db->md_overflow_pages && !subs) + break; + } else if (subs && (ni->mn_flags & F_SUBDATA)) { + mdb_xcursor_init1(mc, ni); + rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (rc) + goto done; + } + } + if (!subs && !mc->mc_db->md_overflow_pages) + goto pop; + } else { + if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) + goto done; + for (i=0; i<n; i++) { + pgno_t pg; + ni = NODEPTR(mp, i); + pg = NODEPGNO(ni); + /* free it */ + mdb_midl_xappend(txn->mt_free_pgs, pg); + } + } + if (!mc->mc_top) + break; + mc->mc_ki[mc->mc_top] = i; + rc = mdb_cursor_sibling(mc, 1); + if (rc) { + if (rc != MDB_NOTFOUND) + goto done; + /* no more siblings, go back to beginning + * of previous level. + */ +pop: + mdb_cursor_pop(mc); + mc->mc_ki[0] = 0; + for (i=1; i<mc->mc_snum; i++) { + mc->mc_ki[i] = 0; + mc->mc_pg[i] = mx.mc_pg[i]; + } + } + } + /* free it */ + rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); +done: + if (rc) + txn->mt_flags |= MDB_TXN_ERROR; + } else if (rc == MDB_NOTFOUND) { + rc = MDB_SUCCESS; + } + mc->mc_flags &= ~C_INITIALIZED; + return rc; +} + +int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) +{ + MDB_cursor *mc, *m2; + int rc; + + if ((unsigned)del > 1 || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EACCES; + + if (TXN_DBI_CHANGED(txn, dbi)) + return MDB_BAD_DBI; + + rc = mdb_cursor_open(txn, dbi, &mc); + if (rc) + return rc; + + rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); + /* Invalidate the dropped DB's cursors */ + for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) + m2->mc_flags &= ~(C_INITIALIZED|C_EOF); + if (rc) + goto leave; + + /* Can't delete the main DB */ + if (del && dbi >= CORE_DBS) { + rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); + if (!rc) { + txn->mt_dbflags[dbi] = DB_STALE; + mdb_dbi_close(txn->mt_env, dbi); + } else { + txn->mt_flags |= MDB_TXN_ERROR; + } + } else { + /* reset the DB record, mark it dirty */ + txn->mt_dbflags[dbi] |= DB_DIRTY; + txn->mt_dbs[dbi].md_depth = 0; + txn->mt_dbs[dbi].md_branch_pages = 0; + txn->mt_dbs[dbi].md_leaf_pages = 0; + txn->mt_dbs[dbi].md_overflow_pages = 0; + txn->mt_dbs[dbi].md_entries = 0; + txn->mt_dbs[dbi].md_root = P_INVALID; + + txn->mt_flags |= MDB_TXN_DIRTY; + } +leave: + mdb_cursor_close(mc); + return rc; +} + +int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) +{ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + txn->mt_dbxs[dbi].md_cmp = cmp; + return MDB_SUCCESS; +} + +int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) +{ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + txn->mt_dbxs[dbi].md_dcmp = cmp; + return MDB_SUCCESS; +} + +int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) +{ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + txn->mt_dbxs[dbi].md_rel = rel; + return MDB_SUCCESS; +} + +int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) +{ + if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + return EINVAL; + + txn->mt_dbxs[dbi].md_relctx = ctx; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_get_maxkeysize(MDB_env *env) +{ + return ENV_MAXKEY(env); +} + +int ESECT +mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) +{ + unsigned int i, rdrs; + MDB_reader *mr; + char buf[64]; + int rc = 0, first = 1; + + if (!env || !func) + return -1; + if (!env->me_txns) { + return func("(no reader locks)\n", ctx); + } + rdrs = env->me_txns->mti_numreaders; + mr = env->me_txns->mti_readers; + for (i=0; i<rdrs; i++) { + if (mr[i].mr_pid) { + txnid_t txnid = mr[i].mr_txnid; + sprintf(buf, txnid == (txnid_t)-1 ? + "%10d %"Z"x -\n" : "%10d %"Z"x %"Z"u\n", + (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid); + if (first) { + first = 0; + rc = func(" pid thread txnid\n", ctx); + if (rc < 0) + break; + } + rc = func(buf, ctx); + if (rc < 0) + break; + } + } + if (first) { + rc = func("(no active readers)\n", ctx); + } + return rc; +} + +/** Insert pid into list if not already present. + * return -1 if already present. + */ +static int ESECT +mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid) +{ + /* binary search of pid in list */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = ids[0]; + + while( 0 < n ) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = pid - ids[cursor]; + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor; + n -= pivot + 1; + + } else { + /* found, so it's a duplicate */ + return -1; + } + } + + if( val > 0 ) { + ++cursor; + } + ids[0]++; + for (n = ids[0]; n > cursor; n--) + ids[n] = ids[n-1]; + ids[n] = pid; + return 0; +} + +int ESECT +mdb_reader_check(MDB_env *env, int *dead) +{ + if (!env) + return EINVAL; + if (dead) + *dead = 0; + return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS; +} + +/** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ +static int ESECT +mdb_reader_check0(MDB_env *env, int rlocked, int *dead) +{ + mdb_mutexref_t rmutex = rlocked ? NULL : env->me_rmutex; + unsigned int i, j, rdrs; + MDB_reader *mr; + MDB_PID_T *pids, pid; + int rc = MDB_SUCCESS, count = 0; + + rdrs = env->me_txns->mti_numreaders; + pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); + if (!pids) + return ENOMEM; + pids[0] = 0; + mr = env->me_txns->mti_readers; + for (i=0; i<rdrs; i++) { + pid = mr[i].mr_pid; + if (pid && pid != env->me_pid) { + if (mdb_pid_insert(pids, pid) == 0) { + if (!mdb_reader_pid(env, Pidcheck, pid)) { + /* Stale reader found */ + j = i; + if (rmutex) { + if ((rc = LOCK_MUTEX0(rmutex)) != 0) { + if ((rc = mdb_mutex_failed(env, rmutex, rc))) + break; + rdrs = 0; /* the above checked all readers */ + } else { + /* Recheck, a new process may have reused pid */ + if (mdb_reader_pid(env, Pidcheck, pid)) + j = rdrs; + } + } + for (; j<rdrs; j++) + if (mr[j].mr_pid == pid) { + DPRINTF(("clear stale reader pid %u txn %"Z"d", + (unsigned) pid, mr[j].mr_txnid)); + mr[j].mr_pid = 0; + count++; + } + if (rmutex) + UNLOCK_MUTEX(rmutex); + } + } + } + } + free(pids); + if (dead) + *dead = count; + return rc; +} + +#ifdef MDB_ROBUST_SUPPORTED +/** Handle #LOCK_MUTEX0() failure. + * Try to repair the lock file if the mutex owner died. + * @param[in] env the environment handle + * @param[in] mutex LOCK_MUTEX0() mutex + * @param[in] rc LOCK_MUTEX0() error (nonzero) + * @return 0 on success with the mutex locked, or an error code on failure. + */ +static int ESECT +mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc) +{ + int rlocked, rc2; + MDB_meta *meta; + + if (rc == MDB_OWNERDEAD) { + /* We own the mutex. Clean up after dead previous owner. */ + rc = MDB_SUCCESS; + rlocked = (mutex == env->me_rmutex); + if (!rlocked) { + /* Keep mti_txnid updated, otherwise next writer can + * overwrite data which latest meta page refers to. + */ + meta = mdb_env_pick_meta(env); + env->me_txns->mti_txnid = meta->mm_txnid; + /* env is hosed if the dead thread was ours */ + if (env->me_txn) { + env->me_flags |= MDB_FATAL_ERROR; + env->me_txn = NULL; + rc = MDB_PANIC; + } + } + DPRINTF(("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering"))); + rc2 = mdb_reader_check0(env, rlocked, NULL); + if (rc2 == 0) + rc2 = mdb_mutex_consistent(mutex); + if (rc || (rc = rc2)) { + DPRINTF(("LOCK_MUTEX recovery failed, %s", mdb_strerror(rc))); + UNLOCK_MUTEX(mutex); + } + } else { +#ifdef _WIN32 + rc = ErrCode(); +#endif + DPRINTF(("LOCK_MUTEX failed, %s", mdb_strerror(rc))); + } + + return rc; +} +#endif /* MDB_ROBUST_SUPPORTED */ + +#if defined(_WIN32) +/** Convert \b src to new wchar_t[] string with room for \b xtra extra chars */ +static int ESECT +utf8_to_utf16(const char *src, MDB_name *dst, int xtra) +{ + int rc, need = 0; + wchar_t *result = NULL; + for (;;) { /* malloc result, then fill it in */ + need = MultiByteToWideChar(CP_UTF8, 0, src, -1, result, need); + if (!need) { + rc = ErrCode(); + free(result); + return rc; + } + if (!result) { + result = malloc(sizeof(wchar_t) * (need + xtra)); + if (!result) + return ENOMEM; + continue; + } + dst->mn_alloced = 1; + dst->mn_len = need - 1; + dst->mn_val = result; + return MDB_SUCCESS; + } +} +#endif /* defined(_WIN32) */ +/** @} */ diff --git a/contrib/lmdb/midl.c b/contrib/lmdb/midl.c new file mode 100644 index 0000000..1689c3a --- /dev/null +++ b/contrib/lmdb/midl.c @@ -0,0 +1,359 @@ +/** @file midl.c + * @brief ldap bdb back-end ID List functions */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software <http://www.openldap.org/>. + * + * Copyright 2000-2016 The OpenLDAP Foundation. + * Portions Copyright 2001-2017 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * <http://www.OpenLDAP.org/license.html>. + */ + +#include <limits.h> +#include <string.h> +#include <stdlib.h> +#include <errno.h> +#include <sys/types.h> +#include "midl.h" + +/** @defgroup internal LMDB Internals + * @{ + */ +/** @defgroup idls ID List Management + * @{ + */ +#define CMP(x,y) ( (x) < (y) ? -1 : (x) > (y) ) + +unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id ) +{ + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = ids[0]; + + while( 0 < n ) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = CMP( ids[cursor], id ); + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor; + n -= pivot + 1; + + } else { + return cursor; + } + } + + if( val > 0 ) { + ++cursor; + } + return cursor; +} + +#if 0 /* superseded by append/sort */ +int mdb_midl_insert( MDB_IDL ids, MDB_ID id ) +{ + unsigned x, i; + + x = mdb_midl_search( ids, id ); + assert( x > 0 ); + + if( x < 1 ) { + /* internal error */ + return -2; + } + + if ( x <= ids[0] && ids[x] == id ) { + /* duplicate */ + assert(0); + return -1; + } + + if ( ++ids[0] >= MDB_IDL_DB_MAX ) { + /* no room */ + --ids[0]; + return -2; + + } else { + /* insert id */ + for (i=ids[0]; i>x; i--) + ids[i] = ids[i-1]; + ids[x] = id; + } + + return 0; +} +#endif + +MDB_IDL mdb_midl_alloc(int num) +{ + MDB_IDL ids = malloc((num+2) * sizeof(MDB_ID)); + if (ids) { + *ids++ = num; + *ids = 0; + } + return ids; +} + +void mdb_midl_free(MDB_IDL ids) +{ + if (ids) + free(ids-1); +} + +void mdb_midl_shrink( MDB_IDL *idp ) +{ + MDB_IDL ids = *idp; + if (*(--ids) > MDB_IDL_UM_MAX && + (ids = realloc(ids, (MDB_IDL_UM_MAX+2) * sizeof(MDB_ID)))) + { + *ids++ = MDB_IDL_UM_MAX; + *idp = ids; + } +} + +static int mdb_midl_grow( MDB_IDL *idp, int num ) +{ + MDB_IDL idn = *idp-1; + /* grow it */ + idn = realloc(idn, (*idn + num + 2) * sizeof(MDB_ID)); + if (!idn) + return ENOMEM; + *idn++ += num; + *idp = idn; + return 0; +} + +int mdb_midl_need( MDB_IDL *idp, unsigned num ) +{ + MDB_IDL ids = *idp; + num += ids[0]; + if (num > ids[-1]) { + num = (num + num/4 + (256 + 2)) & -256; + if (!(ids = realloc(ids-1, num * sizeof(MDB_ID)))) + return ENOMEM; + *ids++ = num - 2; + *idp = ids; + } + return 0; +} + +int mdb_midl_append( MDB_IDL *idp, MDB_ID id ) +{ + MDB_IDL ids = *idp; + /* Too big? */ + if (ids[0] >= ids[-1]) { + if (mdb_midl_grow(idp, MDB_IDL_UM_MAX)) + return ENOMEM; + ids = *idp; + } + ids[0]++; + ids[ids[0]] = id; + return 0; +} + +int mdb_midl_append_list( MDB_IDL *idp, MDB_IDL app ) +{ + MDB_IDL ids = *idp; + /* Too big? */ + if (ids[0] + app[0] >= ids[-1]) { + if (mdb_midl_grow(idp, app[0])) + return ENOMEM; + ids = *idp; + } + memcpy(&ids[ids[0]+1], &app[1], app[0] * sizeof(MDB_ID)); + ids[0] += app[0]; + return 0; +} + +int mdb_midl_append_range( MDB_IDL *idp, MDB_ID id, unsigned n ) +{ + MDB_ID *ids = *idp, len = ids[0]; + /* Too big? */ + if (len + n > ids[-1]) { + if (mdb_midl_grow(idp, n | MDB_IDL_UM_MAX)) + return ENOMEM; + ids = *idp; + } + ids[0] = len + n; + ids += len; + while (n) + ids[n--] = id++; + return 0; +} + +void mdb_midl_xmerge( MDB_IDL idl, MDB_IDL merge ) +{ + MDB_ID old_id, merge_id, i = merge[0], j = idl[0], k = i+j, total = k; + idl[0] = (MDB_ID)-1; /* delimiter for idl scan below */ + old_id = idl[j]; + while (i) { + merge_id = merge[i--]; + for (; old_id < merge_id; old_id = idl[--j]) + idl[k--] = old_id; + idl[k--] = merge_id; + } + idl[0] = total; +} + +/* Quicksort + Insertion sort for small arrays */ + +#define SMALL 8 +#define MIDL_SWAP(a,b) { itmp=(a); (a)=(b); (b)=itmp; } + +void +mdb_midl_sort( MDB_IDL ids ) +{ + /* Max possible depth of int-indexed tree * 2 items/level */ + int istack[sizeof(int)*CHAR_BIT * 2]; + int i,j,k,l,ir,jstack; + MDB_ID a, itmp; + + ir = (int)ids[0]; + l = 1; + jstack = 0; + for(;;) { + if (ir - l < SMALL) { /* Insertion sort */ + for (j=l+1;j<=ir;j++) { + a = ids[j]; + for (i=j-1;i>=1;i--) { + if (ids[i] >= a) break; + ids[i+1] = ids[i]; + } + ids[i+1] = a; + } + if (jstack == 0) break; + ir = istack[jstack--]; + l = istack[jstack--]; + } else { + k = (l + ir) >> 1; /* Choose median of left, center, right */ + MIDL_SWAP(ids[k], ids[l+1]); + if (ids[l] < ids[ir]) { + MIDL_SWAP(ids[l], ids[ir]); + } + if (ids[l+1] < ids[ir]) { + MIDL_SWAP(ids[l+1], ids[ir]); + } + if (ids[l] < ids[l+1]) { + MIDL_SWAP(ids[l], ids[l+1]); + } + i = l+1; + j = ir; + a = ids[l+1]; + for(;;) { + do i++; while(ids[i] > a); + do j--; while(ids[j] < a); + if (j < i) break; + MIDL_SWAP(ids[i],ids[j]); + } + ids[l+1] = ids[j]; + ids[j] = a; + jstack += 2; + if (ir-i+1 >= j-l) { + istack[jstack] = ir; + istack[jstack-1] = i; + ir = j-1; + } else { + istack[jstack] = j-1; + istack[jstack-1] = l; + l = i; + } + } + } +} + +unsigned mdb_mid2l_search( MDB_ID2L ids, MDB_ID id ) +{ + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = (unsigned)ids[0].mid; + + while( 0 < n ) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = CMP( id, ids[cursor].mid ); + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor; + n -= pivot + 1; + + } else { + return cursor; + } + } + + if( val > 0 ) { + ++cursor; + } + return cursor; +} + +int mdb_mid2l_insert( MDB_ID2L ids, MDB_ID2 *id ) +{ + unsigned x, i; + + x = mdb_mid2l_search( ids, id->mid ); + + if( x < 1 ) { + /* internal error */ + return -2; + } + + if ( x <= ids[0].mid && ids[x].mid == id->mid ) { + /* duplicate */ + return -1; + } + + if ( ids[0].mid >= MDB_IDL_UM_MAX ) { + /* too big */ + return -2; + + } else { + /* insert id */ + ids[0].mid++; + for (i=(unsigned)ids[0].mid; i>x; i--) + ids[i] = ids[i-1]; + ids[x] = *id; + } + + return 0; +} + +int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ) +{ + /* Too big? */ + if (ids[0].mid >= MDB_IDL_UM_MAX) { + return -2; + } + ids[0].mid++; + ids[ids[0].mid] = *id; + return 0; +} + +/** @} */ +/** @} */ diff --git a/contrib/lmdb/midl.h b/contrib/lmdb/midl.h new file mode 100644 index 0000000..df9414d --- /dev/null +++ b/contrib/lmdb/midl.h @@ -0,0 +1,186 @@ +/** @file midl.h + * @brief LMDB ID List header file. + * + * This file was originally part of back-bdb but has been + * modified for use in libmdb. Most of the macros defined + * in this file are unused, just left over from the original. + * + * This file is only used internally in libmdb and its definitions + * are not exposed publicly. + */ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software <http://www.openldap.org/>. + * + * Copyright 2000-2016 The OpenLDAP Foundation. + * Portions Copyright 2001-2017 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * <http://www.OpenLDAP.org/license.html>. + */ + +#ifndef _MDB_MIDL_H_ +#define _MDB_MIDL_H_ + +#include <stddef.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** @defgroup internal LMDB Internals + * @{ + */ + +/** @defgroup idls ID List Management + * @{ + */ + /** A generic unsigned ID number. These were entryIDs in back-bdb. + * Preferably it should have the same size as a pointer. + */ +typedef size_t MDB_ID; + + /** An IDL is an ID List, a sorted array of IDs. The first + * element of the array is a counter for how many actual + * IDs are in the list. In the original back-bdb code, IDLs are + * sorted in ascending order. For libmdb IDLs are sorted in + * descending order. + */ +typedef MDB_ID *MDB_IDL; + +/* IDL sizes - likely should be even bigger + * limiting factors: sizeof(ID), thread stack size + */ +#define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ +#define MDB_IDL_DB_SIZE (1<<MDB_IDL_LOGN) +#define MDB_IDL_UM_SIZE (1<<(MDB_IDL_LOGN+1)) + +#define MDB_IDL_DB_MAX (MDB_IDL_DB_SIZE-1) +#define MDB_IDL_UM_MAX (MDB_IDL_UM_SIZE-1) + +#define MDB_IDL_SIZEOF(ids) (((ids)[0]+1) * sizeof(MDB_ID)) +#define MDB_IDL_IS_ZERO(ids) ( (ids)[0] == 0 ) +#define MDB_IDL_CPY( dst, src ) (memcpy( dst, src, MDB_IDL_SIZEOF( src ) )) +#define MDB_IDL_FIRST( ids ) ( (ids)[1] ) +#define MDB_IDL_LAST( ids ) ( (ids)[(ids)[0]] ) + + /** Current max length of an #mdb_midl_alloc()ed IDL */ +#define MDB_IDL_ALLOCLEN( ids ) ( (ids)[-1] ) + + /** Append ID to IDL. The IDL must be big enough. */ +#define mdb_midl_xappend(idl, id) do { \ + MDB_ID *xidl = (idl), xlen = ++(xidl[0]); \ + xidl[xlen] = (id); \ + } while (0) + + /** Search for an ID in an IDL. + * @param[in] ids The IDL to search. + * @param[in] id The ID to search for. + * @return The index of the first ID greater than or equal to \b id. + */ +unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id ); + + /** Allocate an IDL. + * Allocates memory for an IDL of the given size. + * @return IDL on success, NULL on failure. + */ +MDB_IDL mdb_midl_alloc(int num); + + /** Free an IDL. + * @param[in] ids The IDL to free. + */ +void mdb_midl_free(MDB_IDL ids); + + /** Shrink an IDL. + * Return the IDL to the default size if it has grown larger. + * @param[in,out] idp Address of the IDL to shrink. + */ +void mdb_midl_shrink(MDB_IDL *idp); + + /** Make room for num additional elements in an IDL. + * @param[in,out] idp Address of the IDL. + * @param[in] num Number of elements to make room for. + * @return 0 on success, ENOMEM on failure. + */ +int mdb_midl_need(MDB_IDL *idp, unsigned num); + + /** Append an ID onto an IDL. + * @param[in,out] idp Address of the IDL to append to. + * @param[in] id The ID to append. + * @return 0 on success, ENOMEM if the IDL is too large. + */ +int mdb_midl_append( MDB_IDL *idp, MDB_ID id ); + + /** Append an IDL onto an IDL. + * @param[in,out] idp Address of the IDL to append to. + * @param[in] app The IDL to append. + * @return 0 on success, ENOMEM if the IDL is too large. + */ +int mdb_midl_append_list( MDB_IDL *idp, MDB_IDL app ); + + /** Append an ID range onto an IDL. + * @param[in,out] idp Address of the IDL to append to. + * @param[in] id The lowest ID to append. + * @param[in] n Number of IDs to append. + * @return 0 on success, ENOMEM if the IDL is too large. + */ +int mdb_midl_append_range( MDB_IDL *idp, MDB_ID id, unsigned n ); + + /** Merge an IDL onto an IDL. The destination IDL must be big enough. + * @param[in] idl The IDL to merge into. + * @param[in] merge The IDL to merge. + */ +void mdb_midl_xmerge( MDB_IDL idl, MDB_IDL merge ); + + /** Sort an IDL. + * @param[in,out] ids The IDL to sort. + */ +void mdb_midl_sort( MDB_IDL ids ); + + /** An ID2 is an ID/pointer pair. + */ +typedef struct MDB_ID2 { + MDB_ID mid; /**< The ID */ + void *mptr; /**< The pointer */ +} MDB_ID2; + + /** An ID2L is an ID2 List, a sorted array of ID2s. + * The first element's \b mid member is a count of how many actual + * elements are in the array. The \b mptr member of the first element is unused. + * The array is sorted in ascending order by \b mid. + */ +typedef MDB_ID2 *MDB_ID2L; + + /** Search for an ID in an ID2L. + * @param[in] ids The ID2L to search. + * @param[in] id The ID to search for. + * @return The index of the first ID2 whose \b mid member is greater than or equal to \b id. + */ +unsigned mdb_mid2l_search( MDB_ID2L ids, MDB_ID id ); + + + /** Insert an ID2 into a ID2L. + * @param[in,out] ids The ID2L to insert into. + * @param[in] id The ID2 to insert. + * @return 0 on success, -1 if the ID was already present in the ID2L. + */ +int mdb_mid2l_insert( MDB_ID2L ids, MDB_ID2 *id ); + + /** Append an ID2 into a ID2L. + * @param[in,out] ids The ID2L to append into. + * @param[in] id The ID2 to append. + * @return 0 on success, -2 if the ID2L is too big. + */ +int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ); + +/** @} */ +/** @} */ +#ifdef __cplusplus +} +#endif +#endif /* _MDB_MIDL_H_ */ diff --git a/contrib/murmurhash3/LICENSE b/contrib/murmurhash3/LICENSE new file mode 100644 index 0000000..feb9b11 --- /dev/null +++ b/contrib/murmurhash3/LICENSE @@ -0,0 +1,28 @@ +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer exclusive Copyright and Related Rights (defined below) upon the creator and subsequent owner(s) (each and all, an "owner") of an original work of authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for the purpose of contributing to a commons of creative, cultural and scientific works ("Commons") that the public can reliably and without fear of later claims of infringement build upon, modify, incorporate in other works, reuse and redistribute as freely as possible in any form whatsoever and for any purposes, including without limitation commercial purposes. These owners may contribute to the Commons to promote the ideal of a free culture and the further production of creative, cultural and scientific works, or to gain reputation or greater distribution for their Work in part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any expectation of additional consideration or compensation, the person associating CC0 with a Work (the "Affirmer"), to the extent that he or she is an owner of Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and publicly distribute the Work under its terms, with knowledge of his or her Copyright and Related Rights in the Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be protected by copyright and related or neighboring rights ("Copyright and Related Rights"). Copyright and Related Rights include, but are not limited to, the following: + + the right to reproduce, adapt, distribute, perform, display, communicate, and translate a Work; + moral rights retained by the original author(s) and/or performer(s); + publicity and privacy rights pertaining to a person's image or likeness depicted in a Work; + rights protecting against unfair competition in regards to a Work, subject to the limitations in paragraph 4(a), below; + rights protecting the extraction, dissemination, use and reuse of data in a Work; + database rights (such as those arising under Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, and under any national implementation thereof, including any amended or successor version of such directive); and + other similar, equivalent or corresponding rights throughout the world based on applicable law or treaty, and any national implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention of, applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and unconditionally waives, abandons, and surrenders all of Affirmer's Copyright and Related Rights and associated claims and causes of action, whether now known or unknown (including existing as well as future claims and causes of action), in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each member of the public at large and to the detriment of Affirmer's heirs and successors, fully intending that such Waiver shall not be subject to revocation, rescission, cancellation, termination, or any other legal or equitable action to disrupt the quiet enjoyment of the Work by the public as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason be judged legally invalid or ineffective under applicable law, then the Waiver shall be preserved to the maximum extent permitted taking into account Affirmer's express Statement of Purpose. In addition, to the extent the Waiver is so judged Affirmer hereby grants to each affected person a royalty-free, non transferable, non sublicensable, non exclusive, irrevocable and unconditional license to exercise Affirmer's Copyright and Related Rights in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "License"). The License shall be deemed effective as of the date CC0 was applied by Affirmer to the Work. Should any part of the License for any reason be judged legally invalid or ineffective under applicable law, such partial invalidity or ineffectiveness shall not invalidate the remainder of the License, and in such case Affirmer hereby affirms that he or she will not (i) exercise any of his or her remaining Copyright and Related Rights in the Work or (ii) assert any associated claims and causes of action with respect to the Work, in either case contrary to Affirmer's express Statement of Purpose. + +4. Limitations and Disclaimers. + + No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected by this document. + Affirmer offers the Work as-is and makes no representations or warranties of any kind concerning the Work, express, implied, statutory or otherwise, including without limitation warranties of title, merchantability, fitness for a particular purpose, non infringement, or the absence of latent or other defects, accuracy, or the present or absence of errors, whether or not discoverable, all to the greatest extent permissible under applicable law. + Affirmer disclaims responsibility for clearing rights of other persons that may apply to the Work or any use thereof, including without limitation any person's Copyright and Related Rights in the Work. Further, Affirmer disclaims responsibility for obtaining any necessary consents, permissions or other rights required for any use of the Work. + Affirmer understands and acknowledges that Creative Commons is not a party to this document and has no duty or obligation with respect to this CC0 or use of the Work. diff --git a/contrib/murmurhash3/murmurhash3.c b/contrib/murmurhash3/murmurhash3.c new file mode 100644 index 0000000..06ed7d8 --- /dev/null +++ b/contrib/murmurhash3/murmurhash3.c @@ -0,0 +1,74 @@ +/* This is MurmurHash3. The original C++ code was placed in the public domain + * by its author, Austin Appleby. */ + +#include "murmurhash3.h" + +static inline uint32_t fmix(uint32_t h) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +static inline uint32_t rotl32(uint32_t x, int8_t r) +{ + return (x << r) | (x >> (32 - r)); +} + +uint32_t hash(const char* data, size_t len_) +{ + const int len = (int) len_; + const int nblocks = len / 4; + + uint32_t h1 = 0xc062fb4a; + + uint32_t c1 = 0xcc9e2d51; + uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t*) (data + nblocks * 4); + + int i; + for(i = -nblocks; i; i++) + { + uint32_t k1 = blocks[i]; + + k1 *= c1; + k1 = rotl32(k1, 15); + k1 *= c2; + + h1 ^= k1; + h1 = rotl32(h1, 13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1; + } + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix(h1); + + return h1; +} diff --git a/contrib/murmurhash3/murmurhash3.h b/contrib/murmurhash3/murmurhash3.h new file mode 100644 index 0000000..b4507b3 --- /dev/null +++ b/contrib/murmurhash3/murmurhash3.h @@ -0,0 +1,6 @@ +#pragma once + +#include <stdlib.h> +#include <stdint.h> + +uint32_t hash(const char* data, size_t len); diff --git a/contrib/ucw/LICENSE b/contrib/ucw/LICENSE new file mode 100644 index 0000000..4362b49 --- /dev/null +++ b/contrib/ucw/LICENSE @@ -0,0 +1,502 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + <one line to give the library's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + <signature of Ty Coon>, 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! diff --git a/contrib/ucw/alloc.h b/contrib/ucw/alloc.h new file mode 100644 index 0000000..668c707 --- /dev/null +++ b/contrib/ucw/alloc.h @@ -0,0 +1,36 @@ +/* + * UCW Library -- Generic allocators + * + * (c) 2014 Martin Mares <mj@ucw.cz> + */ + +#ifndef _UCW_ALLOC_H +#define _UCW_ALLOC_H + +/** + * This structure describes a generic allocator. It provides pointers + * to three functions, which handle the actual (re)allocations. + **/ +struct ucw_allocator { + void * (*alloc)(struct ucw_allocator *alloc, size_t size); + void * (*realloc)(struct ucw_allocator *alloc, void *ptr, size_t old_size, size_t new_size); + void (*free)(struct ucw_allocator *alloc, void *ptr); +}; + +/* alloc-std.c */ + +/** + * [[std]] + * This allocator uses <<basics:xmalloc()>>, <<basics:xrealloc()>> and <<basics:xfree()>>. The memory + * it allocates is left unitialized. + **/ +extern struct ucw_allocator ucw_allocator_std; + +/** + * [[zeroing]] + * This allocator uses <<basics:xmalloc()>>, <<basics:xrealloc()>> and <<basics:xfree()>>. All memory + * is zeroed upon allocation. + **/ +extern struct ucw_allocator ucw_allocator_zeroed; + +#endif diff --git a/contrib/ucw/config.h b/contrib/ucw/config.h new file mode 100644 index 0000000..5d3cb4f --- /dev/null +++ b/contrib/ucw/config.h @@ -0,0 +1,58 @@ +/* + * UCW Library -- Configuration-Dependent Definitions + * + * (c) 1997--2012 Martin Mares <mj@ucw.cz> + * (c) 2006 Robert Spalek <robert@ucw.cz> + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _UCW_CONFIG_H +#define _UCW_CONFIG_H + +/* Default page size and pointer alignment */ +#ifndef CPU_PAGE_SIZE +#define CPU_PAGE_SIZE 4096 +#endif +#define CPU_STRUCT_ALIGN sizeof(void *) + +/* Tell libc we're going to use all extensions available */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +/* Types (based on standard C99 integers) */ + +#include <stddef.h> +#include <stdint.h> + +typedef uint8_t byte; /** Exactly 8 bits, unsigned **/ +typedef uint8_t u8; /** Exactly 8 bits, unsigned **/ +typedef int8_t s8; /** Exactly 8 bits, signed **/ +typedef uint16_t u16; /** Exactly 16 bits, unsigned **/ +typedef int16_t s16; /** Exactly 16 bits, signed **/ +typedef uint32_t u32; /** Exactly 32 bits, unsigned **/ +typedef int32_t s32; /** Exactly 32 bits, signed **/ +typedef uint64_t u64; /** Exactly 64 bits, unsigned **/ +typedef int64_t s64; /** Exactly 64 bits, signed **/ + + +#ifndef uint /* Redefining typedef is a C11 feature. */ +typedef unsigned int uint; /** A better pronounceable alias for `unsigned int` **/ +#define uint uint +#endif + +typedef s64 timestamp_t; /** Milliseconds since an unknown epoch **/ + +// FIXME: This should be removed soon +typedef uint uns; /** Backwards compatible alias for `uint' ***/ + +#ifdef CONFIG_UCW_LARGE_FILES +typedef s64 ucw_off_t; /** File position (either 32- or 64-bit, depending on `CONFIG_UCW_LARGE_FILES`). **/ +#else +typedef s32 ucw_off_t; +#endif + +#endif diff --git a/contrib/ucw/lib.h b/contrib/ucw/lib.h new file mode 100644 index 0000000..506f09b --- /dev/null +++ b/contrib/ucw/lib.h @@ -0,0 +1,125 @@ +/* + * The UCW Library -- Miscellaneous Functions + * + * (c) 1997--2014 Martin Mares <mj@ucw.cz> + * (c) 2005--2014 Tomas Valla <tom@ucw.cz> + * (c) 2006 Robert Spalek <robert@ucw.cz> + * (c) 2007 Pavel Charvat <pchar@ucw.cz> + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _UCW_LIB_H +#define _UCW_LIB_H + +#include <stdarg.h> +#include <stdbool.h> +#include <stdlib.h> + +#ifdef CONFIG_UCW_CLEAN_ABI +#define assert_failed ucw_assert_failed +#define assert_failed_msg ucw_assert_failed_msg +#define assert_failed_noinfo ucw_assert_failed_noinfo +#define big_alloc ucw_big_alloc +#define big_alloc_zero ucw_big_alloc_zero +#define big_free ucw_big_free +#define die ucw_die +#define log_die_hook ucw_log_die_hook +#define log_file ucw_log_file +#define log_fork ucw_log_fork +#define log_init ucw_log_init +#define log_pid ucw_log_pid +#define log_title ucw_log_title +#define msg ucw_msg +#define page_alloc ucw_page_alloc +#define page_alloc_zero ucw_page_alloc_zero +#define page_free ucw_page_free +#define page_realloc ucw_page_realloc +#define random_max ucw_random_max +#define random_max_u64 ucw_random_max_u64 +#define random_u32 ucw_random_u32 +#define random_u64 ucw_random_u64 +#define vdie ucw_vdie +#define vmsg ucw_vmsg +#define xfree ucw_xfree +#define xmalloc ucw_xmalloc +#define xmalloc_zero ucw_xmalloc_zero +#define xrealloc ucw_xrealloc +#define xstrdup ucw_xstrdup +#endif + +/*** === Macros for handling structures, offsets and alignment ***/ + +#define CHECK_PTR_TYPE(x, type) ((x)-(type)(x) + (type)(x)) /** Check that a pointer @x is of type @type. Fail compilation if not. **/ +#define PTR_TO(s, i) &((s*)0)->i /** Return OFFSETOF() in form of a pointer. **/ +#define OFFSETOF(s, i) ((uint)offsetof(s, i)) /** Offset of item @i from the start of structure @s **/ +#define SKIP_BACK(s, i, p) ((s *)((char *)p - OFFSETOF(s, i))) /** Given a pointer @p to item @i of structure @s, return a pointer to the start of the struct. **/ + +/** Align an integer @s to the nearest higher multiple of @a (which should be a power of two) **/ +#define ALIGN_TO(s, a) (((s)+a-1)&~(a-1)) + +/** Align a pointer @p to the nearest higher multiple of @s. **/ +#define ALIGN_PTR(p, s) ((uintptr_t)(p) % (s) ? (typeof(p))((uintptr_t)(p) + (s) - (uintptr_t)(p) % (s)) : (p)) + +#define UNALIGNED_PART(ptr, type) (((uintptr_t) (ptr)) % sizeof(type)) + +/*** === Other utility macros ***/ + +#define MIN(a,b) (((a)<(b))?(a):(b)) /** Minimum of two numbers **/ +#define MAX(a,b) (((a)>(b))?(a):(b)) /** Maximum of two numbers **/ +#define CLAMP(x,min,max) ({ typeof(x) _t=x; (_t < min) ? min : (_t > max) ? max : _t; }) /** Clip a number @x to interval [@min,@max] **/ +#define ABS(x) ((x) < 0 ? -(x) : (x)) /** Absolute value **/ +#define ARRAY_SIZE(a) (sizeof(a)/sizeof(*(a))) /** The number of elements of an array **/ +#define STRINGIFY(x) #x /** Convert macro parameter to a string **/ +#define STRINGIFY_EXPANDED(x) STRINGIFY(x) /** Convert an expanded macro parameter to a string **/ +#define GLUE(x,y) x##y /** Glue two tokens together **/ +#define GLUE_(x,y) x##_##y /** Glue two tokens together, separating them by an underscore **/ + +#define COMPARE(x,y) do { if ((x)<(y)) return -1; if ((x)>(y)) return 1; } while(0) /** Numeric comparison function for qsort() **/ +#define REV_COMPARE(x,y) COMPARE(y,x) /** Reverse numeric comparison **/ +#define COMPARE_LT(x,y) do { if ((x)<(y)) return 1; if ((x)>(y)) return 0; } while(0) +#define COMPARE_GT(x,y) COMPARE_LT(y,x) + +#define ROL(x, bits) (((x) << (bits)) | ((uint)(x) >> (sizeof(uint)*8 - (bits)))) /** Bitwise rotation of an unsigned int to the left **/ +#define ROR(x, bits) (((uint)(x) >> (bits)) | ((x) << (sizeof(uint)*8 - (bits)))) /** Bitwise rotation of an unsigned int to the right **/ + +/*** === Shortcuts for GCC Extensions ***/ + +#ifdef __GNUC__ + +#include "ccan/compiler/compiler.h" +#define FORMAT_CHECK(x,y,z) __attribute__((format(x,y,z))) /** Checking of printf-like format strings **/ +#define likely(x) __builtin_expect((x),1) /** Use `if (likely(@x))` if @x is almost always true **/ +#define unlikely(x) __builtin_expect((x),0) /** Use `if (unlikely(@x))` to hint that @x is almost always false **/ + +#if __GNUC__ >= 4 || __GNUC__ == 3 && __GNUC_MINOR__ >= 3 +#define ALWAYS_INLINE inline __attribute__((always_inline)) /** Forcibly inline **/ +#define NO_INLINE __attribute__((noinline)) /** Forcibly uninline **/ +#else +#define ALWAYS_INLINE inline +#endif + +#if __GNUC__ >= 4 +#define LIKE_MALLOC __attribute__((malloc)) /** Function returns a "new" pointer **/ +#define SENTINEL_CHECK __attribute__((sentinel)) /** The last argument must be NULL **/ +#else +#define LIKE_MALLOC +#define SENTINEL_CHECK +#endif + +#else +#error This program requires the GNU C compiler. +#endif + +/*** + * [[logging]] + * + * === Basic logging functions (see <<log:,Logging>> and <ucw/log.h> for more) + ***/ + +#define DBG(x, ...) do { } while(0) +#define DBG_SPOT do { } while(0) +#define ASSERT(x) + +#endif diff --git a/contrib/ucw/mempool-fmt.c b/contrib/ucw/mempool-fmt.c new file mode 100644 index 0000000..6c93e1e --- /dev/null +++ b/contrib/ucw/mempool-fmt.c @@ -0,0 +1,100 @@ +/* + * UCW Library -- Memory Pools (Formatting) + * + * (c) 2005 Martin Mares <mj@ucw.cz> + * (c) 2007 Pavel Charvat <pchar@ucw.cz> + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#include <ucw/lib.h> +#include <ucw/mempool.h> + +#include <stdio.h> +#include <string.h> + +/* FIXME: migrate to Knot DNS version of mempools. */ +#pragma GCC diagnostic ignored "-Wpointer-arith" + +static char * +mp_vprintf_at(struct mempool *mp, size_t ofs, const char *fmt, va_list args) +{ + char *ret = mp_grow(mp, ofs + 1) + ofs; + va_list args2; + va_copy(args2, args); + int cnt = vsnprintf(ret, mp_avail(mp) - ofs, fmt, args2); + va_end(args2); + if (cnt < 0) + { + /* Our C library doesn't support C99 return value of vsnprintf, so we need to iterate */ + do + { + ret = mp_expand(mp) + ofs; + va_copy(args2, args); + cnt = vsnprintf(ret, mp_avail(mp) - ofs, fmt, args2); + va_end(args2); + } + while (cnt < 0); + } + else if ((uint)cnt >= mp_avail(mp) - ofs) + { + ret = mp_grow(mp, ofs + cnt + 1) + ofs; + va_copy(args2, args); + vsnprintf(ret, cnt + 1, fmt, args2); + va_end(args2); + } + mp_end(mp, ret + cnt + 1); + return ret - ofs; +} + +char * +mp_vprintf(struct mempool *mp, const char *fmt, va_list args) +{ + mp_start(mp, 1); + return mp_vprintf_at(mp, 0, fmt, args); +} + +char * +mp_printf(struct mempool *p, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + char *res = mp_vprintf(p, fmt, args); + va_end(args); + return res; +} + +char * +mp_vprintf_append(struct mempool *mp, char *ptr, const char *fmt, va_list args) +{ + size_t ofs = mp_open(mp, ptr); + ASSERT(ofs && !ptr[ofs - 1]); + return mp_vprintf_at(mp, ofs - 1, fmt, args); +} + +char * +mp_printf_append(struct mempool *mp, char *ptr, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + char *res = mp_vprintf_append(mp, ptr, fmt, args); + va_end(args); + return res; +} + +#ifdef TEST + +int main(void) +{ + struct mempool *mp = mp_new(64); + char *x = mp_printf(mp, "<Hello, %s!>", "World"); + fputs(x, stdout); + x = mp_printf_append(mp, x, "<Appended>"); + fputs(x, stdout); + x = mp_printf(mp, "<Hello, %50s!>\n", "World"); + fputs(x, stdout); + return 0; +} + +#endif diff --git a/contrib/ucw/mempool.c b/contrib/ucw/mempool.c new file mode 100644 index 0000000..129b733 --- /dev/null +++ b/contrib/ucw/mempool.c @@ -0,0 +1,601 @@ +/* + * UCW Library -- Memory Pools (One-Time Allocation) + * + * (c) 1997--2014 Martin Mares <mj@ucw.cz> + * (c) 2007--2015 Pavel Charvat <pchar@ucw.cz> + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#undef LOCAL_DEBUG + +#include <ucw/config.h> +#include <ucw/lib.h> +#include <ucw/alloc.h> +#include <ucw/mempool.h> + +#include <string.h> +#include <stdlib.h> + +/* FIXME: migrate to Knot DNS version of mempools. */ +#pragma GCC diagnostic ignored "-Wpointer-arith" + +#define MP_CHUNK_TAIL ALIGN_TO(sizeof(struct mempool_chunk), CPU_STRUCT_ALIGN) +#define MP_SIZE_MAX (SIZE_MAX - MP_CHUNK_TAIL - CPU_PAGE_SIZE) + +struct mempool_chunk { +#ifdef CONFIG_DEBUG + struct mempool *pool; // Can be useful when analysing coredump for memory leaks +#endif + struct mempool_chunk *next; + size_t size; +}; + +static size_t +mp_align_size(size_t size) +{ +#ifdef CONFIG_UCW_POOL_IS_MMAP + size = MAX(size, 64 + MP_CHUNK_TAIL); + return ALIGN_TO(size, CPU_PAGE_SIZE) - MP_CHUNK_TAIL; +#else + return ALIGN_TO(size, CPU_STRUCT_ALIGN); +#endif +} + +static void *mp_allocator_alloc(struct ucw_allocator *a, size_t size) +{ + struct mempool *mp = (struct mempool *) a; + return mp_alloc_fast(mp, size); +} + +static void *mp_allocator_realloc(struct ucw_allocator *a, void *ptr, size_t old_size, size_t new_size) +{ + if (new_size <= old_size) + return ptr; + + /* + * In the future, we might want to do something like mp_realloc(), + * but we have to check that it is indeed the last block in the pool. + */ + struct mempool *mp = (struct mempool *) a; + void *new = mp_alloc_fast(mp, new_size); + memcpy(new, ptr, old_size); + return new; +} + +static void mp_allocator_free(struct ucw_allocator *a UNUSED, void *ptr UNUSED) +{ + // Does nothing +} + +void +mp_init(struct mempool *pool, size_t chunk_size) +{ + chunk_size = mp_align_size(MAX(sizeof(struct mempool), chunk_size)); + *pool = (struct mempool) { + .allocator = { + .alloc = mp_allocator_alloc, + .realloc = mp_allocator_realloc, + .free = mp_allocator_free, + }, + .chunk_size = chunk_size, + .threshold = chunk_size >> 1, + .last_big = &pool->last_big + }; +} + +static void * +mp_new_big_chunk(struct mempool *pool, size_t size) +{ + struct mempool_chunk *chunk; + chunk = malloc(size + MP_CHUNK_TAIL); + if (!chunk) + return NULL; + chunk = (struct mempool_chunk *)((char *)chunk + size); + chunk->size = size; + if (pool) + pool->total_size += size + MP_CHUNK_TAIL; + return chunk; +} + +static void +mp_free_big_chunk(struct mempool *pool, struct mempool_chunk *chunk) +{ + pool->total_size -= chunk->size + MP_CHUNK_TAIL; + free((void *)chunk - chunk->size); +} + +static void * +mp_new_chunk(struct mempool *pool, size_t size) +{ +#ifdef CONFIG_UCW_POOL_IS_MMAP + struct mempool_chunk *chunk; + chunk = page_alloc(size + MP_CHUNK_TAIL) + size; + chunk->size = size; + if (pool) + pool->total_size += size + MP_CHUNK_TAIL; + return chunk; +#else + return mp_new_big_chunk(pool, size); +#endif +} + +static void +mp_free_chunk(struct mempool *pool, struct mempool_chunk *chunk) +{ +#ifdef CONFIG_UCW_POOL_IS_MMAP + pool->total_size -= chunk->size + MP_CHUNK_TAIL; + page_free((void *)chunk - chunk->size, chunk->size + MP_CHUNK_TAIL); +#else + mp_free_big_chunk(pool, chunk); +#endif +} + +struct mempool * +mp_new(size_t chunk_size) +{ + chunk_size = mp_align_size(MAX(sizeof(struct mempool), chunk_size)); + struct mempool_chunk *chunk = mp_new_chunk(NULL, chunk_size); + struct mempool *pool = (void *)chunk - chunk_size; + DBG("Creating mempool %p with %u bytes long chunks", pool, chunk_size); + chunk->next = NULL; +#ifdef CONFIG_DEBUG + chunk->pool = pool; +#endif + *pool = (struct mempool) { + .allocator = { + .alloc = mp_allocator_alloc, + .realloc = mp_allocator_realloc, + .free = mp_allocator_free, + }, + .state = { .free = { chunk_size - sizeof(*pool) }, .last = { chunk } }, + .chunk_size = chunk_size, + .threshold = chunk_size >> 1, + .last_big = &pool->last_big, + .total_size = chunk->size + MP_CHUNK_TAIL, + }; + return pool; +} + +static void +mp_free_chain(struct mempool *pool, struct mempool_chunk *chunk) +{ + while (chunk) + { + struct mempool_chunk *next = chunk->next; + mp_free_chunk(pool, chunk); + chunk = next; + } +} + +static void +mp_free_big_chain(struct mempool *pool, struct mempool_chunk *chunk) +{ + while (chunk) + { + struct mempool_chunk *next = chunk->next; + mp_free_big_chunk(pool, chunk); + chunk = next; + } +} + +void +mp_delete(struct mempool *pool) +{ + DBG("Deleting mempool %p", pool); + mp_free_big_chain(pool, pool->state.last[1]); + mp_free_chain(pool, pool->unused); + mp_free_chain(pool, pool->state.last[0]); // can contain the mempool structure +} + +void +mp_flush(struct mempool *pool) +{ + mp_free_big_chain(pool, pool->state.last[1]); + struct mempool_chunk *chunk, *next; + for (chunk = pool->state.last[0]; chunk && (void *)chunk - chunk->size != pool; chunk = next) + { + next = chunk->next; + chunk->next = pool->unused; + pool->unused = chunk; + } + pool->state.last[0] = chunk; + pool->state.free[0] = chunk ? chunk->size - sizeof(*pool) : 0; + pool->state.last[1] = NULL; + pool->state.free[1] = 0; + pool->state.next = NULL; + pool->last_big = &pool->last_big; +} + +static void +mp_stats_chain(struct mempool *pool, struct mempool_chunk *chunk, struct mempool_stats *stats, uint idx) +{ + while (chunk) + { + stats->chain_size[idx] += chunk->size + MP_CHUNK_TAIL; + stats->chain_count[idx]++; + if (idx < 2) + { + stats->used_size += chunk->size; + if ((byte *)pool == (byte *)chunk - chunk->size) + stats->used_size -= sizeof(*pool); + } + chunk = chunk->next; + } + stats->total_size += stats->chain_size[idx]; +} + +void +mp_stats(struct mempool *pool, struct mempool_stats *stats) +{ + bzero(stats, sizeof(*stats)); + mp_stats_chain(pool, pool->state.last[0], stats, 0); + mp_stats_chain(pool, pool->state.last[1], stats, 1); + mp_stats_chain(pool, pool->unused, stats, 2); + stats->used_size -= pool->state.free[0] + pool->state.free[1]; + ASSERT(stats->total_size == pool->total_size); + ASSERT(stats->used_size <= stats->total_size); +} + +u64 +mp_total_size(struct mempool *pool) +{ + return pool->total_size; +} + +void +mp_shrink(struct mempool *pool, u64 min_total_size) +{ + while (1) + { + struct mempool_chunk *chunk = pool->unused; + if (!chunk || pool->total_size - (chunk->size + MP_CHUNK_TAIL) < min_total_size) + break; + pool->unused = chunk->next; + mp_free_chunk(pool, chunk); + } +} + +void * +mp_alloc_internal(struct mempool *pool, size_t size) +{ + struct mempool_chunk *chunk; + if (size <= pool->threshold) + { + pool->idx = 0; + if (pool->unused) + { + chunk = pool->unused; + pool->unused = chunk->next; + } + else + { + chunk = mp_new_chunk(pool, pool->chunk_size); +#ifdef CONFIG_DEBUG + chunk->pool = pool; +#endif + } + chunk->next = pool->state.last[0]; + pool->state.last[0] = chunk; + pool->state.free[0] = pool->chunk_size - size; + return (void *)chunk - pool->chunk_size; + } + else if (likely(size <= MP_SIZE_MAX)) + { + pool->idx = 1; + size_t aligned = ALIGN_TO(size, CPU_STRUCT_ALIGN); + chunk = mp_new_big_chunk(pool, aligned); + chunk->next = pool->state.last[1]; +#ifdef CONFIG_DEBUG + chunk->pool = pool; +#endif + pool->state.last[1] = chunk; + pool->state.free[1] = aligned - size; + return pool->last_big = (void *)chunk - aligned; + } + else + return NULL; +} + +void * +mp_alloc(struct mempool *pool, size_t size) +{ + return mp_alloc_fast(pool, size); +} + +void * +mp_alloc_noalign(struct mempool *pool, size_t size) +{ + return mp_alloc_fast_noalign(pool, size); +} + +void * +mp_alloc_zero(struct mempool *pool, size_t size) +{ + void *ptr = mp_alloc_fast(pool, size); + bzero(ptr, size); + return ptr; +} + +void * +mp_start_internal(struct mempool *pool, size_t size) +{ + void *ptr = mp_alloc_internal(pool, size); + if (!ptr) + return NULL; + pool->state.free[pool->idx] += size; + return ptr; +} + +void * +mp_start(struct mempool *pool, size_t size) +{ + return mp_start_fast(pool, size); +} + +void * +mp_start_noalign(struct mempool *pool, size_t size) +{ + return mp_start_fast_noalign(pool, size); +} + +void * +mp_grow_internal(struct mempool *pool, size_t size) +{ + if (unlikely(size > MP_SIZE_MAX)) + return NULL; + size_t avail = mp_avail(pool); + void *ptr = mp_ptr(pool); + if (pool->idx) + { + size_t amortized = likely(avail <= MP_SIZE_MAX / 2) ? avail * 2 : MP_SIZE_MAX; + amortized = MAX(amortized, size); + amortized = ALIGN_TO(amortized, CPU_STRUCT_ALIGN); + struct mempool_chunk *chunk = pool->state.last[1], *next = chunk->next; + pool->total_size = pool->total_size - chunk->size + amortized; + void *nptr = realloc(ptr, amortized + MP_CHUNK_TAIL); + if (!nptr) + return NULL; + ptr = nptr; + chunk = ptr + amortized; + chunk->next = next; + chunk->size = amortized; + pool->state.last[1] = chunk; + pool->state.free[1] = amortized; + pool->last_big = ptr; + return ptr; + } + else + { + void *p = mp_start_internal(pool, size); + memcpy(p, ptr, avail); + return p; + } +} + +size_t +mp_open(struct mempool *pool, void *ptr) +{ + return mp_open_fast(pool, ptr); +} + +void * +mp_realloc(struct mempool *pool, void *ptr, size_t size) +{ + return mp_realloc_fast(pool, ptr, size); +} + +void * +mp_realloc_zero(struct mempool *pool, void *ptr, size_t size) +{ + size_t old_size = mp_open_fast(pool, ptr); + ptr = mp_grow(pool, size); + if (size > old_size) + bzero(ptr + old_size, size - old_size); + mp_end(pool, ptr + size); + return ptr; +} + +void * +mp_spread_internal(struct mempool *pool, void *p, size_t size) +{ + void *old = mp_ptr(pool); + void *new = mp_grow_internal(pool, p-old+size); + if (!new) { + return NULL; + } + return p-old+new; +} + +void +mp_restore(struct mempool *pool, struct mempool_state *state) +{ + struct mempool_chunk *chunk, *next; + struct mempool_state s = *state; + for (chunk = pool->state.last[0]; chunk != s.last[0]; chunk = next) + { + next = chunk->next; + chunk->next = pool->unused; + pool->unused = chunk; + } + for (chunk = pool->state.last[1]; chunk != s.last[1]; chunk = next) + { + next = chunk->next; + mp_free_big_chunk(pool, chunk); + } + pool->state = s; + pool->last_big = &pool->last_big; +} + +struct mempool_state * +mp_push(struct mempool *pool) +{ + struct mempool_state state = pool->state; + struct mempool_state *p = mp_alloc_fast(pool, sizeof(*p)); + *p = state; + pool->state.next = p; + return p; +} + +void +mp_pop(struct mempool *pool) +{ + ASSERT(pool->state.next); + mp_restore(pool, pool->state.next); +} + +#ifdef TEST + +#include <ucw/getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +static void +fill(byte *ptr, uint len, uint magic) +{ + while (len--) + *ptr++ = (magic++ & 255); +} + +static void +check(byte *ptr, uint len, uint magic, uint align) +{ + ASSERT(!((uintptr_t)ptr & (align - 1))); + while (len--) + if (*ptr++ != (magic++ & 255)) + ASSERT(0); +} + +int main(int argc, char **argv) +{ + srand(time(NULL)); + log_init(argv[0]); + cf_def_file = NULL; + if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 || argc != optind) + die("Invalid usage"); + + uint max = 1000, n = 0, m = 0, can_realloc = 0; + void *ptr[max]; + struct mempool_state *state[max]; + uint len[max], num[max], align[max]; + struct mempool *mp = mp_new(128), mp_static; + + for (uint i = 0; i < 5000; i++) + { + for (uint j = 0; j < n; j++) + check(ptr[j], len[j], j, align[j]); +#if 0 + DBG("free_small=%u free_big=%u idx=%u chunk_size=%u last_big=%p", mp->state.free[0], mp->state.free[1], mp->idx, mp->chunk_size, mp->last_big); + for (struct mempool_chunk *ch = mp->state.last[0]; ch; ch = ch->next) + DBG("small %p %p %p %d", (byte *)ch - ch->size, ch, ch + 1, ch->size); + for (struct mempool_chunk *ch = mp->state.last[1]; ch; ch = ch->next) + DBG("big %p %p %p %d", (byte *)ch - ch->size, ch, ch + 1, ch->size); +#endif + int r = random_max(100); + if ((r -= 1) < 0) + { + DBG("flush"); + mp_flush(mp); + n = m = 0; + } + else if ((r -= 1) < 0) + { + DBG("delete & new"); + mp_delete(mp); + if (random_max(2)) + mp = mp_new(random_max(0x1000) + 1); + else + mp = &mp_static, mp_init(mp, random_max(512) + 1); + n = m = 0; + } + else if (n < max && (r -= 30) < 0) + { + len[n] = random_max(0x2000); + DBG("alloc(%u)", len[n]); + align[n] = random_max(2) ? CPU_STRUCT_ALIGN : 1; + ptr[n] = (align[n] == 1) ? mp_alloc_fast_noalign(mp, len[n]) : mp_alloc_fast(mp, len[n]); + DBG(" -> (%p)", ptr[n]); + fill(ptr[n], len[n], n); + n++; + can_realloc = 1; + } + else if (n < max && (r -= 20) < 0) + { + len[n] = random_max(0x2000); + DBG("start(%u)", len[n]); + align[n] = random_max(2) ? CPU_STRUCT_ALIGN : 1; + ptr[n] = (align[n] == 1) ? mp_start_fast_noalign(mp, len[n]) : mp_start_fast(mp, len[n]); + DBG(" -> (%p)", ptr[n]); + fill(ptr[n], len[n], n); + n++; + can_realloc = 1; + goto grow; + } + else if (can_realloc && n && (r -= 10) < 0) + { + if (mp_open(mp, ptr[n - 1]) != len[n - 1]) + ASSERT(0); +grow: + { + uint k = n - 1; + for (uint i = random_max(4); i--; ) + { + uint l = len[k]; + len[k] = random_max(0x2000); + DBG("grow(%u)", len[k]); + ptr[k] = mp_grow(mp, len[k]); + DBG(" -> (%p)", ptr[k]); + check(ptr[k], MIN(l, len[k]), k, align[k]); + fill(ptr[k], len[k], k); + } + mp_end(mp, ptr[k] + len[k]); + } + } + else if (can_realloc && n && (r -= 20) < 0) + { + uint i = n - 1, l = len[i]; + DBG("realloc(%p, %u)", ptr[i], len[i]); + ptr[i] = mp_realloc(mp, ptr[i], len[i] = random_max(0x2000)); + DBG(" -> (%p, %u)", ptr[i], len[i]); + check(ptr[i], MIN(len[i], l), i, align[i]); + fill(ptr[i], len[i], i); + } + else if (m < max && (r -= 5) < 0) + { + DBG("push(%u)", m); + num[m] = n; + state[m++] = mp_push(mp); + can_realloc = 0; + } + else if (m && (r -= 2) < 0) + { + m--; + DBG("pop(%u)", m); + mp_pop(mp); + n = num[m]; + can_realloc = 0; + } + else if (m && (r -= 1) < 0) + { + uint i = random_max(m); + DBG("restore(%u)", i); + mp_restore(mp, state[i]); + n = num[m = i]; + can_realloc = 0; + } + else if (can_realloc && n && (r -= 5) < 0) + ASSERT(mp_size(mp, ptr[n - 1]) == len[n - 1]); + else + { + struct mempool_stats stats; + mp_stats(mp, &stats); + } + } + + mp_delete(mp); + return 0; +} + +#endif diff --git a/contrib/ucw/mempool.h b/contrib/ucw/mempool.h new file mode 100644 index 0000000..8cb1d57 --- /dev/null +++ b/contrib/ucw/mempool.h @@ -0,0 +1,565 @@ +/* + * UCW Library -- Memory Pools + * + * (c) 1997--2015 Martin Mares <mj@ucw.cz> + * (c) 2007 Pavel Charvat <pchar@ucw.cz> + * + * This software may be freely distributed and used according to the terms + * of the GNU Lesser General Public License. + */ + +#ifndef _UCW_POOLS_H +#define _UCW_POOLS_H + +#include <ucw/alloc.h> +#include <ucw/config.h> +#include <ucw/lib.h> +#include <string.h> + +#ifdef CONFIG_UCW_CLEAN_ABI +#define mp_alloc ucw_mp_alloc +#define mp_alloc_internal ucw_mp_alloc_internal +#define mp_alloc_noalign ucw_mp_alloc_noalign +#define mp_alloc_zero ucw_mp_alloc_zero +#define mp_delete ucw_mp_delete +#define mp_flush ucw_mp_flush +#define mp_grow_internal ucw_mp_grow_internal +#define mp_init ucw_mp_init +#define mp_memdup ucw_mp_memdup +#define mp_multicat ucw_mp_multicat +#define mp_new ucw_mp_new +#define mp_open ucw_mp_open +#define mp_pop ucw_mp_pop +#define mp_printf ucw_mp_printf +#define mp_printf_append ucw_mp_printf_append +#define mp_push ucw_mp_push +#define mp_realloc ucw_mp_realloc +#define mp_realloc_zero ucw_mp_realloc_zero +#define mp_restore ucw_mp_restore +#define mp_shrink ucw_mp_shrink +#define mp_spread_internal ucw_mp_spread_internal +#define mp_start ucw_mp_start +#define mp_start_internal ucw_mp_start_internal +#define mp_start_noalign ucw_mp_start_noalign +#define mp_stats ucw_mp_stats +#define mp_str_from_mem ucw_mp_str_from_mem +#define mp_strdup ucw_mp_strdup +#define mp_strjoin ucw_mp_strjoin +#define mp_total_size ucw_mp_total_size +#define mp_vprintf ucw_mp_vprintf +#define mp_vprintf_append ucw_mp_vprintf_append +#endif + +/*** + * [[defs]] + * Definitions + * ----------- + ***/ + +/** + * Memory pool state (see @mp_push(), ...). + * You should use this one as an opaque handle only, the insides are internal. + **/ +struct mempool_state { + size_t free[2]; + void *last[2]; + struct mempool_state *next; +}; + +/** + * Memory pool. + * You should use this one as an opaque handle only, the insides are internal. + **/ +struct mempool { + struct ucw_allocator allocator; // This must be the first element + struct mempool_state state; + void *unused, *last_big; + size_t chunk_size, threshold; + uint idx; + u64 total_size; +}; + +struct mempool_stats { /** Mempool statistics. See @mp_stats(). **/ + u64 total_size; /* Real allocated size in bytes */ + u64 used_size; /* Estimated size allocated from mempool to application */ + uint chain_count[3]; /* Number of allocated chunks in small/big/unused chains */ + u64 chain_size[3]; /* Size of allocated chunks in small/big/unused chains */ +}; + +/*** + * [[basic]] + * Basic manipulation + * ------------------ + ***/ + +/** + * Initialize a given mempool structure. + * @chunk_size must be in the interval `[1, SIZE_MAX / 2]`. + * It will allocate memory by this large chunks and take + * memory to satisfy requests from them. + * + * Memory pools can be treated as <<trans:respools,resources>>, see <<trans:res_mempool()>>. + **/ +void mp_init(struct mempool *pool, size_t chunk_size); + +/** + * Allocate and initialize a new memory pool. + * See @mp_init() for @chunk_size limitations. + * + * The new mempool structure is allocated on the new mempool. + * + * Memory pools can be treated as <<trans:respools,resources>>, see <<trans:res_mempool()>>. + **/ +struct mempool *mp_new(size_t chunk_size); + +/** + * Cleanup mempool initialized by mp_init or mp_new. + * Frees all the memory allocated by this mempool and, + * if created by @mp_new(), the @pool itself. + **/ +void mp_delete(struct mempool *pool); + +/** + * Frees all data on a memory pool, but leaves it working. + * It can keep some of the chunks allocated to serve + * further allocation requests. Leaves the @pool alive, + * even if it was created with @mp_new(). + **/ +void mp_flush(struct mempool *pool); + +/** + * Compute some statistics for debug purposes. + * See the definition of the <<struct_mempool_stats,mempool_stats structure>>. + * This function scans the chunk list, so it can be slow. If you are interested + * in total memory consumption only, mp_total_size() is faster. + **/ +void mp_stats(struct mempool *pool, struct mempool_stats *stats); + +/** + * Return how many bytes were allocated by the pool, including unused parts + * of chunks. This function runs in constant time. + **/ +u64 mp_total_size(struct mempool *pool); + +/** + * Release unused chunks of memory reserved for further allocation + * requests, but stop if mp_total_size() would drop below @min_total_size. + **/ +void mp_shrink(struct mempool *pool, u64 min_total_size); + +/*** + * [[alloc]] + * Allocation routines + * ------------------- + ***/ + +/* For internal use only, do not call directly */ +void *mp_alloc_internal(struct mempool *pool, size_t size) LIKE_MALLOC; + +/** + * The function allocates new @size bytes on a given memory pool. + * If the @size is zero, the resulting pointer is undefined, + * but it may be safely reallocated or used as the parameter + * to other functions below. + * + * The resulting pointer is always aligned to a multiple of + * `CPU_STRUCT_ALIGN` bytes and this condition remains true also + * after future reallocations. + **/ +void *mp_alloc(struct mempool *pool, size_t size); + +/** + * The same as @mp_alloc(), but the result may be unaligned. + **/ +void *mp_alloc_noalign(struct mempool *pool, size_t size); + +/** + * The same as @mp_alloc(), but fills the newly allocated memory with zeroes. + **/ +void *mp_alloc_zero(struct mempool *pool, size_t size); + +/** + * Inlined version of @mp_alloc(). + **/ +static inline void *mp_alloc_fast(struct mempool *pool, size_t size) +{ + size_t avail = pool->state.free[0] & ~(size_t)(CPU_STRUCT_ALIGN - 1); + if (size <= avail) + { + pool->state.free[0] = avail - size; + return (byte *)pool->state.last[0] - avail; + } + else + return mp_alloc_internal(pool, size); +} + +/** + * Inlined version of @mp_alloc_noalign(). + **/ +static inline void *mp_alloc_fast_noalign(struct mempool *pool, size_t size) +{ + if (size <= pool->state.free[0]) + { + void *ptr = (byte *)pool->state.last[0] - pool->state.free[0]; + pool->state.free[0] -= size; + return ptr; + } + else + return mp_alloc_internal(pool, size); +} + +/** + * Return a generic allocator representing the given mempool. + **/ +static inline struct ucw_allocator *mp_get_allocator(struct mempool *mp) +{ + return &mp->allocator; +} + +/*** + * [[gbuf]] + * Growing buffers + * --------------- + * + * You do not need to know, how a buffer will need to be large, + * you can grow it incrementally to needed size. You can grow only + * one buffer at a time on a given mempool. + * + * Similar functionality is provided by <<growbuf:,growing buffes>> module. + ***/ + +/* For internal use only, do not call directly */ +void *mp_start_internal(struct mempool *pool, size_t size) LIKE_MALLOC; +void *mp_grow_internal(struct mempool *pool, size_t size); +void *mp_spread_internal(struct mempool *pool, void *p, size_t size); + +static inline uint mp_idx(struct mempool *pool, void *ptr) +{ + return ptr == pool->last_big; +} + +/** + * Open a new growing buffer (at least @size bytes long). + * If the @size is zero, the resulting pointer is undefined, + * but it may be safely reallocated or used as the parameter + * to other functions below. + * + * The resulting pointer is always aligned to a multiple of + * `CPU_STRUCT_ALIGN` bytes and this condition remains true also + * after future reallocations. There is an unaligned version as well. + * + * Keep in mind that you can't make any other pool allocations + * before you "close" the growing buffer with @mp_end(). + */ +void *mp_start(struct mempool *pool, size_t size); +void *mp_start_noalign(struct mempool *pool, size_t size); + +/** + * Inlined version of @mp_start(). + **/ +static inline void *mp_start_fast(struct mempool *pool, size_t size) +{ + size_t avail = pool->state.free[0] & ~(size_t)(CPU_STRUCT_ALIGN - 1); + if (size <= avail) + { + pool->idx = 0; + pool->state.free[0] = avail; + return (byte *)pool->state.last[0] - avail; + } + else + return mp_start_internal(pool, size); +} + +/** + * Inlined version of @mp_start_noalign(). + **/ +static inline void *mp_start_fast_noalign(struct mempool *pool, size_t size) +{ + if (size <= pool->state.free[0]) + { + pool->idx = 0; + return (byte *)pool->state.last[0] - pool->state.free[0]; + } + else + return mp_start_internal(pool, size); +} + +/** + * Return start pointer of the growing buffer allocated by latest @mp_start() or a similar function. + **/ +static inline void *mp_ptr(struct mempool *pool) +{ + return (byte *)pool->state.last[pool->idx] - pool->state.free[pool->idx]; +} + +/** + * Return the number of bytes available for extending the growing buffer. + * (Before a reallocation will be needed). + **/ +static inline size_t mp_avail(struct mempool *pool) +{ + return pool->state.free[pool->idx]; +} + +/** + * Grow the buffer allocated by @mp_start() to be at least @size bytes long + * (@size may be less than @mp_avail(), even zero). Reallocated buffer may + * change its starting position. The content will be unchanged to the minimum + * of the old and new sizes; newly allocated memory will be uninitialized. + * Multiple calls to mp_grow() have amortized linear cost wrt. the maximum value of @size. */ +static inline void *mp_grow(struct mempool *pool, size_t size) +{ + return (size <= mp_avail(pool)) ? mp_ptr(pool) : mp_grow_internal(pool, size); +} + +/** + * Grow the buffer by at least one byte -- equivalent to <<mp_grow(),`mp_grow`>>`(@pool, @mp_avail(pool) + 1)`. + **/ +static inline void *mp_expand(struct mempool *pool) +{ + return mp_grow_internal(pool, mp_avail(pool) + 1); +} + +/** + * Ensure that there is at least @size bytes free after @p, + * if not, reallocate and adjust @p. + **/ +static inline void *mp_spread(struct mempool *pool, void *p, size_t size) +{ + return (((size_t)((byte *)pool->state.last[pool->idx] - (byte *)p) >= size) ? p : mp_spread_internal(pool, p, size)); +} + +/** + * Append a character to the growing buffer. Called with @p pointing after + * the last byte in the buffer, returns a pointer after the last byte + * of the new (possibly reallocated) buffer. + **/ +static inline char *mp_append_char(struct mempool *pool, char *p, uint c) +{ + p = mp_spread(pool, p, 1); + *p++ = c; + return p; +} + +/** + * Append a memory block to the growing buffer. Called with @p pointing after + * the last byte in the buffer, returns a pointer after the last byte + * of the new (possibly reallocated) buffer. + **/ +static inline void *mp_append_block(struct mempool *pool, void *p, const void *block, size_t size) +{ + char *q = mp_spread(pool, p, size); + memcpy(q, block, size); + return q + size; +} + +/** + * Append a string to the growing buffer. Called with @p pointing after + * the last byte in the buffer, returns a pointer after the last byte + * of the new (possibly reallocated) buffer. + **/ +static inline void *mp_append_string(struct mempool *pool, void *p, const char *str) +{ + return mp_append_block(pool, p, str, strlen(str)); +} + +/** + * Close the growing buffer. The @end must point just behind the data, you want to keep + * allocated (so it can be in the interval `[@mp_ptr(@pool), @mp_ptr(@pool) + @mp_avail(@pool)]`). + * Returns a pointer to the beginning of the just closed block. + **/ +static inline void *mp_end(struct mempool *pool, void *end) +{ + void *p = mp_ptr(pool); + pool->state.free[pool->idx] = (byte *)pool->state.last[pool->idx] - (byte *)end; + return p; +} + +/** + * Close the growing buffer as a string. That is, append a zero byte and call mp_end(). + **/ +static inline char *mp_end_string(struct mempool *pool, void *end) +{ + end = mp_append_char(pool, end, 0); + return mp_end(pool, end); +} + +/** + * Return size in bytes of the last allocated memory block (with @mp_alloc() or @mp_end()). + **/ +static inline size_t mp_size(struct mempool *pool, void *ptr) +{ + uint idx = mp_idx(pool, ptr); + return ((byte *)pool->state.last[idx] - (byte *)ptr) - pool->state.free[idx]; +} + +/** + * Open the last memory block (allocated with @mp_alloc() or @mp_end()) + * for growing and return its size in bytes. The contents and the start pointer + * remain unchanged. Do not forget to call @mp_end() to close it. + **/ +size_t mp_open(struct mempool *pool, void *ptr); + +/** + * Inlined version of @mp_open(). + **/ +static inline size_t mp_open_fast(struct mempool *pool, void *ptr) +{ + pool->idx = mp_idx(pool, ptr); + size_t size = ((byte *)pool->state.last[pool->idx] - (byte *)ptr) - pool->state.free[pool->idx]; + pool->state.free[pool->idx] += size; + return size; +} + +/** + * Reallocate the last memory block (allocated with @mp_alloc() or @mp_end()) + * to the new @size. Behavior is similar to @mp_grow(), but the resulting + * block is closed. + **/ +void *mp_realloc(struct mempool *pool, void *ptr, size_t size); + +/** + * The same as @mp_realloc(), but fills the additional bytes (if any) with zeroes. + **/ +void *mp_realloc_zero(struct mempool *pool, void *ptr, size_t size); + +/** + * Inlined version of @mp_realloc(). + **/ +static inline void *mp_realloc_fast(struct mempool *pool, void *ptr, size_t size) +{ + mp_open_fast(pool, ptr); + ptr = mp_grow(pool, size); + mp_end(pool, (byte *)ptr + size); + return ptr; +} + +/*** + * [[store]] + * Storing and restoring state + * --------------------------- + * + * Mempools can remember history of what was allocated and return back + * in time. + ***/ + +/** + * Save the current state of a memory pool. + * Do not call this function with an opened growing buffer. + **/ +static inline void mp_save(struct mempool *pool, struct mempool_state *state) +{ + *state = pool->state; + pool->state.next = state; +} + +/** + * Save the current state to a newly allocated mempool_state structure. + * Do not call this function with an opened growing buffer. + **/ +struct mempool_state *mp_push(struct mempool *pool); + +/** + * Restore the state saved by @mp_save() or @mp_push() and free all + * data allocated after that point (including the state structure itself). + * You can't reallocate the last memory block from the saved state. + **/ +void mp_restore(struct mempool *pool, struct mempool_state *state); + +/** + * Inlined version of @mp_restore(). + **/ +static inline void mp_restore_fast(struct mempool *pool, struct mempool_state *state) +{ + if (pool->state.last[0] != state->last[0] || pool->state.last[1] != state->last[1]) + mp_restore(pool, state); + else + { + pool->state = *state; + pool->last_big = &pool->last_big; + } +} + +/** + * Restore the state saved by the last call to @mp_push(). + * @mp_pop() and @mp_push() works as a stack so you can push more states safely. + **/ +void mp_pop(struct mempool *pool); + + +/*** + * [[string]] + * String operations + * ----------------- + ***/ + +char *mp_strdup(struct mempool *, const char *) LIKE_MALLOC; /** Makes a copy of a string on a mempool. Returns NULL for NULL string. **/ +void *mp_memdup(struct mempool *, const void *, size_t) LIKE_MALLOC; /** Makes a copy of a memory block on a mempool. **/ +/** + * Concatenates all passed strings. The last parameter must be NULL. + * This will concatenate two strings: + * + * char *message = mp_multicat(pool, "hello ", "world", NULL); + **/ +char *mp_multicat(struct mempool *, ...) LIKE_MALLOC SENTINEL_CHECK; +/** + * Concatenates two strings and stores result on @mp. + */ +static inline char *LIKE_MALLOC mp_strcat(struct mempool *mp, const char *x, const char *y) +{ + return mp_multicat(mp, x, y, NULL); +} +/** + * Join strings and place @sep between each two neighboring. + * @p is the mempool to provide memory, @a is array of strings and @n + * tells how many there is of them. + **/ +char *mp_strjoin(struct mempool *p, char **a, uint n, uint sep) LIKE_MALLOC; +/** + * Convert memory block to a string. Makes a copy of the given memory block + * in the mempool @p, adding an extra terminating zero byte at the end. + **/ +char *mp_str_from_mem(struct mempool *p, const void *mem, size_t len) LIKE_MALLOC; + + +/*** + * [[format]] + * Formatted output + * --------------- + ***/ + +/** + * printf() into a in-memory string, allocated on the memory pool. + **/ +char *mp_printf(struct mempool *mp, const char *fmt, ...) FORMAT_CHECK(printf,2,3) LIKE_MALLOC; +/** + * Like @mp_printf(), but uses `va_list` for parameters. + **/ +char *mp_vprintf(struct mempool *mp, const char *fmt, va_list args) LIKE_MALLOC; +/** + * Like @mp_printf(), but it appends the data at the end of string + * pointed to by @ptr. The string is @mp_open()ed, so you have to + * provide something that can be. + * + * Returns pointer to the beginning of the string (the pointer may have + * changed due to reallocation). + * + * In some versions of LibUCW, this function was called mp_append_printf(). However, + * this name turned out to be confusing -- unlike other appending functions, this one is + * not called on an opened growing buffer. The old name will be preserved for backward + * compatibility for the time being. + **/ +char *mp_printf_append(struct mempool *mp, char *ptr, const char *fmt, ...) FORMAT_CHECK(printf,3,4); +#define mp_append_printf mp_printf_append +/** + * Like @mp_printf_append(), but uses `va_list` for parameters. + * + * In some versions of LibUCW, this function was called mp_append_vprintf(). However, + * this name turned out to be confusing -- unlike other appending functions, this one is + * not called on an opened growing buffer. The old name will be preserved for backward + * compatibility for the time being. + **/ +char *mp_vprintf_append(struct mempool *mp, char *ptr, const char *fmt, va_list args); +#define mp_append_vprintf mp_vprintf_append + +#endif diff --git a/contrib/wire.h b/contrib/wire.h new file mode 100644 index 0000000..b02296e --- /dev/null +++ b/contrib/wire.h @@ -0,0 +1,175 @@ +/* Copyright (C) 2011-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ +/*! + * \file + * + * \brief Wire integer operations. + * + * \addtogroup contrib + * @{ + */ + +#pragma once + +#include <stdint.h> +#include <string.h> + +#if defined(__linux__) || defined(__gnu_hurd__) || \ + (defined(__FreeBSD_kernel__) && defined(__GLIBC__)) +# include <endian.h> +# ifndef be64toh +# include <arpa/inet.h> +# include <byteswap.h> +# if BYTE_ORDER == LITTLE_ENDIAN +# define be16toh(x) ntohs(x) +# define be32toh(x) ntohl(x) +# define be64toh(x) bswap_64 (x) +# define le16toh(x) (x) +# define le32toh(x) (x) +# define le64toh(x) (x) +# else +# define be16toh(x) (x) +# define be32toh(x) (x) +# define be64toh(x) (x) +# define le16toh(x) ntohs(x) +# define le32toh(x) ntohl(x) +# define le64toh(x) bswap_64 (x) +# endif +# endif +#elif defined(__FreeBSD__) || defined(__NetBSD__) +# include <sys/endian.h> +#elif defined(__OpenBSD__) +# include <endian.h> +#elif defined(__APPLE__) +# include <libkern/OSByteOrder.h> +# define be16toh(x) OSSwapBigToHostInt16(x) +# define be32toh(x) OSSwapBigToHostInt32(x) +# define be64toh(x) OSSwapBigToHostInt64(x) +# define htobe16(x) OSSwapHostToBigInt16(x) +# define htobe32(x) OSSwapHostToBigInt32(x) +# define htobe64(x) OSSwapHostToBigInt64(x) +# define le16toh(x) OSSwapLittleToHostInt16(x) +# define le32toh(x) OSSwapLittleToHostInt32(x) +# define le64toh(x) OSSwapLittleToHostInt64(x) +# define htole16(x) OSSwapHostToLittleInt16(x) +# define htole32(x) OSSwapHostToLittleInt32(x) +# define htole64(x) OSSwapHostToLittleInt64(x) +#endif + +/*! + * \brief Reads 2 bytes from the wireformat data. + * + * \param pos Data to read the 2 bytes from. + * + * \return The 2 bytes read, in host byte order. + */ +inline static uint16_t wire_read_u16(const uint8_t *pos) +{ + return be16toh(*(uint16_t *)pos); +} + +/*! + * \brief Reads 4 bytes from the wireformat data. + * + * \param pos Data to read the 4 bytes from. + * + * \return The 4 bytes read, in host byte order. + */ +inline static uint32_t wire_read_u32(const uint8_t *pos) +{ + return be32toh(*(uint32_t *)pos); +} + +/*! + * \brief Reads 6 bytes from the wireformat data. + * + * \param pos Data to read the 6 bytes from. + * + * \return The 6 bytes read, in host byte order. + */ +inline static uint64_t wire_read_u48(const uint8_t *pos) +{ + uint64_t input = 0; + memcpy((uint8_t *)&input + 1, pos, 6); + return be64toh(input) >> 8; +} + +/*! + * \brief Read 8 bytes from the wireformat data. + * + * \param pos Data to read the 8 bytes from. + * + * \return The 8 bytes read, in host byte order. + */ +inline static uint64_t wire_read_u64(const uint8_t *pos) +{ + return be64toh(*(uint64_t *)pos); +} + +/*! + * \brief Writes 2 bytes in wireformat. + * + * The data are stored in network byte order (big endian). + * + * \param pos Position where to put the 2 bytes. + * \param data Data to put. + */ +inline static void wire_write_u16(uint8_t *pos, uint16_t data) +{ + *(uint16_t *)pos = htobe16(data); +} + +/*! + * \brief Writes 4 bytes in wireformat. + * + * The data are stored in network byte order (big endian). + * + * \param pos Position where to put the 4 bytes. + * \param data Data to put. + */ +inline static void wire_write_u32(uint8_t *pos, uint32_t data) +{ + *(uint32_t *)pos = htobe32(data); +} + +/*! + * \brief Writes 6 bytes in wireformat. + * + * The data are stored in network byte order (big endian). + * + * \param pos Position where to put the 4 bytes. + * \param data Data to put. + */ +inline static void wire_write_u48(uint8_t *pos, uint64_t data) +{ + uint64_t swapped = htobe64(data << 8); + memcpy(pos, (uint8_t *)&swapped + 1, 6); +} + +/*! + * \brief Writes 8 bytes in wireformat. + * + * The data are stored in network byte order (big endian). + * + * \param pos Position where to put the 8 bytes. + * \param data Data to put. + */ +inline static void wire_write_u64(uint8_t *pos, uint64_t data) +{ + *(uint64_t *)pos = htobe64(data); +} + +/*! @} */ |