diff options
Diffstat (limited to 'storage/mroonga/vendor/groonga/lib/ii.c')
-rw-r--r-- | storage/mroonga/vendor/groonga/lib/ii.c | 12816 |
1 files changed, 12816 insertions, 0 deletions
diff --git a/storage/mroonga/vendor/groonga/lib/ii.c b/storage/mroonga/vendor/groonga/lib/ii.c new file mode 100644 index 00000000..2abd0747 --- /dev/null +++ b/storage/mroonga/vendor/groonga/lib/ii.c @@ -0,0 +1,12816 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2009-2017 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ +#include "grn.h" +#include <stdio.h> +#include <fcntl.h> +#include <string.h> +#include <sys/stat.h> + +#ifdef WIN32 +# include <io.h> +# include <share.h> +#endif /* WIN32 */ + +#include "grn_ii.h" +#include "grn_ctx_impl.h" +#include "grn_token_cursor.h" +#include "grn_pat.h" +#include "grn_db.h" +#include "grn_output.h" +#include "grn_scorer.h" +#include "grn_util.h" + +#ifdef GRN_WITH_ONIGMO +# define GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH +#endif + +#ifdef GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH +# include "grn_string.h" +# include <onigmo.h> +#endif + +#define MAX_PSEG 0x20000 +#define MAX_PSEG_SMALL 0x00200 +/* MAX_PSEG_MEDIUM has enough space for the following source: + * * Single source. + * * Source is a fixed size column or _key of a table. + * * Source column is a scalar column. + * * Lexicon doesn't have tokenizer. + */ +#define MAX_PSEG_MEDIUM 0x10000 +#define S_CHUNK (1 << GRN_II_W_CHUNK) +#define W_SEGMENT 18 +#define S_SEGMENT (1 << W_SEGMENT) +#define W_ARRAY_ELEMENT 3 +#define S_ARRAY_ELEMENT (1 << W_ARRAY_ELEMENT) +#define W_ARRAY (W_SEGMENT - W_ARRAY_ELEMENT) +#define ARRAY_MASK_IN_A_SEGMENT ((1 << W_ARRAY) - 1) + +#define S_GARBAGE (1<<12) + +#define CHUNK_SPLIT 0x80000000 +#define CHUNK_SPLIT_THRESHOLD 0x60000 + +#define MAX_N_ELEMENTS 5 + +#define DEFINE_NAME(ii) \ + const char *name; \ + char name_buffer[GRN_TABLE_MAX_KEY_SIZE]; \ + int name_size; \ + do { \ + if (DB_OBJ(ii)->id == GRN_ID_NIL) { \ + name = "(temporary)"; \ + name_size = strlen(name); \ + } else { \ + name_size = grn_obj_name(ctx, (grn_obj *)ii, \ + name_buffer, GRN_TABLE_MAX_KEY_SIZE); \ + name = name_buffer; \ + } \ + } while (GRN_FALSE) + +#define LSEG(pos) ((pos) >> 16) +#define LPOS(pos) (((pos) & 0xffff) << 2) +#define SEG2POS(seg,pos) ((((uint32_t)(seg)) << 16) + (((uint32_t)(pos)) >> 2)) + +#ifndef S_IRUSR +# define S_IRUSR 0400 +#endif /* S_IRUSR */ +#ifndef S_IWUSR +# define S_IWUSR 0200 +#endif /* S_IWUSR */ + +static grn_bool grn_ii_cursor_set_min_enable = GRN_TRUE; +static double grn_ii_select_too_many_index_match_ratio = -1; +static double grn_ii_estimate_size_for_query_reduce_ratio = 0.9; +static grn_bool grn_ii_overlap_token_skip_enable = GRN_FALSE; +static uint32_t grn_ii_builder_block_threshold_force = 0; +static uint32_t grn_ii_max_n_segments_small = MAX_PSEG_SMALL; +static uint32_t grn_ii_max_n_chunks_small = GRN_II_MAX_CHUNK_SMALL; + +void +grn_ii_init_from_env(void) +{ + { + char grn_ii_cursor_set_min_enable_env[GRN_ENV_BUFFER_SIZE]; + grn_getenv("GRN_II_CURSOR_SET_MIN_ENABLE", + grn_ii_cursor_set_min_enable_env, + GRN_ENV_BUFFER_SIZE); + if (strcmp(grn_ii_cursor_set_min_enable_env, "no") == 0) { + grn_ii_cursor_set_min_enable = GRN_FALSE; + } else { + grn_ii_cursor_set_min_enable = GRN_TRUE; + } + } + + { + char grn_ii_select_too_many_index_match_ratio_env[GRN_ENV_BUFFER_SIZE]; + grn_getenv("GRN_II_SELECT_TOO_MANY_INDEX_MATCH_RATIO", + grn_ii_select_too_many_index_match_ratio_env, + GRN_ENV_BUFFER_SIZE); + if (grn_ii_select_too_many_index_match_ratio_env[0]) { + grn_ii_select_too_many_index_match_ratio = + atof(grn_ii_select_too_many_index_match_ratio_env); + } + } + + { + char grn_ii_estimate_size_for_query_reduce_ratio_env[GRN_ENV_BUFFER_SIZE]; + grn_getenv("GRN_II_ESTIMATE_SIZE_FOR_QUERY_REDUCE_RATIO", + grn_ii_estimate_size_for_query_reduce_ratio_env, + GRN_ENV_BUFFER_SIZE); + if (grn_ii_estimate_size_for_query_reduce_ratio_env[0]) { + grn_ii_estimate_size_for_query_reduce_ratio = + atof(grn_ii_estimate_size_for_query_reduce_ratio_env); + } + } + + { + char grn_ii_overlap_token_skip_enable_env[GRN_ENV_BUFFER_SIZE]; + grn_getenv("GRN_II_OVERLAP_TOKEN_SKIP_ENABLE", + grn_ii_overlap_token_skip_enable_env, + GRN_ENV_BUFFER_SIZE); + if (grn_ii_overlap_token_skip_enable_env[0]) { + grn_ii_overlap_token_skip_enable = GRN_TRUE; + } else { + grn_ii_overlap_token_skip_enable = GRN_FALSE; + } + } + + { + char grn_ii_builder_block_threshold_env[GRN_ENV_BUFFER_SIZE]; + grn_getenv("GRN_II_BUILDER_BLOCK_THRESHOLD", + grn_ii_builder_block_threshold_env, + GRN_ENV_BUFFER_SIZE); + if (grn_ii_builder_block_threshold_env[0]) { + grn_ii_builder_block_threshold_force = + grn_atoui(grn_ii_builder_block_threshold_env, + grn_ii_builder_block_threshold_env + + strlen(grn_ii_builder_block_threshold_env), + NULL); + } else { + grn_ii_builder_block_threshold_force = 0; + } + } + + { + char grn_ii_max_n_segments_small_env[GRN_ENV_BUFFER_SIZE]; + grn_getenv("GRN_II_MAX_N_SEGMENTS_SMALL", + grn_ii_max_n_segments_small_env, + GRN_ENV_BUFFER_SIZE); + if (grn_ii_max_n_segments_small_env[0]) { + grn_ii_max_n_segments_small = + grn_atoui(grn_ii_max_n_segments_small_env, + grn_ii_max_n_segments_small_env + + strlen(grn_ii_max_n_segments_small_env), + NULL); + if (grn_ii_max_n_segments_small > MAX_PSEG) { + grn_ii_max_n_segments_small = MAX_PSEG; + } + } + } + + { + char grn_ii_max_n_chunks_small_env[GRN_ENV_BUFFER_SIZE]; + grn_getenv("GRN_II_MAX_N_CHUNKS_SMALL", + grn_ii_max_n_chunks_small_env, + GRN_ENV_BUFFER_SIZE); + if (grn_ii_max_n_chunks_small_env[0]) { + grn_ii_max_n_chunks_small = + grn_atoui(grn_ii_max_n_chunks_small_env, + grn_ii_max_n_chunks_small_env + + strlen(grn_ii_max_n_chunks_small_env), + NULL); + if (grn_ii_max_n_chunks_small > GRN_II_MAX_CHUNK) { + grn_ii_max_n_chunks_small = GRN_II_MAX_CHUNK; + } + } + } +} + +void +grn_ii_cursor_set_min_enable_set(grn_bool enable) +{ + grn_ii_cursor_set_min_enable = enable; +} + +grn_bool +grn_ii_cursor_set_min_enable_get(void) +{ + return grn_ii_cursor_set_min_enable; +} + +/* segment */ + +inline static uint32_t +segment_get(grn_ctx *ctx, grn_ii *ii) +{ + uint32_t pseg; + if (ii->header->bgqtail == ((ii->header->bgqhead + 1) & (GRN_II_BGQSIZE - 1))) { + pseg = ii->header->bgqbody[ii->header->bgqtail]; + ii->header->bgqtail = (ii->header->bgqtail + 1) & (GRN_II_BGQSIZE - 1); + } else { + pseg = ii->header->pnext; +#ifndef CUT_OFF_COMPATIBILITY + if (!pseg) { + int i; + uint32_t pmax = 0; + char *used; + uint32_t max_segment = ii->seg->header->max_segment; + used = GRN_CALLOC(max_segment); + if (!used) { return max_segment; } + for (i = 0; i < GRN_II_MAX_LSEG && i < max_segment; i++) { + if ((pseg = ii->header->ainfo[i]) != GRN_II_PSEG_NOT_ASSIGNED) { + if (pseg > pmax) { pmax = pseg; } + used[pseg] = 1; + } + if ((pseg = ii->header->binfo[i]) != GRN_II_PSEG_NOT_ASSIGNED) { + if (pseg > pmax) { pmax = pseg; } + used[pseg] = 1; + } + } + for (pseg = 0; pseg < max_segment && used[pseg]; pseg++) ; + GRN_FREE(used); + ii->header->pnext = pmax + 1; + } else +#endif /* CUT_OFF_COMPATIBILITY */ + if (ii->header->pnext < ii->seg->header->max_segment) { + ii->header->pnext++; + } + } + return pseg; +} + +inline static grn_rc +segment_get_clear(grn_ctx *ctx, grn_ii *ii, uint32_t *pseg) +{ + uint32_t seg = segment_get(ctx, ii); + if (seg < ii->seg->header->max_segment) { + void *p = NULL; + GRN_IO_SEG_REF(ii->seg, seg, p); + if (!p) { return GRN_NO_MEMORY_AVAILABLE; } + memset(p, 0, S_SEGMENT); + GRN_IO_SEG_UNREF(ii->seg, seg); + *pseg = seg; + return GRN_SUCCESS; + } else { + return GRN_NO_MEMORY_AVAILABLE; + } +} + +inline static grn_rc +buffer_segment_new(grn_ctx *ctx, grn_ii *ii, uint32_t *segno) +{ + uint32_t lseg, pseg; + if (*segno < GRN_II_MAX_LSEG) { + if (ii->header->binfo[*segno] != GRN_II_PSEG_NOT_ASSIGNED) { + return GRN_INVALID_ARGUMENT; + } + lseg = *segno; + } else { + for (lseg = 0; lseg < GRN_II_MAX_LSEG; lseg++) { + if (ii->header->binfo[lseg] == GRN_II_PSEG_NOT_ASSIGNED) { break; } + } + if (lseg == GRN_II_MAX_LSEG) { return GRN_NO_MEMORY_AVAILABLE; } + *segno = lseg; + } + pseg = segment_get(ctx, ii); + if (pseg < ii->seg->header->max_segment) { + ii->header->binfo[lseg] = pseg; + if (lseg >= ii->header->bmax) { ii->header->bmax = lseg + 1; } + return GRN_SUCCESS; + } else { + return GRN_NO_MEMORY_AVAILABLE; + } +} + +static grn_rc +buffer_segment_reserve(grn_ctx *ctx, grn_ii *ii, + uint32_t *lseg0, uint32_t *pseg0, + uint32_t *lseg1, uint32_t *pseg1) +{ + uint32_t i = 0; + for (;; i++) { + if (i == GRN_II_MAX_LSEG) { + DEFINE_NAME(ii); + MERR("[ii][buffer][segment][reserve] " + "couldn't find a free buffer: <%.*s>: max:<%u>", + name_size, name, + GRN_II_MAX_LSEG); + return ctx->rc; + } + if (ii->header->binfo[i] == GRN_II_PSEG_NOT_ASSIGNED) { break; } + } + *lseg0 = i++; + for (;; i++) { + if (i == GRN_II_MAX_LSEG) { + DEFINE_NAME(ii); + MERR("[ii][buffer][segment][reserve] " + "couldn't find two free buffers: " + "<%.*s>: " + "found:<%u>, max:<%u>", + name_size, name, + *lseg0, GRN_II_MAX_LSEG); + return ctx->rc; + } + if (ii->header->binfo[i] == GRN_II_PSEG_NOT_ASSIGNED) { break; } + } + *lseg1 = i; + if ((*pseg0 = segment_get(ctx, ii)) == ii->seg->header->max_segment) { + DEFINE_NAME(ii); + MERR("[ii][buffer][segment][reserve] " + "couldn't allocate a free segment: <%.*s>: " + "buffer:<%u>, max:<%u>", + name_size, name, + *lseg0, ii->seg->header->max_segment); + return ctx->rc; + } + if ((*pseg1 = segment_get(ctx, ii)) == ii->seg->header->max_segment) { + DEFINE_NAME(ii); + MERR("[ii][buffer][segment][reserve] " + "couldn't allocate two free segments: " + "<%.*s>: " + "found:<%u>, not-found:<%u>, max:<%u>", + name_size, name, + *lseg0, *lseg1, ii->seg->header->max_segment); + return ctx->rc; + } + /* + { + uint32_t pseg; + char *used = GRN_CALLOC(ii->seg->header->max_segment); + if (!used) { return GRN_NO_MEMORY_AVAILABLE; } + for (i = 0; i < GRN_II_MAX_LSEG; i++) { + if ((pseg = ii->header->ainfo[i]) != GRN_II_PSEG_NOT_ASSIGNED) { + used[pseg] = 1; + } + if ((pseg = ii->header->binfo[i]) != GRN_II_PSEG_NOT_ASSIGNED) { + used[pseg] = 1; + } + } + for (pseg = 0;; pseg++) { + if (pseg == ii->seg->header->max_segment) { + GRN_FREE(used); + return GRN_NO_MEMORY_AVAILABLE; + } + if (!used[pseg]) { break; } + } + *pseg0 = pseg++; + for (;; pseg++) { + if (pseg == ii->seg->header->max_segment) { + GRN_FREE(used); + return GRN_NO_MEMORY_AVAILABLE; + } + if (!used[pseg]) { break; } + } + *pseg1 = pseg; + GRN_FREE(used); + } + */ + return ctx->rc; +} + +#define BGQENQUE(lseg) do {\ + if (ii->header->binfo[lseg] != GRN_II_PSEG_NOT_ASSIGNED) {\ + ii->header->bgqbody[ii->header->bgqhead] = ii->header->binfo[lseg];\ + ii->header->bgqhead = (ii->header->bgqhead + 1) & (GRN_II_BGQSIZE - 1);\ + GRN_ASSERT(ii->header->bgqhead != ii->header->bgqtail);\ + }\ +} while (0) + +inline static void +buffer_segment_update(grn_ii *ii, uint32_t lseg, uint32_t pseg) +{ + BGQENQUE(lseg); + // smb_wmb(); + ii->header->binfo[lseg] = pseg; + if (lseg >= ii->header->bmax) { ii->header->bmax = lseg + 1; } +} + +inline static void +buffer_segment_clear(grn_ii *ii, uint32_t lseg) +{ + BGQENQUE(lseg); + // smb_wmb(); + ii->header->binfo[lseg] = GRN_II_PSEG_NOT_ASSIGNED; +} + +/* chunk */ + +#define HEADER_CHUNK_AT(ii,offset) \ + ((((ii)->header->chunks[((offset) >> 3)]) >> ((offset) & 7)) & 1) + +#define HEADER_CHUNK_ON(ii,offset) \ + (((ii)->header->chunks[((offset) >> 3)]) |= (1 << ((offset) & 7))) + +#define HEADER_CHUNK_OFF(ii,offset) \ + (((ii)->header->chunks[((offset) >> 3)]) &= ~(1 << ((offset) & 7))) + +#define N_GARBAGES_TH 1 + +#define N_GARBAGES ((S_GARBAGE - (sizeof(uint32_t) * 4))/(sizeof(uint32_t))) + +typedef struct { + uint32_t head; + uint32_t tail; + uint32_t nrecs; + uint32_t next; + uint32_t recs[N_GARBAGES]; +} grn_ii_ginfo; + +#define WIN_MAP(chunk,ctx,iw,seg,pos,size,mode)\ + grn_io_win_map(chunk, ctx, iw,\ + ((seg) >> GRN_II_N_CHUNK_VARIATION),\ + (((seg) & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << GRN_II_W_LEAST_CHUNK) + (pos),\ + size, mode) +/* +static int new_histogram[32]; +static int free_histogram[32]; +*/ +static grn_rc +chunk_new(grn_ctx *ctx, grn_ii *ii, uint32_t *res, uint32_t size) +{ + uint32_t n_chunks; + + n_chunks = ii->chunk->header->max_segment; + + /* + if (size) { + int m, es = size - 1; + GRN_BIT_SCAN_REV(es, m); + m++; + new_histogram[m]++; + } + */ + if (size > S_CHUNK) { + int i, j; + uint32_t n = (size + S_CHUNK - 1) >> GRN_II_W_CHUNK; + for (i = 0, j = -1; i < n_chunks; i++) { + if (HEADER_CHUNK_AT(ii, i)) { + j = i; + } else { + if (i == j + n) { + j++; + *res = j << GRN_II_N_CHUNK_VARIATION; + for (; j <= i; j++) { HEADER_CHUNK_ON(ii, j); } + return GRN_SUCCESS; + } + } + } + { + DEFINE_NAME(ii); + MERR("[ii][chunk][new] index is full: " + "<%.*s>: " + "size:<%u>, n-chunks:<%u>", + name_size, name, + size, n_chunks); + } + return ctx->rc; + } else { + uint32_t *vp; + int m, aligned_size; + if (size > (1 << GRN_II_W_LEAST_CHUNK)) { + int es = size - 1; + GRN_BIT_SCAN_REV(es, m); + m++; + } else { + m = GRN_II_W_LEAST_CHUNK; + } + aligned_size = 1 << (m - GRN_II_W_LEAST_CHUNK); + if (ii->header->ngarbages[m - GRN_II_W_LEAST_CHUNK] > N_GARBAGES_TH) { + grn_ii_ginfo *ginfo; + uint32_t *gseg; + grn_io_win iw, iw_; + iw_.addr = NULL; + gseg = &ii->header->garbages[m - GRN_II_W_LEAST_CHUNK]; + while (*gseg != GRN_II_PSEG_NOT_ASSIGNED) { + ginfo = WIN_MAP(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); + //GRN_IO_SEG_MAP2(ii->chunk, *gseg, ginfo); + if (!ginfo) { + if (iw_.addr) { grn_io_win_unmap(&iw_); } + { + DEFINE_NAME(ii); + MERR("[ii][chunk][new] failed to allocate garbage segment: " + "<%.*s>: " + "n-garbages:<%u>, size:<%u>, n-chunks:<%u>", + name_size, name, + ii->header->ngarbages[m - GRN_II_W_LEAST_CHUNK], + size, + n_chunks); + } + return ctx->rc; + } + if (ginfo->next != GRN_II_PSEG_NOT_ASSIGNED || + ginfo->nrecs > N_GARBAGES_TH) { + *res = ginfo->recs[ginfo->tail]; + if (++ginfo->tail == N_GARBAGES) { ginfo->tail = 0; } + ginfo->nrecs--; + ii->header->ngarbages[m - GRN_II_W_LEAST_CHUNK]--; + if (!ginfo->nrecs) { + HEADER_CHUNK_OFF(ii, *gseg); + *gseg = ginfo->next; + } + if (iw_.addr) { grn_io_win_unmap(&iw_); } + grn_io_win_unmap(&iw); + return GRN_SUCCESS; + } + if (iw_.addr) { grn_io_win_unmap(&iw_); } + iw_ = iw; + gseg = &ginfo->next; + } + if (iw_.addr) { grn_io_win_unmap(&iw_); } + } + vp = &ii->header->free_chunks[m - GRN_II_W_LEAST_CHUNK]; + if (*vp == GRN_II_PSEG_NOT_ASSIGNED) { + int i = 0; + while (HEADER_CHUNK_AT(ii, i)) { + if (++i >= n_chunks) { + DEFINE_NAME(ii); + MERR("[ii][chunk][new] failed to find a free chunk: " + "<%.*s>: " + "index:<%u>, size:<%u>, n-chunks:<%u>", + name_size, name, + m - GRN_II_W_LEAST_CHUNK, + size, + n_chunks); + return ctx->rc; + } + } + HEADER_CHUNK_ON(ii, i); + *vp = i << GRN_II_N_CHUNK_VARIATION; + } + *res = *vp; + *vp += 1 << (m - GRN_II_W_LEAST_CHUNK); + if (!(*vp & ((1 << GRN_II_N_CHUNK_VARIATION) - 1))) { + *vp = GRN_II_PSEG_NOT_ASSIGNED; + } + return GRN_SUCCESS; + } +} + +static grn_rc +chunk_free(grn_ctx *ctx, grn_ii *ii, + uint32_t offset, uint32_t dummy, uint32_t size) +{ + /* + if (size) { + int m, es = size - 1; + GRN_BIT_SCAN_REV(es, m); + m++; + free_histogram[m]++; + } + */ + grn_io_win iw, iw_; + grn_ii_ginfo *ginfo= 0; + uint32_t seg, m, *gseg; + seg = offset >> GRN_II_N_CHUNK_VARIATION; + if (size > S_CHUNK) { + int n = (size + S_CHUNK - 1) >> GRN_II_W_CHUNK; + for (; n--; seg++) { HEADER_CHUNK_OFF(ii, seg); } + return GRN_SUCCESS; + } + if (size > (1 << GRN_II_W_LEAST_CHUNK)) { + int es = size - 1; + GRN_BIT_SCAN_REV(es, m); + m++; + } else { + m = GRN_II_W_LEAST_CHUNK; + } + gseg = &ii->header->garbages[m - GRN_II_W_LEAST_CHUNK]; + iw_.addr = NULL; + while (*gseg != GRN_II_PSEG_NOT_ASSIGNED) { + ginfo = WIN_MAP(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); + // GRN_IO_SEG_MAP2(ii->chunk, *gseg, ginfo); + if (!ginfo) { + if (iw_.addr) { grn_io_win_unmap(&iw_); } + return GRN_NO_MEMORY_AVAILABLE; + } + if (ginfo->nrecs < N_GARBAGES) { break; } + if (iw_.addr) { grn_io_win_unmap(&iw_); } + iw_ = iw; + gseg = &ginfo->next; + } + if (*gseg == GRN_II_PSEG_NOT_ASSIGNED) { + grn_rc rc; + if ((rc = chunk_new(ctx, ii, gseg, S_GARBAGE))) { + if (iw_.addr) { grn_io_win_unmap(&iw_); } + return rc; + } + ginfo = WIN_MAP(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); + /* + uint32_t i = 0; + while (HEADER_CHUNK_AT(ii, i)) { + if (++i >= ii->chunk->header->max_segment) { + return GRN_NO_MEMORY_AVAILABLE; + } + } + HEADER_CHUNK_ON(ii, i); + *gseg = i; + GRN_IO_SEG_MAP2(ii->chunk, *gseg, ginfo); + */ + if (!ginfo) { + if (iw_.addr) { grn_io_win_unmap(&iw_); } + return GRN_NO_MEMORY_AVAILABLE; + } + ginfo->head = 0; + ginfo->tail = 0; + ginfo->nrecs = 0; + ginfo->next = GRN_II_PSEG_NOT_ASSIGNED; + } + if (iw_.addr) { grn_io_win_unmap(&iw_); } + ginfo->recs[ginfo->head] = offset; + if (++ginfo->head == N_GARBAGES) { ginfo->head = 0; } + ginfo->nrecs++; + grn_io_win_unmap(&iw); + ii->header->ngarbages[m - GRN_II_W_LEAST_CHUNK]++; + return GRN_SUCCESS; +} + +#define UNIT_SIZE 0x80 +#define UNIT_MASK (UNIT_SIZE - 1) + +/* <generated> */ +static uint8_t * +pack_1(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + v = *p++ << 7; + v += *p++ << 6; + v += *p++ << 5; + v += *p++ << 4; + v += *p++ << 3; + v += *p++ << 2; + v += *p++ << 1; + *rp++ = v + *p++; + return rp; +} +static uint8_t * +unpack_1(uint32_t *p, uint8_t *dp) +{ + *p++ = (*dp >> 7); + *p++ = ((*dp >> 6) & 0x1); + *p++ = ((*dp >> 5) & 0x1); + *p++ = ((*dp >> 4) & 0x1); + *p++ = ((*dp >> 3) & 0x1); + *p++ = ((*dp >> 2) & 0x1); + *p++ = ((*dp >> 1) & 0x1); + *p++ = (*dp++ & 0x1); + return dp; +} +static uint8_t * +pack_2(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + v = *p++ << 6; + v += *p++ << 4; + v += *p++ << 2; + *rp++ = v + *p++; + v = *p++ << 6; + v += *p++ << 4; + v += *p++ << 2; + *rp++ = v + *p++; + return rp; +} +static uint8_t * +unpack_2(uint32_t *p, uint8_t *dp) +{ + *p++ = (*dp >> 6); + *p++ = ((*dp >> 4) & 0x3); + *p++ = ((*dp >> 2) & 0x3); + *p++ = (*dp++ & 0x3); + *p++ = (*dp >> 6); + *p++ = ((*dp >> 4) & 0x3); + *p++ = ((*dp >> 2) & 0x3); + *p++ = (*dp++ & 0x3); + return dp; +} +static uint8_t * +pack_3(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + v = *p++ << 5; + v += *p++ << 2; + *rp++ = v + (*p >> 1); v = *p++ << 7; + v += *p++ << 4; + v += *p++ << 1; + *rp++ = v + (*p >> 2); v = *p++ << 6; + v += *p++ << 3; + *rp++ = v + *p++; + return rp; +} +static uint8_t * +unpack_3(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + *p++ = (*dp >> 5); + *p++ = ((*dp >> 2) & 0x7); + v = ((*dp++ << 1) & 0x7); *p++ = v + (*dp >> 7); + *p++ = ((*dp >> 4) & 0x7); + *p++ = ((*dp >> 1) & 0x7); + v = ((*dp++ << 2) & 0x7); *p++ = v + (*dp >> 6); + *p++ = ((*dp >> 3) & 0x7); + *p++ = (*dp++ & 0x7); + return dp; +} +static uint8_t * +pack_4(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + v = *p++ << 4; + *rp++ = v + *p++; + v = *p++ << 4; + *rp++ = v + *p++; + v = *p++ << 4; + *rp++ = v + *p++; + v = *p++ << 4; + *rp++ = v + *p++; + return rp; +} +static uint8_t * +unpack_4(uint32_t *p, uint8_t *dp) +{ + *p++ = (*dp >> 4); + *p++ = (*dp++ & 0xf); + *p++ = (*dp >> 4); + *p++ = (*dp++ & 0xf); + *p++ = (*dp >> 4); + *p++ = (*dp++ & 0xf); + *p++ = (*dp >> 4); + *p++ = (*dp++ & 0xf); + return dp; +} +static uint8_t * +pack_5(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + v = *p++ << 3; + *rp++ = v + (*p >> 2); v = *p++ << 6; + v += *p++ << 1; + *rp++ = v + (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 1); v = *p++ << 7; + v += *p++ << 2; + *rp++ = v + (*p >> 3); v = *p++ << 5; + *rp++ = v + *p++; + return rp; +} +static uint8_t * +unpack_5(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + *p++ = (*dp >> 3); + v = ((*dp++ << 2) & 0x1f); *p++ = v + (*dp >> 6); + *p++ = ((*dp >> 1) & 0x1f); + v = ((*dp++ << 4) & 0x1f); *p++ = v + (*dp >> 4); + v = ((*dp++ << 1) & 0x1f); *p++ = v + (*dp >> 7); + *p++ = ((*dp >> 2) & 0x1f); + v = ((*dp++ << 3) & 0x1f); *p++ = v + (*dp >> 5); + *p++ = (*dp++ & 0x1f); + return dp; +} +static uint8_t * +pack_6(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + v = *p++ << 2; + *rp++ = v + (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 2); v = *p++ << 6; + *rp++ = v + *p++; + v = *p++ << 2; + *rp++ = v + (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 2); v = *p++ << 6; + *rp++ = v + *p++; + return rp; +} +static uint8_t * +unpack_6(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + *p++ = (*dp >> 2); + v = ((*dp++ << 4) & 0x3f); *p++ = v + (*dp >> 4); + v = ((*dp++ << 2) & 0x3f); *p++ = v + (*dp >> 6); + *p++ = (*dp++ & 0x3f); + *p++ = (*dp >> 2); + v = ((*dp++ << 4) & 0x3f); *p++ = v + (*dp >> 4); + v = ((*dp++ << 2) & 0x3f); *p++ = v + (*dp >> 6); + *p++ = (*dp++ & 0x3f); + return dp; +} +static uint8_t * +pack_7(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + v = *p++ << 1; + *rp++ = v + (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 5); v = *p++ << 3; + *rp++ = v + (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 3); v = *p++ << 5; + *rp++ = v + (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 1); v = *p++ << 7; + *rp++ = v + *p++; + return rp; +} +static uint8_t * +unpack_7(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + *p++ = (*dp >> 1); + v = ((*dp++ << 6) & 0x7f); *p++ = v + (*dp >> 2); + v = ((*dp++ << 5) & 0x7f); *p++ = v + (*dp >> 3); + v = ((*dp++ << 4) & 0x7f); *p++ = v + (*dp >> 4); + v = ((*dp++ << 3) & 0x7f); *p++ = v + (*dp >> 5); + v = ((*dp++ << 2) & 0x7f); *p++ = v + (*dp >> 6); + v = ((*dp++ << 1) & 0x7f); *p++ = v + (*dp >> 7); + *p++ = (*dp++ & 0x7f); + return dp; +} +static uint8_t * +pack_8(uint32_t *p, uint8_t *rp) +{ + *rp++ = *p++; + *rp++ = *p++; + *rp++ = *p++; + *rp++ = *p++; + *rp++ = *p++; + *rp++ = *p++; + *rp++ = *p++; + *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_8(uint32_t *p, uint8_t *dp) +{ + *p++ = *dp++; + *p++ = *dp++; + *p++ = *dp++; + *p++ = *dp++; + *p++ = *dp++; + *p++ = *dp++; + *p++ = *dp++; + *p++ = *dp++; + return dp; +} +static uint8_t * +pack_9(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 1); v = *p++ << 7; + *rp++ = v + (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 3); v = *p++ << 5; + *rp++ = v + (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 5); v = *p++ << 3; + *rp++ = v + (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 7); v = *p++ << 1; + *rp++ = v + (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_9(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 1; *p++ = v + (*dp >> 7); + v = ((*dp++ << 2) & 0x1ff); *p++ = v + (*dp >> 6); + v = ((*dp++ << 3) & 0x1ff); *p++ = v + (*dp >> 5); + v = ((*dp++ << 4) & 0x1ff); *p++ = v + (*dp >> 4); + v = ((*dp++ << 5) & 0x1ff); *p++ = v + (*dp >> 3); + v = ((*dp++ << 6) & 0x1ff); *p++ = v + (*dp >> 2); + v = ((*dp++ << 7) & 0x1ff); *p++ = v + (*dp >> 1); + v = ((*dp++ << 8) & 0x1ff); *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_10(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_10(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 4) & 0x3ff); *p++ = v + (*dp >> 4); + v = ((*dp++ << 6) & 0x3ff); *p++ = v + (*dp >> 2); + v = ((*dp++ << 8) & 0x3ff); *p++ = v + *dp++; + v = *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 4) & 0x3ff); *p++ = v + (*dp >> 4); + v = ((*dp++ << 6) & 0x3ff); *p++ = v + (*dp >> 2); + v = ((*dp++ << 8) & 0x3ff); *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_11(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 3); v = *p++ << 5; + *rp++ = v + (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; + *rp++ = v + (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 7); v = *p++ << 1; + *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 5); v = *p++ << 3; + *rp++ = v + (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_11(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 3; *p++ = v + (*dp >> 5); + v = ((*dp++ << 6) & 0x7ff); *p++ = v + (*dp >> 2); + v = ((*dp++ << 9) & 0x7ff); v += *dp++ << 1; *p++ = v + (*dp >> 7); + v = ((*dp++ << 4) & 0x7ff); *p++ = v + (*dp >> 4); + v = ((*dp++ << 7) & 0x7ff); *p++ = v + (*dp >> 1); + v = ((*dp++ << 10) & 0x7ff); v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 5) & 0x7ff); *p++ = v + (*dp >> 3); + v = ((*dp++ << 8) & 0x7ff); *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_12(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_12(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 8) & 0xfff); *p++ = v + *dp++; + v = *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 8) & 0xfff); *p++ = v + *dp++; + v = *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 8) & 0xfff); *p++ = v + *dp++; + v = *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 8) & 0xfff); *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_13(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 5); v = *p++ << 3; + *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 7); v = *p++ << 1; + *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; + *rp++ = v + (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; + *rp++ = v + (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_13(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 5; *p++ = v + (*dp >> 3); + v = ((*dp++ << 10) & 0x1fff); v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 7) & 0x1fff); *p++ = v + (*dp >> 1); + v = ((*dp++ << 12) & 0x1fff); v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 9) & 0x1fff); v += *dp++ << 1; *p++ = v + (*dp >> 7); + v = ((*dp++ << 6) & 0x1fff); *p++ = v + (*dp >> 2); + v = ((*dp++ << 11) & 0x1fff); v += *dp++ << 3; *p++ = v + (*dp >> 5); + v = ((*dp++ << 8) & 0x1fff); *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_14(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_14(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 6; *p++ = v + (*dp >> 2); + v = ((*dp++ << 12) & 0x3fff); v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 10) & 0x3fff); v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 8) & 0x3fff); *p++ = v + *dp++; + v = *dp++ << 6; *p++ = v + (*dp >> 2); + v = ((*dp++ << 12) & 0x3fff); v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 10) & 0x3fff); v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 8) & 0x3fff); *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_15(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 7); v = *p++ << 1; + *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; + *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; + *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; + *rp++ = v + (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_15(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 7; *p++ = v + (*dp >> 1); + v = ((*dp++ << 14) & 0x7fff); v += *dp++ << 6; *p++ = v + (*dp >> 2); + v = ((*dp++ << 13) & 0x7fff); v += *dp++ << 5; *p++ = v + (*dp >> 3); + v = ((*dp++ << 12) & 0x7fff); v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 11) & 0x7fff); v += *dp++ << 3; *p++ = v + (*dp >> 5); + v = ((*dp++ << 10) & 0x7fff); v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 9) & 0x7fff); v += *dp++ << 1; *p++ = v + (*dp >> 7); + v = ((*dp++ << 8) & 0x7fff); *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_16(uint32_t *p, uint8_t *rp) +{ + *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_16(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 8; *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_17(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; + *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; + *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; + *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; + *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_17(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 9; v += *dp++ << 1; *p++ = v + (*dp >> 7); + v = ((*dp++ << 10) & 0x1ffff); v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 11) & 0x1ffff); v += *dp++ << 3; *p++ = v + (*dp >> 5); + v = ((*dp++ << 12) & 0x1ffff); v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 13) & 0x1ffff); v += *dp++ << 5; *p++ = v + (*dp >> 3); + v = ((*dp++ << 14) & 0x1ffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); + v = ((*dp++ << 15) & 0x1ffff); v += *dp++ << 7; *p++ = v + (*dp >> 1); + v = ((*dp++ << 16) & 0x1ffff); v += *dp++ << 8; *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_18(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_18(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 12) & 0x3ffff); v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 14) & 0x3ffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); + v = ((*dp++ << 16) & 0x3ffff); v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 12) & 0x3ffff); v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 14) & 0x3ffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); + v = ((*dp++ << 16) & 0x3ffff); v += *dp++ << 8; *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_19(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; + *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; + *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; + *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; + *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_19(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 11; v += *dp++ << 3; *p++ = v + (*dp >> 5); + v = ((*dp++ << 14) & 0x7ffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); + v = ((*dp++ << 17) & 0x7ffff); v += *dp++ << 9; v += *dp++ << 1; + *p++ = v + (*dp >> 7); + v = ((*dp++ << 12) & 0x7ffff); v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 15) & 0x7ffff); v += *dp++ << 7; *p++ = v + (*dp >> 1); + v = ((*dp++ << 18) & 0x7ffff); v += *dp++ << 10; v += *dp++ << 2; + *p++ = v + (*dp >> 6); + v = ((*dp++ << 13) & 0x7ffff); v += *dp++ << 5; *p++ = v + (*dp >> 3); + v = ((*dp++ << 16) & 0x7ffff); v += *dp++ << 8; *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_20(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_20(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 16) & 0xfffff); v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 16) & 0xfffff); v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 16) & 0xfffff); v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 16) & 0xfffff); v += *dp++ << 8; *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_21(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; + *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; + *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; + *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 19); *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; + *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_21(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 13; v += *dp++ << 5; *p++ = v + (*dp >> 3); + v = ((*dp++ << 18) & 0x1fffff); v += *dp++ << 10; v += *dp++ << 2; + *p++ = v + (*dp >> 6); + v = ((*dp++ << 15) & 0x1fffff); v += *dp++ << 7; *p++ = v + (*dp >> 1); + v = ((*dp++ << 20) & 0x1fffff); v += *dp++ << 12; v += *dp++ << 4; + *p++ = v + (*dp >> 4); + v = ((*dp++ << 17) & 0x1fffff); v += *dp++ << 9; v += *dp++ << 1; + *p++ = v + (*dp >> 7); + v = ((*dp++ << 14) & 0x1fffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); + v = ((*dp++ << 19) & 0x1fffff); v += *dp++ << 11; v += *dp++ << 3; + *p++ = v + (*dp >> 5); + v = ((*dp++ << 16) & 0x1fffff); v += *dp++ << 8; *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_22(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_22(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); + v = ((*dp++ << 20) & 0x3fffff); v += *dp++ << 12; v += *dp++ << 4; + *p++ = v + (*dp >> 4); + v = ((*dp++ << 18) & 0x3fffff); v += *dp++ << 10; v += *dp++ << 2; + *p++ = v + (*dp >> 6); + v = ((*dp++ << 16) & 0x3fffff); v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); + v = ((*dp++ << 20) & 0x3fffff); v += *dp++ << 12; v += *dp++ << 4; + *p++ = v + (*dp >> 4); + v = ((*dp++ << 18) & 0x3fffff); v += *dp++ << 10; v += *dp++ << 2; + *p++ = v + (*dp >> 6); + v = ((*dp++ << 16) & 0x3fffff); v += *dp++ << 8; *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_23(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; + *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 21); *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; + *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 19); *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; + *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; + *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_23(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 15; v += *dp++ << 7; *p++ = v + (*dp >> 1); + v = ((*dp++ << 22) & 0x7fffff); v += *dp++ << 14; v += *dp++ << 6; + *p++ = v + (*dp >> 2); + v = ((*dp++ << 21) & 0x7fffff); v += *dp++ << 13; v += *dp++ << 5; + *p++ = v + (*dp >> 3); + v = ((*dp++ << 20) & 0x7fffff); v += *dp++ << 12; v += *dp++ << 4; + *p++ = v + (*dp >> 4); + v = ((*dp++ << 19) & 0x7fffff); v += *dp++ << 11; v += *dp++ << 3; + *p++ = v + (*dp >> 5); + v = ((*dp++ << 18) & 0x7fffff); v += *dp++ << 10; v += *dp++ << 2; + *p++ = v + (*dp >> 6); + v = ((*dp++ << 17) & 0x7fffff); v += *dp++ << 9; v += *dp++ << 1; + *p++ = v + (*dp >> 7); + v = ((*dp++ << 16) & 0x7fffff); v += *dp++ << 8; *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_24(uint32_t *p, uint8_t *rp) +{ + *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_24(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_25(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; + *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 19); *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; + *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 21); *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; + *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 23); *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; + *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_25(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 17; v += *dp++ << 9; v += *dp++ << 1; *p++ = v + (*dp >> 7); + v = ((*dp++ << 18) & 0x1ffffff); v += *dp++ << 10; v += *dp++ << 2; + *p++ = v + (*dp >> 6); + v = ((*dp++ << 19) & 0x1ffffff); v += *dp++ << 11; v += *dp++ << 3; + *p++ = v + (*dp >> 5); + v = ((*dp++ << 20) & 0x1ffffff); v += *dp++ << 12; v += *dp++ << 4; + *p++ = v + (*dp >> 4); + v = ((*dp++ << 21) & 0x1ffffff); v += *dp++ << 13; v += *dp++ << 5; + *p++ = v + (*dp >> 3); + v = ((*dp++ << 22) & 0x1ffffff); v += *dp++ << 14; v += *dp++ << 6; + *p++ = v + (*dp >> 2); + v = ((*dp++ << 23) & 0x1ffffff); v += *dp++ << 15; v += *dp++ << 7; + *p++ = v + (*dp >> 1); + v = ((*dp++ << 24) & 0x1ffffff); v += *dp++ << 16; v += *dp++ << 8; + *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_26(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_26(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 18; v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 20) & 0x3ffffff); v += *dp++ << 12; v += *dp++ << 4; + *p++ = v + (*dp >> 4); + v = ((*dp++ << 22) & 0x3ffffff); v += *dp++ << 14; v += *dp++ << 6; + *p++ = v + (*dp >> 2); + v = ((*dp++ << 24) & 0x3ffffff); v += *dp++ << 16; v += *dp++ << 8; + *p++ = v + *dp++; + v = *dp++ << 18; v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 20) & 0x3ffffff); v += *dp++ << 12; v += *dp++ << 4; + *p++ = v + (*dp >> 4); + v = ((*dp++ << 22) & 0x3ffffff); v += *dp++ << 14; v += *dp++ << 6; + *p++ = v + (*dp >> 2); + v = ((*dp++ << 24) & 0x3ffffff); v += *dp++ << 16; v += *dp++ << 8; + *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_27(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 19); *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; + *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 25); *rp++ = (*p >> 17); *rp++ = (*p >> 9); + *rp++ = (*p >> 1); v = *p++ << 7; + *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 23); *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; + *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); + *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 21); *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; + *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_27(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 19; v += *dp++ << 11; v += *dp++ << 3; *p++ = v + (*dp >> 5); + v = ((*dp++ << 22) & 0x7ffffff); v += *dp++ << 14; v += *dp++ << 6; + *p++ = v + (*dp >> 2); + v = ((*dp++ << 25) & 0x7ffffff); v += *dp++ << 17; v += *dp++ << 9; + v += *dp++ << 1; *p++ = v + (*dp >> 7); + v = ((*dp++ << 20) & 0x7ffffff); v += *dp++ << 12; v += *dp++ << 4; + *p++ = v + (*dp >> 4); + v = ((*dp++ << 23) & 0x7ffffff); v += *dp++ << 15; v += *dp++ << 7; + *p++ = v + (*dp >> 1); + v = ((*dp++ << 26) & 0x7ffffff); v += *dp++ << 18; v += *dp++ << 10; + v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 21) & 0x7ffffff); v += *dp++ << 13; v += *dp++ << 5; + *p++ = v + (*dp >> 3); + v = ((*dp++ << 24) & 0x7ffffff); v += *dp++ << 16; v += *dp++ << 8; + *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_28(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_28(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 24) & 0xfffffff); v += *dp++ << 16; v += *dp++ << 8; + *p++ = v + *dp++; + v = *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 24) & 0xfffffff); v += *dp++ << 16; v += *dp++ << 8; + *p++ = v + *dp++; + v = *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 24) & 0xfffffff); v += *dp++ << 16; v += *dp++ << 8; + *p++ = v + *dp++; + v = *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 24) & 0xfffffff); v += *dp++ << 16; v += *dp++ << 8; + *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_29(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 21); *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; + *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); + *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 23); *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; + *rp++ = v + (*p >> 28); *rp++ = (*p >> 20); *rp++ = (*p >> 12); + *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 25); *rp++ = (*p >> 17); *rp++ = (*p >> 9); + *rp++ = (*p >> 1); v = *p++ << 7; + *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 27); *rp++ = (*p >> 19); *rp++ = (*p >> 11); + *rp++ = (*p >> 3); v = *p++ << 5; + *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_29(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 21; v += *dp++ << 13; v += *dp++ << 5; *p++ = v + (*dp >> 3); + v = ((*dp++ << 26) & 0x1fffffff); v += *dp++ << 18; v += *dp++ << 10; + v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 23) & 0x1fffffff); v += *dp++ << 15; v += *dp++ << 7; + *p++ = v + (*dp >> 1); + v = ((*dp++ << 28) & 0x1fffffff); v += *dp++ << 20; v += *dp++ << 12; + v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 25) & 0x1fffffff); v += *dp++ << 17; v += *dp++ << 9; + v += *dp++ << 1; *p++ = v + (*dp >> 7); + v = ((*dp++ << 22) & 0x1fffffff); v += *dp++ << 14; v += *dp++ << 6; + *p++ = v + (*dp >> 2); + v = ((*dp++ << 27) & 0x1fffffff); v += *dp++ << 19; v += *dp++ << 11; + v += *dp++ << 3; *p++ = v + (*dp >> 5); + v = ((*dp++ << 24) & 0x1fffffff); v += *dp++ << 16; v += *dp++ << 8; + *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_30(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 28); *rp++ = (*p >> 20); *rp++ = (*p >> 12); + *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); + *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 28); *rp++ = (*p >> 20); *rp++ = (*p >> 12); + *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); + *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); + *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_30(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 22; v += *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); + v = ((*dp++ << 28) & 0x3fffffff); v += *dp++ << 20; v += *dp++ << 12; + v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 26) & 0x3fffffff); v += *dp++ << 18; v += *dp++ << 10; + v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 24) & 0x3fffffff); v += *dp++ << 16; v += *dp++ << 8; + *p++ = v + *dp++; + v = *dp++ << 22; v += *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); + v = ((*dp++ << 28) & 0x3fffffff); v += *dp++ << 20; v += *dp++ << 12; + v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 26) & 0x3fffffff); v += *dp++ << 18; v += *dp++ << 10; + v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 24) & 0x3fffffff); v += *dp++ << 16; v += *dp++ << 8; + *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_31(uint32_t *p, uint8_t *rp) +{ + uint8_t v; + *rp++ = (*p >> 23); *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; + *rp++ = v + (*p >> 30); *rp++ = (*p >> 22); *rp++ = (*p >> 14); + *rp++ = (*p >> 6); v = *p++ << 2; + *rp++ = v + (*p >> 29); *rp++ = (*p >> 21); *rp++ = (*p >> 13); + *rp++ = (*p >> 5); v = *p++ << 3; + *rp++ = v + (*p >> 28); *rp++ = (*p >> 20); *rp++ = (*p >> 12); + *rp++ = (*p >> 4); v = *p++ << 4; + *rp++ = v + (*p >> 27); *rp++ = (*p >> 19); *rp++ = (*p >> 11); + *rp++ = (*p >> 3); v = *p++ << 5; + *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); + *rp++ = (*p >> 2); v = *p++ << 6; + *rp++ = v + (*p >> 25); *rp++ = (*p >> 17); *rp++ = (*p >> 9); + *rp++ = (*p >> 1); v = *p++ << 7; + *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); + *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_31(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 23; v += *dp++ << 15; v += *dp++ << 7; *p++ = v + (*dp >> 1); + v = ((*dp++ << 30) & 0x7fffffff); v += *dp++ << 22; v += *dp++ << 14; + v += *dp++ << 6; *p++ = v + (*dp >> 2); + v = ((*dp++ << 29) & 0x7fffffff); v += *dp++ << 21; v += *dp++ << 13; + v += *dp++ << 5; *p++ = v + (*dp >> 3); + v = ((*dp++ << 28) & 0x7fffffff); v += *dp++ << 20; v += *dp++ << 12; + v += *dp++ << 4; *p++ = v + (*dp >> 4); + v = ((*dp++ << 27) & 0x7fffffff); v += *dp++ << 19; v += *dp++ << 11; + v += *dp++ << 3; *p++ = v + (*dp >> 5); + v = ((*dp++ << 26) & 0x7fffffff); v += *dp++ << 18; v += *dp++ << 10; + v += *dp++ << 2; *p++ = v + (*dp >> 6); + v = ((*dp++ << 25) & 0x7fffffff); v += *dp++ << 17; v += *dp++ << 9; + v += *dp++ << 1; *p++ = v + (*dp >> 7); + v = ((*dp++ << 24) & 0x7fffffff); v += *dp++ << 16; v += *dp++ << 8; + *p++ = v + *dp++; + return dp; +} +static uint8_t * +pack_32(uint32_t *p, uint8_t *rp) +{ + *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; + return rp; +} +static uint8_t * +unpack_32(uint32_t *p, uint8_t *dp) +{ + uint32_t v; + v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; + return dp; +} +/* </generated> */ + +static uint8_t * +pack_(uint32_t *p, uint32_t i, int w, uint8_t *rp) +{ + while (i >= 8) { + switch (w) { + case 0 : break; + case 1 : rp = pack_1(p, rp); break; + case 2 : rp = pack_2(p, rp); break; + case 3 : rp = pack_3(p, rp); break; + case 4 : rp = pack_4(p, rp); break; + case 5 : rp = pack_5(p, rp); break; + case 6 : rp = pack_6(p, rp); break; + case 7 : rp = pack_7(p, rp); break; + case 8 : rp = pack_8(p, rp); break; + case 9 : rp = pack_9(p, rp); break; + case 10 : rp = pack_10(p, rp); break; + case 11 : rp = pack_11(p, rp); break; + case 12 : rp = pack_12(p, rp); break; + case 13 : rp = pack_13(p, rp); break; + case 14 : rp = pack_14(p, rp); break; + case 15 : rp = pack_15(p, rp); break; + case 16 : rp = pack_16(p, rp); break; + case 17 : rp = pack_17(p, rp); break; + case 18 : rp = pack_18(p, rp); break; + case 19 : rp = pack_19(p, rp); break; + case 20 : rp = pack_20(p, rp); break; + case 21 : rp = pack_21(p, rp); break; + case 22 : rp = pack_22(p, rp); break; + case 23 : rp = pack_23(p, rp); break; + case 24 : rp = pack_24(p, rp); break; + case 25 : rp = pack_25(p, rp); break; + case 26 : rp = pack_26(p, rp); break; + case 27 : rp = pack_27(p, rp); break; + case 28 : rp = pack_28(p, rp); break; + case 29 : rp = pack_29(p, rp); break; + case 30 : rp = pack_30(p, rp); break; + case 31 : rp = pack_31(p, rp); break; + case 32 : rp = pack_32(p, rp); break; + } + p += 8; + i -= 8; + } + { + int b; + uint8_t v; + uint32_t *pe = p + i; + for (b = 8 - w, v = 0; p < pe;) { + if (b > 0) { + v += *p++ << b; + b -= w; + } else if (b < 0) { + *rp++ = v + (*p >> -b); + b += 8; + v = 0; + } else { + *rp++ = v + *p++; + b = 8 - w; + v = 0; + } + } + if (b + w != 8) { *rp++ = v; } + return rp; + } +} + +static uint8_t * +pack(uint32_t *p, uint32_t i, uint8_t *freq, uint8_t *rp) +{ + int32_t k, w; + uint8_t ebuf[UNIT_SIZE], *ep = ebuf; + uint32_t s, *pe = p + i, r, th = i - (i >> 3); + for (w = 0, s = 0; w <= 32; w++) { + if ((s += freq[w]) >= th) { break; } + } + if (i == s) { + *rp++ = w; + return pack_(p, i, w, rp); + } + r = 1 << w; + *rp++ = w + 0x80; + *rp++ = i - s; + if (r >= UNIT_SIZE) { + uint32_t first, *last = &first; + for (k = 0; p < pe; p++, k++) { + if (*p >= r) { + GRN_B_ENC(*p - r, ep); + *last = k; + last = p; + } + } + *last = 0; + *rp++ = (uint8_t) first; + } else { + for (k = 0; p < pe; p++, k++) { + if (*p >= r) { + *ep++ = k; + GRN_B_ENC(*p - r, ep); + *p = 0; + } + } + } + rp = pack_(p - i, i, w, rp); + grn_memcpy(rp, ebuf, ep - ebuf); + return rp + (ep - ebuf); +} + +int +grn_p_enc(grn_ctx *ctx, uint32_t *data, uint32_t data_size, uint8_t **res) +{ + uint8_t *rp, freq[33]; + uint32_t j, *dp, *dpe, d, w, buf[UNIT_SIZE]; + *res = rp = GRN_MALLOC(data_size * sizeof(uint32_t) * 2); + GRN_B_ENC(data_size, rp); + memset(freq, 0, 33); + for (j = 0, dp = data, dpe = dp + data_size; dp < dpe; j++, dp++) { + if (j == UNIT_SIZE) { + rp = pack(buf, j, freq, rp); + memset(freq, 0, 33); + j = 0; + } + if ((d = buf[j] = *dp)) { + GRN_BIT_SCAN_REV(d, w); + freq[w + 1]++; + } else { + freq[0]++; + } + } + if (j) { rp = pack(buf, j, freq, rp); } + return rp - *res; +} + +#define USE_P_ENC (1<<0) /* Use PForDelta */ +#define CUT_OFF (1<<1) /* Deprecated */ +#define ODD (1<<2) /* Variable size data */ + +typedef struct { + uint32_t *data; + uint32_t data_size; + uint32_t flags; +} datavec; + +static grn_rc +datavec_reset(grn_ctx *ctx, datavec *dv, uint32_t dvlen, + size_t unitsize, size_t totalsize) +{ + int i; + if (!dv[0].data || dv[dvlen].data < dv[0].data + totalsize) { + if (dv[0].data) { GRN_FREE(dv[0].data); } + if (!(dv[0].data = GRN_MALLOC(totalsize * sizeof(uint32_t)))) { + MERR("[ii][data-vector][reset] failed to allocate data: " + "length:<%u>, " + "unit-size:<%" GRN_FMT_SIZE ">, " + "total-size:<%" GRN_FMT_SIZE ">", + dvlen, + unitsize, + totalsize); + return ctx->rc; + } + dv[dvlen].data = dv[0].data + totalsize; + } + for (i = 1; i < dvlen; i++) { + dv[i].data = dv[i - 1].data + unitsize; + } + return GRN_SUCCESS; +} + +static grn_rc +datavec_init(grn_ctx *ctx, datavec *dv, uint32_t dvlen, + size_t unitsize, size_t totalsize) +{ + int i; + if (!totalsize) { + memset(dv, 0, sizeof(datavec) * (dvlen + 1)); + return GRN_SUCCESS; + } + if (!(dv[0].data = GRN_MALLOC(totalsize * sizeof(uint32_t)))) { + MERR("[ii][data-vector][init] failed to allocate data: " + "length:<%u>, " + "unit-size:<%" GRN_FMT_SIZE ">, " + "total-size:<%" GRN_FMT_SIZE ">", + dvlen, + unitsize, + totalsize); + return ctx->rc; + } + dv[dvlen].data = dv[0].data + totalsize; + for (i = 1; i < dvlen; i++) { + dv[i].data = dv[i - 1].data + unitsize; + } + return GRN_SUCCESS; +} + +static void +datavec_fin(grn_ctx *ctx, datavec *dv) +{ + if (dv[0].data) { GRN_FREE(dv[0].data); } +} + +size_t +grn_p_encv(grn_ctx *ctx, datavec *dv, uint32_t dvlen, uint8_t *res) +{ + uint8_t *rp = res, freq[33]; + uint32_t pgap, usep, l, df, data_size, *dp, *dpe; + if (!dvlen || !(df = dv[0].data_size)) { return 0; } + for (usep = 0, data_size = 0, l = 0; l < dvlen; l++) { + uint32_t dl = dv[l].data_size; + if (dl < df || ((dl > df) && (l != dvlen - 1))) { + /* invalid argument */ + return 0; + } + usep += (dv[l].flags & USE_P_ENC) << l; + data_size += dl; + } + pgap = data_size - df * dvlen; + if (!usep) { + GRN_B_ENC((df << 1) + 1, rp); + for (l = 0; l < dvlen; l++) { + for (dp = dv[l].data, dpe = dp + dv[l].data_size; dp < dpe; dp++) { + GRN_B_ENC(*dp, rp); + } + } + } else { + uint32_t buf[UNIT_SIZE]; + GRN_B_ENC((usep << 1), rp); + GRN_B_ENC(df, rp); + if (dv[dvlen - 1].flags & ODD) { + GRN_B_ENC(pgap, rp); + } else { + GRN_ASSERT(!pgap); + } + for (l = 0; l < dvlen; l++) { + dp = dv[l].data; + dpe = dp + dv[l].data_size; + if ((dv[l].flags & USE_P_ENC)) { + uint32_t j = 0, d; + memset(freq, 0, 33); + while (dp < dpe) { + if (j == UNIT_SIZE) { + rp = pack(buf, j, freq, rp); + memset(freq, 0, 33); + j = 0; + } + if ((d = buf[j++] = *dp++)) { + uint32_t w; + GRN_BIT_SCAN_REV(d, w); + freq[w + 1]++; + } else { + freq[0]++; + } + } + if (j) { rp = pack(buf, j, freq, rp); } + } else { + while (dp < dpe) { GRN_B_ENC(*dp++, rp); } + } + } + } + return rp - res; +} + +#define GRN_B_DEC_CHECK(v,p,pe) do { \ + uint8_t *_p = (uint8_t *)p; \ + uint32_t _v; \ + if (_p >= pe) { return 0; } \ + _v = *_p++; \ + switch (_v >> 4) { \ + case 0x08 : \ + if (_v == 0x8f) { \ + if (_p + sizeof(uint32_t) > pe) { return 0; } \ + grn_memcpy(&_v, _p, sizeof(uint32_t)); \ + _p += sizeof(uint32_t); \ + } \ + break; \ + case 0x09 : \ + if (_p + 3 > pe) { return 0; } \ + _v = (_v - 0x90) * 0x100 + *_p++; \ + _v = _v * 0x100 + *_p++; \ + _v = _v * 0x100 + *_p++ + 0x20408f; \ + break; \ + case 0x0a : \ + case 0x0b : \ + if (_p + 2 > pe) { return 0; } \ + _v = (_v - 0xa0) * 0x100 + *_p++; \ + _v = _v * 0x100 + *_p++ + 0x408f; \ + break; \ + case 0x0c : \ + case 0x0d : \ + case 0x0e : \ + case 0x0f : \ + if (_p + 1 > pe) { return 0; } \ + _v = (_v - 0xc0) * 0x100 + *_p++ + 0x8f; \ + break; \ + } \ + v = _v; \ + p = _p; \ +} while (0) + +static uint8_t * +unpack(uint8_t *dp, uint8_t *dpe, int i, uint32_t *rp) +{ + uint8_t ne = 0, k = 0, w = *dp++; + uint32_t m, *p = rp; + if (w & 0x80) { + ne = *dp++; + w -= 0x80; + m = (1 << w) - 1; + if (m >= UNIT_MASK) { k = *dp++; } + } else { + m = (1 << w) - 1; + } + if (w) { + while (i >= 8) { + if (dp + w > dpe) { return NULL; } + switch (w) { + case 1 : dp = unpack_1(p, dp); break; + case 2 : dp = unpack_2(p, dp); break; + case 3 : dp = unpack_3(p, dp); break; + case 4 : dp = unpack_4(p, dp); break; + case 5 : dp = unpack_5(p, dp); break; + case 6 : dp = unpack_6(p, dp); break; + case 7 : dp = unpack_7(p, dp); break; + case 8 : dp = unpack_8(p, dp); break; + case 9 : dp = unpack_9(p, dp); break; + case 10 : dp = unpack_10(p, dp); break; + case 11 : dp = unpack_11(p, dp); break; + case 12 : dp = unpack_12(p, dp); break; + case 13 : dp = unpack_13(p, dp); break; + case 14 : dp = unpack_14(p, dp); break; + case 15 : dp = unpack_15(p, dp); break; + case 16 : dp = unpack_16(p, dp); break; + case 17 : dp = unpack_17(p, dp); break; + case 18 : dp = unpack_18(p, dp); break; + case 19 : dp = unpack_19(p, dp); break; + case 20 : dp = unpack_20(p, dp); break; + case 21 : dp = unpack_21(p, dp); break; + case 22 : dp = unpack_22(p, dp); break; + case 23 : dp = unpack_23(p, dp); break; + case 24 : dp = unpack_24(p, dp); break; + case 25 : dp = unpack_25(p, dp); break; + case 26 : dp = unpack_26(p, dp); break; + case 27 : dp = unpack_27(p, dp); break; + case 28 : dp = unpack_28(p, dp); break; + case 29 : dp = unpack_29(p, dp); break; + case 30 : dp = unpack_30(p, dp); break; + case 31 : dp = unpack_31(p, dp); break; + case 32 : dp = unpack_32(p, dp); break; + } + i -= 8; + p += 8; + } + { + int b; + uint32_t v, *pe; + for (b = 8 - w, v = 0, pe = p + i; p < pe && dp < dpe;) { + if (b > 0) { + *p++ = v + ((*dp >> b) & m); + b -= w; + v = 0; + } else if (b < 0) { + v += (*dp++ << -b) & m; + b += 8; + } else { + *p++ = v + (*dp++ & m); + b = 8 - w; + v = 0; + } + } + if (b + w != 8) { dp++; } + } + } else { + memset(p, 0, sizeof(uint32_t) * i); + } + if (ne) { + if (m >= UNIT_MASK) { + uint32_t *pp; + while (ne--) { + pp = &rp[k]; + k = *pp; + GRN_B_DEC_CHECK(*pp, dp, dpe); + *pp += (m + 1); + } + } else { + while (ne--) { + k = *dp++; + GRN_B_DEC_CHECK(rp[k], dp, dpe); + rp[k] += (m + 1); + } + } + } + return dp; +} + +int +grn_p_dec(grn_ctx *ctx, uint8_t *data, uint32_t data_size, uint32_t nreq, uint32_t **res) +{ + uint8_t *dp = data, *dpe = data + data_size; + uint32_t rest, orig_size, *rp, *rpe; + GRN_B_DEC(orig_size, dp); + if (!orig_size) { + if (!nreq || nreq > data_size) { nreq = data_size; } + if ((*res = rp = GRN_MALLOC(nreq * 4))) { + for (rpe = rp + nreq; dp < data + data_size && rp < rpe; rp++) { + GRN_B_DEC(*rp, dp); + } + } + return rp - *res; + } else { + if (!(*res = rp = GRN_MALLOC(orig_size * sizeof(uint32_t)))) { + return 0; + } + if (!nreq || nreq > orig_size) { nreq = orig_size; } + for (rest = nreq; rest >= UNIT_SIZE; rest -= UNIT_SIZE) { + if (!(dp = unpack(dp, dpe, UNIT_SIZE, rp))) { return 0; } + rp += UNIT_SIZE; + } + if (rest) { if (!(dp = unpack(dp, dpe, rest, rp))) { return 0; } } + GRN_ASSERT(data + data_size == dp); + return nreq; + } +} + +int +grn_p_decv(grn_ctx *ctx, uint8_t *data, uint32_t data_size, datavec *dv, uint32_t dvlen) +{ + size_t size; + uint32_t df, l, i, *rp, nreq; + uint8_t *dp = data, *dpe = data + data_size; + if (!data_size) { + dv[0].data_size = 0; + return 0; + } + for (nreq = 0; nreq < dvlen; nreq++) { + if (dv[nreq].flags & CUT_OFF) { break; } + } + if (!nreq) { return 0; } + GRN_B_DEC_CHECK(df, dp, dpe); + if ((df & 1)) { + df >>= 1; + size = nreq == dvlen ? data_size : df * nreq; + if (dv[dvlen].data < dv[0].data + size) { + if (dv[0].data) { GRN_FREE(dv[0].data); } + if (!(rp = GRN_MALLOC(size * sizeof(uint32_t)))) { return 0; } + dv[dvlen].data = rp + size; + } else { + rp = dv[0].data; + } + for (l = 0; l < dvlen; l++) { + if (dv[l].flags & CUT_OFF) { break; } + dv[l].data = rp; + if (l < dvlen - 1) { + for (i = 0; i < df; i++, rp++) { GRN_B_DEC_CHECK(*rp, dp, dpe); } + } else { + for (i = 0; dp < dpe; i++, rp++) { GRN_B_DEC_CHECK(*rp, dp, dpe); } + } + dv[l].data_size = i; + } + } else { + uint32_t n, rest, usep = df >> 1; + GRN_B_DEC_CHECK(df, dp, dpe); + if (dv[dvlen -1].flags & ODD) { + GRN_B_DEC_CHECK(rest, dp, dpe); + } else { + rest = 0; + } + size = df * nreq + (nreq == dvlen ? rest : 0); + if (dv[dvlen].data < dv[0].data + size) { + if (dv[0].data) { GRN_FREE(dv[0].data); } + if (!(rp = GRN_MALLOC(size * sizeof(uint32_t)))) { return 0; } + dv[dvlen].data = rp + size; + } else { + rp = dv[0].data; + } + for (l = 0; l < dvlen; l++) { + if (dv[l].flags & CUT_OFF) { break; } + dv[l].data = rp; + dv[l].data_size = n = (l < dvlen - 1) ? df : df + rest; + if (usep & (1 << l)) { + for (; n >= UNIT_SIZE; n -= UNIT_SIZE) { + if (!(dp = unpack(dp, dpe, UNIT_SIZE, rp))) { return 0; } + rp += UNIT_SIZE; + } + if (n) { + if (!(dp = unpack(dp, dpe, n, rp))) { return 0; } + rp += n; + } + dv[l].flags |= USE_P_ENC; + } else { + for (; n; n--, rp++) { + GRN_B_DEC_CHECK(*rp, dp, dpe); + } + } + } + GRN_ASSERT(dp == dpe); + if (dp != dpe) { + GRN_LOG(ctx, GRN_LOG_DEBUG, "data_size=%d, %" GRN_FMT_LLD, + data_size, (long long int)(dpe - dp)); + } + } + return rp - dv[0].data; +} + +int +grn_b_enc(grn_ctx *ctx, uint32_t *data, uint32_t data_size, uint8_t **res) +{ + uint8_t *rp; + uint32_t *dp, i; + *res = rp = GRN_MALLOC(data_size * sizeof(uint32_t) * 2); + GRN_B_ENC(data_size, rp); + for (i = data_size, dp = data; i; i--, dp++) { + GRN_B_ENC(*dp, rp); + } + return rp - *res; +} + +int +grn_b_dec(grn_ctx *ctx, uint8_t *data, uint32_t data_size, uint32_t **res) +{ + uint32_t i, *rp, orig_size; + uint8_t *dp = data; + GRN_B_DEC(orig_size, dp); + *res = rp = GRN_MALLOC(orig_size * sizeof(uint32_t)); + for (i = orig_size; i; i--, rp++) { + GRN_B_DEC(*rp, dp); + } + return orig_size; +} + +/* buffer */ + +typedef struct { + uint32_t tid; + uint32_t size_in_chunk; + uint32_t pos_in_chunk; + uint16_t size_in_buffer; + uint16_t pos_in_buffer; +} buffer_term; + +typedef struct { + uint16_t step; + uint16_t jump; +} buffer_rec; + +typedef struct { + uint32_t chunk; + uint32_t chunk_size; + uint32_t buffer_free; + uint16_t nterms; + uint16_t nterms_void; +} buffer_header; + +struct grn_ii_buffer { + buffer_header header; + buffer_term terms[(S_SEGMENT - sizeof(buffer_header))/sizeof(buffer_term)]; +}; + +typedef struct grn_ii_buffer buffer; + +inline static uint32_t +buffer_open(grn_ctx *ctx, grn_ii *ii, uint32_t pos, buffer_term **bt, buffer **b) +{ + byte *p = NULL; + uint16_t lseg = (uint16_t) (LSEG(pos)); + uint32_t pseg = ii->header->binfo[lseg]; + if (pseg != GRN_II_PSEG_NOT_ASSIGNED) { + GRN_IO_SEG_REF(ii->seg, pseg, p); + if (!p) { return GRN_II_PSEG_NOT_ASSIGNED; } + if (b) { *b = (buffer *)p; } + if (bt) { *bt = (buffer_term *)(p + LPOS(pos)); } + } + return pseg; +} + +inline static grn_rc +buffer_close(grn_ctx *ctx, grn_ii *ii, uint32_t pseg) +{ + if (pseg >= ii->seg->header->max_segment) { + GRN_LOG(ctx, GRN_LOG_NOTICE, "invalid pseg buffer_close(%d)", pseg); + return GRN_INVALID_ARGUMENT; + } + GRN_IO_SEG_UNREF(ii->seg, pseg); + return GRN_SUCCESS; +} + +typedef struct { + uint32_t rid; + uint32_t sid; +} docid; + +#define BUFFER_REC_DEL(r) ((r)->jump = 1) +#define BUFFER_REC_DELETED(r) ((r)->jump == 1) + +#define BUFFER_REC_AT(b,pos) ((buffer_rec *)(b) + (pos)) +#define BUFFER_REC_POS(b,rec) ((uint16_t)((rec) - (buffer_rec *)(b))) + +inline static void +buffer_term_dump(grn_ctx *ctx, grn_ii *ii, buffer *b, buffer_term *bt) +{ + int pos, rid, sid; + uint8_t *p; + buffer_rec *r; + + if (!grn_logger_pass(ctx, GRN_LOG_DEBUG)) { + return; + } + + GRN_LOG(ctx, GRN_LOG_DEBUG, + "b=(%x %u %u %u)", b->header.chunk, b->header.chunk_size, + b->header.buffer_free, b->header.nterms); + GRN_LOG(ctx, GRN_LOG_DEBUG, + "bt=(%u %u %u %u %u)", bt->tid, bt->size_in_chunk, bt->pos_in_chunk, + bt->size_in_buffer, bt->pos_in_buffer); + for (pos = bt->pos_in_buffer; pos; pos = r->step) { + r = BUFFER_REC_AT(b, pos); + p = GRN_NEXT_ADDR(r); + GRN_B_DEC(rid, p); + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { + GRN_B_DEC(sid, p); + } else { + sid = 1; + } + GRN_LOG(ctx, GRN_LOG_DEBUG, + "%d=(%d:%d),(%d:%d)", pos, r->jump, r->step, rid, sid); + } +} + +inline static grn_rc +check_jump(grn_ctx *ctx, grn_ii *ii, buffer *b, buffer_rec *r, int j) +{ + uint16_t i = BUFFER_REC_POS(b, r); + uint8_t *p; + buffer_rec *r2; + docid id, id2; + if (!j) { return GRN_SUCCESS; } + p = GRN_NEXT_ADDR(r); + GRN_B_DEC(id.rid, p); + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { + GRN_B_DEC(id.sid, p); + } else { + id.sid = 1; + } + if (j == 1) { + GRN_LOG(ctx, GRN_LOG_DEBUG, "deleting! %d(%d:%d)", i, id.rid, id.sid); + return GRN_SUCCESS; + } + r2 = BUFFER_REC_AT(b, j); + p = GRN_NEXT_ADDR(r2); + GRN_B_DEC(id2.rid, p); + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { + GRN_B_DEC(id2.sid, p); + } else { + id2.sid = 1; + } + if (r2->step == i) { + GRN_LOG(ctx, GRN_LOG_EMERG, "cycle! %d(%d:%d)<->%d(%d:%d)", + i, id.rid, id.sid, j, id2.rid, id2.sid); + return GRN_FILE_CORRUPT; + } + if (id2.rid < id.rid || (id2.rid == id.rid && id2.sid <= id.sid)) { + GRN_LOG(ctx, GRN_LOG_CRIT, + "invalid jump! %d(%d:%d)(%d:%d)->%d(%d:%d)(%d:%d)", + i, r->jump, r->step, id.rid, id.sid, j, r2->jump, r2->step, + id2.rid, id2.sid); + return GRN_FILE_CORRUPT; + } + return GRN_SUCCESS; +} + +inline static grn_rc +set_jump_r(grn_ctx *ctx, grn_ii *ii, buffer *b, buffer_rec *from, int to) +{ + int i, j, max_jump = 100; + buffer_rec *r, *r2; + for (r = from, j = to; j > 1 && max_jump--; r = BUFFER_REC_AT(b, r->step)) { + r2 = BUFFER_REC_AT(b, j); + if (r == r2) { break; } + if (BUFFER_REC_DELETED(r2)) { break; } + if (j == (i = r->jump)) { break; } + if (j == r->step) { break; } + if (check_jump(ctx, ii, b, r, j)) { + ERR(GRN_FILE_CORRUPT, "check_jump failed"); + return ctx->rc; + } + r->jump = j; + j = i; + if (!r->step) { return GRN_FILE_CORRUPT; } + } + return GRN_SUCCESS; +} + +#define GET_NUM_BITS(x,n) do {\ + n = x;\ + n = (n & 0x55555555) + ((n >> 1) & 0x55555555);\ + n = (n & 0x33333333) + ((n >> 2) & 0x33333333);\ + n = (n & 0x0F0F0F0F) + ((n >> 4) & 0x0F0F0F0F);\ + n = (n & 0x00FF00FF) + ((n >> 8) & 0x00FF00FF);\ + n = (n & 0x0000FFFF) + ((n >>16) & 0x0000FFFF);\ +} while (0) + +inline static grn_rc +buffer_put(grn_ctx *ctx, grn_ii *ii, buffer *b, buffer_term *bt, + buffer_rec *rnew, uint8_t *bs, grn_ii_updspec *u, int size) +{ + uint8_t *p; + docid id_curr = {0, 0}, id_start = {0, 0}, id_post = {0, 0}; + buffer_rec *r_curr, *r_start = NULL; + uint16_t last = 0, *lastp = &bt->pos_in_buffer, pos = BUFFER_REC_POS(b, rnew); + int vdelta = 0, delta, delta0 = 0, vhops = 0, nhops = 0, reset = 1; + grn_memcpy(GRN_NEXT_ADDR(rnew), bs, size - sizeof(buffer_rec)); + for (;;) { + if (!*lastp) { + rnew->step = 0; + rnew->jump = 0; + // smb_wmb(); + *lastp = pos; + if (bt->size_in_buffer++ > 1) { + buffer_rec *rhead = BUFFER_REC_AT(b, bt->pos_in_buffer); + rhead->jump = pos; + if (!(bt->size_in_buffer & 1)) { + int n; + buffer_rec *r = BUFFER_REC_AT(b, rhead->step), *r2; + GET_NUM_BITS(bt->size_in_buffer, n); + while (n-- && (r->jump > 1)) { + r2 = BUFFER_REC_AT(b, r->jump); + if (BUFFER_REC_DELETED(r2)) { break; } + r = r2; + } + if (r != rnew) { set_jump_r(ctx, ii, b, r, last); } + } + } + break; + } + r_curr = BUFFER_REC_AT(b, *lastp); + p = GRN_NEXT_ADDR(r_curr); + GRN_B_DEC(id_curr.rid, p); + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { + GRN_B_DEC(id_curr.sid, p); + } else { + id_curr.sid = 1; + } + if (id_curr.rid < id_post.rid || + (id_curr.rid == id_post.rid && id_curr.sid < id_post.sid)) { + { + DEFINE_NAME(ii); + CRIT(GRN_FILE_CORRUPT, + "[ii][buffer][put] loop is found: " + "<%.*s>: " + "(%d:%d)->(%d:%d)", + name_size, name, + id_post.rid, id_post.sid, id_curr.rid, id_curr.sid); + } + buffer_term_dump(ctx, ii, b, bt); + bt->pos_in_buffer = 0; + bt->size_in_buffer = 0; + lastp = &bt->pos_in_buffer; + continue; + } + id_post.rid = id_curr.rid; + id_post.sid = id_curr.sid; + if (u->rid < id_curr.rid || (u->rid == id_curr.rid && u->sid <= id_curr.sid)) { + uint16_t step = *lastp, jump = r_curr->jump; + if (u->rid == id_curr.rid) { + if (u->sid == 0) { + while (id_curr.rid == u->rid) { + BUFFER_REC_DEL(r_curr); + if (!(step = r_curr->step)) { break; } + r_curr = BUFFER_REC_AT(b, step); + p = GRN_NEXT_ADDR(r_curr); + GRN_B_DEC(id_curr.rid, p); + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { + GRN_B_DEC(id_curr.sid, p); + } else { + id_curr.sid = 1; + } + } + } else if (u->sid == id_curr.sid) { + BUFFER_REC_DEL(r_curr); + step = r_curr->step; + } + } + rnew->step = step; + rnew->jump = check_jump(ctx, ii, b, rnew, jump) ? 0 : jump; + // smb_wmb(); + *lastp = pos; + break; + } + + if (reset) { + r_start = r_curr; + id_start.rid = id_curr.rid; + id_start.sid = id_curr.sid; + if (!(delta0 = u->rid - id_start.rid)) { delta0 = u->sid - id_start.sid; } + nhops = 0; + vhops = 1; + vdelta = delta0 >> 1; + } else { + if (!(delta = id_curr.rid - id_start.rid)) { + delta = id_curr.sid - id_start.sid; + } + if (vdelta < delta) { + vdelta += (delta0 >> ++vhops); + r_start = r_curr; + } + if (nhops > vhops) { + set_jump_r(ctx, ii, b, r_start, *lastp); + } else { + nhops++; + } + } + + last = *lastp; + lastp = &r_curr->step; + reset = 0; + { + uint16_t posj = r_curr->jump; + if (posj > 1) { + buffer_rec *rj = BUFFER_REC_AT(b, posj); + if (!BUFFER_REC_DELETED(rj)) { + docid idj; + p = GRN_NEXT_ADDR(rj); + GRN_B_DEC(idj.rid, p); + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { + GRN_B_DEC(idj.sid, p); + } else { + idj.sid = 1; + } + if (idj.rid < u->rid || (idj.rid == u->rid && idj.sid < u->sid)) { + last = posj; + lastp = &rj->step; + } else { + reset = 1; + } + } + } + } + } + return ctx->rc; +} + +/* array */ + +inline static uint32_t * +array_at(grn_ctx *ctx, grn_ii *ii, uint32_t id) +{ + byte *p = NULL; + uint32_t seg, pseg; + if (id > GRN_ID_MAX) { return NULL; } + seg = id >> W_ARRAY; + if ((pseg = ii->header->ainfo[seg]) == GRN_II_PSEG_NOT_ASSIGNED) { + return NULL; + } + GRN_IO_SEG_REF(ii->seg, pseg, p); + if (!p) { return NULL; } + return (uint32_t *)(p + (id & ARRAY_MASK_IN_A_SEGMENT) * S_ARRAY_ELEMENT); +} + +inline static uint32_t * +array_get(grn_ctx *ctx, grn_ii *ii, uint32_t id) +{ + byte *p = NULL; + uint16_t seg; + uint32_t pseg; + if (id > GRN_ID_MAX) { return NULL; } + seg = id >> W_ARRAY; + if ((pseg = ii->header->ainfo[seg]) == GRN_II_PSEG_NOT_ASSIGNED) { + if (segment_get_clear(ctx, ii, &pseg)) { return NULL; } + ii->header->ainfo[seg] = pseg; + if (seg >= ii->header->amax) { ii->header->amax = seg + 1; } + } + GRN_IO_SEG_REF(ii->seg, pseg, p); + if (!p) { return NULL; } + return (uint32_t *)(p + (id & ARRAY_MASK_IN_A_SEGMENT) * S_ARRAY_ELEMENT); +} + +inline static void +array_unref(grn_ii *ii, uint32_t id) +{ + GRN_IO_SEG_UNREF(ii->seg, ii->header->ainfo[id >> W_ARRAY]); +} + +/* updspec */ + +grn_ii_updspec * +grn_ii_updspec_open(grn_ctx *ctx, uint32_t rid, uint32_t sid) +{ + grn_ii_updspec *u; + if (!(u = GRN_MALLOC(sizeof(grn_ii_updspec)))) { return NULL; } + u->rid = rid; + u->sid = sid; + u->weight = 0; + u->tf = 0; + u->atf = 0; + u->pos = NULL; + u->tail = NULL; + // u->vnodes = NULL; + return u; +} + +#define GRN_II_MAX_TF 0x1ffff + +grn_rc +grn_ii_updspec_add(grn_ctx *ctx, grn_ii_updspec *u, int pos, int32_t weight) +{ + struct _grn_ii_pos *p; + u->atf++; + if (u->tf >= GRN_II_MAX_TF) { return GRN_SUCCESS; } + if (!(p = GRN_MALLOC(sizeof(struct _grn_ii_pos)))) { + return GRN_NO_MEMORY_AVAILABLE; + } + u->weight += weight; + p->pos = pos; + p->next = NULL; + if (u->tail) { + u->tail->next = p; + } else { + u->pos = p; + } + u->tail = p; + u->tf++; + return GRN_SUCCESS; +} + +int +grn_ii_updspec_cmp(grn_ii_updspec *a, grn_ii_updspec *b) +{ + struct _grn_ii_pos *pa, *pb; + if (a->rid != b->rid) { return a->rid - b->rid; } + if (a->sid != b->sid) { return a->sid - b->sid; } + if (a->weight != b->weight) { return a->weight - b->weight; } + if (a->tf != b->tf) { return a->tf - b->tf; } + for (pa = a->pos, pb = b->pos; pa && pb; pa = pa->next, pb = pb->next) { + if (pa->pos != pb->pos) { return pa->pos - pb->pos; } + } + if (pa) { return 1; } + if (pb) { return -1; } + return 0; +} + +grn_rc +grn_ii_updspec_close(grn_ctx *ctx, grn_ii_updspec *u) +{ + struct _grn_ii_pos *p = u->pos, *q; + while (p) { + q = p->next; + GRN_FREE(p); + p = q; + } + GRN_FREE(u); + return GRN_SUCCESS; +} + +inline static uint8_t * +encode_rec(grn_ctx *ctx, grn_ii *ii, grn_ii_updspec *u, unsigned int *size, int deletep) +{ + uint8_t *br, *p; + struct _grn_ii_pos *pp; + uint32_t lpos, tf, weight; + if (deletep) { + tf = 0; + weight = 0; + } else { + tf = u->tf; + weight = u->weight; + } + if (!(br = GRN_MALLOC((tf + 4) * 5))) { + return NULL; + } + p = br; + GRN_B_ENC(u->rid, p); + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { + GRN_B_ENC(u->sid, p); + } else { + u->sid = 1; + } + GRN_B_ENC(tf, p); + if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { GRN_B_ENC(weight, p); } + if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { + for (lpos = 0, pp = u->pos; pp && tf--; lpos = pp->pos, pp = pp->next) { + GRN_B_ENC(pp->pos - lpos, p); + } + } + while (((intptr_t)p & 0x03)) { *p++ = 0; } + *size = (unsigned int) ((p - br) + sizeof(buffer_rec)); + return br; +} + +typedef struct { + grn_ii *ii; + grn_hash *h; +} lexicon_deletable_arg; + +#ifdef CASCADE_DELETE_LEXICON +static int +lexicon_deletable(grn_ctx *ctx, grn_obj *lexicon, grn_id tid, void *arg) +{ + uint32_t *a; + grn_hash *h = ((lexicon_deletable_arg *)arg)->h; + grn_ii *ii = ((lexicon_deletable_arg *)arg)->ii; + if (!h) { return 0; } + if ((a = array_at(ctx, ii, tid))) { + if (a[0]) { + array_unref(ii, tid); + return 0; + } + array_unref(ii, tid); + } + { + grn_ii_updspec **u; + if (!grn_hash_get(ctx, h, &tid, sizeof(grn_id), (void **) &u)) { + return (ERRP(ctx, GRN_ERROR)) ? 0 : 1; + } + if (!(*u)->tf || !(*u)->sid) { return 1; } + return 0; + } +} +#endif /* CASCADE_DELETE_LEXICON */ + +inline static void +lexicon_delete(grn_ctx *ctx, grn_ii *ii, uint32_t tid, grn_hash *h) +{ +#ifdef CASCADE_DELETE_LEXICON + lexicon_deletable_arg arg = {ii, h}; + grn_table_delete_optarg optarg = {0, lexicon_deletable, &arg}; + _grn_table_delete_by_id(ctx, ii->lexicon, tid, &optarg); +#endif /* CASCADE_DELETE_LEXICON */ +} + +typedef struct { + grn_id rid; + uint32_t sid; + uint32_t tf; + uint32_t weight; + uint32_t flags; +} docinfo; + +#define GETNEXTC() do {\ + if (sdf) {\ + uint32_t dgap = *srp++;\ + cid.rid += dgap;\ + if (dgap) { cid.sid = 0; }\ + snp += cid.tf;\ + cid.tf = 1 + *stp++;\ + if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { cid.weight = *sop++; }\ + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) {\ + cid.sid += 1 + *ssp++;\ + } else {\ + cid.sid = 1;\ + }\ + sdf--;\ + } else {\ + cid.rid = 0;\ + }\ +} while (0) + +#define PUTNEXT_(id) do {\ + uint32_t dgap = id.rid - lid.rid;\ + uint32_t sgap = (dgap ? id.sid : id.sid - lid.sid) - 1;\ + *ridp++ = dgap;\ + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) {\ + *sidp++ = sgap;\ + }\ + *tfp++ = id.tf - 1;\ + if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { *weightp++ = id.weight; }\ + lid.rid = id.rid;\ + lid.sid = id.sid;\ +} while (0) + +#define PUTNEXTC() do {\ + if (cid.rid) {\ + if (cid.tf) {\ + if (lid.rid > cid.rid || (lid.rid == cid.rid && lid.sid >= cid.sid)) {\ + DEFINE_NAME(ii);\ + CRIT(GRN_FILE_CORRUPT,\ + "[ii][broken] posting in list is larger than posting in chunk: "\ + "<%.*s>: (%d:%d) -> (%d:%d)",\ + name_size, name, lid.rid, lid.sid, cid.rid, cid.sid);\ + break;\ + }\ + PUTNEXT_(cid);\ + if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) {\ + uint32_t i;\ + for (i = 0; i < cid.tf; i++) {\ + *posp++ = snp[i];\ + spos += snp[i];\ + }\ + }\ + } else {\ + DEFINE_NAME(ii);\ + CRIT(GRN_FILE_CORRUPT,\ + "[ii][broken] invalid posting in chunk: <%.*s>: (%d,%d)",\ + name_size, name, bt->tid, cid.rid);\ + break;\ + }\ + }\ + GETNEXTC();\ +} while (0) + +#define GETNEXTB() do {\ + if (nextb) {\ + uint32_t lrid = bid.rid, lsid = bid.sid;\ + buffer_rec *br = BUFFER_REC_AT(sb, nextb);\ + sbp = GRN_NEXT_ADDR(br);\ + GRN_B_DEC(bid.rid, sbp);\ + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) {\ + GRN_B_DEC(bid.sid, sbp);\ + } else {\ + bid.sid = 1;\ + }\ + if (lrid > bid.rid || (lrid == bid.rid && lsid >= bid.sid)) {\ + DEFINE_NAME(ii);\ + CRIT(GRN_FILE_CORRUPT,\ + "[ii][broken] postings in block aren't sorted: "\ + "<%.*s>: (%d:%d) -> (%d:%d)",\ + name_size, name, lrid, lsid, bid.rid, bid.sid);\ + break;\ + }\ + nextb = br->step;\ + } else {\ + bid.rid = 0;\ + }\ +} while (0) + +#define PUTNEXTB() do {\ + if (bid.rid && bid.sid) {\ + GRN_B_DEC(bid.tf, sbp);\ + if (bid.tf > 0) {\ + if (lid.rid > bid.rid || (lid.rid == bid.rid && lid.sid >= bid.sid)) {\ + DEFINE_NAME(ii);\ + CRIT(GRN_FILE_CORRUPT,\ + "[ii][broken] posting in list is larger than posting in buffer: "\ + "<%.*s>: (%d:%d) -> (%d:%d)",\ + name_size, name, lid.rid, lid.sid, bid.rid, bid.sid);\ + break;\ + }\ + if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) {\ + GRN_B_DEC(bid.weight, sbp);\ + }\ + PUTNEXT_(bid);\ + if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) {\ + while (bid.tf--) { GRN_B_DEC(*posp, sbp); spos += *posp++; }\ + }\ + }\ + }\ + GETNEXTB();\ +} while (0) + +#define MERGE_BC(cond) do {\ + if (bid.rid) {\ + if (cid.rid) {\ + if (cid.rid < bid.rid) {\ + PUTNEXTC();\ + if (ctx->rc != GRN_SUCCESS) { break; }\ + } else {\ + if (bid.rid < cid.rid) {\ + PUTNEXTB();\ + if (ctx->rc != GRN_SUCCESS) { break; }\ + } else {\ + if (bid.sid) {\ + if (cid.sid < bid.sid) {\ + PUTNEXTC();\ + if (ctx->rc != GRN_SUCCESS) { break; }\ + } else {\ + if (bid.sid == cid.sid) { GETNEXTC(); }\ + PUTNEXTB();\ + if (ctx->rc != GRN_SUCCESS) { break; }\ + }\ + } else {\ + GETNEXTC();\ + }\ + }\ + }\ + } else {\ + PUTNEXTB();\ + if (ctx->rc != GRN_SUCCESS) { break; }\ + }\ + } else {\ + if (cid.rid) {\ + PUTNEXTC();\ + if (ctx->rc != GRN_SUCCESS) { break; }\ + } else {\ + break;\ + }\ + }\ +} while (cond) + +typedef struct { + uint32_t segno; + uint32_t size; + uint32_t dgap; +} chunk_info; + +static grn_rc +chunk_flush(grn_ctx *ctx, grn_ii *ii, chunk_info *cinfo, uint8_t *enc, uint32_t encsize) +{ + uint8_t *dc; + uint32_t dcn; + grn_io_win dw; + if (encsize) { + chunk_new(ctx, ii, &dcn, encsize); + if (ctx->rc == GRN_SUCCESS) { + if ((dc = WIN_MAP(ii->chunk, ctx, &dw, dcn, 0, encsize, grn_io_wronly))) { + grn_memcpy(dc, enc, encsize); + grn_io_win_unmap(&dw); + cinfo->segno = dcn; + cinfo->size = encsize; + } else { + chunk_free(ctx, ii, dcn, 0, encsize); + { + DEFINE_NAME(ii); + MERR("[ii][chunk][flush] failed to allocate a destination chunk: " + "<%.*s> :" + "segment:<%u>, size:<%u>", + name_size, name, + dcn, encsize); + } + } + } + } else { + cinfo->segno = 0; + cinfo->size = 0; + } + return ctx->rc; +} + +static grn_rc +chunk_merge(grn_ctx *ctx, grn_ii *ii, buffer *sb, buffer_term *bt, + chunk_info *cinfo, grn_id rid, datavec *dv, + uint16_t *nextbp, uint8_t **sbpp, docinfo *bidp, int32_t *balance) +{ + grn_io_win sw; + uint64_t spos = 0; + uint32_t segno = cinfo->segno, size = cinfo->size, sdf = 0, ndf = 0; + uint32_t *ridp = NULL, *sidp = NULL, *tfp, *weightp = NULL, *posp = NULL; + docinfo cid = {0, 0, 0, 0, 0}, lid = {0, 0, 0, 0, 0}, bid = *bidp; + uint8_t *scp = WIN_MAP(ii->chunk, ctx, &sw, segno, 0, size, grn_io_rdonly); + + if (scp) { + uint16_t nextb = *nextbp; + uint32_t snn = 0, *srp, *ssp = NULL, *stp, *sop = NULL, *snp; + uint8_t *sbp = *sbpp; + datavec rdv[MAX_N_ELEMENTS + 1]; + size_t bufsize = S_SEGMENT * ii->n_elements; + datavec_init(ctx, rdv, ii->n_elements, 0, 0); + if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { + rdv[ii->n_elements - 1].flags = ODD; + } + bufsize += grn_p_decv(ctx, scp, cinfo->size, rdv, ii->n_elements); + // (df in chunk list) = a[1] - sdf; + { + int j = 0; + sdf = rdv[j].data_size; + srp = rdv[j++].data; + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { ssp = rdv[j++].data; } + stp = rdv[j++].data; + if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { sop = rdv[j++].data; } + snn = rdv[j].data_size; + snp = rdv[j].data; + } + datavec_reset(ctx, dv, ii->n_elements, sdf + S_SEGMENT, bufsize); + if (ctx->rc == GRN_SUCCESS) { + { + int j = 0; + ridp = dv[j++].data; + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { sidp = dv[j++].data; } + tfp = dv[j++].data; + if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { weightp = dv[j++].data; } + posp = dv[j].data; + } + GETNEXTC(); + MERGE_BC(bid.rid <= rid || cid.rid); + if (ctx->rc == GRN_SUCCESS) { + *sbpp = sbp; + *nextbp = nextb; + *bidp = bid; + GRN_ASSERT(posp < dv[ii->n_elements].data); + ndf = ridp - dv[0].data; + } + } + datavec_fin(ctx, rdv); + grn_io_win_unmap(&sw); + } else { + DEFINE_NAME(ii); + MERR("[ii][chunk][merge] failed to allocate a source chunk: " + "<%.*s> :" + "record:<%u>, segment:<%u>, size:<%u>", + name_size, name, + rid, + segno, + size); + } + if (ctx->rc == GRN_SUCCESS) { + int j = 0; + uint8_t *enc; + uint32_t encsize; + uint32_t np = posp - dv[ii->n_elements - 1].data; + uint32_t f_s = (ndf < 3) ? 0 : USE_P_ENC; + uint32_t f_d = ((ndf < 16) || (ndf <= (lid.rid >> 8))) ? 0 : USE_P_ENC; + dv[j].data_size = ndf; dv[j++].flags = f_d; + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { + dv[j].data_size = ndf; dv[j++].flags = f_s; + } + dv[j].data_size = ndf; dv[j++].flags = f_s; + if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { + dv[j].data_size = ndf; dv[j++].flags = f_s; + } + if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { + uint32_t f_p = ((np < 32) || (np <= (spos >> 13))) ? 0 : USE_P_ENC; + dv[j].data_size = np; dv[j].flags = f_p|ODD; + } + if ((enc = GRN_MALLOC((ndf * 4 + np) * 2))) { + encsize = grn_p_encv(ctx, dv, ii->n_elements, enc); + chunk_flush(ctx, ii, cinfo, enc, encsize); + if (ctx->rc == GRN_SUCCESS) { + chunk_free(ctx, ii, segno, 0, size); + } + GRN_FREE(enc); + } else { + DEFINE_NAME(ii); + MERR("[ii][chunk][merge] failed to allocate a encode buffer: " + "<%.*s> :" + "record:<%u>, segment:<%u>, size:<%u>", + name_size, name, + rid, + segno, + size); + } + } + *balance += (ndf - sdf); + return ctx->rc; +} + +static void +buffer_merge_dump_datavec(grn_ctx *ctx, + grn_ii *ii, + datavec *dv, + datavec *rdv) +{ + int i, j; + grn_obj buffer; + + GRN_TEXT_INIT(&buffer, 0); + for (i = 0; i < ii->n_elements; i++) { + GRN_LOG(ctx, GRN_LOG_DEBUG, "rdv[%d] data_size=%d, flags=%d", + i, rdv[i].data_size, rdv[i].flags); + GRN_BULK_REWIND(&buffer); + for (j = 0; j < rdv[i].data_size;) { + grn_text_printf(ctx, &buffer, " %d", rdv[i].data[j]); + j++; + if (!(j % 32) || j == rdv[i].data_size) { + GRN_LOG(ctx, GRN_LOG_DEBUG, + "rdv[%d].data[%d]%.*s", + i, j, + (int)GRN_TEXT_LEN(&buffer), + GRN_TEXT_VALUE(&buffer)); + GRN_BULK_REWIND(&buffer); + } + } + } + + for (i = 0; i < ii->n_elements; i++) { + GRN_LOG(ctx, GRN_LOG_DEBUG, "dv[%d] data_size=%d, flags=%d", + i, dv[i].data_size, dv[i].flags); + GRN_BULK_REWIND(&buffer); + for (j = 0; j < dv[i].data_size;) { + grn_text_printf(ctx, &buffer, " %d", dv[i].data[j]); + j++; + if (!(j % 32) || j == dv[i].data_size) { + GRN_LOG(ctx, GRN_LOG_DEBUG, + "dv[%d].data[%d]%.*s", + i, j, + (int)GRN_TEXT_LEN(&buffer), + GRN_TEXT_VALUE(&buffer)); + GRN_BULK_REWIND(&buffer); + } + } + } + + GRN_OBJ_FIN(ctx, &buffer); +} + +/* If dc doesn't have enough space, program may be crashed. + * TODO: Support auto space extension or max size check. + */ +static grn_rc +buffer_merge(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h, + buffer *sb, uint8_t *sc, buffer *db, uint8_t *dc) +{ + buffer_term *bt; + uint8_t *sbp = NULL, *dcp = dc; + datavec dv[MAX_N_ELEMENTS + 1]; + datavec rdv[MAX_N_ELEMENTS + 1]; + uint16_t n = db->header.nterms, nterms_void = 0; + size_t unitsize = (S_SEGMENT + sb->header.chunk_size / sb->header.nterms) * 2; + // size_t unitsize = (S_SEGMENT + sb->header.chunk_size) * 2 + (1<<24); + size_t totalsize = unitsize * ii->n_elements; + //todo : realloc + datavec_init(ctx, dv, ii->n_elements, unitsize, totalsize); + if (ctx->rc != GRN_SUCCESS) { + DEFINE_NAME(ii); + ERR(ctx->rc, + "[ii][buffer][merge] failed to initialize data vector: " + "<%.*s>: " + "unit-size:<%" GRN_FMT_SIZE ">, " + "total-size:<%" GRN_FMT_SIZE ">", + name_size, name, + unitsize, + totalsize); + return ctx->rc; + } + datavec_init(ctx, rdv, ii->n_elements, 0, 0); + if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { + rdv[ii->n_elements - 1].flags = ODD; + } + for (bt = db->terms; n; n--, bt++) { + uint16_t nextb; + uint64_t spos = 0; + int32_t balance = 0; + uint32_t *ridp, *sidp = NULL, *tfp, *weightp = NULL, *posp, nchunks = 0; + uint32_t nvchunks = 0; + chunk_info *cinfo = NULL; + grn_id crid = GRN_ID_NIL; + docinfo cid = {0, 0, 0, 0, 0}, lid = {0, 0, 0, 0, 0}, bid = {0, 0}; + uint32_t sdf = 0, snn = 0, ndf; + uint32_t *srp = NULL, *ssp = NULL, *stp = NULL, *sop = NULL, *snp = NULL; + if (!bt->tid) { + nterms_void++; + continue; + } + if (!bt->pos_in_buffer) { + GRN_ASSERT(!bt->size_in_buffer); + if (bt->size_in_chunk) { + grn_memcpy(dcp, sc + bt->pos_in_chunk, bt->size_in_chunk); + bt->pos_in_chunk = (uint32_t)(dcp - dc); + dcp += bt->size_in_chunk; + } + continue; + } + nextb = bt->pos_in_buffer; + GETNEXTB(); + if (sc && bt->size_in_chunk) { + uint8_t *scp = sc + bt->pos_in_chunk; + uint8_t *sce = scp + bt->size_in_chunk; + size_t size = S_SEGMENT * ii->n_elements; + if ((bt->tid & CHUNK_SPLIT)) { + int i; + GRN_B_DEC(nchunks, scp); + if (!(cinfo = GRN_MALLOCN(chunk_info, nchunks + 1))) { + datavec_fin(ctx, dv); + datavec_fin(ctx, rdv); + { + DEFINE_NAME(ii); + MERR("[ii][buffer][merge] failed to allocate chunk info: " + "<%.*s> :" + "segment:<%u>, " + "n-chunks:<%u>, " + "unit-size:<%" GRN_FMT_SIZE ">, " + "total-size:<%" GRN_FMT_SIZE ">", + name_size, name, + seg, + nchunks, + unitsize, + totalsize); + } + return ctx->rc; + } + for (i = 0; i < nchunks; i++) { + GRN_B_DEC(cinfo[i].segno, scp); + GRN_B_DEC(cinfo[i].size, scp); + GRN_B_DEC(cinfo[i].dgap, scp); + crid += cinfo[i].dgap; + if (bid.rid <= crid) { + chunk_merge(ctx, ii, sb, bt, &cinfo[i], crid, dv, + &nextb, &sbp, &bid, &balance); + if (ctx->rc != GRN_SUCCESS) { + if (cinfo) { GRN_FREE(cinfo); } + datavec_fin(ctx, dv); + datavec_fin(ctx, rdv); + { + DEFINE_NAME(ii); + ERR(ctx->rc, + "[ii][buffer][merge] failed to merge chunk: " + "<%.*s>: " + "chunk:<%u>, " + "n-chunks:<%u>", + name_size, name, + i, + nchunks); + } + return ctx->rc; + } + } + if (cinfo[i].size) { + nvchunks++; + } else { + crid -= cinfo[i].dgap; + cinfo[i + 1].dgap += cinfo[i].dgap; + } + } + } + if (sce > scp) { + size += grn_p_decv(ctx, scp, sce - scp, rdv, ii->n_elements); + { + int j = 0; + sdf = rdv[j].data_size; + srp = rdv[j++].data; + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { ssp = rdv[j++].data; } + stp = rdv[j++].data; + if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { sop = rdv[j++].data; } + snn = rdv[j].data_size; + snp = rdv[j].data; + } + datavec_reset(ctx, dv, ii->n_elements, sdf + S_SEGMENT, size); + if (ctx->rc != GRN_SUCCESS) { + if (cinfo) { GRN_FREE(cinfo); } + datavec_fin(ctx, dv); + datavec_fin(ctx, rdv); + { + DEFINE_NAME(ii); + ERR(ctx->rc, + "[ii][buffer][merge] failed to reset data vector: " + "<%.*s>: " + "unit-size:<%" GRN_FMT_SIZE ">, " + "total-size:<%" GRN_FMT_SIZE ">", + name_size, name, + (size_t)(sdf + S_SEGMENT), + size); + } + return ctx->rc; + } + } + } + { + int j = 0; + ridp = dv[j++].data; + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { sidp = dv[j++].data; } + tfp = dv[j++].data; + if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { weightp = dv[j++].data; } + posp = dv[j].data; + } + GETNEXTC(); + MERGE_BC(1); + if (ctx->rc != GRN_SUCCESS) { + if (cinfo) { GRN_FREE(cinfo); } + datavec_fin(ctx, dv); + datavec_fin(ctx, rdv); + { + DEFINE_NAME(ii); + ERR(ctx->rc, + "[ii][buffer][merge] failed to merge chunk: <%.*s>", + name_size, name); + } + return ctx->rc; + } + GRN_ASSERT(posp < dv[ii->n_elements].data); + ndf = ridp - dv[0].data; + /* + { + grn_obj buf; + uint32_t rid, sid, tf, i, pos, *pp; + GRN_TEXT_INIT(&buf, 0); + rid = 0; + pp = dv[3].data; + for (i = 0; i < ndf; i++) { + GRN_BULK_REWIND(&buf); + rid += dv[0].data[i]; + if (dv[0].data[i]) { sid = 0; } + sid += dv[1].data[i] + 1; + tf = dv[2].data[i] + 1; + pos = 0; + grn_text_itoa(ctx, &buf, rid); + GRN_TEXT_PUTC(ctx, &buf, ':'); + grn_text_itoa(ctx, &buf, sid); + GRN_TEXT_PUTC(ctx, &buf, ':'); + grn_text_itoa(ctx, &buf, tf); + GRN_TEXT_PUTC(ctx, &buf, ':'); + while (tf--) { + pos += *pp++; + grn_text_itoa(ctx, &buf, pos); + if (tf) { GRN_TEXT_PUTC(ctx, &buf, ','); } + } + GRN_TEXT_PUTC(ctx, &buf, '\0'); + GRN_LOG(ctx, GRN_LOG_DEBUG, "Posting:%s", GRN_TEXT_VALUE(&buf)); + } + GRN_OBJ_FIN(ctx, &buf); + } + */ + { + grn_id tid = bt->tid & GRN_ID_MAX; + uint32_t *a = array_at(ctx, ii, tid); + if (!a) { + GRN_LOG(ctx, GRN_LOG_DEBUG, "array_entry not found tid=%d", tid); + memset(bt, 0, sizeof(buffer_term)); + nterms_void++; + } else { + if (!ndf && !nvchunks) { + a[0] = 0; + a[1] = 0; + lexicon_delete(ctx, ii, tid, h); + memset(bt, 0, sizeof(buffer_term)); + nterms_void++; + } else if ((ii->header->flags & GRN_OBJ_WITH_SECTION) + && !nvchunks && ndf == 1 && lid.rid < 0x100000 && + lid.sid < 0x800 && lid.tf == 1 && lid.weight == 0) { + a[0] = (lid.rid << 12) + (lid.sid << 1) + 1; + a[1] = (ii->header->flags & GRN_OBJ_WITH_POSITION) ? posp[-1] : 0; + memset(bt, 0, sizeof(buffer_term)); + nterms_void++; + } else if (!(ii->header->flags & GRN_OBJ_WITH_SECTION) + && !nvchunks && ndf == 1 && lid.tf == 1 && lid.weight == 0) { + a[0] = (lid.rid << 1) + 1; + a[1] = (ii->header->flags & GRN_OBJ_WITH_POSITION) ? posp[-1] : 0; + memset(bt, 0, sizeof(buffer_term)); + nterms_void++; + } else { + int j = 0; + uint8_t *dcp0; + uint32_t encsize; + uint32_t f_s = (ndf < 3) ? 0 : USE_P_ENC; + uint32_t f_d = ((ndf < 16) || (ndf <= (lid.rid >> 8))) ? 0 : USE_P_ENC; + dv[j].data_size = ndf; dv[j++].flags = f_d; + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { + dv[j].data_size = ndf; dv[j++].flags = f_s; + } + dv[j].data_size = ndf; dv[j++].flags = f_s; + if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { + dv[j].data_size = ndf; dv[j++].flags = f_s; + } + if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { + uint32_t np = posp - dv[ii->n_elements - 1].data; + uint32_t f_p = ((np < 32) || (np <= (spos >> 13))) ? 0 : USE_P_ENC; + dv[j].data_size = np; dv[j].flags = f_p|ODD; + } + dcp0 = dcp; + a[1] = (bt->size_in_chunk ? a[1] : 0) + (ndf - sdf) + balance; + if (nvchunks) { + int i; + GRN_B_ENC(nvchunks, dcp); + for (i = 0; i < nchunks; i++) { + if (cinfo[i].size) { + GRN_B_ENC(cinfo[i].segno, dcp); + GRN_B_ENC(cinfo[i].size, dcp); + GRN_B_ENC(cinfo[i].dgap, dcp); + } + } + } + encsize = grn_p_encv(ctx, dv, ii->n_elements, dcp); + + if (grn_logger_pass(ctx, GRN_LOG_DEBUG)) { + if (sb->header.chunk_size + S_SEGMENT <= (dcp - dc) + encsize) { + GRN_LOG(ctx, GRN_LOG_DEBUG, + "cs(%d)+(%d)=(%d)" + "<=(%" GRN_FMT_LLD ")+(%d)=" + "(%" GRN_FMT_LLD ")", + sb->header.chunk_size, + S_SEGMENT, + sb->header.chunk_size + S_SEGMENT, + (long long int)(dcp - dc), + encsize, + (long long int)((dcp - dc) + encsize)); + buffer_merge_dump_datavec(ctx, ii, dv, rdv); + } + } + + if (encsize > CHUNK_SPLIT_THRESHOLD && + (cinfo || (cinfo = GRN_MALLOCN(chunk_info, nchunks + 1))) && + !chunk_flush(ctx, ii, &cinfo[nchunks], dcp, encsize)) { + int i; + cinfo[nchunks].dgap = lid.rid - crid; + nvchunks++; + dcp = dcp0; + GRN_B_ENC(nvchunks, dcp); + for (i = 0; i <= nchunks; i++) { + if (cinfo[i].size) { + GRN_B_ENC(cinfo[i].segno, dcp); + GRN_B_ENC(cinfo[i].size, dcp); + GRN_B_ENC(cinfo[i].dgap, dcp); + } + } + GRN_LOG(ctx, GRN_LOG_DEBUG, "split (%d) encsize=%d", tid, encsize); + bt->tid |= CHUNK_SPLIT; + } else { + dcp += encsize; + if (!nvchunks) { + bt->tid &= ~CHUNK_SPLIT; + } + } + bt->pos_in_chunk = (uint32_t)(dcp0 - dc); + bt->size_in_chunk = (uint32_t)(dcp - dcp0); + bt->size_in_buffer = 0; + bt->pos_in_buffer = 0; + } + array_unref(ii, tid); + } + } + if (cinfo) { GRN_FREE(cinfo); } + } + datavec_fin(ctx, rdv); + datavec_fin(ctx, dv); + db->header.chunk_size = (uint32_t)(dcp - dc); + db->header.buffer_free = + S_SEGMENT - sizeof(buffer_header) - db->header.nterms * sizeof(buffer_term); + db->header.nterms_void = nterms_void; + return ctx->rc; +} + +static void +fake_map(grn_ctx *ctx, grn_io *io, grn_io_win *iw, void *addr, uint32_t seg, uint32_t size) +{ + iw->ctx = ctx; + iw->diff = 0; + iw->io = io; + iw->mode = grn_io_wronly; + iw->segment = ((seg) >> GRN_II_N_CHUNK_VARIATION); + iw->offset = (((seg) & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << GRN_II_W_LEAST_CHUNK); + iw->size = size; + iw->cached = 0; + iw->addr = addr; +} + +static grn_rc +buffer_flush(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) +{ + grn_io_win sw, dw; + buffer *sb, *db = NULL; + uint8_t *dc, *sc = NULL; + uint32_t ds, pseg, scn, dcn = 0; + if (ii->header->binfo[seg] == GRN_II_PSEG_NOT_ASSIGNED) { + DEFINE_NAME(ii); + CRIT(GRN_FILE_CORRUPT, + "[ii][buffer][flush] invalid segment: " + "<%.*s> :" + "request:<%u>, max:<%u>", + name_size, name, + seg, ii->seg->header->max_segment); + return ctx->rc; + } + if ((ds = segment_get(ctx, ii)) == ii->seg->header->max_segment) { + DEFINE_NAME(ii); + MERR("[ii][buffer][flush] segment is full: " + "<%.*s> :" + "request:<%u>, max:<%u>", + name_size, name, + seg, ii->seg->header->max_segment); + return ctx->rc; + } + pseg = buffer_open(ctx, ii, SEG2POS(seg, 0), NULL, &sb); + if (pseg == GRN_II_PSEG_NOT_ASSIGNED) { + DEFINE_NAME(ii); + MERR("[ii][buffer][flush] failed to open buffer: " + "<%.*s> :" + "segment:<%u>, position:<%u>, max:<%u>", + name_size, name, + seg, SEG2POS(seg, 0), ii->seg->header->max_segment); + return ctx->rc; + } + { + GRN_IO_SEG_REF(ii->seg, ds, db); + if (db) { + uint32_t actual_chunk_size = 0; + uint32_t max_dest_chunk_size = sb->header.chunk_size + S_SEGMENT; + if ((dc = GRN_MALLOC(max_dest_chunk_size * 2))) { + if ((scn = sb->header.chunk) == GRN_II_PSEG_NOT_ASSIGNED || + (sc = WIN_MAP(ii->chunk, ctx, &sw, scn, 0, + sb->header.chunk_size, grn_io_rdonly))) { + uint16_t n = sb->header.nterms; + memset(db, 0, S_SEGMENT); + grn_memcpy(db->terms, sb->terms, n * sizeof(buffer_term)); + db->header.nterms = n; + buffer_merge(ctx, ii, seg, h, sb, sc, db, dc); + if (ctx->rc == GRN_SUCCESS) { + actual_chunk_size = db->header.chunk_size; + if (actual_chunk_size > 0) { + chunk_new(ctx, ii, &dcn, actual_chunk_size); + } + if (ctx->rc == GRN_SUCCESS) { + grn_rc rc; + db->header.chunk = + actual_chunk_size ? dcn : GRN_II_PSEG_NOT_ASSIGNED; + fake_map(ctx, ii->chunk, &dw, dc, dcn, actual_chunk_size); + rc = grn_io_win_unmap(&dw); + if (rc == GRN_SUCCESS) { + buffer_segment_update(ii, seg, ds); + ii->header->total_chunk_size += actual_chunk_size; + if (scn != GRN_II_PSEG_NOT_ASSIGNED) { + grn_io_win_unmap(&sw); + chunk_free(ctx, ii, scn, 0, sb->header.chunk_size); + ii->header->total_chunk_size -= sb->header.chunk_size; + } + } else { + GRN_FREE(dc); + if (actual_chunk_size) { + chunk_free(ctx, ii, dcn, 0, actual_chunk_size); + } + if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } + { + DEFINE_NAME(ii); + ERR(rc, + "[ii][buffer][flush] failed to unmap a destination chunk: " + "<%.*s> : " + "segment:<%u>, destination-segment:<%u>, actual-size:<%u>", + name_size, name, + seg, + dcn, + actual_chunk_size); + } + } + } else { + GRN_FREE(dc); + if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } + } + } else { + GRN_FREE(dc); + if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } + } + } else { + GRN_FREE(dc); + { + DEFINE_NAME(ii); + MERR("[ii][buffer][flush] failed to map a source chunk: " + "<%.*s> :" + "segment:<%u>, source-segment:<%u>, chunk-size:<%u>", + name_size, name, + seg, + scn, + sb->header.chunk_size); + } + } + } else { + DEFINE_NAME(ii); + MERR("[ii][buffer][flush] failed to allocate a destination chunk: " + "<%.*s> :" + "segment:<%u>, destination-segment:<%u>", + name_size, name, + seg, + ds); + } + GRN_IO_SEG_UNREF(ii->seg, ds); + } else { + DEFINE_NAME(ii); + MERR("[ii][buffer][flush] failed to allocate a destination segment: " + "<%.*s> :" + "segment:<%u>, destination-segment:<%u>", + name_size, name, + seg, + ds); + } + buffer_close(ctx, ii, pseg); + } + return ctx->rc; +} + +void +grn_ii_buffer_check(grn_ctx *ctx, grn_ii *ii, uint32_t seg) +{ + grn_io_win sw; + buffer *sb; + uint8_t *sc = NULL; + uint32_t pseg, scn, nterms_with_corrupt_chunk = 0, nterm_with_chunk = 0; + uint32_t ndeleted_terms_with_value = 0; + buffer_term *bt; + uint8_t *sbp = NULL; + datavec rdv[MAX_N_ELEMENTS + 1]; + uint16_t n; + int nterms_void = 0; + int size_in_buffer = 0; + grn_obj buf; + size_t lower_bound; + int64_t nloops = 0, nviolations = 0; + if (ii->header->binfo[seg] == GRN_II_PSEG_NOT_ASSIGNED) { + GRN_OUTPUT_BOOL(GRN_FALSE); + return; + } + pseg = buffer_open(ctx, ii, SEG2POS(seg, 0), NULL, &sb); + if (pseg == GRN_II_PSEG_NOT_ASSIGNED) { + GRN_OUTPUT_BOOL(GRN_FALSE); + return; + } + lower_bound = + (sb->header.buffer_free + sizeof(buffer_term) * sb->header.nterms) + / sizeof(buffer_rec); + datavec_init(ctx, rdv, ii->n_elements, 0, 0); + if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { + rdv[ii->n_elements - 1].flags = ODD; + } + GRN_OUTPUT_MAP_OPEN("BUFFER", -1); + GRN_OUTPUT_CSTR("buffer id"); + GRN_OUTPUT_INT64(seg); + if ((scn = sb->header.chunk) == GRN_II_PSEG_NOT_ASSIGNED) { + GRN_OUTPUT_CSTR("void chunk size"); + GRN_OUTPUT_INT64(sb->header.chunk_size); + } else { + if ((sc = WIN_MAP(ii->chunk, ctx, &sw, scn, 0, sb->header.chunk_size, + grn_io_rdonly))) { + GRN_OUTPUT_CSTR("chunk size"); + GRN_OUTPUT_INT64(sb->header.chunk_size); + } else { + GRN_OUTPUT_CSTR("unmappable chunk size"); + GRN_OUTPUT_INT64(sb->header.chunk_size); + } + } + GRN_OUTPUT_CSTR("buffer term"); + GRN_OUTPUT_ARRAY_OPEN("TERMS", sb->header.nterms); + + GRN_OBJ_INIT(&buf, GRN_BULK, 0, ii->lexicon->header.domain); + for (bt = sb->terms, n = sb->header.nterms; n; n--, bt++) { + grn_id tid, tid_; + char key[GRN_TABLE_MAX_KEY_SIZE]; + int key_size; + uint16_t nextb; + uint32_t nchunks = 0; + chunk_info *cinfo = NULL; + grn_id crid = GRN_ID_NIL; + docinfo bid = {0, 0}; + uint32_t sdf = 0, snn = 0; + uint32_t *srp = NULL, *ssp = NULL, *stp = NULL, *sop = NULL, *snp = NULL; + if (!bt->tid && !bt->pos_in_buffer && !bt->size_in_buffer) { + nterms_void++; + continue; + } + GRN_OUTPUT_ARRAY_OPEN("TERM", -1); + tid = (bt->tid & GRN_ID_MAX); + key_size = grn_table_get_key(ctx, ii->lexicon, tid, key, + GRN_TABLE_MAX_KEY_SIZE); + tid_ = grn_table_get(ctx, ii->lexicon, key, key_size); + GRN_TEXT_SET(ctx, &buf, key, key_size); + GRN_OUTPUT_OBJ(&buf, NULL); + GRN_OUTPUT_INT64(bt->tid); + GRN_OUTPUT_INT64(tid_); + nextb = bt->pos_in_buffer; + size_in_buffer += bt->size_in_buffer; + if (tid != tid_ && (bt->size_in_buffer || bt->size_in_chunk)) { + ndeleted_terms_with_value++; + } + GETNEXTB(); + GRN_OUTPUT_INT64(bt->size_in_buffer); + GRN_OUTPUT_INT64(bt->size_in_chunk); + if (sc && bt->size_in_chunk) { + uint8_t *scp = sc + bt->pos_in_chunk; + uint8_t *sce = scp + bt->size_in_chunk; + size_t size = S_SEGMENT * ii->n_elements; + if ((bt->tid & CHUNK_SPLIT)) { + int i; + GRN_B_DEC(nchunks, scp); + if (!(cinfo = GRN_MALLOCN(chunk_info, nchunks + 1))) { + datavec_fin(ctx, rdv); + GRN_OBJ_FIN(ctx, &buf); + return; + } + for (i = 0; i < nchunks; i++) { + GRN_B_DEC(cinfo[i].segno, scp); + GRN_B_DEC(cinfo[i].size, scp); + GRN_B_DEC(cinfo[i].dgap, scp); + crid += cinfo[i].dgap; + } + } + if (sce > scp) { + size += grn_p_decv(ctx, scp, sce - scp, rdv, ii->n_elements); + { + int j = 0; + sdf = rdv[j].data_size; + GRN_OUTPUT_INT64(sdf); + srp = rdv[j++].data; + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { ssp = rdv[j++].data; } + if (sdf != rdv[j].data_size) { + nterms_with_corrupt_chunk++; + } + stp = rdv[j++].data; + if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { sop = rdv[j++].data; } + GRN_OUTPUT_INT64(rdv[j].data_size); + snn = rdv[j].data_size; + snp = rdv[j].data; + } + nterm_with_chunk++; + } + } + { + uint16_t pos; + grn_id rid, sid, rid_ = 0, sid_ = 0; + uint8_t *p; + buffer_rec *r; + for (pos = bt->pos_in_buffer; pos; pos = r->step) { + if (pos < lower_bound) { + nviolations++; + } + r = BUFFER_REC_AT(sb, pos); + p = GRN_NEXT_ADDR(r); + GRN_B_DEC(rid, p); + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { + GRN_B_DEC(sid, p); + } else { + sid = 1; + } + if (rid < rid_ || (rid == rid_ && sid < sid_)) { + nloops++; + } + rid_ = rid; + sid_ = sid; + } + } + GRN_OUTPUT_ARRAY_CLOSE(); + if (cinfo) { GRN_FREE(cinfo); } + } + GRN_OBJ_FIN(ctx, &buf); + + GRN_OUTPUT_ARRAY_CLOSE(); + GRN_OUTPUT_CSTR("buffer free"); + GRN_OUTPUT_INT64(sb->header.buffer_free); + GRN_OUTPUT_CSTR("size in buffer"); + GRN_OUTPUT_INT64(size_in_buffer); + GRN_OUTPUT_CSTR("nterms"); + GRN_OUTPUT_INT64(sb->header.nterms); + if (nterms_void != sb->header.nterms_void) { + GRN_OUTPUT_CSTR("nterms void gap"); + GRN_OUTPUT_INT64(nterms_void - sb->header.nterms_void); + } + GRN_OUTPUT_CSTR("nterms with chunk"); + GRN_OUTPUT_INT64(nterm_with_chunk); + if (nterms_with_corrupt_chunk) { + GRN_OUTPUT_CSTR("nterms with corrupt chunk"); + GRN_OUTPUT_INT64(nterms_with_corrupt_chunk); + } + if (ndeleted_terms_with_value) { + GRN_OUTPUT_CSTR("number of deleted terms with value"); + GRN_OUTPUT_INT64(ndeleted_terms_with_value); + } + if (nloops) { + GRN_OUTPUT_CSTR("number of loops"); + GRN_OUTPUT_INT64(nloops); + } + if (nviolations) { + GRN_OUTPUT_CSTR("number of violations"); + GRN_OUTPUT_INT64(nviolations); + } + GRN_OUTPUT_MAP_CLOSE(); + datavec_fin(ctx, rdv); + if (sc) { grn_io_win_unmap(&sw); } + buffer_close(ctx, ii, pseg); +} + +typedef struct { + buffer_term *bt; + const char *key; + uint32_t key_size; +} term_sort; + +static int +term_compar(const void *t1, const void *t2) +{ + int r; + const term_sort *x = (term_sort *)t1, *y = (term_sort *)t2; + if (x->key_size > y->key_size) { + r = memcmp(x->key, y->key, y->key_size); + return r ? r : x->key_size - y->key_size; + } else { + r = memcmp(x->key, y->key, x->key_size); + return r ? r : x->key_size - y->key_size; + } +} + +static grn_rc +term_split(grn_ctx *ctx, grn_obj *lexicon, buffer *sb, buffer *db0, buffer *db1) +{ + uint16_t i, n, *nt; + buffer_term *bt; + uint32_t s, th = (sb->header.chunk_size + sb->header.nterms) >> 1; + term_sort *ts = GRN_MALLOC(sb->header.nterms * sizeof(term_sort)); + if (!ts) { return GRN_NO_MEMORY_AVAILABLE; } + for (i = 0, n = sb->header.nterms, bt = sb->terms; n; bt++, n--) { + if (bt->tid) { + grn_id tid = bt->tid & GRN_ID_MAX; + ts[i].key = _grn_table_key(ctx, lexicon, tid, &ts[i].key_size); + ts[i].bt = bt; + i++; + } + } + qsort(ts, i, sizeof(term_sort), term_compar); + memset(db0, 0, S_SEGMENT); + bt = db0->terms; + nt = &db0->header.nterms; + for (s = 0; n + 1 < i && s <= th; n++, bt++) { + grn_memcpy(bt, ts[n].bt, sizeof(buffer_term)); + (*nt)++; + s += ts[n].bt->size_in_chunk + 1; + } + memset(db1, 0, S_SEGMENT); + bt = db1->terms; + nt = &db1->header.nterms; + for (; n < i; n++, bt++) { + grn_memcpy(bt, ts[n].bt, sizeof(buffer_term)); + (*nt)++; + } + GRN_FREE(ts); + GRN_LOG(ctx, GRN_LOG_DEBUG, "d0=%d d1=%d", + db0->header.nterms, db1->header.nterms); + return GRN_SUCCESS; +} + +static void +array_update(grn_ctx *ctx, grn_ii *ii, uint32_t dls, buffer *db) +{ + uint16_t n; + buffer_term *bt; + uint32_t *a, pos = SEG2POS(dls, sizeof(buffer_header)); + for (n = db->header.nterms, bt = db->terms; n; n--, bt++) { + if (bt->tid) { + grn_id tid = bt->tid & GRN_ID_MAX; + if ((a = array_at(ctx, ii, tid))) { + a[0] = pos; + array_unref(ii, tid); + } else { + GRN_LOG(ctx, GRN_LOG_WARNING, "array_at failed (%d)", tid); + } + } + pos += sizeof(buffer_term) >> 2; + } +} + +static grn_rc +buffer_split(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) +{ + grn_io_win sw, dw0, dw1; + buffer *sb, *db0 = NULL, *db1 = NULL; + uint8_t *sc = NULL, *dc0, *dc1; + uint32_t dps0 = 0, dps1 = 0, dls0 = 0, dls1 = 0, sps, scn, dcn0 = 0, dcn1 = 0; + if (ii->header->binfo[seg] == GRN_II_PSEG_NOT_ASSIGNED) { + DEFINE_NAME(ii); + CRIT(GRN_FILE_CORRUPT, + "[ii][buffer][split] invalid segment: " + "<%.*s> :" + "request:<%u>, max:<%u>", + name_size, name, + seg, ii->seg->header->max_segment); + return ctx->rc; + } + buffer_segment_reserve(ctx, ii, &dls0, &dps0, &dls1, &dps1); + if (ctx->rc != GRN_SUCCESS) { + DEFINE_NAME(ii); + ERR(ctx->rc, + "[ii][buffer][split] failed to reserve buffer segments: " + "<%.*s> :" + "request:<%u>, max:<%u>", + name_size, name, + seg, ii->seg->header->max_segment); + return ctx->rc; + } + sps = buffer_open(ctx, ii, SEG2POS(seg, 0), NULL, &sb); + if (sps == GRN_II_PSEG_NOT_ASSIGNED) { + DEFINE_NAME(ii); + MERR("[ii][buffer][split] failed to open buffer: " + "<%.*s> :" + "segment:<%u>, position:<%u>, max-segment:<%u>", + name_size, name, + seg, SEG2POS(seg, 0), ii->seg->header->max_segment); + } else { + GRN_IO_SEG_REF(ii->seg, dps0, db0); + if (db0) { + GRN_IO_SEG_REF(ii->seg, dps1, db1); + if (db1) { + uint32_t actual_db0_chunk_size = 0; + uint32_t actual_db1_chunk_size = 0; + uint32_t max_dest_chunk_size = sb->header.chunk_size + S_SEGMENT; + if ((dc0 = GRN_MALLOC(max_dest_chunk_size * 2))) { + if ((dc1 = GRN_MALLOC(max_dest_chunk_size * 2))) { + if ((scn = sb->header.chunk) == GRN_II_PSEG_NOT_ASSIGNED || + (sc = WIN_MAP(ii->chunk, ctx, &sw, scn, 0, + sb->header.chunk_size, grn_io_rdonly))) { + term_split(ctx, ii->lexicon, sb, db0, db1); + buffer_merge(ctx, ii, seg, h, sb, sc, db0, dc0); + if (ctx->rc == GRN_SUCCESS) { + actual_db0_chunk_size = db0->header.chunk_size; + if (actual_db0_chunk_size > 0) { + chunk_new(ctx, ii, &dcn0, actual_db0_chunk_size); + } + if (ctx->rc == GRN_SUCCESS) { + grn_rc rc; + db0->header.chunk = + actual_db0_chunk_size ? dcn0 : GRN_II_PSEG_NOT_ASSIGNED; + fake_map(ctx, ii->chunk, &dw0, dc0, dcn0, actual_db0_chunk_size); + rc = grn_io_win_unmap(&dw0); + if (rc == GRN_SUCCESS) { + buffer_merge(ctx, ii, seg, h, sb, sc, db1, dc1); + if (ctx->rc == GRN_SUCCESS) { + actual_db1_chunk_size = db1->header.chunk_size; + if (actual_db1_chunk_size > 0) { + chunk_new(ctx, ii, &dcn1, actual_db1_chunk_size); + } + if (ctx->rc == GRN_SUCCESS) { + fake_map(ctx, ii->chunk, &dw1, dc1, dcn1, + actual_db1_chunk_size); + rc = grn_io_win_unmap(&dw1); + if (rc == GRN_SUCCESS) { + db1->header.chunk = + actual_db1_chunk_size ? dcn1 : GRN_II_PSEG_NOT_ASSIGNED; + buffer_segment_update(ii, dls0, dps0); + buffer_segment_update(ii, dls1, dps1); + array_update(ctx, ii, dls0, db0); + array_update(ctx, ii, dls1, db1); + buffer_segment_clear(ii, seg); + ii->header->total_chunk_size += actual_db0_chunk_size; + ii->header->total_chunk_size += actual_db1_chunk_size; + if (scn != GRN_II_PSEG_NOT_ASSIGNED) { + grn_io_win_unmap(&sw); + chunk_free(ctx, ii, scn, 0, sb->header.chunk_size); + ii->header->total_chunk_size -= sb->header.chunk_size; + } + } else { + if (actual_db1_chunk_size) { + chunk_free(ctx, ii, dcn1, 0, actual_db1_chunk_size); + } + if (actual_db0_chunk_size) { + chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); + } + GRN_FREE(dc1); + if (scn != GRN_II_PSEG_NOT_ASSIGNED) { + grn_io_win_unmap(&sw); + } + { + DEFINE_NAME(ii); + ERR(rc, + "[ii][buffer[merge] " + "failed to unmap a destination chunk2: " + "<%.*s> :" + "segment:<%u>, " + "destination-chunk1:<%u>, " + "destination-chunk2:<%u>, " + "actual-size1:<%u>, " + "actual-size2:<%u>", + name_size, name, + seg, + dcn0, + dcn1, + actual_db0_chunk_size, + actual_db1_chunk_size); + } + } + } else { + if (actual_db0_chunk_size) { + chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); + } + GRN_FREE(dc1); + if (scn != GRN_II_PSEG_NOT_ASSIGNED) { + grn_io_win_unmap(&sw); + } + } + } else { + if (actual_db0_chunk_size) { + chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); + } + GRN_FREE(dc1); + if (scn != GRN_II_PSEG_NOT_ASSIGNED) { + grn_io_win_unmap(&sw); + } + } + } else { + if (actual_db0_chunk_size) { + chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); + } + GRN_FREE(dc1); + GRN_FREE(dc0); + if (scn != GRN_II_PSEG_NOT_ASSIGNED) { + grn_io_win_unmap(&sw); + } + { + DEFINE_NAME(ii); + ERR(rc, + "[ii][buffer[merge] " + "failed to unmap a destination chunk1: " + "<%.*s> :" + "segment:<%u>, " + "destination-chunk1:<%u>, " + "actual-size1:<%u>", + name_size, name, + seg, + dcn0, + actual_db0_chunk_size); + } + } + } else { + GRN_FREE(dc1); + GRN_FREE(dc0); + if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } + } + } else { + GRN_FREE(dc1); + GRN_FREE(dc0); + if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } + } + } else { + GRN_FREE(dc1); + GRN_FREE(dc0); + { + DEFINE_NAME(ii); + MERR("[ii][buffer][split] failed to map a source chunk: " + "<%.*s> :" + "segment:<%u>, " + "source-segment:<%u>, " + "chunk-size:<%u>", + name_size, name, + seg, + scn, + sb->header.chunk_size); + } + } + } else { + GRN_FREE(dc0); + { + DEFINE_NAME(ii); + MERR("[ii][buffer][split] " + "failed to allocate a destination chunk2: " + "<%.*s> :" + "segment:<%u>, " + "destination-segment1:<%u>, " + "destination-segment2:<%u>", + name_size, name, + seg, + dps0, + dps1); + } + } + } else { + DEFINE_NAME(ii); + MERR("[ii][buffer][split] failed to allocate a destination chunk1: " + "<%.*s>: " + "segment:<%u>, " + "destination-segment1:<%u>, " + "destination-segment2:<%u>", + name_size, name, + seg, + dps0, + dps1); + } + GRN_IO_SEG_UNREF(ii->seg, dps1); + } else { + DEFINE_NAME(ii); + MERR("[ii][buffer][split] failed to allocate a destination segment2: " + "<%.*s>: " + "segment:<%u>, " + "destination-segment1:<%u>, " + "destination-segment2:<%u>", + name_size, name, + seg, + dps0, + dps1); + } + GRN_IO_SEG_UNREF(ii->seg, dps0); + } else { + DEFINE_NAME(ii); + MERR("[ii][buffer][split] failed to allocate a destination segment1: " + "<%.*s>: " + "segment:<%u>, " + "destination-segment1:<%u>, " + "destination-segment2:<%u>", + name_size, name, + seg, + dps0, + dps1); + } + buffer_close(ctx, ii, sps); + } + return ctx->rc; +} + +#define SCALE_FACTOR 2048 +#define MAX_NTERMS 8192 +#define SPLIT_COND(ii, buffer)\ + ((buffer)->header.nterms > 1024 ||\ + ((buffer)->header.nterms > 1 &&\ + (buffer)->header.chunk_size * 100 > (ii)->header->total_chunk_size)) + +inline static void +buffer_new_find_segment(grn_ctx *ctx, + grn_ii *ii, + int size, + grn_id tid, + grn_hash *h, + buffer **b, + uint32_t *lseg, + uint32_t *pseg) +{ + uint32_t *a; + + a = array_at(ctx, ii, tid); + if (!a) { + return; + } + + for (;;) { + uint32_t pos = a[0]; + if (!pos || (pos & 1)) { break; } + *pseg = buffer_open(ctx, ii, pos, NULL, b); + if (*pseg == GRN_II_PSEG_NOT_ASSIGNED) { break; } + if ((*b)->header.buffer_free >= size + sizeof(buffer_term)) { + *lseg = LSEG(pos); + break; + } + buffer_close(ctx, ii, *pseg); + if (SPLIT_COND(ii, (*b))) { + /* ((S_SEGMENT - sizeof(buffer_header) + ii->header->bmax - + (*b)->header.nterms * sizeof(buffer_term)) * 4 < + (*b)->header.chunk_size) */ + GRN_LOG(ctx, GRN_LOG_DEBUG, + "nterms=%d chunk=%d total=%" GRN_FMT_INT64U, + (*b)->header.nterms, + (*b)->header.chunk_size, + ii->header->total_chunk_size >> 10); + if (buffer_split(ctx, ii, LSEG(pos), h)) { break; } + } else { + if (S_SEGMENT - sizeof(buffer_header) + - (*b)->header.nterms * sizeof(buffer_term) + < size + sizeof(buffer_term)) { + break; + } + if (buffer_flush(ctx, ii, LSEG(pos), h)) { break; } + } + } + + array_unref(ii, tid); +} + +inline static void +buffer_new_lexicon_pat(grn_ctx *ctx, + grn_ii *ii, + int size, + grn_id id, + grn_hash *h, + buffer **b, + uint32_t *lseg, + uint32_t *pseg) +{ + grn_pat_cursor *cursor; + char key[GRN_TABLE_MAX_KEY_SIZE]; + int key_size; + + key_size = grn_table_get_key(ctx, ii->lexicon, id, key, + GRN_TABLE_MAX_KEY_SIZE); + if (ii->lexicon->header.flags & GRN_OBJ_KEY_VAR_SIZE) { + grn_obj *tokenizer = NULL; + + grn_table_get_info(ctx, ii->lexicon, NULL, NULL, &tokenizer, NULL, NULL); + if (tokenizer) { + /* For natural language */ + cursor = grn_pat_cursor_open(ctx, + (grn_pat *)(ii->lexicon), + key, + key_size, + NULL, + 0, + 0, + -1, + GRN_CURSOR_ASCENDING|GRN_CURSOR_GT); + if (cursor) { + grn_id tid; + while (ctx->rc == GRN_SUCCESS && + *lseg == GRN_II_PSEG_NOT_ASSIGNED && + (tid = grn_pat_cursor_next(ctx, cursor))) { + buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); + } + grn_pat_cursor_close(ctx, cursor); + } + } else { + /* For text data */ + int target_key_size = key_size; + int reduced_key_size = 0; + + while (*lseg == GRN_II_PSEG_NOT_ASSIGNED && target_key_size > 0) { + grn_id tid; + + cursor = grn_pat_cursor_open(ctx, + (grn_pat *)(ii->lexicon), + key, target_key_size, + NULL, 0, 0, -1, + GRN_CURSOR_PREFIX); + if (!cursor) { + break; + } + + if (reduced_key_size == 0) { + while (ctx->rc == GRN_SUCCESS && + *lseg == GRN_II_PSEG_NOT_ASSIGNED && + (tid = grn_pat_cursor_next(ctx, cursor))) { + buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); + } + } else { + while (ctx->rc == GRN_SUCCESS && + *lseg == GRN_II_PSEG_NOT_ASSIGNED && + (tid = grn_pat_cursor_next(ctx, cursor))) { + void *current_key; + int current_key_size; + + current_key_size = grn_pat_cursor_get_key(ctx, cursor, ¤t_key); + if (memcmp(((char *)current_key) + target_key_size, + key + target_key_size, + reduced_key_size) == 0) { + continue; + } + buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); + } + } + grn_pat_cursor_close(ctx, cursor); + + if (reduced_key_size == 0) { + reduced_key_size = 1; + } else { + reduced_key_size *= 2; + } + target_key_size -= reduced_key_size; + } + } + } else { + /* For other data */ + cursor = grn_pat_cursor_open(ctx, + (grn_pat *)(ii->lexicon), + NULL, 0, key, key_size, 0, -1, + GRN_CURSOR_PREFIX); + if (cursor) { + grn_id tid; + while (ctx->rc == GRN_SUCCESS && + *lseg == GRN_II_PSEG_NOT_ASSIGNED && + (tid = grn_pat_cursor_next(ctx, cursor))) { + buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); + } + grn_pat_cursor_close(ctx, cursor); + } + } +} + +inline static void +buffer_new_lexicon_other(grn_ctx *ctx, + grn_ii *ii, + int size, + grn_id id, + grn_hash *h, + buffer **b, + uint32_t *lseg, + uint32_t *pseg) +{ + GRN_TABLE_EACH_BEGIN(ctx, ii->lexicon, cursor, tid) { + if (ctx->rc != GRN_SUCCESS || *lseg != GRN_II_PSEG_NOT_ASSIGNED) { + break; + } + buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); + } GRN_TABLE_EACH_END(ctx, cursor); +} + + +inline static uint32_t +buffer_new(grn_ctx *ctx, grn_ii *ii, int size, uint32_t *pos, + buffer_term **bt, buffer_rec **br, buffer **bp, grn_id id, grn_hash *h) +{ + buffer *b = NULL; + uint16_t offset; + uint32_t lseg = GRN_II_PSEG_NOT_ASSIGNED, pseg = GRN_II_PSEG_NOT_ASSIGNED; + if (S_SEGMENT - sizeof(buffer_header) < size + sizeof(buffer_term)) { + DEFINE_NAME(ii); + MERR("[ii][buffer][new] requested size is too large: " + "<%.*s> :" + "requested:<%" GRN_FMT_SIZE ">, max:<%" GRN_FMT_SIZE ">", + name_size, name, + (size_t)(size + sizeof(buffer_term)), + (size_t)(S_SEGMENT - sizeof(buffer_header))); + return GRN_II_PSEG_NOT_ASSIGNED; + } + if (ii->lexicon->header.type == GRN_TABLE_PAT_KEY) { + buffer_new_lexicon_pat(ctx, ii, size, id, h, &b, &lseg, &pseg); + } else { + buffer_new_lexicon_other(ctx, ii, size, id, h, &b, &lseg, &pseg); + } + if (lseg == GRN_II_PSEG_NOT_ASSIGNED) { + if (buffer_segment_new(ctx, ii, &lseg) || + (pseg = buffer_open(ctx, ii, SEG2POS(lseg, 0), NULL, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { + return GRN_II_PSEG_NOT_ASSIGNED; + } + memset(b, 0, S_SEGMENT); + b->header.buffer_free = S_SEGMENT - sizeof(buffer_header); + b->header.chunk = GRN_II_PSEG_NOT_ASSIGNED; + } + if (b->header.nterms_void) { + for (offset = 0; offset < b->header.nterms; offset++) { + if (!b->terms[offset].tid) { break; } + } + if (offset == b->header.nterms) { + GRN_LOG(ctx, GRN_LOG_DEBUG, "inconsistent buffer(%d)", lseg); + b->header.nterms_void = 0; + b->header.nterms++; + b->header.buffer_free -= size + sizeof(buffer_term); + } else { + b->header.nterms_void--; + b->header.buffer_free -= size; + } + } else { + offset = b->header.nterms++; + b->header.buffer_free -= size + sizeof(buffer_term); + } + *pos = SEG2POS(lseg, (sizeof(buffer_header) + sizeof(buffer_term) * offset)); + *bt = &b->terms[offset]; + *br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) + b->header.buffer_free); + *bp = b; + return pseg; +} + +/* ii */ + +static grn_ii * +_grn_ii_create(grn_ctx *ctx, grn_ii *ii, const char *path, grn_obj *lexicon, uint32_t flags) +{ + int i; + uint32_t max_n_segments; + uint32_t max_n_chunks; + grn_io *seg, *chunk; + char path2[PATH_MAX]; + struct grn_ii_header *header; + grn_table_flags lflags; + grn_encoding encoding; + grn_obj *tokenizer; + /* + for (i = 0; i < 32; i++) { + new_histogram[i] = 0; + free_histogram[i] = 0; + } + */ + if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, + NULL, NULL)) { + return NULL; + } + if (path && strlen(path) + 6 >= PATH_MAX) { return NULL; } + + if (flags & GRN_OBJ_INDEX_SMALL) { + max_n_segments = grn_ii_max_n_segments_small; + max_n_chunks = grn_ii_max_n_chunks_small; + } else if (flags & GRN_OBJ_INDEX_MEDIUM) { + max_n_segments = MAX_PSEG_MEDIUM; + max_n_chunks = GRN_II_MAX_CHUNK_MEDIUM; + } else { + max_n_segments = MAX_PSEG; + max_n_chunks = GRN_II_MAX_CHUNK; + } + + seg = grn_io_create(ctx, + path, + sizeof(struct grn_ii_header), + S_SEGMENT, + max_n_segments, + grn_io_auto, + GRN_IO_EXPIRE_SEGMENT); + if (!seg) { return NULL; } + if (path) { + grn_strcpy(path2, PATH_MAX, path); + grn_strcat(path2, PATH_MAX, ".c"); + chunk = grn_io_create(ctx, path2, 0, S_CHUNK, max_n_chunks, grn_io_auto, + GRN_IO_EXPIRE_SEGMENT); + } else { + chunk = grn_io_create(ctx, NULL, 0, S_CHUNK, max_n_chunks, grn_io_auto, 0); + } + if (!chunk) { + grn_io_close(ctx, seg); + grn_io_remove(ctx, path); + return NULL; + } + header = grn_io_header(seg); + grn_io_set_type(seg, GRN_COLUMN_INDEX); + for (i = 0; i < GRN_II_MAX_LSEG; i++) { + header->ainfo[i] = GRN_II_PSEG_NOT_ASSIGNED; + header->binfo[i] = GRN_II_PSEG_NOT_ASSIGNED; + } + for (i = 0; i <= GRN_II_N_CHUNK_VARIATION; i++) { + header->free_chunks[i] = GRN_II_PSEG_NOT_ASSIGNED; + header->garbages[i] = GRN_II_PSEG_NOT_ASSIGNED; + } + header->flags = flags; + ii->seg = seg; + ii->chunk = chunk; + ii->lexicon = lexicon; + ii->lflags = lflags; + ii->encoding = encoding; + ii->header = header; + ii->n_elements = 2; + if ((flags & GRN_OBJ_WITH_SECTION)) { ii->n_elements++; } + if ((flags & GRN_OBJ_WITH_WEIGHT)) { ii->n_elements++; } + if ((flags & GRN_OBJ_WITH_POSITION)) { ii->n_elements++; } + return ii; +} + +grn_ii * +grn_ii_create(grn_ctx *ctx, const char *path, grn_obj *lexicon, uint32_t flags) +{ + grn_ii *ii = NULL; + if (!(ii = GRN_MALLOCN(grn_ii, 1))) { + return NULL; + } + GRN_DB_OBJ_SET_TYPE(ii, GRN_COLUMN_INDEX); + if (!_grn_ii_create(ctx, ii, path, lexicon, flags)) { + GRN_FREE(ii); + return NULL; + } + return ii; +} + +grn_rc +grn_ii_remove(grn_ctx *ctx, const char *path) +{ + grn_rc rc; + char buffer[PATH_MAX]; + if (!path || strlen(path) > PATH_MAX - 4) { return GRN_INVALID_ARGUMENT; } + if ((rc = grn_io_remove(ctx, path))) { goto exit; } + grn_snprintf(buffer, PATH_MAX, PATH_MAX, + "%-.256s.c", path); + rc = grn_io_remove(ctx, buffer); +exit : + return rc; +} + +grn_rc +grn_ii_truncate(grn_ctx *ctx, grn_ii *ii) +{ + grn_rc rc; + const char *io_segpath, *io_chunkpath; + char *segpath, *chunkpath = NULL; + grn_obj *lexicon; + uint32_t flags; + if ((io_segpath = grn_io_path(ii->seg)) && *io_segpath != '\0') { + if (!(segpath = GRN_STRDUP(io_segpath))) { + ERR(GRN_NO_MEMORY_AVAILABLE, "cannot duplicate path: <%-.256s>", io_segpath); + return GRN_NO_MEMORY_AVAILABLE; + } + if ((io_chunkpath = grn_io_path(ii->chunk)) && *io_chunkpath != '\0') { + if (!(chunkpath = GRN_STRDUP(io_chunkpath))) { + ERR(GRN_NO_MEMORY_AVAILABLE, "cannot duplicate path: <%-.256s>", io_chunkpath); + return GRN_NO_MEMORY_AVAILABLE; + } + } else { + chunkpath = NULL; + } + } else { + segpath = NULL; + } + lexicon = ii->lexicon; + flags = ii->header->flags; + if ((rc = grn_io_close(ctx, ii->seg))) { goto exit; } + if ((rc = grn_io_close(ctx, ii->chunk))) { goto exit; } + ii->seg = NULL; + ii->chunk = NULL; + if (segpath && (rc = grn_io_remove(ctx, segpath))) { goto exit; } + if (chunkpath && (rc = grn_io_remove(ctx, chunkpath))) { goto exit; } + if (!_grn_ii_create(ctx, ii, segpath, lexicon, flags)) { + rc = GRN_UNKNOWN_ERROR; + } +exit: + if (segpath) { GRN_FREE(segpath); } + if (chunkpath) { GRN_FREE(chunkpath); } + return rc; +} + +grn_ii * +grn_ii_open(grn_ctx *ctx, const char *path, grn_obj *lexicon) +{ + grn_io *seg, *chunk; + grn_ii *ii; + char path2[PATH_MAX]; + struct grn_ii_header *header; + uint32_t io_type; + grn_table_flags lflags; + grn_encoding encoding; + grn_obj *tokenizer; + if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, + NULL, NULL)) { + return NULL; + } + if (strlen(path) + 6 >= PATH_MAX) { return NULL; } + grn_strcpy(path2, PATH_MAX, path); + grn_strcat(path2, PATH_MAX, ".c"); + seg = grn_io_open(ctx, path, grn_io_auto); + if (!seg) { return NULL; } + chunk = grn_io_open(ctx, path2, grn_io_auto); + if (!chunk) { + grn_io_close(ctx, seg); + return NULL; + } + header = grn_io_header(seg); + io_type = grn_io_get_type(seg); + if (io_type != GRN_COLUMN_INDEX) { + ERR(GRN_INVALID_FORMAT, + "[column][index] file type must be %#04x: <%#04x>", + GRN_COLUMN_INDEX, io_type); + grn_io_close(ctx, seg); + grn_io_close(ctx, chunk); + return NULL; + } + if (!(ii = GRN_MALLOCN(grn_ii, 1))) { + grn_io_close(ctx, seg); + grn_io_close(ctx, chunk); + return NULL; + } + GRN_DB_OBJ_SET_TYPE(ii, GRN_COLUMN_INDEX); + ii->seg = seg; + ii->chunk = chunk; + ii->lexicon = lexicon; + ii->lflags = lflags; + ii->encoding = encoding; + ii->header = header; + ii->n_elements = 2; + if ((header->flags & GRN_OBJ_WITH_SECTION)) { ii->n_elements++; } + if ((header->flags & GRN_OBJ_WITH_WEIGHT)) { ii->n_elements++; } + if ((header->flags & GRN_OBJ_WITH_POSITION)) { ii->n_elements++; } + return ii; +} + +grn_rc +grn_ii_close(grn_ctx *ctx, grn_ii *ii) +{ + grn_rc rc; + if (!ii) { return GRN_INVALID_ARGUMENT; } + if ((rc = grn_io_close(ctx, ii->seg))) { return rc; } + if ((rc = grn_io_close(ctx, ii->chunk))) { return rc; } + GRN_FREE(ii); + /* + { + int i; + for (i = 0; i < 32; i++) { + GRN_LOG(ctx, GRN_LOG_DEBUG, "new[%d]=%d free[%d]=%d", + i, new_histogram[i], + i, free_histogram[i]); + } + } + */ + return rc; +} + +grn_rc +grn_ii_info(grn_ctx *ctx, grn_ii *ii, uint64_t *seg_size, uint64_t *chunk_size) +{ + grn_rc rc; + + if (seg_size) { + if ((rc = grn_io_size(ctx, ii->seg, seg_size))) { + return rc; + } + } + + if (chunk_size) { + if ((rc = grn_io_size(ctx, ii->chunk, chunk_size))) { + return rc; + } + } + + return GRN_SUCCESS; +} + +grn_column_flags +grn_ii_get_flags(grn_ctx *ctx, grn_ii *ii) +{ + if (!ii) { + return 0; + } + + return ii->header->flags; +} + +uint32_t +grn_ii_get_n_elements(grn_ctx *ctx, grn_ii *ii) +{ + if (!ii) { + return 0; + } + + return ii->n_elements; +} + +void +grn_ii_expire(grn_ctx *ctx, grn_ii *ii) +{ + /* + grn_io_expire(ctx, ii->seg, 128, 1000000); + */ + grn_io_expire(ctx, ii->chunk, 0, 1000000); +} + +grn_rc +grn_ii_flush(grn_ctx *ctx, grn_ii *ii) +{ + grn_rc rc; + + rc = grn_io_flush(ctx, ii->seg); + if (rc == GRN_SUCCESS) { + rc = grn_io_flush(ctx, ii->chunk); + } + + return rc; +} + +size_t +grn_ii_get_disk_usage(grn_ctx *ctx, grn_ii *ii) +{ + size_t usage; + + usage = grn_io_get_disk_usage(ctx, ii->seg); + usage += grn_io_get_disk_usage(ctx, ii->chunk); + + return usage; +} + +#define BIT11_01(x) ((x >> 1) & 0x7ff) +#define BIT31_12(x) (x >> 12) + +grn_rc +grn_ii_update_one(grn_ctx *ctx, grn_ii *ii, grn_id tid, grn_ii_updspec *u, grn_hash *h) +{ + buffer *b; + uint8_t *bs; + buffer_rec *br = NULL; + buffer_term *bt; + uint32_t pseg = 0, pos = 0, size, *a; + if (!tid) { return ctx->rc; } + if (!u->tf || !u->sid) { return grn_ii_delete_one(ctx, ii, tid, u, h); } + if (u->sid > ii->header->smax) { ii->header->smax = u->sid; } + if (!(a = array_get(ctx, ii, tid))) { + DEFINE_NAME(ii); + MERR("[ii][update][one] failed to allocate an array: " + "<%.*s>: " + "<%u>:<%u>:<%u>", + name_size, name, + u->rid, u->sid, tid); + return ctx->rc; + } + if (!(bs = encode_rec(ctx, ii, u, &size, 0))) { + DEFINE_NAME(ii); + MERR("[ii][update][one] failed to encode a record: " + "<%.*s>: " + "<%u>:<%u>:<%u>", + name_size, name, + u->rid, u->sid, tid); + goto exit; + } + for (;;) { + if (a[0]) { + if (!(a[0] & 1)) { + pos = a[0]; + if ((pseg = buffer_open(ctx, ii, pos, &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { + DEFINE_NAME(ii); + MERR("[ii][update][one] failed to allocate a buffer: " + "<%.*s>: " + "<%u>:<%u>:<%u>: " + "segment:<%u>", + name_size, name, + u->rid, u->sid, tid, + pos); + goto exit; + } + if (b->header.buffer_free < size) { + int bfb = b->header.buffer_free; + GRN_LOG(ctx, GRN_LOG_DEBUG, "flushing a[0]=%d seg=%d(%p) free=%d", + a[0], LSEG(a[0]), b, b->header.buffer_free); + buffer_close(ctx, ii, pseg); + if (SPLIT_COND(ii, b)) { + /*((S_SEGMENT - sizeof(buffer_header) + ii->header->bmax - + b->header.nterms * sizeof(buffer_term)) * 4 < + b->header.chunk_size)*/ + GRN_LOG(ctx, GRN_LOG_DEBUG, + "nterms=%d chunk=%d total=%" GRN_FMT_INT64U, + b->header.nterms, + b->header.chunk_size, + ii->header->total_chunk_size >> 10); + buffer_split(ctx, ii, LSEG(pos), h); + if (ctx->rc != GRN_SUCCESS) { + DEFINE_NAME(ii); + ERR(ctx->rc, + "[ii][update][one] failed to split a buffer: " + "<%.*s>: " + "<%u>:<%u><%u>: " + "segment:<%u>", + name_size, name, + u->rid, u->sid, tid, + pos); + goto exit; + } + continue; + } + buffer_flush(ctx, ii, LSEG(pos), h); + if (ctx->rc != GRN_SUCCESS) { + DEFINE_NAME(ii); + ERR(ctx->rc, + "[ii][update][one] failed to flush a buffer: " + "<%.*s>: " + "<%u>:<%u><%u>: " + "segment:<%u>", + name_size, name, + u->rid, u->sid, tid, + pos); + goto exit; + } + if (a[0] != pos) { + GRN_LOG(ctx, GRN_LOG_DEBUG, + "grn_ii_update_one: a[0] changed %d->%d", a[0], pos); + continue; + } + if ((pseg = buffer_open(ctx, ii, pos, &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { + GRN_LOG(ctx, GRN_LOG_CRIT, "buffer not found a[0]=%d", a[0]); + { + DEFINE_NAME(ii); + MERR("[ii][update][one] failed to reallocate a buffer: " + "<%.*s>: " + "<%u>:<%u>:<%u>: " + "segment:<%u>, new-segment:<%u>", + name_size, name, + u->rid, u->sid, tid, + pos, a[0]); + } + goto exit; + } + GRN_LOG(ctx, GRN_LOG_DEBUG, + "flushed a[0]=%d seg=%d(%p) free=%d->%d nterms=%d v=%d", + a[0], LSEG(a[0]), b, bfb, b->header.buffer_free, + b->header.nterms, b->header.nterms_void); + if (b->header.buffer_free < size) { + DEFINE_NAME(ii); + MERR("[ii][update][one] buffer is full: " + "<%.*s>: " + "<%u>:<%u><%u>: " + "segment:<%u>, new-segment:<%u>, free:<%u>, required:<%u>", + name_size, name, + u->rid, u->sid, tid, + pos, a[0], b->header.buffer_free, size); + buffer_close(ctx, ii, pseg); + /* todo: direct merge */ + goto exit; + } + } + b->header.buffer_free -= size; + br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) + + b->header.buffer_free); + } else { + grn_ii_updspec u2; + uint32_t size2 = 0, v = a[0]; + struct _grn_ii_pos pos2; + pos2.pos = a[1]; + pos2.next = NULL; + u2.pos = &pos2; + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { + u2.rid = BIT31_12(v); + u2.sid = BIT11_01(v); + } else { + u2.rid = v >> 1; + u2.sid = 1; + } + u2.tf = 1; + u2.weight = 0; + if (u2.rid != u->rid || u2.sid != u->sid) { + uint8_t *bs2 = encode_rec(ctx, ii, &u2, &size2, 0); + if (!bs2) { + DEFINE_NAME(ii); + MERR("[ii][update][one] failed to encode a record2: " + "<%.*s>: " + "<%u>:<%u>:<%u>", + name_size, name, + u2.rid, u2.sid, tid); + goto exit; + } + pseg = buffer_new(ctx, ii, size + size2, &pos, &bt, &br, &b, tid, h); + if (pseg == GRN_II_PSEG_NOT_ASSIGNED) { + GRN_FREE(bs2); + { + DEFINE_NAME(ii); + MERR("[ii][update][one] failed to create a buffer2: " + "<%.*s>: " + "<%u>:<%u>:<%u>: " + "size:<%u>", + name_size, name, + u2.rid, u2.sid, tid, + size + size2); + } + goto exit; + } + bt->tid = tid; + bt->size_in_chunk = 0; + bt->pos_in_chunk = 0; + bt->size_in_buffer = 0; + bt->pos_in_buffer = 0; + buffer_put(ctx, ii, b, bt, br, bs2, &u2, size2); + if (ctx->rc != GRN_SUCCESS) { + GRN_FREE(bs2); + buffer_close(ctx, ii, pseg); + { + DEFINE_NAME(ii); + MERR("[ii][update][one] failed to put to buffer: " + "<%.*s>: " + "<%u>:<%u>:<%u>", + name_size, name, + u2.rid, u2.sid, tid); + } + goto exit; + } + br = (buffer_rec *)(((byte *)br) + size2); + GRN_FREE(bs2); + } + } + } + break; + } + if (!br) { + if (u->tf == 1 && u->weight == 0) { + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { + if (u->rid < 0x100000 && u->sid < 0x800) { + a[0] = (u->rid << 12) + (u->sid << 1) + 1; + a[1] = u->pos->pos; + goto exit; + } + } else { + a[0] = (u->rid << 1) + 1; + a[1] = u->pos->pos; + goto exit; + } + } + pseg = buffer_new(ctx, ii, size, &pos, &bt, &br, &b, tid, h); + if (pseg == GRN_II_PSEG_NOT_ASSIGNED) { + DEFINE_NAME(ii); + MERR("[ii][update][one] failed to create a buffer: " + "<%.*s>: " + "<%u>:<%u>:<%u>: " + "size:<%u>", + name_size, name, + u->rid, u->sid, tid, + size); + goto exit; + } + bt->tid = tid; + bt->size_in_chunk = 0; + bt->pos_in_chunk = 0; + bt->size_in_buffer = 0; + bt->pos_in_buffer = 0; + } + buffer_put(ctx, ii, b, bt, br, bs, u, size); + buffer_close(ctx, ii, pseg); + if (!a[0] || (a[0] & 1)) { a[0] = pos; } +exit : + array_unref(ii, tid); + if (bs) { GRN_FREE(bs); } + if (u->tf != u->atf) { + grn_obj *source_table; + char source_table_name[GRN_TABLE_MAX_KEY_SIZE]; + int source_table_name_size; + char term[GRN_TABLE_MAX_KEY_SIZE]; + int term_size; + + source_table = grn_ctx_at(ctx, DB_OBJ(ii)->range); + if (source_table) { + source_table_name_size = grn_obj_name(ctx, + source_table, + source_table_name, + GRN_TABLE_MAX_KEY_SIZE); + } else { + grn_strcpy(source_table_name, GRN_TABLE_MAX_KEY_SIZE, "(null)"); + source_table_name_size = strlen(source_table_name); + } + term_size = grn_table_get_key(ctx, ii->lexicon, tid, + term, GRN_TABLE_MAX_KEY_SIZE); + { + DEFINE_NAME(ii); + GRN_LOG(ctx, GRN_LOG_WARNING, + "[ii][update][one] too many postings: " + "<%.*s>: " + "record:<%.*s>(%d), " + "n-postings:<%d>, " + "n-discarded-postings:<%d>, " + "term:<%d>(<%.*s>)", + name_size, name, + source_table_name_size, source_table_name, + u->rid, + u->atf, + u->atf - u->tf, + tid, term_size, term); + } + } + grn_ii_expire(ctx, ii); + return ctx->rc; +} + +grn_rc +grn_ii_delete_one(grn_ctx *ctx, grn_ii *ii, grn_id tid, grn_ii_updspec *u, grn_hash *h) +{ + buffer *b; + uint8_t *bs = NULL; + buffer_rec *br; + buffer_term *bt; + uint32_t pseg, size, *a; + if (!tid) { return ctx->rc; } + if (!(a = array_at(ctx, ii, tid))) { + return ctx->rc; + } + for (;;) { + if (!a[0]) { goto exit; } + if (a[0] & 1) { + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { + uint32_t rid = BIT31_12(a[0]); + uint32_t sid = BIT11_01(a[0]); + if (u->rid == rid && (!u->sid || u->sid == sid)) { + a[0] = 0; + lexicon_delete(ctx, ii, tid, h); + } + } else { + uint32_t rid = a[0] >> 1; + if (u->rid == rid) { + a[0] = 0; + lexicon_delete(ctx, ii, tid, h); + } + } + goto exit; + } + if (!(bs = encode_rec(ctx, ii, u, &size, 1))) { + DEFINE_NAME(ii); + MERR("[ii][delete][one] failed to encode a record: " + "<%.*s>: " + "<%u>:<%u>:<%u>", + name_size, name, + u->rid, u->sid, tid); + goto exit; + } + if ((pseg = buffer_open(ctx, ii, a[0], &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { + DEFINE_NAME(ii); + MERR("[ii][delete][one] failed to allocate a buffer: " + "<%.*s>: " + "<%u>:<%u><%u>: " + "position:<%u>", + name_size, name, + u->rid, u->sid, tid, + a[0]); + goto exit; + } + if (b->header.buffer_free < size) { + uint32_t _a = a[0]; + GRN_LOG(ctx, GRN_LOG_DEBUG, "flushing! b=%p free=%d, seg(%d)", + b, b->header.buffer_free, LSEG(a[0])); + buffer_close(ctx, ii, pseg); + buffer_flush(ctx, ii, LSEG(a[0]), h); + if (ctx->rc != GRN_SUCCESS) { + DEFINE_NAME(ii); + ERR(ctx->rc, + "[ii][delete][one] failed to flush a buffer: " + "<%.*s>: " + "<%u>:<%u><%u>: " + "position:<%u>", + name_size, name, + u->rid, u->sid, tid, + a[0]); + goto exit; + } + if (a[0] != _a) { + GRN_LOG(ctx, GRN_LOG_DEBUG, "grn_ii_delete_one: a[0] changed %d->%d)", + a[0], _a); + continue; + } + if ((pseg = buffer_open(ctx, ii, a[0], &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { + DEFINE_NAME(ii); + MERR("[ii][delete][one] failed to reallocate a buffer: " + "<%.*s>: " + "<%u>:<%u><%u>: " + "position:<%u>", + name_size, name, + u->rid, u->sid, tid, + a[0]); + goto exit; + } + GRN_LOG(ctx, GRN_LOG_DEBUG, "flushed! b=%p free=%d, seg(%d)", + b, b->header.buffer_free, LSEG(a[0])); + if (b->header.buffer_free < size) { + DEFINE_NAME(ii); + MERR("[ii][delete][one] buffer is full: " + "<%.*s>: " + "<%u>:<%u><%u>: " + "segment:<%u>, free:<%u>, required:<%u>", + name_size, name, + u->rid, u->sid, tid, + a[0], b->header.buffer_free, size); + buffer_close(ctx, ii, pseg); + goto exit; + } + } + + b->header.buffer_free -= size; + br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) + b->header.buffer_free); + buffer_put(ctx, ii, b, bt, br, bs, u, size); + buffer_close(ctx, ii, pseg); + break; + } +exit : + array_unref(ii, tid); + if (bs) { GRN_FREE(bs); } + return ctx->rc; +} + +#define CHUNK_USED 1 +#define BUFFER_USED 2 +#define SOLE_DOC_USED 4 +#define SOLE_POS_USED 8 + +struct _grn_ii_cursor { + grn_db_obj obj; + grn_ctx *ctx; + grn_ii *ii; + grn_id id; + grn_posting *post; + + grn_id min; /* Minimum record ID */ + grn_id max; + grn_posting pc; + grn_posting pb; + + uint32_t cdf; /* Document frequency */ + uint32_t *cdp; + uint32_t *crp; /* Record ID */ + uint32_t *csp; /* Section ID */ + uint32_t *ctp; /* Term frequency */ + uint32_t *cwp; /* Weight */ + uint32_t *cpp; /* Position */ + + uint8_t *bp; + + int nelements; + uint32_t nchunks; + uint32_t curr_chunk; + chunk_info *cinfo; + grn_io_win iw; + uint8_t *cp; + uint8_t *cpe; + datavec rdv[MAX_N_ELEMENTS + 1]; + + struct grn_ii_buffer *buf; + uint16_t stat; + uint16_t nextb; + uint32_t buffer_pseg; + int flags; + uint32_t *ppseg; + + int weight; + + uint32_t prev_chunk_rid; +}; + +static grn_bool +buffer_is_reused(grn_ctx *ctx, grn_ii *ii, grn_ii_cursor *c) +{ + if (*c->ppseg != c->buffer_pseg) { + uint32_t i; + for (i = ii->header->bgqtail; i != ii->header->bgqhead; + i = (i + 1) & (GRN_II_BGQSIZE - 1)) { + if (ii->header->bgqbody[i] == c->buffer_pseg) { return GRN_FALSE; } + } + return GRN_TRUE; + } + return GRN_FALSE; +} + +static int +chunk_is_reused(grn_ctx *ctx, grn_ii *ii, grn_ii_cursor *c, uint32_t offset, uint32_t size) +{ + if (*c->ppseg != c->buffer_pseg) { + uint32_t i, m, gseg; + if (size > S_CHUNK) { return 1; } + if (size > (1 << GRN_II_W_LEAST_CHUNK)) { + int es = size - 1; + GRN_BIT_SCAN_REV(es, m); + m++; + } else { + m = GRN_II_W_LEAST_CHUNK; + } + gseg = ii->header->garbages[m - GRN_II_W_LEAST_CHUNK]; + while (gseg != GRN_II_PSEG_NOT_ASSIGNED) { + grn_io_win iw; + grn_ii_ginfo *ginfo = WIN_MAP(ii->chunk, ctx, &iw, gseg, 0, S_GARBAGE, + grn_io_rdwr); + if (!ginfo) { break; } + for (i = 0; i < ginfo->nrecs; i++) { + if (ginfo->recs[i] == offset) { + grn_io_win_unmap(&iw); + return 0; + } + } + gseg = ginfo->next; + grn_io_win_unmap(&iw); + } + return 1; + } + return 0; +} + +#define GRN_II_CURSOR_CMP(c1,c2) \ + (((c1)->post->rid > (c2)->post->rid) || \ + (((c1)->post->rid == (c2)->post->rid) && \ + (((c1)->post->sid > (c2)->post->sid) || \ + (((c1)->post->sid == (c2)->post->sid) && \ + ((c1)->post->pos > (c2)->post->pos))))) + +grn_ii_cursor * +grn_ii_cursor_open(grn_ctx *ctx, grn_ii *ii, grn_id tid, + grn_id min, grn_id max, int nelements, int flags) +{ + grn_ii_cursor *c = NULL; + uint32_t pos, *a; + if (!(a = array_at(ctx, ii, tid))) { return NULL; } + for (;;) { + c = NULL; + if (!(pos = a[0])) { goto exit; } + if (!(c = GRN_MALLOC(sizeof(grn_ii_cursor)))) { goto exit; } + memset(c, 0, sizeof(grn_ii_cursor)); + c->ctx = ctx; + c->ii = ii; + c->id = tid; + c->min = min; + c->max = max; + c->nelements = nelements; + c->flags = flags; + c->weight = 0; + if (pos & 1) { + c->stat = 0; + if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { + c->pb.rid = BIT31_12(pos); + c->pb.sid = BIT11_01(pos); + } else { + c->pb.rid = pos >> 1; + c->pb.sid = 1; + } + c->pb.tf = 1; + c->pb.weight = 0; + c->pb.pos = a[1]; + } else { + uint32_t chunk; + buffer_term *bt; + c->buffer_pseg = buffer_open(ctx, ii, pos, &bt, &c->buf); + if (c->buffer_pseg == GRN_II_PSEG_NOT_ASSIGNED) { + GRN_FREE(c); + c = NULL; + goto exit; + } + c->ppseg = &ii->header->binfo[LSEG(pos)]; + if (bt->size_in_chunk && (chunk = c->buf->header.chunk) != GRN_II_PSEG_NOT_ASSIGNED) { + if (!(c->cp = WIN_MAP(ii->chunk, ctx, &c->iw, chunk, bt->pos_in_chunk, + bt->size_in_chunk, grn_io_rdonly))) { + buffer_close(ctx, ii, c->buffer_pseg); + GRN_FREE(c); + c = NULL; + goto exit; + } + if (buffer_is_reused(ctx, ii, c)) { + grn_ii_cursor_close(ctx, c); + continue; + } + c->cpe = c->cp + bt->size_in_chunk; + if ((bt->tid & CHUNK_SPLIT)) { + int i; + grn_id crid; + GRN_B_DEC(c->nchunks, c->cp); + if (chunk_is_reused(ctx, ii, c, chunk, c->buf->header.chunk_size)) { + grn_ii_cursor_close(ctx, c); + continue; + } + if (!(c->cinfo = GRN_MALLOCN(chunk_info, c->nchunks))) { + buffer_close(ctx, ii, c->buffer_pseg); + grn_io_win_unmap(&c->iw); + GRN_FREE(c); + c = NULL; + goto exit; + } + for (i = 0, crid = GRN_ID_NIL; i < c->nchunks; i++) { + GRN_B_DEC(c->cinfo[i].segno, c->cp); + GRN_B_DEC(c->cinfo[i].size, c->cp); + GRN_B_DEC(c->cinfo[i].dgap, c->cp); + crid += c->cinfo[i].dgap; + if (crid < min) { + c->pc.rid = crid; + c->curr_chunk = i + 1; + } + } + if (chunk_is_reused(ctx, ii, c, chunk, c->buf->header.chunk_size)) { + grn_ii_cursor_close(ctx, c); + continue; + } + } + if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { + c->rdv[ii->n_elements - 1].flags = ODD; + } + } + c->nextb = bt->pos_in_buffer; + c->stat = CHUNK_USED|BUFFER_USED; + } + if (pos == a[0]) { break; } + grn_ii_cursor_close(ctx, c); + } +exit : + array_unref(ii, tid); + return c; +} + +static inline void +grn_ii_cursor_set_min(grn_ctx *ctx, grn_ii_cursor *c, grn_id min) +{ + if (c->min >= min) { + return; + } + + if (grn_ii_cursor_set_min_enable) { + grn_id old_min = c->min; + c->min = min; + if (c->buf && + c->pc.rid != GRN_ID_NIL && + c->pc.rid < c->min && + c->prev_chunk_rid < c->min && + c->curr_chunk < c->nchunks) { + uint32_t i; + uint32_t skip_chunk = 0; + grn_id rid = c->prev_chunk_rid; + + if (c->curr_chunk > 0) { + i = c->curr_chunk - 1; + } else { + i = 0; + } + for (; i < c->nchunks; i++) { + rid += c->cinfo[i].dgap; + if (rid < c->min) { + skip_chunk = i + 1; + } else { + rid -= c->cinfo[i].dgap; + break; + } + } + if (skip_chunk > c->curr_chunk) { + uint32_t old_chunk = c->curr_chunk; + grn_bool old_chunk_used = (c->stat & CHUNK_USED); + c->pc.rid = rid; + c->pc.rest = 0; + c->prev_chunk_rid = rid - c->cinfo[skip_chunk - 1].dgap; + c->curr_chunk = skip_chunk; + c->crp = c->cdp + c->cdf; + c->stat |= CHUNK_USED; + GRN_LOG(ctx, GRN_LOG_DEBUG, + "[ii][cursor][min] skip: %p: min(%u->%u): chunk(%u->%u): " + "chunk-used(%-.256s->%-.256s)", + c, + old_min, min, + old_chunk, c->curr_chunk, + old_chunk_used ? "true" : "false", + (c->stat & CHUNK_USED) ? "true" : "false"); + } + } + } +} + +typedef struct { + grn_bool include_garbage; +} grn_ii_cursor_next_options; + +static inline grn_posting * +grn_ii_cursor_next_internal(grn_ctx *ctx, grn_ii_cursor *c, + grn_ii_cursor_next_options *options) +{ + const grn_bool include_garbage = options->include_garbage; + if (c->buf) { + for (;;) { + if (c->stat & CHUNK_USED) { + for (;;) { + if (c->crp < c->cdp + c->cdf) { + uint32_t dgap = *c->crp++; + c->pc.rid += dgap; + if (dgap) { c->pc.sid = 0; } + if ((c->ii->header->flags & GRN_OBJ_WITH_SECTION)) { + c->pc.sid += 1 + *c->csp++; + } else { + c->pc.sid = 1; + } + c->cpp += c->pc.rest; + c->pc.rest = c->pc.tf = 1 + *c->ctp++; + if ((c->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { + c->pc.weight = *c->cwp++; + } else { + c->pc.weight = 0; + } + c->pc.pos = 0; + /* + { + static int count = 0; + int tf = c->pc.tf, pos = 0, *pp = (int *)c->cpp; + grn_obj buf; + GRN_TEXT_INIT(&buf, 0); + grn_text_itoa(ctx, &buf, c->pc.rid); + GRN_TEXT_PUTC(ctx, &buf, ':'); + grn_text_itoa(ctx, &buf, c->pc.sid); + GRN_TEXT_PUTC(ctx, &buf, ':'); + grn_text_itoa(ctx, &buf, c->pc.tf); + GRN_TEXT_PUTC(ctx, &buf, '('); + while (tf--) { + pos += *pp++; + count++; + grn_text_itoa(ctx, &buf, pos); + if (tf) { GRN_TEXT_PUTC(ctx, &buf, ':'); } + } + GRN_TEXT_PUTC(ctx, &buf, ')'); + GRN_TEXT_PUTC(ctx, &buf, '\0'); + GRN_LOG(ctx, GRN_LOG_DEBUG, "posting(%d):%-.256s", count, GRN_TEXT_VALUE(&buf)); + GRN_OBJ_FIN(ctx, &buf); + } + */ + } else { + if (c->curr_chunk <= c->nchunks) { + if (c->curr_chunk == c->nchunks) { + if (c->cp < c->cpe) { + int decoded_size; + decoded_size = + grn_p_decv(ctx, c->cp, c->cpe - c->cp, + c->rdv, c->ii->n_elements); + if (decoded_size == 0) { + GRN_LOG(ctx, GRN_LOG_WARNING, + "[ii][cursor][next][chunk][last] " + "chunk(%d) is changed by another thread " + "while decoding: %p", + c->cinfo[c->curr_chunk].segno, + c); + c->pc.rid = GRN_ID_NIL; + break; + } + if (buffer_is_reused(ctx, c->ii, c)) { + GRN_LOG(ctx, GRN_LOG_WARNING, + "[ii][cursor][next][chunk][last] " + "buffer is reused by another thread: %p", + c); + c->pc.rid = GRN_ID_NIL; + break; + } + if (chunk_is_reused(ctx, c->ii, c, + c->buf->header.chunk, + c->buf->header.chunk_size)) { + GRN_LOG(ctx, GRN_LOG_WARNING, + "[ii][cursor][next][chunk][last] " + "chunk(%d) is reused by another thread: %p", + c->buf->header.chunk, + c); + c->pc.rid = GRN_ID_NIL; + break; + } + } else { + c->pc.rid = GRN_ID_NIL; + break; + } + } else { + uint8_t *cp; + grn_io_win iw; + uint32_t size = c->cinfo[c->curr_chunk].size; + if (size && (cp = WIN_MAP(c->ii->chunk, ctx, &iw, + c->cinfo[c->curr_chunk].segno, 0, + size, grn_io_rdonly))) { + int decoded_size; + decoded_size = + grn_p_decv(ctx, cp, size, c->rdv, c->ii->n_elements); + grn_io_win_unmap(&iw); + if (decoded_size == 0) { + GRN_LOG(ctx, GRN_LOG_WARNING, + "[ii][cursor][next][chunk] " + "chunk(%d) is changed by another thread " + "while decoding: %p", + c->cinfo[c->curr_chunk].segno, + c); + c->pc.rid = GRN_ID_NIL; + break; + } + if (chunk_is_reused(ctx, c->ii, c, + c->cinfo[c->curr_chunk].segno, size)) { + GRN_LOG(ctx, GRN_LOG_WARNING, + "[ii][cursor][next][chunk] " + "chunk(%d) is reused by another thread: %p", + c->cinfo[c->curr_chunk].segno, + c); + c->pc.rid = GRN_ID_NIL; + break; + } + } else { + c->pc.rid = GRN_ID_NIL; + break; + } + } + { + int j = 0; + c->cdf = c->rdv[j].data_size; + c->crp = c->cdp = c->rdv[j++].data; + if ((c->ii->header->flags & GRN_OBJ_WITH_SECTION)) { + c->csp = c->rdv[j++].data; + } + c->ctp = c->rdv[j++].data; + if ((c->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { + c->cwp = c->rdv[j++].data; + } + if ((c->ii->header->flags & GRN_OBJ_WITH_POSITION)) { + c->cpp = c->rdv[j].data; + } + } + c->prev_chunk_rid = c->pc.rid; + c->pc.rid = GRN_ID_NIL; + c->pc.sid = 0; + c->pc.rest = 0; + c->curr_chunk++; + continue; + } else { + c->pc.rid = GRN_ID_NIL; + } + } + break; + } + } + if (c->stat & BUFFER_USED) { + for (;;) { + if (c->nextb) { + uint32_t lrid = c->pb.rid, lsid = c->pb.sid; /* for check */ + buffer_rec *br = BUFFER_REC_AT(c->buf, c->nextb); + if (buffer_is_reused(ctx, c->ii, c)) { + GRN_LOG(ctx, GRN_LOG_WARNING, + "[ii][cursor][next][buffer] " + "buffer(%d,%d) is reused by another thread: %p", + c->buffer_pseg, *c->ppseg, + c); + c->pb.rid = GRN_ID_NIL; + break; + } + c->bp = GRN_NEXT_ADDR(br); + GRN_B_DEC(c->pb.rid, c->bp); + if ((c->ii->header->flags & GRN_OBJ_WITH_SECTION)) { + GRN_B_DEC(c->pb.sid, c->bp); + } else { + c->pb.sid = 1; + } + if (lrid > c->pb.rid || (lrid == c->pb.rid && lsid >= c->pb.sid)) { + DEFINE_NAME(c->ii); + ERR(GRN_FILE_CORRUPT, + "[ii][broken][cursor][next][buffer] " + "posting in list in buffer isn't sorted: " + "<%.*s>: (%d:%d) -> (%d:%d) (%d->%d)", + name_size, name, + lrid, lsid, + c->pb.rid, c->pb.sid, + c->buffer_pseg, *c->ppseg); + c->pb.rid = GRN_ID_NIL; + break; + } + if (c->pb.rid < c->min) { + c->pb.rid = 0; + if (br->jump > 0 && !BUFFER_REC_DELETED(br)) { + buffer_rec *jump_br = BUFFER_REC_AT(c->buf, br->jump); + if (BUFFER_REC_DELETED(jump_br)) { + c->nextb = br->step; + } else { + uint8_t *jump_bp; + uint32_t jump_rid; + jump_bp = GRN_NEXT_ADDR(jump_br); + GRN_B_DEC(jump_rid, jump_bp); + if (jump_rid < c->min) { + c->nextb = br->jump; + } else { + c->nextb = br->step; + } + } + } else { + c->nextb = br->step; + } + continue; + } + c->nextb = br->step; + GRN_B_DEC(c->pb.tf, c->bp); + if ((c->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { + GRN_B_DEC(c->pb.weight, c->bp); + } else { + c->pb.weight = 0; + } + c->pb.rest = c->pb.tf; + c->pb.pos = 0; + } else { + c->pb.rid = 0; + } + break; + } + } + if (c->pb.rid) { + if (c->pc.rid) { + if (c->pc.rid < c->pb.rid) { + c->stat = CHUNK_USED; + if (include_garbage || (c->pc.tf && c->pc.sid)) { + c->post = &c->pc; + break; + } + } else { + if (c->pb.rid < c->pc.rid) { + c->stat = BUFFER_USED; + if (include_garbage || (c->pb.tf && c->pb.sid)) { + c->post = &c->pb; + break; + } + } else { + if (c->pb.sid) { + if (c->pc.sid < c->pb.sid) { + c->stat = CHUNK_USED; + if (include_garbage || (c->pc.tf && c->pc.sid)) { + c->post = &c->pc; + break; + } + } else { + c->stat = BUFFER_USED; + if (c->pb.sid == c->pc.sid) { c->stat |= CHUNK_USED; } + if (include_garbage || (c->pb.tf)) { + c->post = &c->pb; + break; + } + } + } else { + c->stat = CHUNK_USED; + } + } + } + } else { + c->stat = BUFFER_USED; + if (include_garbage || (c->pb.tf && c->pb.sid)) { + c->post = &c->pb; + break; + } + } + } else { + if (c->pc.rid) { + c->stat = CHUNK_USED; + if (include_garbage || (c->pc.tf && c->pc.sid)) { + c->post = &c->pc; + break; + } + } else { + c->post = NULL; + return NULL; + } + } + } + } else { + if (c->stat & SOLE_DOC_USED) { + c->post = NULL; + return NULL; + } else { + c->post = &c->pb; + c->stat |= SOLE_DOC_USED; + if (c->post->rid < c->min) { + c->post = NULL; + return NULL; + } + } + } + return c->post; +} + +grn_posting * +grn_ii_cursor_next(grn_ctx *ctx, grn_ii_cursor *c) +{ + grn_ii_cursor_next_options options = { + .include_garbage = GRN_FALSE + }; + return grn_ii_cursor_next_internal(ctx, c, &options); +} + +grn_posting * +grn_ii_cursor_next_pos(grn_ctx *ctx, grn_ii_cursor *c) +{ + uint32_t gap; + if ((c->ii->header->flags & GRN_OBJ_WITH_POSITION)) { + if (c->nelements == c->ii->n_elements) { + if (c->buf) { + if (c->post == &c->pc) { + if (c->pc.rest) { + c->pc.rest--; + c->pc.pos += *c->cpp++; + } else { + return NULL; + } + } else if (c->post == &c->pb) { + if (buffer_is_reused(ctx, c->ii, c)) { + GRN_LOG(ctx, GRN_LOG_WARNING, + "[ii][cursor][next][pos][buffer] " + "buffer(%d,%d) is reused by another thread: %p", + c->buffer_pseg, *c->ppseg, + c); + return NULL; + } + if (c->pb.rest) { + c->pb.rest--; + GRN_B_DEC(gap, c->bp); + c->pb.pos += gap; + } else { + return NULL; + } + } else { + return NULL; + } + } else { + if (c->stat & SOLE_POS_USED) { + return NULL; + } else { + c->stat |= SOLE_POS_USED; + } + } + } + } else { + if (c->stat & SOLE_POS_USED) { + return NULL; + } else { + c->stat |= SOLE_POS_USED; + } + } + return c->post; +} + +grn_rc +grn_ii_cursor_close(grn_ctx *ctx, grn_ii_cursor *c) +{ + if (!c) { return GRN_INVALID_ARGUMENT; } + datavec_fin(ctx, c->rdv); + if (c->cinfo) { GRN_FREE(c->cinfo); } + if (c->buf) { buffer_close(ctx, c->ii, c->buffer_pseg); } + if (c->cp) { grn_io_win_unmap(&c->iw); } + GRN_FREE(c); + return GRN_SUCCESS; +} + +uint32_t +grn_ii_get_chunksize(grn_ctx *ctx, grn_ii *ii, grn_id tid) +{ + uint32_t res, pos, *a; + a = array_at(ctx, ii, tid); + if (!a) { return 0; } + if ((pos = a[0])) { + if (pos & 1) { + res = 0; + } else { + buffer *buf; + uint32_t pseg; + buffer_term *bt; + if ((pseg = buffer_open(ctx, ii, pos, &bt, &buf)) == GRN_II_PSEG_NOT_ASSIGNED) { + res = 0; + } else { + res = bt->size_in_chunk; + buffer_close(ctx, ii, pseg); + } + } + } else { + res = 0; + } + array_unref(ii, tid); + return res; +} + +uint32_t +grn_ii_estimate_size(grn_ctx *ctx, grn_ii *ii, grn_id tid) +{ + uint32_t res, pos, *a; + a = array_at(ctx, ii, tid); + if (!a) { return 0; } + if ((pos = a[0])) { + if (pos & 1) { + res = 1; + } else { + buffer *buf; + uint32_t pseg; + buffer_term *bt; + if ((pseg = buffer_open(ctx, ii, pos, &bt, &buf)) == GRN_II_PSEG_NOT_ASSIGNED) { + res = 0; + } else { + res = a[1] + bt->size_in_buffer + 2; + buffer_close(ctx, ii, pseg); + } + } + } else { + res = 0; + } + array_unref(ii, tid); + return res; +} + +int +grn_ii_entry_info(grn_ctx *ctx, grn_ii *ii, grn_id tid, unsigned int *a, + unsigned int *chunk, unsigned int *chunk_size, + unsigned int *buffer_free, + unsigned int *nterms, unsigned int *nterms_void, + unsigned int *bt_tid, + unsigned int *size_in_chunk, unsigned int *pos_in_chunk, + unsigned int *size_in_buffer, unsigned int *pos_in_buffer) +{ + buffer *b; + buffer_term *bt; + uint32_t pseg, *ap; + ERRCLR(NULL); + ap = array_at(ctx, ii, tid); + if (!ap) { return 0; } + a[0] = *ap; + array_unref(ii, tid); + if (!a[0]) { return 1; } + if (a[0] & 1) { return 2; } + if ((pseg = buffer_open(ctx, ii, a[0], &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { return 3; } + *chunk = b->header.chunk; + *chunk_size = b->header.chunk_size; + *buffer_free = b->header.buffer_free; + *nterms = b->header.nterms; + *bt_tid = bt->tid; + *size_in_chunk = bt->size_in_chunk; + *pos_in_chunk = bt->pos_in_chunk; + *size_in_buffer = bt->size_in_buffer; + *pos_in_buffer = bt->pos_in_buffer; + buffer_close(ctx, ii, pseg); + return 4; +} + +const char * +grn_ii_path(grn_ii *ii) +{ + return grn_io_path(ii->seg); +} + +uint32_t +grn_ii_max_section(grn_ii *ii) +{ + return ii->header->smax; +} + +grn_obj * +grn_ii_lexicon(grn_ii *ii) +{ + return ii->lexicon; +} + +/* private classes */ + +/* b-heap */ + +typedef struct { + int n_entries; + int n_bins; + grn_ii_cursor **bins; +} cursor_heap; + +static inline cursor_heap * +cursor_heap_open(grn_ctx *ctx, int max) +{ + cursor_heap *h = GRN_MALLOC(sizeof(cursor_heap)); + if (!h) { return NULL; } + h->bins = GRN_MALLOC(sizeof(grn_ii_cursor *) * max); + if (!h->bins) { + GRN_FREE(h); + return NULL; + } + h->n_entries = 0; + h->n_bins = max; + return h; +} + +static inline grn_rc +cursor_heap_push(grn_ctx *ctx, cursor_heap *h, grn_ii *ii, grn_id tid, uint32_t offset2, + int weight, grn_id min) +{ + int n, n2; + grn_ii_cursor *c, *c2; + if (h->n_entries >= h->n_bins) { + int max = h->n_bins * 2; + grn_ii_cursor **bins = GRN_REALLOC(h->bins, sizeof(grn_ii_cursor *) * max); + GRN_LOG(ctx, GRN_LOG_DEBUG, "expanded cursor_heap to %d,%p", max, bins); + if (!bins) { return GRN_NO_MEMORY_AVAILABLE; } + h->n_bins = max; + h->bins = bins; + } + { + if (!(c = grn_ii_cursor_open(ctx, ii, tid, min, GRN_ID_MAX, + ii->n_elements, 0))) { + GRN_LOG(ctx, GRN_LOG_ERROR, "cursor open failed"); + return ctx->rc; + } + if (!grn_ii_cursor_next(ctx, c)) { + grn_ii_cursor_close(ctx, c); + return GRN_END_OF_DATA; + } + if (!grn_ii_cursor_next_pos(ctx, c)) { + if (grn_logger_pass(ctx, GRN_LOG_ERROR)) { + char token[GRN_TABLE_MAX_KEY_SIZE]; + int token_size; + token_size = grn_table_get_key(ctx, + c->ii->lexicon, + c->id, + &token, + GRN_TABLE_MAX_KEY_SIZE); + GRN_LOG(ctx, GRN_LOG_ERROR, + "[ii][cursor][heap][push] invalid cursor: " + "%p: token:<%.*s>(%u)", + c, token_size, token, c->id); + } + grn_ii_cursor_close(ctx, c); + return GRN_END_OF_DATA; + } + if (weight) { + c->weight = weight; + } + n = h->n_entries++; + while (n) { + n2 = (n - 1) >> 1; + c2 = h->bins[n2]; + if (GRN_II_CURSOR_CMP(c, c2)) { break; } + h->bins[n] = c2; + n = n2; + } + h->bins[n] = c; + } + return GRN_SUCCESS; +} + +static inline grn_rc +cursor_heap_push2(cursor_heap *h) +{ + grn_rc rc = GRN_SUCCESS; + return rc; +} + +static inline grn_ii_cursor * +cursor_heap_min(cursor_heap *h) +{ + return h->n_entries ? h->bins[0] : NULL; +} + +static inline void +cursor_heap_recalc_min(cursor_heap *h) +{ + int n = 0, n1, n2, m; + if ((m = h->n_entries) > 1) { + grn_ii_cursor *c = h->bins[0], *c1, *c2; + for (;;) { + n1 = n * 2 + 1; + n2 = n1 + 1; + c1 = n1 < m ? h->bins[n1] : NULL; + c2 = n2 < m ? h->bins[n2] : NULL; + if (c1 && GRN_II_CURSOR_CMP(c, c1)) { + if (c2 && GRN_II_CURSOR_CMP(c, c2) && GRN_II_CURSOR_CMP(c1, c2)) { + h->bins[n] = c2; + n = n2; + } else { + h->bins[n] = c1; + n = n1; + } + } else { + if (c2 && GRN_II_CURSOR_CMP(c, c2)) { + h->bins[n] = c2; + n = n2; + } else { + h->bins[n] = c; + break; + } + } + } + } +} + +static inline void +cursor_heap_pop(grn_ctx *ctx, cursor_heap *h, grn_id min) +{ + if (h->n_entries) { + grn_ii_cursor *c = h->bins[0]; + grn_ii_cursor_set_min(ctx, c, min); + if (!grn_ii_cursor_next(ctx, c)) { + grn_ii_cursor_close(ctx, c); + h->bins[0] = h->bins[--h->n_entries]; + } else if (!grn_ii_cursor_next_pos(ctx, c)) { + if (grn_logger_pass(ctx, GRN_LOG_ERROR)) { + char token[GRN_TABLE_MAX_KEY_SIZE]; + int token_size; + token_size = grn_table_get_key(ctx, + c->ii->lexicon, + c->id, + &token, + GRN_TABLE_MAX_KEY_SIZE); + GRN_LOG(ctx, GRN_LOG_ERROR, + "[ii][cursor][heap][pop] invalid cursor: " + "%p: token:<%.*s>(%u)", + c, token_size, token, c->id); + } + grn_ii_cursor_close(ctx, c); + h->bins[0] = h->bins[--h->n_entries]; + } + if (h->n_entries > 1) { cursor_heap_recalc_min(h); } + } +} + +static inline void +cursor_heap_pop_pos(grn_ctx *ctx, cursor_heap *h) +{ + if (h->n_entries) { + grn_ii_cursor *c = h->bins[0]; + if (!grn_ii_cursor_next_pos(ctx, c)) { + if (!grn_ii_cursor_next(ctx, c)) { + grn_ii_cursor_close(ctx, c); + h->bins[0] = h->bins[--h->n_entries]; + } else if (!grn_ii_cursor_next_pos(ctx, c)) { + if (grn_logger_pass(ctx, GRN_LOG_ERROR)) { + char token[GRN_TABLE_MAX_KEY_SIZE]; + int token_size; + token_size = grn_table_get_key(ctx, + c->ii->lexicon, + c->id, + &token, + GRN_TABLE_MAX_KEY_SIZE); + GRN_LOG(ctx, GRN_LOG_ERROR, + "[ii][cursor][heap][pop][position] invalid cursor: " + "%p: token:<%.*s>(%u)", + c, token_size, token, c->id); + } + grn_ii_cursor_close(ctx, c); + h->bins[0] = h->bins[--h->n_entries]; + } + } + if (h->n_entries > 1) { cursor_heap_recalc_min(h); } + } +} + +static inline void +cursor_heap_close(grn_ctx *ctx, cursor_heap *h) +{ + int i; + if (!h) { return; } + for (i = h->n_entries; i--;) { grn_ii_cursor_close(ctx, h->bins[i]); } + GRN_FREE(h->bins); + GRN_FREE(h); +} + +/* update */ +#ifdef USE_VGRAM + +inline static grn_rc +index_add(grn_ctx *ctx, grn_id rid, grn_obj *lexicon, grn_ii *ii, grn_vgram *vgram, + const char *value, size_t value_len) +{ + grn_hash *h; + unsigned int token_flags = 0; + grn_token_cursor *token_cursor; + grn_ii_updspec **u; + grn_id tid, *tp; + grn_rc r, rc = GRN_SUCCESS; + grn_vgram_buf *sbuf = NULL; + if (!rid) { return GRN_INVALID_ARGUMENT; } + if (!(token_cursor = grn_token_cursor_open(ctx, lexicon, value, value_len, + GRN_TOKEN_ADD, token_flags))) { + return GRN_NO_MEMORY_AVAILABLE; + } + if (vgram) { sbuf = grn_vgram_buf_open(value_len); } + h = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), + GRN_HASH_TINY); + if (!h) { + GRN_LOG(ctx, GRN_LOG_ALERT, "grn_hash_create on index_add failed !"); + grn_token_cursor_close(ctx, token_cursor); + if (sbuf) { grn_vgram_buf_close(sbuf); } + return GRN_NO_MEMORY_AVAILABLE; + } + while (!token_cursor->status) { + (tid = grn_token_cursor_next(ctx, token_cursor)); + if (tid) { + if (!grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **) &u, NULL)) { + break; + } + if (!*u) { + if (!(*u = grn_ii_updspec_open(ctx, rid, 1))) { + GRN_LOG(ctx, GRN_LOG_ERROR, + "grn_ii_updspec_open on index_add failed!"); + goto exit; + } + } + if (grn_ii_updspec_add(ctx, *u, token_cursor->pos, 0)) { + GRN_LOG(ctx, GRN_LOG_ERROR, + "grn_ii_updspec_add on index_add failed!"); + goto exit; + } + if (sbuf) { grn_vgram_buf_add(sbuf, tid); } + } + } + grn_token_cursor_close(ctx, token_cursor); + // todo : support vgram + // if (sbuf) { grn_vgram_update(vgram, rid, sbuf, (grn_set *)h); } + GRN_HASH_EACH(ctx, h, id, &tp, NULL, &u, { + if ((r = grn_ii_update_one(ctx, ii, *tp, *u, h))) { rc = r; } + grn_ii_updspec_close(ctx, *u); + }); + grn_hash_close(ctx, h); + if (sbuf) { grn_vgram_buf_close(sbuf); } + return rc; +exit: + grn_hash_close(ctx, h); + grn_token_cursor_close(ctx, token_cursor); + if (sbuf) { grn_vgram_buf_close(sbuf); } + return GRN_NO_MEMORY_AVAILABLE; +} + +inline static grn_rc +index_del(grn_ctx *ctx, grn_id rid, grn_obj *lexicon, grn_ii *ii, grn_vgram *vgram, + const char *value, size_t value_len) +{ + grn_rc rc = GRN_SUCCESS; + grn_hash *h; + unsigned int token_flags = 0; + grn_token_cursor *token_cursor; + grn_ii_updspec **u; + grn_id tid, *tp; + if (!rid) { return GRN_INVALID_ARGUMENT; } + if (!(token_cursor = grn_token_cursor_open(ctx, lexicon, value, value_len, + GRN_TOKEN_DEL, token_flags))) { + return GRN_NO_MEMORY_AVAILABLE; + } + h = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), + GRN_HASH_TINY); + if (!h) { + GRN_LOG(ctx, GRN_LOG_ALERT, "grn_hash_create on index_del failed !"); + grn_token_cursor_close(ctx, token_cursor); + return GRN_NO_MEMORY_AVAILABLE; + } + while (!token_cursor->status) { + if ((tid = grn_token_cursor_next(ctx, token_cursor))) { + if (!grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **) &u, NULL)) { + break; + } + if (!*u) { + if (!(*u = grn_ii_updspec_open(ctx, rid, 0))) { + GRN_LOG(ctx, GRN_LOG_ALERT, + "grn_ii_updspec_open on index_del failed !"); + grn_hash_close(ctx, h); + grn_token_cursor_close(ctx, token_cursor); + return GRN_NO_MEMORY_AVAILABLE; + } + } + } + } + grn_token_cursor_close(ctx, token_cursor); + GRN_HASH_EACH(ctx, h, id, &tp, NULL, &u, { + if (*tp) { + grn_rc r; + r = grn_ii_delete_one(ctx, ii, *tp, *u, NULL); + if (r) { + rc = r; + } + } + grn_ii_updspec_close(ctx, *u); + }); + grn_hash_close(ctx, h); + return rc; +} + +grn_rc +grn_ii_upd(grn_ctx *ctx, grn_ii *ii, grn_id rid, grn_vgram *vgram, + const char *oldvalue, unsigned int oldvalue_len, + const char *newvalue, unsigned int newvalue_len) +{ + grn_rc rc; + grn_obj *lexicon = ii->lexicon; + if (!rid) { return GRN_INVALID_ARGUMENT; } + if (oldvalue && *oldvalue) { + if ((rc = index_del(ctx, rid, lexicon, ii, vgram, oldvalue, oldvalue_len))) { + GRN_LOG(ctx, GRN_LOG_ERROR, "index_del on grn_ii_upd failed !"); + goto exit; + } + } + if (newvalue && *newvalue) { + rc = index_add(ctx, rid, lexicon, ii, vgram, newvalue, newvalue_len); + } +exit : + return rc; +} + +grn_rc +grn_ii_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, grn_vgram *vgram, unsigned int section, + grn_values *oldvalues, grn_values *newvalues) +{ + int j; + grn_value *v; + unsigned int token_flags = 0; + grn_token_cursor *token_cursor; + grn_rc rc = GRN_SUCCESS; + grn_hash *old, *new; + grn_id tid, *tp; + grn_ii_updspec **u, **un; + grn_obj *lexicon = ii->lexicon; + if (!lexicon || !ii || !rid) { + GRN_LOG(ctx, GRN_LOG_WARNING, "grn_ii_update: invalid argument"); + return GRN_INVALID_ARGUMENT; + } + if (newvalues) { + new = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), + GRN_HASH_TINY); + if (!new) { + GRN_LOG(ctx, GRN_LOG_ALERT, "grn_hash_create on grn_ii_update failed !"); + rc = GRN_NO_MEMORY_AVAILABLE; + goto exit; + } + for (j = newvalues->n_values, v = newvalues->values; j; j--, v++) { + if ((token_cursor = grn_token_cursor_open(ctx, lexicon, v->str, + v->str_len, GRN_TOKEN_ADD, + token_flags))) { + while (!token_cursor->status) { + if ((tid = grn_token_cursor_next(ctx, token_cursor))) { + if (!grn_hash_add(ctx, new, &tid, sizeof(grn_id), (void **) &u, + NULL)) { + break; + } + if (!*u) { + if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { + GRN_LOG(ctx, GRN_LOG_ALERT, + "grn_ii_updspec_open on grn_ii_update failed!"); + grn_token_cursor_close(ctx, token_cursor); + grn_hash_close(ctx, new); + rc = GRN_NO_MEMORY_AVAILABLE; + goto exit; + } + } + if (grn_ii_updspec_add(ctx, *u, token_cursor->pos, v->weight)) { + GRN_LOG(ctx, GRN_LOG_ALERT, + "grn_ii_updspec_add on grn_ii_update failed!"); + grn_token_cursor_close(ctx, token_cursor); + grn_hash_close(ctx, new); + rc = GRN_NO_MEMORY_AVAILABLE; + goto exit; + } + } + } + grn_token_cursor_close(ctx, token_cursor); + } + } + if (!GRN_HASH_SIZE(new)) { + grn_hash_close(ctx, new); + new = NULL; + } + } else { + new = NULL; + } + if (oldvalues) { + old = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), + GRN_HASH_TINY); + if (!old) { + GRN_LOG(ctx, GRN_LOG_ALERT, + "grn_hash_create(ctx, NULL, old) on grn_ii_update failed!"); + if (new) { grn_hash_close(ctx, new); } + rc = GRN_NO_MEMORY_AVAILABLE; + goto exit; + } + for (j = oldvalues->n_values, v = oldvalues->values; j; j--, v++) { + if ((token_cursor = grn_token_cursor_open(ctx, lexicon, v->str, + v->str_len, GRN_TOKEN_DEL, + token_flags))) { + while (!token_cursor->status) { + if ((tid = grn_token_cursor_next(ctx, token_cursor))) { + if (!grn_hash_add(ctx, old, &tid, sizeof(grn_id), (void **) &u, + NULL)) { + break; + } + if (!*u) { + if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { + GRN_LOG(ctx, GRN_LOG_ALERT, + "grn_ii_updspec_open on grn_ii_update failed!"); + grn_token_cursor_close(ctx, token_cursor); + if (new) { grn_hash_close(ctx, new); }; + grn_hash_close(ctx, old); + rc = GRN_NO_MEMORY_AVAILABLE; + goto exit; + } + } + if (grn_ii_updspec_add(ctx, *u, token_cursor->pos, v->weight)) { + GRN_LOG(ctx, GRN_LOG_ALERT, + "grn_ii_updspec_add on grn_ii_update failed!"); + grn_token_cursor_close(ctx, token_cursor); + if (new) { grn_hash_close(ctx, new); }; + grn_hash_close(ctx, old); + rc = GRN_NO_MEMORY_AVAILABLE; + goto exit; + } + } + } + grn_token_cursor_close(ctx, token_cursor); + } + } + } else { + old = NULL; + } + if (old) { + grn_id eid; + GRN_HASH_EACH(ctx, old, id, &tp, NULL, &u, { + if (new && (eid = grn_hash_get(ctx, new, tp, sizeof(grn_id), + (void **) &un))) { + if (!grn_ii_updspec_cmp(*u, *un)) { + grn_ii_updspec_close(ctx, *un); + grn_hash_delete_by_id(ctx, new, eid, NULL); + } + } else { + grn_rc r; + r = grn_ii_delete_one(ctx, ii, *tp, *u, new); + if (r) { + rc = r; + } + } + grn_ii_updspec_close(ctx, *u); + }); + grn_hash_close(ctx, old); + } + if (new) { + GRN_HASH_EACH(ctx, new, id, &tp, NULL, &u, { + grn_rc r; + if ((r = grn_ii_update_one(ctx, ii, *tp, *u, new))) { rc = r; } + grn_ii_updspec_close(ctx, *u); + }); + grn_hash_close(ctx, new); + } else { + if (!section) { + /* todo: delete key when all sections deleted */ + } + } +exit : + return rc; +} +#endif /* USE_VGRAM */ + +static grn_rc +grn_vector2updspecs(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, + grn_obj *in, grn_obj *out, grn_tokenize_mode mode, + grn_obj *posting) +{ + int j; + grn_id tid; + grn_section *v; + grn_token_cursor *token_cursor; + grn_ii_updspec **u; + grn_hash *h = (grn_hash *)out; + grn_obj *lexicon = ii->lexicon; + if (in->u.v.body) { + const char *head = GRN_BULK_HEAD(in->u.v.body); + for (j = in->u.v.n_sections, v = in->u.v.sections; j; j--, v++) { + unsigned int token_flags = 0; + if (v->length && + (token_cursor = grn_token_cursor_open(ctx, lexicon, head + v->offset, + v->length, mode, + token_flags))) { + while (!token_cursor->status) { + if ((tid = grn_token_cursor_next(ctx, token_cursor))) { + if (posting) { GRN_RECORD_PUT(ctx, posting, tid); } + if (!grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **) &u, + NULL)) { + break; + } + if (!*u) { + if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { + DEFINE_NAME(ii); + MERR("[ii][update][spec] failed to create an update spec: " + "<%.*s>: " + "record:<%u>:<%u>, token:<%u>:<%d>:<%u>", + name_size, name, + rid, section, + tid, token_cursor->pos, v->weight); + grn_token_cursor_close(ctx, token_cursor); + return ctx->rc; + } + } + if (grn_ii_updspec_add(ctx, *u, token_cursor->pos, v->weight)) { + DEFINE_NAME(ii); + MERR("[ii][update][spec] failed to add to update spec: " + "<%.*s>: " + "record:<%u>:<%u>, token:<%u>:<%d>:<%u>", + name_size, name, + rid, section, + tid, token_cursor->pos, v->weight); + grn_token_cursor_close(ctx, token_cursor); + return ctx->rc; + } + } + } + grn_token_cursor_close(ctx, token_cursor); + } + } + } + return ctx->rc; +} + +static grn_rc +grn_uvector2updspecs_data(grn_ctx *ctx, grn_ii *ii, grn_id rid, + unsigned int section, grn_obj *in, grn_obj *out, + grn_tokenize_mode mode, grn_obj *posting) +{ + int i, n; + grn_hash *h = (grn_hash *)out; + grn_obj *lexicon = ii->lexicon; + unsigned int element_size; + + n = grn_uvector_size(ctx, in); + element_size = grn_uvector_element_size(ctx, in); + for (i = 0; i < n; i++) { + grn_obj *tokenizer; + grn_token_cursor *token_cursor; + unsigned int token_flags = 0; + const char *element; + + tokenizer = grn_obj_get_info(ctx, lexicon, GRN_INFO_DEFAULT_TOKENIZER, + NULL); + + element = GRN_BULK_HEAD(in) + (element_size * i); + token_cursor = grn_token_cursor_open(ctx, lexicon, + element, element_size, + mode, token_flags); + if (!token_cursor) { + continue; + } + + while (!token_cursor->status) { + grn_id tid; + if ((tid = grn_token_cursor_next(ctx, token_cursor))) { + grn_ii_updspec **u; + int pos; + + if (posting) { GRN_RECORD_PUT(ctx, posting, tid); } + if (!grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **)&u, NULL)) { + break; + } + if (!*u) { + if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { + GRN_LOG(ctx, GRN_LOG_ALERT, + "grn_ii_updspec_open on grn_uvector2updspecs_data failed!"); + grn_token_cursor_close(ctx, token_cursor); + return GRN_NO_MEMORY_AVAILABLE; + } + } + if (tokenizer) { + pos = token_cursor->pos; + } else { + pos = i; + } + if (grn_ii_updspec_add(ctx, *u, pos, 0)) { + GRN_LOG(ctx, GRN_LOG_ALERT, + "grn_ii_updspec_add on grn_uvector2updspecs failed!"); + grn_token_cursor_close(ctx, token_cursor); + return GRN_NO_MEMORY_AVAILABLE; + } + } + } + + grn_token_cursor_close(ctx, token_cursor); + } + + return GRN_SUCCESS; +} + +static grn_rc +grn_uvector2updspecs_id(grn_ctx *ctx, grn_ii *ii, grn_id rid, + unsigned int section, grn_obj *in, grn_obj *out) +{ + int i, n; + grn_ii_updspec **u; + grn_hash *h = (grn_hash *)out; + + n = grn_vector_size(ctx, in); + for (i = 0; i < n; i++) { + grn_id id; + unsigned int weight; + + id = grn_uvector_get_element(ctx, in, i, &weight); + if (!grn_hash_add(ctx, h, &id, sizeof(grn_id), (void **)&u, NULL)) { + break; + } + if (!*u) { + if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { + GRN_LOG(ctx, GRN_LOG_ALERT, + "grn_ii_updspec_open on grn_ii_update failed!"); + return GRN_NO_MEMORY_AVAILABLE; + } + } + if (grn_ii_updspec_add(ctx, *u, i, weight)) { + GRN_LOG(ctx, GRN_LOG_ALERT, + "grn_ii_updspec_add on grn_ii_update failed!"); + return GRN_NO_MEMORY_AVAILABLE; + } + } + return GRN_SUCCESS; +} + +static grn_rc +grn_uvector2updspecs(grn_ctx *ctx, grn_ii *ii, grn_id rid, + unsigned int section, grn_obj *in, grn_obj *out, + grn_tokenize_mode mode, grn_obj *posting) +{ + if (in->header.domain < GRN_N_RESERVED_TYPES) { + return grn_uvector2updspecs_data(ctx, ii, rid, section, in, out, + mode, posting); + } else { + return grn_uvector2updspecs_id(ctx, ii, rid, section, in, out); + } +} + +grn_rc +grn_ii_column_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, + grn_obj *oldvalue, grn_obj *newvalue, grn_obj *posting) +{ + grn_id *tp; + grn_bool do_grn_ii_updspec_cmp = GRN_TRUE; + grn_ii_updspec **u, **un; + grn_obj *old_, *old = oldvalue, *new_, *new = newvalue, oldv, newv; + grn_obj buf, *post = NULL; + + if (!ii) { + ERR(GRN_INVALID_ARGUMENT, "[ii][column][update] ii is NULL"); + return ctx->rc; + } + if (!ii->lexicon) { + ERR(GRN_INVALID_ARGUMENT, "[ii][column][update] lexicon is NULL"); + return ctx->rc; + } + if (rid == GRN_ID_NIL) { + ERR(GRN_INVALID_ARGUMENT, "[ii][column][update] record ID is nil"); + return ctx->rc; + } + if (old || new) { + unsigned char type = GRN_VOID; + if (old) { + type = (ii->obj.header.domain == old->header.domain) + ? GRN_UVECTOR + : old->header.type; + } + if (new) { + type = (ii->obj.header.domain == new->header.domain) + ? GRN_UVECTOR + : new->header.type; + } + if (type == GRN_VECTOR) { + grn_obj *tokenizer; + grn_table_get_info(ctx, ii->lexicon, NULL, NULL, &tokenizer, NULL, NULL); + if (tokenizer) { + grn_obj old_elem, new_elem; + unsigned int i, max_n; + unsigned int old_n = 0, new_n = 0; + if (old) { + old_n = grn_vector_size(ctx, old); + } + if (new) { + new_n = grn_vector_size(ctx, new); + } + max_n = (old_n > new_n) ? old_n : new_n; + GRN_OBJ_INIT(&old_elem, GRN_BULK, GRN_OBJ_DO_SHALLOW_COPY, old->header.domain); + GRN_OBJ_INIT(&new_elem, GRN_BULK, GRN_OBJ_DO_SHALLOW_COPY, new->header.domain); + for (i = 0; i < max_n; i++) { + grn_rc rc; + grn_obj *old_p = NULL, *new_p = NULL; + if (i < old_n) { + const char *str; + unsigned int size = grn_vector_get_element(ctx, old, i, &str, NULL, NULL); + GRN_TEXT_SET_REF(&old_elem, str, size); + old_p = &old_elem; + } + if (i < new_n) { + const char *str; + unsigned int size = grn_vector_get_element(ctx, new, i, &str, NULL, NULL); + GRN_TEXT_SET_REF(&new_elem, str, size); + new_p = &new_elem; + } + rc = grn_ii_column_update(ctx, ii, rid, section + i, old_p, new_p, posting); + if (rc != GRN_SUCCESS) { + break; + } + } + GRN_OBJ_FIN(ctx, &old_elem); + GRN_OBJ_FIN(ctx, &new_elem); + return ctx->rc; + } + } + } + if (posting) { + GRN_RECORD_INIT(&buf, GRN_OBJ_VECTOR, grn_obj_id(ctx, ii->lexicon)); + post = &buf; + } + if (grn_io_lock(ctx, ii->seg, grn_lock_timeout)) { return ctx->rc; } + if (new) { + unsigned char type = (ii->obj.header.domain == new->header.domain) + ? GRN_UVECTOR + : new->header.type; + switch (type) { + case GRN_BULK : + { + if (grn_bulk_is_zero(ctx, new)) { + do_grn_ii_updspec_cmp = GRN_FALSE; + } + new_ = new; + GRN_OBJ_INIT(&newv, GRN_VECTOR, GRN_OBJ_DO_SHALLOW_COPY, GRN_DB_TEXT); + newv.u.v.body = new; + new = &newv; + grn_vector_delimit(ctx, new, 0, GRN_ID_NIL); + if (new_ != newvalue) { grn_obj_close(ctx, new_); } + } + /* fallthru */ + case GRN_VECTOR : + new_ = new; + new = (grn_obj *)grn_hash_create(ctx, NULL, sizeof(grn_id), + sizeof(grn_ii_updspec *), + GRN_HASH_TINY); + if (!new) { + DEFINE_NAME(ii); + MERR("[ii][column][update][new][vector] failed to create a hash table: " + "<%.*s>: ", + name_size, name); + } else { + grn_vector2updspecs(ctx, ii, rid, section, new_, new, + GRN_TOKEN_ADD, post); + } + if (new_ != newvalue) { grn_obj_close(ctx, new_); } + if (ctx->rc != GRN_SUCCESS) { goto exit; } + break; + case GRN_UVECTOR : + new_ = new; + new = (grn_obj *)grn_hash_create(ctx, NULL, sizeof(grn_id), + sizeof(grn_ii_updspec *), + GRN_HASH_TINY); + if (!new) { + DEFINE_NAME(ii); + MERR("[ii][column][update][new][uvector] failed to create a hash table: " + "<%.*s>: ", + name_size, name); + } else { + if (new_->header.type == GRN_UVECTOR) { + grn_uvector2updspecs(ctx, ii, rid, section, new_, new, + GRN_TOKEN_ADD, post); + } else { + grn_obj uvector; + unsigned int weight = 0; + GRN_VALUE_FIX_SIZE_INIT(&uvector, GRN_OBJ_VECTOR, + new_->header.domain); + if (new_->header.impl_flags & GRN_OBJ_WITH_WEIGHT) { + uvector.header.impl_flags |= GRN_OBJ_WITH_WEIGHT; + } + grn_uvector_add_element(ctx, &uvector, GRN_RECORD_VALUE(new_), + weight); + grn_uvector2updspecs(ctx, ii, rid, section, &uvector, new, + GRN_TOKEN_ADD, post); + GRN_OBJ_FIN(ctx, &uvector); + } + } + if (new_ != newvalue) { grn_obj_close(ctx, new_); } + if (ctx->rc != GRN_SUCCESS) { goto exit; } + break; + case GRN_TABLE_HASH_KEY : + break; + default : + { + DEFINE_NAME(ii); + ERR(GRN_INVALID_ARGUMENT, + "[ii][column][update][new] invalid object: " + "<%.*s>: " + "<%-.256s>(%#x)", + name_size, name, + grn_obj_type_to_string(type), + type); + } + goto exit; + } + } + if (posting) { + grn_ii_updspec *u_; + uint32_t offset = 0; + grn_id tid_ = 0, gap, tid, *tpe; + grn_table_sort_optarg arg = {GRN_TABLE_SORT_ASC| + GRN_TABLE_SORT_AS_NUMBER| + GRN_TABLE_SORT_AS_UNSIGNED, NULL, NULL,0 }; + grn_array *sorted = grn_array_create(ctx, NULL, sizeof(grn_id), 0); + grn_hash_sort(ctx, (grn_hash *)new, -1, sorted, &arg); + GRN_TEXT_PUT(ctx, posting, ((grn_hash *)new)->n_entries, sizeof(uint32_t)); + GRN_ARRAY_EACH(ctx, sorted, 0, 0, id, &tp, { + grn_hash_get_key(ctx, (grn_hash *)new, *tp, &tid, sizeof(grn_id)); + gap = tid - tid_; + GRN_TEXT_PUT(ctx, posting, &gap, sizeof(grn_id)); + tid_ = tid; + }); + GRN_ARRAY_EACH(ctx, sorted, 0, 0, id, &tp, { + grn_hash_get_value(ctx, (grn_hash *)new, *tp, &u_); + u_->offset = offset++; + GRN_TEXT_PUT(ctx, posting, &u_->tf, sizeof(int32_t)); + }); + tpe = (grn_id *)GRN_BULK_CURR(post); + for (tp = (grn_id *)GRN_BULK_HEAD(post); tp < tpe; tp++) { + grn_hash_get(ctx, (grn_hash *)new, (void *)tp, sizeof(grn_id), + (void **)&u); + GRN_TEXT_PUT(ctx, posting, &(*u)->offset, sizeof(int32_t)); + } + GRN_OBJ_FIN(ctx, post); + grn_array_close(ctx, sorted); + } + + if (old) { + unsigned char type = (ii->obj.header.domain == old->header.domain) + ? GRN_UVECTOR + : old->header.type; + switch (type) { + case GRN_BULK : + { + // const char *str = GRN_BULK_HEAD(old); + // unsigned int str_len = GRN_BULK_VSIZE(old); + old_ = old; + GRN_OBJ_INIT(&oldv, GRN_VECTOR, GRN_OBJ_DO_SHALLOW_COPY, GRN_DB_TEXT); + oldv.u.v.body = old; + old = &oldv; + grn_vector_delimit(ctx, old, 0, GRN_ID_NIL); + if (old_ != oldvalue) { grn_obj_close(ctx, old_); } + } + /* fallthru */ + case GRN_VECTOR : + old_ = old; + old = (grn_obj *)grn_hash_create(ctx, NULL, sizeof(grn_id), + sizeof(grn_ii_updspec *), + GRN_HASH_TINY); + if (!old) { + DEFINE_NAME(ii); + MERR("[ii][column][update][old][vector] failed to create a hash table: " + "<%.*s>: ", + name_size, name); + } else { + grn_vector2updspecs(ctx, ii, rid, section, old_, old, + GRN_TOKEN_DEL, NULL); + } + if (old_ != oldvalue) { grn_obj_close(ctx, old_); } + if (ctx->rc != GRN_SUCCESS) { goto exit; } + break; + case GRN_UVECTOR : + old_ = old; + old = (grn_obj *)grn_hash_create(ctx, NULL, sizeof(grn_id), + sizeof(grn_ii_updspec *), + GRN_HASH_TINY); + if (!old) { + DEFINE_NAME(ii); + MERR("[ii][column][update][old][uvector] failed to create a hash table: " + "<%.*s>: ", + name_size, name); + } else { + if (old_->header.type == GRN_UVECTOR) { + grn_uvector2updspecs(ctx, ii, rid, section, old_, old, + GRN_TOKEN_DEL, NULL); + } else { + grn_obj uvector; + unsigned int weight = 0; + GRN_VALUE_FIX_SIZE_INIT(&uvector, GRN_OBJ_VECTOR, + old_->header.domain); + if (old_->header.impl_flags & GRN_OBJ_WITH_WEIGHT) { + uvector.header.impl_flags |= GRN_OBJ_WITH_WEIGHT; + } + grn_uvector_add_element(ctx, &uvector, GRN_RECORD_VALUE(old_), + weight); + grn_uvector2updspecs(ctx, ii, rid, section, &uvector, old, + GRN_TOKEN_DEL, NULL); + GRN_OBJ_FIN(ctx, &uvector); + } + } + if (old_ != oldvalue) { grn_obj_close(ctx, old_); } + if (ctx->rc != GRN_SUCCESS) { goto exit; } + break; + case GRN_TABLE_HASH_KEY : + break; + default : + { + DEFINE_NAME(ii); + ERR(GRN_INVALID_ARGUMENT, + "[ii][column][update][old] invalid object: " + "<%.*s>: " + "<%-.256s>(%#x)", + name_size, name, + grn_obj_type_to_string(type), + type); + } + goto exit; + } + } + + if (old) { + grn_id eid; + grn_hash *o = (grn_hash *)old; + grn_hash *n = (grn_hash *)new; + GRN_HASH_EACH(ctx, o, id, &tp, NULL, &u, { + if (n && (eid = grn_hash_get(ctx, n, tp, sizeof(grn_id), + (void **) &un))) { + if (do_grn_ii_updspec_cmp && !grn_ii_updspec_cmp(*u, *un)) { + grn_ii_updspec_close(ctx, *un); + grn_hash_delete_by_id(ctx, n, eid, NULL); + } + } else { + grn_ii_delete_one(ctx, ii, *tp, *u, n); + } + grn_ii_updspec_close(ctx, *u); + if (ctx->rc != GRN_SUCCESS) { + break; + } + }); + } + if (new) { + grn_hash *n = (grn_hash *)new; + GRN_HASH_EACH(ctx, n, id, &tp, NULL, &u, { + grn_ii_update_one(ctx, ii, *tp, *u, n); + grn_ii_updspec_close(ctx, *u); + if (ctx->rc != GRN_SUCCESS) { + break; + } + }); + } else { + if (!section) { + /* todo: delete key when all sections deleted */ + } + } +exit : + grn_io_unlock(ii->seg); + if (old && old != oldvalue) { grn_obj_close(ctx, old); } + if (new && new != newvalue) { grn_obj_close(ctx, new); } + return ctx->rc; +} + +/* token_info */ + +typedef struct { + cursor_heap *cursors; + int offset; + int pos; + int size; + int ntoken; + grn_posting *p; +} token_info; + +#define EX_NONE 0 +#define EX_PREFIX 1 +#define EX_SUFFIX 2 +#define EX_BOTH 3 +#define EX_FUZZY 4 + +inline static void +token_info_expand_both(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, + const char *key, unsigned int key_size, token_info *ti) +{ + int s = 0; + grn_hash *h, *g; + uint32_t *offset2; + grn_hash_cursor *c; + grn_id *tp, *tq; + if ((h = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, 0))) { + grn_table_search(ctx, lexicon, key, key_size, + GRN_OP_PREFIX, (grn_obj *)h, GRN_OP_OR); + if (GRN_HASH_SIZE(h)) { + if ((ti->cursors = cursor_heap_open(ctx, GRN_HASH_SIZE(h) + 256))) { + if ((c = grn_hash_cursor_open(ctx, h, NULL, 0, NULL, 0, 0, -1, 0))) { + uint32_t key2_size; + const char *key2; + while (grn_hash_cursor_next(ctx, c)) { + grn_hash_cursor_get_key(ctx, c, (void **) &tp); + key2 = _grn_table_key(ctx, lexicon, *tp, &key2_size); + if (!key2) { break; } + if ((lexicon->header.type != GRN_TABLE_PAT_KEY) || + !(lexicon->header.flags & GRN_OBJ_KEY_WITH_SIS) || + key2_size <= 2) { // todo: refine + if ((s = grn_ii_estimate_size(ctx, ii, *tp))) { + cursor_heap_push(ctx, ti->cursors, ii, *tp, 0, 0, GRN_ID_NIL); + ti->ntoken++; + ti->size += s; + } + } else { + if ((g = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, + GRN_HASH_TINY))) { + grn_pat_suffix_search(ctx, (grn_pat *)lexicon, key2, key2_size, + g); + GRN_HASH_EACH(ctx, g, id, &tq, NULL, &offset2, { + if ((s = grn_ii_estimate_size(ctx, ii, *tq))) { + cursor_heap_push(ctx, ti->cursors, ii, *tq, + /* *offset2 */ 0, 0, GRN_ID_NIL); + ti->ntoken++; + ti->size += s; + } + }); + grn_hash_close(ctx, g); + } + } + } + grn_hash_cursor_close(ctx, c); + } + } + } + grn_hash_close(ctx, h); + } +} + +inline static grn_rc +token_info_close(grn_ctx *ctx, token_info *ti) +{ + cursor_heap_close(ctx, ti->cursors); + GRN_FREE(ti); + return GRN_SUCCESS; +} + +inline static token_info * +token_info_open(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, + const char *key, unsigned int key_size, uint32_t offset, + int mode, grn_fuzzy_search_optarg *args, grn_id min) +{ + int s = 0; + grn_hash *h; + token_info *ti; + grn_id tid; + grn_id *tp; + if (!key) { return NULL; } + if (!(ti = GRN_MALLOC(sizeof(token_info)))) { return NULL; } + ti->cursors = NULL; + ti->size = 0; + ti->ntoken = 0; + ti->offset = offset; + switch (mode) { + case EX_BOTH : + token_info_expand_both(ctx, lexicon, ii, key, key_size, ti); + break; + case EX_NONE : + if ((tid = grn_table_get(ctx, lexicon, key, key_size)) && + (s = grn_ii_estimate_size(ctx, ii, tid)) && + (ti->cursors = cursor_heap_open(ctx, 1))) { + cursor_heap_push(ctx, ti->cursors, ii, tid, 0, 0, min); + ti->ntoken++; + ti->size = s; + } + break; + case EX_PREFIX : + if ((h = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, 0))) { + grn_table_search(ctx, lexicon, key, key_size, + GRN_OP_PREFIX, (grn_obj *)h, GRN_OP_OR); + if (GRN_HASH_SIZE(h)) { + if ((ti->cursors = cursor_heap_open(ctx, GRN_HASH_SIZE(h)))) { + GRN_HASH_EACH(ctx, h, id, &tp, NULL, NULL, { + if ((s = grn_ii_estimate_size(ctx, ii, *tp))) { + cursor_heap_push(ctx, ti->cursors, ii, *tp, 0, 0, min); + ti->ntoken++; + ti->size += s; + } + }); + } + } + grn_hash_close(ctx, h); + } + break; + case EX_SUFFIX : + if ((h = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, 0))) { + grn_table_search(ctx, lexicon, key, key_size, + GRN_OP_SUFFIX, (grn_obj *)h, GRN_OP_OR); + if (GRN_HASH_SIZE(h)) { + if ((ti->cursors = cursor_heap_open(ctx, GRN_HASH_SIZE(h)))) { + uint32_t *offset2; + GRN_HASH_EACH(ctx, h, id, &tp, NULL, &offset2, { + if ((s = grn_ii_estimate_size(ctx, ii, *tp))) { + cursor_heap_push(ctx, ti->cursors, ii, *tp, /* *offset2 */ 0, 0, min); + ti->ntoken++; + ti->size += s; + } + }); + } + } + grn_hash_close(ctx, h); + } + break; + case EX_FUZZY : + if ((h = (grn_hash *)grn_table_create(ctx, NULL, 0, NULL, + GRN_OBJ_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, + grn_ctx_at(ctx, GRN_DB_UINT32), NULL))) { + grn_table_fuzzy_search(ctx, lexicon, key, key_size, + args, (grn_obj *)h, GRN_OP_OR); + if (GRN_HASH_SIZE(h)) { + if ((ti->cursors = cursor_heap_open(ctx, GRN_HASH_SIZE(h)))) { + grn_rset_recinfo *ri; + GRN_HASH_EACH(ctx, h, id, &tp, NULL, (void **)&ri, { + if ((s = grn_ii_estimate_size(ctx, ii, *tp))) { + cursor_heap_push(ctx, ti->cursors, ii, *tp, 0, ri->score - 1, min); + ti->ntoken++; + ti->size += s; + } + }); + } + } + grn_obj_close(ctx, (grn_obj *)h); + } + break; + } + if (cursor_heap_push2(ti->cursors)) { + token_info_close(ctx, ti); + return NULL; + } + { + grn_ii_cursor *ic; + if (ti->cursors && (ic = cursor_heap_min(ti->cursors))) { + grn_posting *p = ic->post; + ti->pos = p->pos - ti->offset; + ti->p = p; + } else { + token_info_close(ctx, ti); + ti = NULL; + } + } + return ti; +} + +static inline grn_rc +token_info_skip(grn_ctx *ctx, token_info *ti, uint32_t rid, uint32_t sid) +{ + grn_ii_cursor *c; + grn_posting *p; + for (;;) { + if (!(c = cursor_heap_min(ti->cursors))) { return GRN_END_OF_DATA; } + p = c->post; + if (p->rid > rid || (p->rid == rid && p->sid >= sid)) { break; } + cursor_heap_pop(ctx, ti->cursors, rid); + } + ti->pos = p->pos - ti->offset; + ti->p = p; + return GRN_SUCCESS; +} + +static inline grn_rc +token_info_skip_pos(grn_ctx *ctx, token_info *ti, uint32_t rid, uint32_t sid, uint32_t pos) +{ + grn_ii_cursor *c; + grn_posting *p; + pos += ti->offset; + for (;;) { + if (!(c = cursor_heap_min(ti->cursors))) { return GRN_END_OF_DATA; } + p = c->post; + if (p->rid != rid || p->sid != sid || p->pos >= pos) { break; } + cursor_heap_pop_pos(ctx, ti->cursors); + } + ti->pos = p->pos - ti->offset; + ti->p = p; + return GRN_SUCCESS; +} + +inline static int +token_compare(const void *a, const void *b) +{ + const token_info *t1 = *((token_info **)a), *t2 = *((token_info **)b); + return t1->size - t2->size; +} + +#define TOKEN_CANDIDATE_NODE_SIZE 32 +#define TOKEN_CANDIDATE_ADJACENT_MAX_SIZE 16 +#define TOKEN_CANDIDATE_QUEUE_SIZE 64 +#define TOKEN_CANDIDATE_SIZE 16 + +typedef struct { + grn_id tid; + const unsigned char *token; + uint32_t token_size; + int32_t pos; + grn_token_cursor_status status; + int ef; + uint32_t estimated_size; + uint8_t adjacent[TOKEN_CANDIDATE_ADJACENT_MAX_SIZE]; /* Index of adjacent node from top */ + uint8_t n_adjacent; +} token_candidate_node; + +typedef struct { + uint32_t *candidates; /* Standing bits indicate index of token_candidate_node */ + int top; + int rear; + int size; +} token_candidate_queue; + +inline static void +token_candidate_adjacent_set(grn_ctx *ctx, grn_token_cursor *token_cursor, + token_candidate_node *top, token_candidate_node *curr) +{ + grn_bool exists_adjacent = GRN_FALSE; + token_candidate_node *adj; + for (adj = top; adj < curr; adj++) { + if (token_cursor->curr <= adj->token + adj->token_size) { + if (adj->n_adjacent < TOKEN_CANDIDATE_ADJACENT_MAX_SIZE) { + adj->adjacent[adj->n_adjacent] = curr - top; + adj->n_adjacent++; + exists_adjacent = GRN_TRUE; + } + } + } + if (!exists_adjacent) { + adj = curr - 1; + if (adj->n_adjacent < TOKEN_CANDIDATE_ADJACENT_MAX_SIZE) { + adj->adjacent[adj->n_adjacent] = curr - top; + adj->n_adjacent++; + } + } +} + +inline static grn_rc +token_candidate_init(grn_ctx *ctx, grn_ii *ii, grn_token_cursor *token_cursor, + grn_id tid, int ef, token_candidate_node **nodes, int *n_nodes, + uint32_t *max_estimated_size) +{ + grn_rc rc; + token_candidate_node *top, *curr; + int size = TOKEN_CANDIDATE_NODE_SIZE; + + *nodes = GRN_MALLOC(TOKEN_CANDIDATE_NODE_SIZE * sizeof(token_candidate_node)); + if (!*nodes) { + return GRN_NO_MEMORY_AVAILABLE; + } + top = *nodes; + curr = top; + +#define TOKEN_CANDIDATE_NODE_SET() { \ + curr->tid = tid; \ + curr->token = token_cursor->curr; \ + curr->token_size = token_cursor->curr_size; \ + curr->pos = token_cursor->pos; \ + curr->status = token_cursor->status; \ + curr->ef = ef; \ + curr->estimated_size = grn_ii_estimate_size(ctx, ii, tid); \ + curr->n_adjacent = 0; \ +} + TOKEN_CANDIDATE_NODE_SET(); + GRN_LOG(ctx, GRN_LOG_DEBUG, "[ii][overlap_token_skip] tid=%u pos=%d estimated_size=%u", + curr->tid, curr->pos, curr->estimated_size); + *max_estimated_size = curr->estimated_size; + curr++; + + while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) { + if (curr - top >= size) { + if (!(*nodes = GRN_REALLOC(*nodes, + (curr - top + TOKEN_CANDIDATE_NODE_SIZE) * sizeof(token_candidate_node)))) { + return GRN_NO_MEMORY_AVAILABLE; + } + top = *nodes; + curr = top + size; + size += TOKEN_CANDIDATE_NODE_SIZE; + } + tid = grn_token_cursor_next(ctx, token_cursor); + if (token_cursor->status != GRN_TOKEN_CURSOR_DONE_SKIP) { + if (token_cursor->force_prefix) { ef |= EX_PREFIX; } + TOKEN_CANDIDATE_NODE_SET(); + token_candidate_adjacent_set(ctx, token_cursor, top, curr); + if (curr->estimated_size > *max_estimated_size) { + *max_estimated_size = curr->estimated_size; + } + curr++; + } + } + *n_nodes = curr - top; + rc = GRN_SUCCESS; + return rc; +#undef TOKEN_CANDIDATE_NODE_SET +} + +inline static grn_rc +token_candidate_queue_init(grn_ctx *ctx, token_candidate_queue *q) +{ + q->top = 0; + q->rear = 0; + q->size = TOKEN_CANDIDATE_QUEUE_SIZE; + + q->candidates = GRN_MALLOC(TOKEN_CANDIDATE_QUEUE_SIZE * sizeof(uint32_t)); + if (!q->candidates) { + q->size = 0; + return GRN_NO_MEMORY_AVAILABLE; + } + return GRN_SUCCESS; +} + +inline static grn_rc +token_candidate_enqueue(grn_ctx *ctx, token_candidate_queue *q, uint32_t candidate) +{ + if (q->rear >= q->size) { + if (!(q->candidates = + GRN_REALLOC(q->candidates, + (q->rear + TOKEN_CANDIDATE_QUEUE_SIZE) * sizeof(uint32_t)))) { + q->size = 0; + return GRN_NO_MEMORY_AVAILABLE; + } + q->size += TOKEN_CANDIDATE_QUEUE_SIZE; + } + *(q->candidates + q->rear) = candidate; + q->rear++; + return GRN_SUCCESS; +} + +inline static grn_rc +token_candidate_dequeue(grn_ctx *ctx, token_candidate_queue *q, uint32_t *candidate) +{ + if (q->top == q->rear) { + return GRN_END_OF_DATA; + } + *candidate = *(q->candidates + q->top); + q->top++; + return GRN_SUCCESS; +} + +inline static void +token_candidate_queue_fin(grn_ctx *ctx, token_candidate_queue *q) +{ + GRN_FREE(q->candidates); +} + +inline static token_candidate_node* +token_candidate_last_node(grn_ctx *ctx, token_candidate_node *nodes, uint32_t candidate, int offset) +{ + int i; + GRN_BIT_SCAN_REV(candidate, i); + return nodes + i + offset; +} + +inline static uint64_t +token_candidate_score(grn_ctx *ctx, token_candidate_node *nodes, uint32_t candidate, + int offset, uint32_t max_estimated_size) +{ + int i, last; + uint64_t score = 0; + GRN_BIT_SCAN_REV(candidate, last); + for (i = 0; i <= last; i++) { + if (candidate & (1 << i)) { + token_candidate_node *node = nodes + i + offset; + if (node->estimated_size > 0) { + score += max_estimated_size / node->estimated_size; + } + } + } + return score; +} + +inline static grn_rc +token_candidate_select(grn_ctx *ctx, token_candidate_node *nodes, + int offset, int limit, int end, + uint32_t *selected_candidate, uint32_t max_estimated_size) +{ + grn_rc rc; + token_candidate_queue q; + uint32_t candidate; + uint64_t max_score = 0; + int i, min_n_nodes = 0; + + if (offset + limit > end) { + limit = end - offset; + } + rc = token_candidate_queue_init(ctx, &q); + if (rc != GRN_SUCCESS) { + return rc; + } + rc = token_candidate_enqueue(ctx, &q, 1); + if (rc != GRN_SUCCESS) { + goto exit; + } + while (token_candidate_dequeue(ctx, &q, &candidate) != GRN_END_OF_DATA) { + token_candidate_node *candidate_last_node = + token_candidate_last_node(ctx, nodes, candidate, offset); + for (i = 0; i < candidate_last_node->n_adjacent; i++) { + int adjacent, n_nodes = 0; + uint32_t new_candidate; + adjacent = candidate_last_node->adjacent[i] - offset; + if (adjacent > limit) { + break; + } + new_candidate = candidate | (1 << adjacent); + GET_NUM_BITS(new_candidate, n_nodes); + if (min_n_nodes > 0 && n_nodes > min_n_nodes + 1) { + goto exit; + } + rc = token_candidate_enqueue(ctx, &q, new_candidate); + if (rc != GRN_SUCCESS) { + goto exit; + } + if (adjacent == limit) { + if (min_n_nodes == 0) { + min_n_nodes = n_nodes; + } + if (n_nodes >= min_n_nodes && n_nodes <= min_n_nodes + 1) { + uint64_t score; + score = token_candidate_score(ctx, nodes, new_candidate, offset, max_estimated_size); + if (score > max_score) { + max_score = score; + *selected_candidate = new_candidate; + } + } + } + } + } + rc = GRN_SUCCESS; +exit : + token_candidate_queue_fin(ctx, &q); + return rc; +} + +inline static grn_rc +token_candidate_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, + token_info **tis, uint32_t *n, + token_candidate_node *nodes, uint32_t selected_candidate, + int offset, grn_id min) +{ + grn_rc rc = GRN_END_OF_DATA; + token_info *ti; + const char *key; + uint32_t size; + int i, last = 0; + GRN_BIT_SCAN_REV(selected_candidate, last); + for (i = 1; i <= last; i++) { + if (selected_candidate & (1 << i)) { + token_candidate_node *node = nodes + i + offset; + switch (node->status) { + case GRN_TOKEN_CURSOR_DOING : + key = _grn_table_key(ctx, lexicon, node->tid, &size); + ti = token_info_open(ctx, lexicon, ii, key, size, node->pos, + EX_NONE, NULL, min); + break; + case GRN_TOKEN_CURSOR_DONE : + if (node->tid) { + key = _grn_table_key(ctx, lexicon, node->tid, &size); + ti = token_info_open(ctx, lexicon, ii, key, size, node->pos, + node->ef & EX_PREFIX, NULL, min); + break; + } /* else fallthru */ + default : + ti = token_info_open(ctx, lexicon, ii, (char *)node->token, + node->token_size, node->pos, + node->ef & EX_PREFIX, NULL, min); + break; + } + if (!ti) { + goto exit; + } + tis[(*n)++] = ti; + GRN_LOG(ctx, GRN_LOG_DEBUG, "[ii][overlap_token_skip] tid=%u pos=%d estimated_size=%u", + node->tid, node->pos, node->estimated_size); + } + } + rc = GRN_SUCCESS; +exit : + return rc; +} + +inline static grn_rc +token_info_build_skipping_overlap(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, + token_info **tis, uint32_t *n, + grn_token_cursor *token_cursor, + grn_id tid, int ef, grn_id min) +{ + grn_rc rc; + token_candidate_node *nodes = NULL; + int n_nodes = 0, offset = 0, limit = TOKEN_CANDIDATE_SIZE - 1; + uint32_t max_estimated_size; + + rc = token_candidate_init(ctx, ii, token_cursor, tid, ef, &nodes, &n_nodes, &max_estimated_size); + if (rc != GRN_SUCCESS) { + return rc; + } + while (offset < n_nodes - 1) { + uint32_t selected_candidate = 0; + rc = token_candidate_select(ctx, nodes, offset, limit, n_nodes - 1, + &selected_candidate, max_estimated_size); + if (rc != GRN_SUCCESS) { + goto exit; + } + rc = token_candidate_build(ctx, lexicon, ii, tis, n, nodes, selected_candidate, offset, min); + if (rc != GRN_SUCCESS) { + goto exit; + } + offset += limit; + } + rc = GRN_SUCCESS; +exit : + if (nodes) { + GRN_FREE(nodes); + } + return rc; +} + +inline static grn_rc +token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, unsigned int string_len, + token_info **tis, uint32_t *n, grn_bool *only_skip_token, grn_id min, + grn_operator mode) +{ + token_info *ti; + const char *key; + uint32_t size; + grn_rc rc = GRN_END_OF_DATA; + unsigned int token_flags = GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER; + grn_token_cursor *token_cursor = grn_token_cursor_open(ctx, lexicon, + string, string_len, + GRN_TOKEN_GET, + token_flags); + *only_skip_token = GRN_FALSE; + if (!token_cursor) { return GRN_NO_MEMORY_AVAILABLE; } + if (mode == GRN_OP_UNSPLIT) { + if ((ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->orig, + token_cursor->orig_blen, 0, EX_BOTH, NULL, min))) { + tis[(*n)++] = ti; + rc = GRN_SUCCESS; + } + } else { + grn_id tid; + int ef; + switch (mode) { + case GRN_OP_PREFIX : + ef = EX_PREFIX; + break; + case GRN_OP_SUFFIX : + ef = EX_SUFFIX; + break; + case GRN_OP_PARTIAL : + ef = EX_BOTH; + break; + default : + ef = EX_NONE; + break; + } + tid = grn_token_cursor_next(ctx, token_cursor); + if (token_cursor->force_prefix) { ef |= EX_PREFIX; } + switch (token_cursor->status) { + case GRN_TOKEN_CURSOR_DOING : + key = _grn_table_key(ctx, lexicon, tid, &size); + ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, + ef & EX_SUFFIX, NULL, min); + break; + case GRN_TOKEN_CURSOR_DONE : + ti = token_info_open(ctx, lexicon, ii, (const char *)token_cursor->curr, + token_cursor->curr_size, 0, ef, NULL, min); + /* + key = _grn_table_key(ctx, lexicon, tid, &size); + ti = token_info_open(ctx, lexicon, ii, token_cursor->curr, token_cursor->curr_size, token_cursor->pos, ef, NULL, GRN_ID_NIL); + ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->orig, + token_cursor->orig_blen, token_cursor->pos, ef, NULL, GRN_ID_NIL); + */ + break; + case GRN_TOKEN_CURSOR_NOT_FOUND : + ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->orig, + token_cursor->orig_blen, 0, ef, NULL, min); + break; + case GRN_TOKEN_CURSOR_DONE_SKIP : + *only_skip_token = GRN_TRUE; + goto exit; + default : + goto exit; + } + if (!ti) { goto exit ; } + tis[(*n)++] = ti; + + if (grn_ii_overlap_token_skip_enable) { + rc = token_info_build_skipping_overlap(ctx, lexicon, ii, tis, n, token_cursor, tid, ef, min); + goto exit; + } + + while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) { + tid = grn_token_cursor_next(ctx, token_cursor); + if (token_cursor->force_prefix) { ef |= EX_PREFIX; } + switch (token_cursor->status) { + case GRN_TOKEN_CURSOR_DONE_SKIP : + continue; + case GRN_TOKEN_CURSOR_DOING : + key = _grn_table_key(ctx, lexicon, tid, &size); + ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, + EX_NONE, NULL, min); + break; + case GRN_TOKEN_CURSOR_DONE : + if (tid) { + key = _grn_table_key(ctx, lexicon, tid, &size); + ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, + ef & EX_PREFIX, NULL, min); + break; + } /* else fallthru */ + default : + ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->curr, + token_cursor->curr_size, token_cursor->pos, + ef & EX_PREFIX, NULL, min); + break; + } + if (!ti) { + goto exit; + } + tis[(*n)++] = ti; + } + rc = GRN_SUCCESS; + } +exit : + grn_token_cursor_close(ctx, token_cursor); + return rc; +} + +inline static grn_rc +token_info_build_fuzzy(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, + const char *string, unsigned int string_len, + token_info **tis, uint32_t *n, grn_bool *only_skip_token, + grn_id min, grn_operator mode, grn_fuzzy_search_optarg *args) +{ + token_info *ti; + grn_rc rc = GRN_END_OF_DATA; + unsigned int token_flags = GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER; + grn_token_cursor *token_cursor = grn_token_cursor_open(ctx, lexicon, + string, string_len, + GRN_TOKENIZE_ONLY, + token_flags); + *only_skip_token = GRN_FALSE; + if (!token_cursor) { return GRN_NO_MEMORY_AVAILABLE; } + grn_token_cursor_next(ctx, token_cursor); + switch (token_cursor->status) { + case GRN_TOKEN_CURSOR_DONE_SKIP : + *only_skip_token = GRN_TRUE; + goto exit; + case GRN_TOKEN_CURSOR_DOING : + case GRN_TOKEN_CURSOR_DONE : + ti = token_info_open(ctx, lexicon, ii, (const char *)token_cursor->curr, + token_cursor->curr_size, token_cursor->pos, EX_FUZZY, + args, min); + break; + default : + ti = NULL; + break; + } + if (!ti) { + goto exit ; + } + tis[(*n)++] = ti; + while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) { + grn_token_cursor_next(ctx, token_cursor); + switch (token_cursor->status) { + case GRN_TOKEN_CURSOR_DONE_SKIP : + continue; + case GRN_TOKEN_CURSOR_DOING : + case GRN_TOKEN_CURSOR_DONE : + ti = token_info_open(ctx, lexicon, ii, (const char *)token_cursor->curr, + token_cursor->curr_size, token_cursor->pos, EX_FUZZY, + args, min); + break; + default : + break; + } + if (!ti) { + goto exit; + } + tis[(*n)++] = ti; + } + rc = GRN_SUCCESS; +exit : + grn_token_cursor_close(ctx, token_cursor); + return rc; +} + +static void +token_info_clear_offset(token_info **tis, uint32_t n) +{ + token_info **tie; + for (tie = tis + n; tis < tie; tis++) { (*tis)->offset = 0; } +} + +/* select */ + +inline static void +res_add(grn_ctx *ctx, grn_hash *s, grn_rset_posinfo *pi, double score, + grn_operator op) +{ + grn_rset_recinfo *ri; + switch (op) { + case GRN_OP_OR : + if (grn_hash_add(ctx, s, pi, s->key_size, (void **)&ri, NULL)) { + if (s->obj.header.flags & GRN_OBJ_WITH_SUBREC) { + grn_table_add_subrec((grn_obj *)s, ri, score, pi, 1); + } + } + break; + case GRN_OP_AND : + if (grn_hash_get(ctx, s, pi, s->key_size, (void **)&ri)) { + if (s->obj.header.flags & GRN_OBJ_WITH_SUBREC) { + ri->n_subrecs |= GRN_RSET_UTIL_BIT; + grn_table_add_subrec((grn_obj *)s, ri, score, pi, 1); + } + } + break; + case GRN_OP_AND_NOT : + { + grn_id id; + if ((id = grn_hash_get(ctx, s, pi, s->key_size, (void **)&ri))) { + grn_hash_delete_by_id(ctx, s, id, NULL); + } + } + break; + case GRN_OP_ADJUST : + if (grn_hash_get(ctx, s, pi, s->key_size, (void **)&ri)) { + if (s->obj.header.flags & GRN_OBJ_WITH_SUBREC) { + ri->score += score; + } + } + break; + default : + break; + } +} + +grn_rc +grn_ii_posting_add(grn_ctx *ctx, grn_posting *pos, grn_hash *s, grn_operator op) +{ + res_add(ctx, s, (grn_rset_posinfo *)(pos), (1 + pos->weight), op); + return ctx->rc; +} + +#ifdef USE_BHEAP + +/* todo */ + +#else /* USE_BHEAP */ + +struct _btr_node { + struct _btr_node *car; + struct _btr_node *cdr; + token_info *ti; +}; + +typedef struct _btr_node btr_node; + +typedef struct { + int n; + token_info *min; + token_info *max; + btr_node *root; + btr_node *nodes; +} btr; + +inline static void +bt_zap(btr *bt) +{ + bt->n = 0; + bt->min = NULL; + bt->max = NULL; + bt->root = NULL; +} + +inline static btr * +bt_open(grn_ctx *ctx, int size) +{ + btr *bt = GRN_MALLOC(sizeof(btr)); + if (bt) { + bt_zap(bt); + if (!(bt->nodes = GRN_MALLOC(sizeof(btr_node) * size))) { + GRN_FREE(bt); + bt = NULL; + } + } + return bt; +} + +inline static void +bt_close(grn_ctx *ctx, btr *bt) +{ + if (!bt) { return; } + GRN_FREE(bt->nodes); + GRN_FREE(bt); +} + +inline static void +bt_push(btr *bt, token_info *ti) +{ + int pos = ti->pos, minp = 1, maxp = 1; + btr_node *node, *new, **last; + new = bt->nodes + bt->n++; + new->ti = ti; + new->car = NULL; + new->cdr = NULL; + for (last = &bt->root; (node = *last);) { + if (pos < node->ti->pos) { + last = &node->car; + maxp = 0; + } else { + last = &node->cdr; + minp = 0; + } + } + *last = new; + if (minp) { bt->min = ti; } + if (maxp) { bt->max = ti; } +} + +inline static void +bt_pop(btr *bt) +{ + btr_node *node, *min, *newmin, **last; + for (last = &bt->root; (min = *last) && min->car; last = &min->car) ; + if (min) { + int pos = min->ti->pos, minp = 1, maxp = 1; + *last = min->cdr; + min->cdr = NULL; + for (last = &bt->root; (node = *last);) { + if (pos < node->ti->pos) { + last = &node->car; + maxp = 0; + } else { + last = &node->cdr; + minp = 0; + } + } + *last = min; + if (maxp) { bt->max = min->ti; } + if (!minp) { + for (newmin = bt->root; newmin->car; newmin = newmin->car) ; + bt->min = newmin->ti; + } + } +} + +#endif /* USE_BHEAP */ + +typedef enum { + grn_wv_none = 0, + grn_wv_static, + grn_wv_dynamic, + grn_wv_constant +} grn_wv_mode; + +inline static double +get_weight(grn_ctx *ctx, grn_hash *s, grn_id rid, int sid, + grn_wv_mode wvm, grn_select_optarg *optarg) +{ + switch (wvm) { + case grn_wv_none : + return 1; + case grn_wv_static : + return sid <= optarg->vector_size ? optarg->weight_vector[sid - 1] : 0; + case grn_wv_dynamic : + /* todo : support hash with keys + if (s->keys) { + uint32_t key_size; + const char *key = _grn_table_key(ctx, s->keys, rid, &key_size); + // todo : change grn_select_optarg + return key ? optarg->func(s, key, key_size, sid, optarg->func_arg) : 0; + } + */ + /* todo : cast */ + return optarg->func(ctx, (void *)s, (void *)(intptr_t)rid, sid, + optarg->func_arg); + case grn_wv_constant : + return optarg->vector_size; + default : + return 1; + } +} + +grn_rc +grn_ii_similar_search(grn_ctx *ctx, grn_ii *ii, + const char *string, unsigned int string_len, + grn_hash *s, grn_operator op, grn_select_optarg *optarg) +{ + int *w1, limit; + grn_id tid, *tp, max_size; + grn_rc rc = GRN_SUCCESS; + grn_hash *h; + grn_token_cursor *token_cursor; + unsigned int token_flags = GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER; + grn_obj *lexicon = ii->lexicon; + if (!lexicon || !ii || !string || !string_len || !s || !optarg) { + return GRN_INVALID_ARGUMENT; + } + if (!(h = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(int), 0))) { + return GRN_NO_MEMORY_AVAILABLE; + } + if (!(token_cursor = grn_token_cursor_open(ctx, lexicon, string, string_len, + GRN_TOKEN_GET, token_flags))) { + grn_hash_close(ctx, h); + return GRN_NO_MEMORY_AVAILABLE; + } + if (!(max_size = optarg->max_size)) { max_size = 1048576; } + while (token_cursor->status != GRN_TOKEN_CURSOR_DONE && + token_cursor->status != GRN_TOKEN_CURSOR_DONE_SKIP) { + if ((tid = grn_token_cursor_next(ctx, token_cursor))) { + if (grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **)&w1, NULL)) { + (*w1)++; + } + } + if (tid && token_cursor->curr_size) { + if (optarg->mode == GRN_OP_UNSPLIT) { + grn_table_search(ctx, lexicon, token_cursor->curr, + token_cursor->curr_size, + GRN_OP_PREFIX, (grn_obj *)h, GRN_OP_OR); + } + if (optarg->mode == GRN_OP_PARTIAL) { + grn_table_search(ctx, lexicon, token_cursor->curr, + token_cursor->curr_size, + GRN_OP_SUFFIX, (grn_obj *)h, GRN_OP_OR); + } + } + } + grn_token_cursor_close(ctx, token_cursor); + { + grn_hash_cursor *c = grn_hash_cursor_open(ctx, h, NULL, 0, NULL, 0, + 0, -1, 0); + if (!c) { + GRN_LOG(ctx, GRN_LOG_ALERT, + "grn_hash_cursor_open on grn_ii_similar_search failed !"); + grn_hash_close(ctx, h); + return GRN_NO_MEMORY_AVAILABLE; + } + while (grn_hash_cursor_next(ctx, c)) { + uint32_t es; + grn_hash_cursor_get_key_value(ctx, c, (void **) &tp, NULL, (void **) &w1); + if ((es = grn_ii_estimate_size(ctx, ii, *tp))) { + *w1 += max_size / es; + } else { + grn_hash_cursor_delete(ctx, c, NULL); + } + } + grn_hash_cursor_close(ctx, c); + } + limit = optarg->similarity_threshold + ? (optarg->similarity_threshold > GRN_HASH_SIZE(h) + ? GRN_HASH_SIZE(h) + : optarg->similarity_threshold) + : (GRN_HASH_SIZE(h) >> 3) + 1; + if (GRN_HASH_SIZE(h)) { + grn_id j, id; + int w2, rep; + grn_ii_cursor *c; + grn_posting *pos; + grn_wv_mode wvm = grn_wv_none; + grn_table_sort_optarg arg = { + GRN_TABLE_SORT_DESC|GRN_TABLE_SORT_BY_VALUE|GRN_TABLE_SORT_AS_NUMBER, + NULL, + NULL, + 0 + }; + grn_array *sorted = grn_array_create(ctx, NULL, sizeof(grn_id), 0); + if (!sorted) { + GRN_LOG(ctx, GRN_LOG_ALERT, + "grn_hash_sort on grn_ii_similar_search failed !"); + grn_hash_close(ctx, h); + return GRN_NO_MEMORY_AVAILABLE; + } + grn_hash_sort(ctx, h, limit, sorted, &arg); + /* todo support subrec + rep = (s->record_unit == grn_rec_position || s->subrec_unit == grn_rec_position); + */ + rep = 0; + if (optarg->func) { + wvm = grn_wv_dynamic; + } else if (optarg->vector_size) { + wvm = optarg->weight_vector ? grn_wv_static : grn_wv_constant; + } + for (j = 1; j <= limit; j++) { + grn_array_get_value(ctx, sorted, j, &id); + _grn_hash_get_key_value(ctx, h, id, (void **) &tp, (void **) &w1); + if (!*tp || !(c = grn_ii_cursor_open(ctx, ii, *tp, GRN_ID_NIL, GRN_ID_MAX, + rep + ? ii->n_elements + : ii->n_elements - 1, 0))) { + GRN_LOG(ctx, GRN_LOG_ERROR, "cursor open failed (%d)", *tp); + continue; + } + if (rep) { + while (grn_ii_cursor_next(ctx, c)) { + pos = c->post; + if ((w2 = get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg)) > 0) { + while (grn_ii_cursor_next_pos(ctx, c)) { + res_add(ctx, s, (grn_rset_posinfo *) pos, + *w1 * w2 * (1 + pos->weight), op); + } + } + } + } else { + while (grn_ii_cursor_next(ctx, c)) { + pos = c->post; + if ((w2 = get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg)) > 0) { + res_add(ctx, s, (grn_rset_posinfo *) pos, + *w1 * w2 * (pos->tf + pos->weight), op); + } + } + } + grn_ii_cursor_close(ctx, c); + } + grn_array_close(ctx, sorted); + } + grn_hash_close(ctx, h); + grn_ii_resolve_sel_and(ctx, s, op); + // grn_hash_cursor_clear(r); + return rc; +} + +#define TERM_EXTRACT_EACH_POST 0 +#define TERM_EXTRACT_EACH_TERM 1 + +grn_rc +grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string, + unsigned int string_len, grn_hash *s, + grn_operator op, grn_select_optarg *optarg) +{ + grn_rset_posinfo pi; + grn_id tid; + const char *p, *pe; + grn_obj *nstr; + const char *normalized; + unsigned int normalized_length_in_bytes; + grn_ii_cursor *c; + grn_posting *pos; + int skip, rep, policy; + grn_rc rc = GRN_SUCCESS; + grn_wv_mode wvm = grn_wv_none; + if (!ii || !string || !string_len || !s || !optarg) { + return GRN_INVALID_ARGUMENT; + } + if (!(nstr = grn_string_open(ctx, string, string_len, NULL, 0))) { + return GRN_INVALID_ARGUMENT; + } + policy = optarg->max_interval; + if (optarg->func) { + wvm = grn_wv_dynamic; + } else if (optarg->vector_size) { + wvm = optarg->weight_vector ? grn_wv_static : grn_wv_constant; + } + /* todo support subrec + if (policy == TERM_EXTRACT_EACH_POST) { + if ((rc = grn_records_reopen(s, grn_rec_section, grn_rec_none, 0))) { goto exit; } + } + rep = (s->record_unit == grn_rec_position || s->subrec_unit == grn_rec_position); + */ + rep = 0; + grn_string_get_normalized(ctx, nstr, &normalized, &normalized_length_in_bytes, + NULL); + for (p = normalized, pe = p + normalized_length_in_bytes; p < pe; p += skip) { + if ((tid = grn_table_lcp_search(ctx, ii->lexicon, p, pe - p))) { + if (policy == TERM_EXTRACT_EACH_POST) { + if (!(skip = grn_table_get_key(ctx, ii->lexicon, tid, NULL, 0))) { break; } + } else { + if (!(skip = (int)grn_charlen(ctx, p, pe))) { break; } + } + if (!(c = grn_ii_cursor_open(ctx, ii, tid, GRN_ID_NIL, GRN_ID_MAX, + rep + ? ii->n_elements + : ii->n_elements - 1, 0))) { + GRN_LOG(ctx, GRN_LOG_ERROR, "cursor open failed (%d)", tid); + continue; + } + if (rep) { + while (grn_ii_cursor_next(ctx, c)) { + pos = c->post; + while (grn_ii_cursor_next_pos(ctx, c)) { + res_add(ctx, s, (grn_rset_posinfo *) pos, + get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg), op); + } + } + } else { + while (grn_ii_cursor_next(ctx, c)) { + if (policy == TERM_EXTRACT_EACH_POST) { + pi.rid = c->post->rid; + pi.sid = p - normalized; + res_add(ctx, s, &pi, pi.sid + 1, op); + } else { + pos = c->post; + res_add(ctx, s, (grn_rset_posinfo *) pos, + get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg), op); + } + } + } + grn_ii_cursor_close(ctx, c); + } else { + if (!(skip = (int)grn_charlen(ctx, p, pe))) { + break; + } + } + } + grn_obj_close(ctx, nstr); + return rc; +} + +typedef struct { + grn_id rid; + uint32_t sid; + uint32_t start_pos; + uint32_t end_pos; + uint32_t tf; + uint32_t weight; +} grn_ii_select_cursor_posting; + +typedef struct { + btr *bt; + grn_ii *ii; + token_info **tis; + uint32_t n_tis; + int max_interval; + grn_operator mode; + grn_ii_select_cursor_posting posting; + const char *string; + unsigned int string_len; + grn_bool done; + grn_ii_select_cursor_posting unshifted_posting; + grn_bool have_unshifted_posting; +} grn_ii_select_cursor; + +static grn_rc +grn_ii_select_cursor_close(grn_ctx *ctx, + grn_ii_select_cursor *cursor) +{ + token_info **tip; + + if (!cursor) { + return GRN_SUCCESS; + } + + for (tip = cursor->tis; tip < cursor->tis + cursor->n_tis; tip++) { + if (*tip) { + token_info_close(ctx, *tip); + } + } + if (cursor->tis) { + GRN_FREE(cursor->tis); + } + bt_close(ctx, cursor->bt); + GRN_FREE(cursor); + + return GRN_SUCCESS; +} + +static grn_ii_select_cursor * +grn_ii_select_cursor_open(grn_ctx *ctx, + grn_ii *ii, + const char *string, + unsigned int string_len, + grn_select_optarg *optarg) +{ + grn_operator mode = GRN_OP_EXACT; + grn_ii_select_cursor *cursor; + + if (string_len == 0) { + ERR(GRN_INVALID_ARGUMENT, + "[ii][select][cursor][open] empty string"); + return NULL; + } + + if (optarg) { + mode = optarg->mode; + } + switch (mode) { + case GRN_OP_EXACT : + case GRN_OP_FUZZY : + case GRN_OP_NEAR : + case GRN_OP_NEAR2 : + break; + default : + ERR(GRN_INVALID_ARGUMENT, + "[ii][select][cursor][open] " + "EXACT, FUZZY, NEAR and NEAR2 are only supported mode: %-.256s", + grn_operator_to_string(mode)); + break; + } + + cursor = GRN_CALLOC(sizeof(grn_ii_select_cursor)); + if (!cursor) { + ERR(ctx->rc, + "[ii][select][cursor][open] failed to allocate cursor: %-.256s", + ctx->errbuf); + return NULL; + } + + cursor->ii = ii; + cursor->mode = mode; + + if (!(cursor->tis = GRN_MALLOC(sizeof(token_info *) * string_len * 2))) { + ERR(ctx->rc, + "[ii][select][cursor][open] failed to allocate token info container: %-.256s", + ctx->errbuf); + GRN_FREE(cursor); + return NULL; + } + cursor->n_tis = 0; + if (cursor->mode == GRN_OP_FUZZY) { + grn_bool only_skip_token = GRN_FALSE; + grn_id previous_min = GRN_ID_NIL; + if (token_info_build_fuzzy(ctx, ii->lexicon, ii, string, string_len, + cursor->tis, &(cursor->n_tis), + &only_skip_token, previous_min, + cursor->mode, &(optarg->fuzzy)) != GRN_SUCCESS) { + grn_ii_select_cursor_close(ctx, cursor); + return NULL; + } + } else { + grn_bool only_skip_token = GRN_FALSE; + grn_id previous_min = GRN_ID_NIL; + if (token_info_build(ctx, ii->lexicon, ii, string, string_len, + cursor->tis, &(cursor->n_tis), + &only_skip_token, previous_min, + cursor->mode) != GRN_SUCCESS) { + grn_ii_select_cursor_close(ctx, cursor); + return NULL; + } + } + if (cursor->n_tis == 0) { + grn_ii_select_cursor_close(ctx, cursor); + return NULL; + } + + switch (cursor->mode) { + case GRN_OP_NEAR2 : + token_info_clear_offset(cursor->tis, cursor->n_tis); + cursor->mode = GRN_OP_NEAR; + /* fallthru */ + case GRN_OP_NEAR : + if (!(cursor->bt = bt_open(ctx, cursor->n_tis))) { + ERR(ctx->rc, + "[ii][select][cursor][open] failed to allocate btree: %-.256s", + ctx->errbuf); + grn_ii_select_cursor_close(ctx, cursor); + return NULL; + } + cursor->max_interval = optarg->max_interval; + break; + default : + break; + } + qsort(cursor->tis, cursor->n_tis, sizeof(token_info *), token_compare); + GRN_LOG(ctx, GRN_LOG_INFO, + "[ii][select][cursor][open] n=%d <%.*s>", + cursor->n_tis, + string_len, string); + + cursor->string = string; + cursor->string_len = string_len; + + cursor->done = GRN_FALSE; + + cursor->have_unshifted_posting = GRN_FALSE; + + return cursor; +} + +static grn_ii_select_cursor_posting * +grn_ii_select_cursor_next(grn_ctx *ctx, + grn_ii_select_cursor *cursor) +{ + btr *bt = cursor->bt; + token_info **tis = cursor->tis; + token_info **tie = tis + cursor->n_tis; + uint32_t n_tis = cursor->n_tis; + int max_interval = cursor->max_interval; + grn_operator mode = cursor->mode; + + if (cursor->have_unshifted_posting) { + cursor->have_unshifted_posting = GRN_FALSE; + return &(cursor->unshifted_posting); + } + + if (cursor->done) { + return NULL; + } + + for (;;) { + grn_id rid; + grn_id sid; + grn_id next_rid; + grn_id next_sid; + token_info **tip; + + rid = (*tis)->p->rid; + sid = (*tis)->p->sid; + for (tip = tis + 1, next_rid = rid, next_sid = sid + 1; + tip < tie; + tip++) { + token_info *ti = *tip; + if (token_info_skip(ctx, ti, rid, sid)) { return NULL; } + if (ti->p->rid != rid || ti->p->sid != sid) { + next_rid = ti->p->rid; + next_sid = ti->p->sid; + break; + } + } + + if (tip == tie) { + int start_pos = 0; + int pos = 0; + int end_pos = 0; + int score = 0; + int tf = 0; + int tscore = 0; + +#define SKIP_OR_BREAK(pos) {\ + if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; } \ + if (ti->p->rid != rid || ti->p->sid != sid) { \ + next_rid = ti->p->rid; \ + next_sid = ti->p->sid; \ + break; \ + } \ +} + +#define RETURN_POSTING() do { \ + cursor->posting.rid = rid; \ + cursor->posting.sid = sid; \ + cursor->posting.start_pos = start_pos; \ + cursor->posting.end_pos = end_pos; \ + cursor->posting.tf = tf; \ + cursor->posting.weight = tscore; \ + if (token_info_skip_pos(ctx, *tis, rid, sid, pos) != GRN_SUCCESS) { \ + if (token_info_skip(ctx, *tis, next_rid, next_sid) != GRN_SUCCESS) { \ + cursor->done = GRN_TRUE; \ + } \ + } \ + return &(cursor->posting); \ +} while (GRN_FALSE) + + if (n_tis == 1) { + start_pos = pos = end_pos = (*tis)->p->pos; + pos++; + tf = (*tis)->p->tf; + tscore = (*tis)->p->weight + (*tis)->cursors->bins[0]->weight; + RETURN_POSTING(); + } else if (mode == GRN_OP_NEAR) { + bt_zap(bt); + for (tip = tis; tip < tie; tip++) { + token_info *ti = *tip; + SKIP_OR_BREAK(pos); + bt_push(bt, ti); + } + if (tip == tie) { + for (;;) { + token_info *ti; + int min; + int max; + + ti = bt->min; + min = ti->pos; + max = bt->max->pos; + if (min > max) { + char ii_name[GRN_TABLE_MAX_KEY_SIZE]; + int ii_name_size; + ii_name_size = grn_obj_name(ctx, + (grn_obj *)(cursor->ii), + ii_name, + GRN_TABLE_MAX_KEY_SIZE); + ERR(GRN_FILE_CORRUPT, + "[ii][select][cursor][near] " + "max position must be larger than min position: " + "min:<%d> max:<%d> ii:<%.*s> string:<%.*s>", + min, max, + ii_name_size, ii_name, + cursor->string_len, + cursor->string); + return NULL; + } + if ((max_interval < 0) || (max - min <= max_interval)) { + /* TODO: Set start_pos, pos, end_pos, tf and tscore */ + RETURN_POSTING(); + if (ti->pos == max + 1) { + break; + } + SKIP_OR_BREAK(max + 1); + } else { + if (ti->pos == max - max_interval) { + break; + } + SKIP_OR_BREAK(max - max_interval); + } + bt_pop(bt); + } + } + } else { + int count = 0; + for (tip = tis; ; tip++) { + token_info *ti; + + if (tip == tie) { tip = tis; } + ti = *tip; + SKIP_OR_BREAK(pos); + if (ti->pos == pos) { + score += ti->p->weight + ti->cursors->bins[0]->weight; + count++; + if (ti->p->pos > end_pos) { + end_pos = ti->p->pos; + } + } else { + score = ti->p->weight + ti->cursors->bins[0]->weight; + count = 1; + start_pos = pos = ti->pos; + end_pos = ti->p->pos; + } + if (count == n_tis) { + pos++; + if (ti->p->pos > end_pos) { + end_pos = ti->p->pos; + } + tf = 1; + tscore += score; + RETURN_POSTING(); + } + } + } +#undef SKIP_OR_BREAK + } + if (token_info_skip(ctx, *tis, next_rid, next_sid)) { + return NULL; + } + } +} + +static void +grn_ii_select_cursor_unshift(grn_ctx *ctx, + grn_ii_select_cursor *cursor, + grn_ii_select_cursor_posting *posting) +{ + cursor->unshifted_posting = *posting; + cursor->have_unshifted_posting = GRN_TRUE; +} + +static grn_rc +grn_ii_parse_regexp_query(grn_ctx *ctx, + const char *log_tag, + const char *string, unsigned int string_len, + grn_obj *parsed_strings) +{ + grn_bool escaping = GRN_FALSE; + int nth_char = 0; + const char *current = string; + const char *string_end = string + string_len; + grn_obj buffer; + + GRN_TEXT_INIT(&buffer, 0); + while (current < string_end) { + const char *target; + int char_len; + + char_len = grn_charlen(ctx, current, string_end); + if (char_len == 0) { + GRN_OBJ_FIN(ctx, &buffer); + ERR(GRN_INVALID_ARGUMENT, + "%-.256s invalid encoding character: <%.*s|%#x|>", + log_tag, + (int)(current - string), string, + *current); + return ctx->rc; + } + target = current; + current += char_len; + + if (escaping) { + escaping = GRN_FALSE; + if (char_len == 1) { + switch (*target) { + case 'A' : + if (nth_char == 0) { + target = GRN_TOKENIZER_BEGIN_MARK_UTF8; + char_len = GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN; + } + break; + case 'z' : + if (current == string_end) { + target = GRN_TOKENIZER_END_MARK_UTF8; + char_len = GRN_TOKENIZER_END_MARK_UTF8_LEN; + } + break; + default : + break; + } + } + } else { + if (char_len == 1) { + if (*target == '\\') { + escaping = GRN_TRUE; + continue; + } else if (*target == '.' && + grn_charlen(ctx, current, string_end) == 1 && + *current == '*') { + if (GRN_TEXT_LEN(&buffer) > 0) { + grn_vector_add_element(ctx, + parsed_strings, + GRN_TEXT_VALUE(&buffer), + GRN_TEXT_LEN(&buffer), + 0, + GRN_DB_TEXT); + GRN_BULK_REWIND(&buffer); + } + current++; + nth_char++; + continue; + } + } + } + + GRN_TEXT_PUT(ctx, &buffer, target, char_len); + nth_char++; + } + if (GRN_TEXT_LEN(&buffer) > 0) { + grn_vector_add_element(ctx, + parsed_strings, + GRN_TEXT_VALUE(&buffer), + GRN_TEXT_LEN(&buffer), + 0, + GRN_DB_TEXT); + } + GRN_OBJ_FIN(ctx, &buffer); + + return GRN_SUCCESS; +} + +static grn_rc +grn_ii_select_regexp(grn_ctx *ctx, grn_ii *ii, + const char *string, unsigned int string_len, + grn_hash *s, grn_operator op, grn_select_optarg *optarg) +{ + grn_rc rc; + grn_obj parsed_strings; + unsigned int n_parsed_strings; + + GRN_TEXT_INIT(&parsed_strings, GRN_OBJ_VECTOR); + rc = grn_ii_parse_regexp_query(ctx, "[ii][select][regexp]", + string, string_len, &parsed_strings); + if (rc != GRN_SUCCESS) { + GRN_OBJ_FIN(ctx, &parsed_strings); + return rc; + } + + if (optarg) { + optarg->mode = GRN_OP_EXACT; + } + + n_parsed_strings = grn_vector_size(ctx, &parsed_strings); + if (n_parsed_strings == 1) { + const char *parsed_string; + unsigned int parsed_string_len; + parsed_string_len = grn_vector_get_element(ctx, + &parsed_strings, + 0, + &parsed_string, + NULL, + NULL); + rc = grn_ii_select(ctx, ii, + parsed_string, + parsed_string_len, + s, op, optarg); + } else { + int i; + grn_ii_select_cursor **cursors; + grn_bool have_error = GRN_FALSE; + + cursors = GRN_CALLOC(sizeof(grn_ii_select_cursor *) * n_parsed_strings); + for (i = 0; i < n_parsed_strings; i++) { + const char *parsed_string; + unsigned int parsed_string_len; + parsed_string_len = grn_vector_get_element(ctx, + &parsed_strings, + i, + &parsed_string, + NULL, + NULL); + cursors[i] = grn_ii_select_cursor_open(ctx, + ii, + parsed_string, + parsed_string_len, + optarg); + if (!cursors[i]) { + have_error = GRN_TRUE; + break; + } + } + + while (!have_error) { + grn_ii_select_cursor_posting *posting; + uint32_t pos; + + posting = grn_ii_select_cursor_next(ctx, cursors[0]); + if (!posting) { + break; + } + + pos = posting->end_pos; + for (i = 1; i < n_parsed_strings; i++) { + grn_ii_select_cursor_posting *posting_i; + + for (;;) { + posting_i = grn_ii_select_cursor_next(ctx, cursors[i]); + if (!posting_i) { + break; + } + + if (posting_i->rid == posting->rid && + posting_i->sid == posting->sid && + posting_i->start_pos > pos) { + grn_ii_select_cursor_unshift(ctx, cursors[i], posting_i); + break; + } + if (posting_i->rid > posting->rid) { + grn_ii_select_cursor_unshift(ctx, cursors[i], posting_i); + break; + } + } + + if (!posting_i) { + break; + } + + if (posting_i->rid != posting->rid || posting_i->sid != posting->sid) { + break; + } + + pos = posting_i->end_pos; + } + + if (i == n_parsed_strings) { + grn_rset_posinfo pi = {posting->rid, posting->sid, pos}; + double record_score = 1.0; + res_add(ctx, s, &pi, record_score, op); + } + } + + for (i = 0; i < n_parsed_strings; i++) { + if (cursors[i]) { + grn_ii_select_cursor_close(ctx, cursors[i]); + } + } + GRN_FREE(cursors); + } + GRN_OBJ_FIN(ctx, &parsed_strings); + + if (optarg) { + optarg->mode = GRN_OP_REGEXP; + } + + return rc; +} + +#ifdef GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH +static grn_bool +grn_ii_select_sequential_search_should_use(grn_ctx *ctx, + grn_ii *ii, + const char *raw_query, + unsigned int raw_query_len, + grn_hash *result, + grn_operator op, + grn_wv_mode wvm, + grn_select_optarg *optarg, + token_info **token_infos, + uint32_t n_token_infos, + double too_many_index_match_ratio) +{ + int n_sources; + + if (too_many_index_match_ratio < 0.0) { + return GRN_FALSE; + } + + if (op != GRN_OP_AND) { + return GRN_FALSE; + } + + if (optarg->mode != GRN_OP_EXACT) { + return GRN_FALSE; + } + + n_sources = ii->obj.source_size / sizeof(grn_id); + if (n_sources == 0) { + return GRN_FALSE; + } + + { + uint32_t i; + int n_existing_records; + + n_existing_records = GRN_HASH_SIZE(result); + for (i = 0; i < n_token_infos; i++) { + token_info *info = token_infos[i]; + if (n_existing_records <= (info->size * too_many_index_match_ratio)) { + return GRN_TRUE; + } + } + return GRN_FALSE; + } +} + +static void +grn_ii_select_sequential_search_body(grn_ctx *ctx, + grn_ii *ii, + grn_obj *normalizer, + grn_encoding encoding, + OnigRegex regex, + grn_hash *result, + grn_operator op, + grn_wv_mode wvm, + grn_select_optarg *optarg) +{ + int i, n_sources; + grn_id *source_ids = ii->obj.source; + grn_obj buffer; + + GRN_TEXT_INIT(&buffer, 0); + n_sources = ii->obj.source_size / sizeof(grn_id); + for (i = 0; i < n_sources; i++) { + grn_id source_id = source_ids[i]; + grn_obj *source; + grn_obj *accessor; + + source = grn_ctx_at(ctx, source_id); + switch (source->header.type) { + case GRN_TABLE_HASH_KEY : + case GRN_TABLE_PAT_KEY : + case GRN_TABLE_DAT_KEY : + accessor = grn_obj_column(ctx, + (grn_obj *)result, + GRN_COLUMN_NAME_KEY, + GRN_COLUMN_NAME_KEY_LEN); + break; + default : + { + char column_name[GRN_TABLE_MAX_KEY_SIZE]; + int column_name_size; + column_name_size = grn_column_name(ctx, source, + column_name, + GRN_TABLE_MAX_KEY_SIZE); + accessor = grn_obj_column(ctx, (grn_obj *)result, column_name, + column_name_size); + } + break; + } + + { + grn_hash_cursor *cursor; + grn_id id; + cursor = grn_hash_cursor_open(ctx, result, NULL, 0, NULL, 0, 0, -1, 0); + while ((id = grn_hash_cursor_next(ctx, cursor)) != GRN_ID_NIL) { + OnigPosition position; + grn_obj *value; + const char *normalized_value; + unsigned int normalized_value_length; + + GRN_BULK_REWIND(&buffer); + grn_obj_get_value(ctx, accessor, id, &buffer); + value = grn_string_open_(ctx, + GRN_TEXT_VALUE(&buffer), + GRN_TEXT_LEN(&buffer), + normalizer, 0, encoding); + grn_string_get_normalized(ctx, value, + &normalized_value, &normalized_value_length, + NULL); + position = onig_search(regex, + normalized_value, + normalized_value + normalized_value_length, + normalized_value, + normalized_value + normalized_value_length, + NULL, + 0); + if (position != ONIG_MISMATCH) { + grn_id *record_id; + grn_rset_posinfo info; + double score; + + grn_hash_cursor_get_key(ctx, cursor, (void **)&record_id); + + info.rid = *record_id; + info.sid = i + 1; + info.pos = 0; + score = get_weight(ctx, result, info.rid, info.sid, wvm, optarg); + res_add(ctx, result, &info, score, op); + } + grn_obj_unlink(ctx, value); + } + grn_hash_cursor_close(ctx, cursor); + } + grn_obj_unlink(ctx, accessor); + } + grn_obj_unlink(ctx, &buffer); +} + +static grn_bool +grn_ii_select_sequential_search(grn_ctx *ctx, + grn_ii *ii, + const char *raw_query, + unsigned int raw_query_len, + grn_hash *result, + grn_operator op, + grn_wv_mode wvm, + grn_select_optarg *optarg, + token_info **token_infos, + uint32_t n_token_infos) +{ + grn_bool processed = GRN_TRUE; + + { + if (!grn_ii_select_sequential_search_should_use(ctx, + ii, + raw_query, + raw_query_len, + result, + op, + wvm, + optarg, + token_infos, + n_token_infos, + grn_ii_select_too_many_index_match_ratio)) { + return GRN_FALSE; + } + } + + { + grn_encoding encoding; + grn_obj *normalizer; + int nflags = 0; + grn_obj *query; + const char *normalized_query; + unsigned int normalized_query_length; + + grn_table_get_info(ctx, ii->lexicon, + NULL, &encoding, NULL, &normalizer, NULL); + query = grn_string_open_(ctx, raw_query, raw_query_len, + normalizer, nflags, encoding); + grn_string_get_normalized(ctx, query, + &normalized_query, &normalized_query_length, + NULL); + { + OnigRegex regex; + int onig_result; + OnigErrorInfo error_info; + onig_result = onig_new(®ex, + normalized_query, + normalized_query + normalized_query_length, + ONIG_OPTION_NONE, + ONIG_ENCODING_UTF8, + ONIG_SYNTAX_ASIS, + &error_info); + if (onig_result == ONIG_NORMAL) { + grn_ii_select_sequential_search_body(ctx, ii, normalizer, encoding, + regex, result, op, wvm, optarg); + onig_free(regex); + } else { + char message[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str(message, onig_result, error_info); + GRN_LOG(ctx, GRN_LOG_WARNING, + "[ii][select][sequential] " + "failed to create regular expression object: %-.256s", + message); + processed = GRN_FALSE; + } + } + grn_obj_unlink(ctx, query); + } + + return processed; +} +#endif + +grn_rc +grn_ii_select(grn_ctx *ctx, grn_ii *ii, + const char *string, unsigned int string_len, + grn_hash *s, grn_operator op, grn_select_optarg *optarg) +{ + btr *bt = NULL; + grn_rc rc = GRN_SUCCESS; + int rep, orp, weight, max_interval = 0; + token_info *ti, **tis = NULL, **tip, **tie; + uint32_t n = 0, rid, sid, nrid, nsid; + grn_bool only_skip_token = GRN_FALSE; + grn_operator mode = GRN_OP_EXACT; + grn_wv_mode wvm = grn_wv_none; + grn_obj *lexicon = ii->lexicon; + grn_scorer_score_func *score_func = NULL; + grn_scorer_matched_record record; + grn_id previous_min = GRN_ID_NIL; + grn_id current_min = GRN_ID_NIL; + grn_bool set_min_enable_for_and_query = GRN_FALSE; + + if (!lexicon || !ii || !s) { return GRN_INVALID_ARGUMENT; } + if (optarg) { + mode = optarg->mode; + if (optarg->func) { + wvm = grn_wv_dynamic; + } else if (optarg->vector_size) { + wvm = optarg->weight_vector ? grn_wv_static : grn_wv_constant; + } + if (optarg->match_info) { + if (optarg->match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { + previous_min = optarg->match_info->min; + set_min_enable_for_and_query = GRN_TRUE; + } + } + } + if (mode == GRN_OP_SIMILAR) { + return grn_ii_similar_search(ctx, ii, string, string_len, s, op, optarg); + } + if (mode == GRN_OP_TERM_EXTRACT) { + return grn_ii_term_extract(ctx, ii, string, string_len, s, op, optarg); + } + if (mode == GRN_OP_REGEXP) { + return grn_ii_select_regexp(ctx, ii, string, string_len, s, op, optarg); + } + /* todo : support subrec + rep = (s->record_unit == grn_rec_position || s->subrec_unit == grn_rec_position); + orp = (s->record_unit == grn_rec_position || op == GRN_OP_OR); + */ + rep = 0; + orp = op == GRN_OP_OR; + if (!string_len) { goto exit; } + if (!(tis = GRN_MALLOC(sizeof(token_info *) * string_len * 2))) { + return GRN_NO_MEMORY_AVAILABLE; + } + if (mode == GRN_OP_FUZZY) { + if (token_info_build_fuzzy(ctx, lexicon, ii, string, string_len, + tis, &n, &only_skip_token, previous_min, + mode, &(optarg->fuzzy)) || + !n) { + goto exit; + } + } else { + if (token_info_build(ctx, lexicon, ii, string, string_len, + tis, &n, &only_skip_token, previous_min, mode) || + !n) { + goto exit; + } + } + switch (mode) { + case GRN_OP_NEAR2 : + token_info_clear_offset(tis, n); + mode = GRN_OP_NEAR; + /* fallthru */ + case GRN_OP_NEAR : + if (!(bt = bt_open(ctx, n))) { rc = GRN_NO_MEMORY_AVAILABLE; goto exit; } + max_interval = optarg->max_interval; + break; + default : + break; + } + qsort(tis, n, sizeof(token_info *), token_compare); + tie = tis + n; + /* + for (tip = tis; tip < tie; tip++) { + ti = *tip; + grn_log("o=%d n=%d s=%d r=%d", ti->offset, ti->ntoken, ti->size, ti->rid); + } + */ + GRN_LOG(ctx, GRN_LOG_INFO, "n=%d (%.*s)", n, string_len, string); + /* todo : array as result + if (n == 1 && (*tis)->cursors->n_entries == 1 && op == GRN_OP_OR + && !GRN_HASH_SIZE(s) && !s->garbages + && s->record_unit == grn_rec_document && !s->max_n_subrecs + && grn_ii_max_section(ii) == 1) { + grn_ii_cursor *c = (*tis)->cursors->bins[0]; + if ((rc = grn_hash_array_init(s, (*tis)->size + 32768))) { goto exit; } + do { + grn_rset_recinfo *ri; + grn_posting *p = c->post; + if ((weight = get_weight(ctx, s, p->rid, p->sid, wvm, optarg))) { + GRN_HASH_INT_ADD(s, p, ri); + ri->score = (p->tf + p->score) * weight; + ri->n_subrecs = 1; + } + } while (grn_ii_cursor_next(ctx, c)); + goto exit; + } + */ +#ifdef GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH + if (grn_ii_select_sequential_search(ctx, ii, string, string_len, + s, op, wvm, optarg, tis, n)) { + goto exit; + } +#endif + + if (optarg && optarg->scorer) { + grn_proc *scorer = (grn_proc *)(optarg->scorer); + score_func = scorer->callbacks.scorer.score; + record.table = grn_ctx_at(ctx, s->obj.header.domain); + record.lexicon = lexicon; + record.id = GRN_ID_NIL; + GRN_RECORD_INIT(&(record.terms), GRN_OBJ_VECTOR, lexicon->header.domain); + GRN_UINT32_INIT(&(record.term_weights), GRN_OBJ_VECTOR); + record.total_term_weights = 0; + record.n_documents = grn_table_size(ctx, record.table); + record.n_occurrences = 0; + record.n_candidates = 0; + record.n_tokens = 0; + record.weight = 0; + record.args_expr = optarg->scorer_args_expr; + record.args_expr_offset = optarg->scorer_args_expr_offset; + } + + for (;;) { + rid = (*tis)->p->rid; + sid = (*tis)->p->sid; + for (tip = tis + 1, nrid = rid, nsid = sid + 1; tip < tie; tip++) { + ti = *tip; + if (token_info_skip(ctx, ti, rid, sid)) { goto exit; } + if (ti->p->rid != rid || ti->p->sid != sid) { + nrid = ti->p->rid; + nsid = ti->p->sid; + break; + } + } + weight = get_weight(ctx, s, rid, sid, wvm, optarg); + if (tip == tie && weight != 0) { + grn_rset_posinfo pi = {rid, sid, 0}; + if (orp || grn_hash_get(ctx, s, &pi, s->key_size, NULL)) { + int count = 0, noccur = 0, pos = 0, score = 0, tscore = 0, min, max; + + if (score_func) { + GRN_BULK_REWIND(&(record.terms)); + GRN_BULK_REWIND(&(record.term_weights)); + record.n_candidates = 0; + record.n_tokens = 0; + } + +#define SKIP_OR_BREAK(pos) {\ + if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; } \ + if (ti->p->rid != rid || ti->p->sid != sid) { \ + nrid = ti->p->rid; \ + nsid = ti->p->sid; \ + break; \ + } \ +} + if (n == 1 && !rep) { + noccur = (*tis)->p->tf; + tscore = (*tis)->p->weight + (*tis)->cursors->bins[0]->weight; + if (score_func) { + GRN_RECORD_PUT(ctx, &(record.terms), (*tis)->cursors->bins[0]->id); + GRN_UINT32_PUT(ctx, &(record.term_weights), tscore); + record.n_occurrences = noccur; + record.n_candidates = (*tis)->size; + record.n_tokens = (*tis)->ntoken; + } + } else if (mode == GRN_OP_NEAR) { + bt_zap(bt); + for (tip = tis; tip < tie; tip++) { + ti = *tip; + SKIP_OR_BREAK(pos); + bt_push(bt, ti); + } + if (tip == tie) { + for (;;) { + ti = bt->min; min = ti->pos; max = bt->max->pos; + if (min > max) { + char ii_name[GRN_TABLE_MAX_KEY_SIZE]; + int ii_name_size; + ii_name_size = grn_obj_name(ctx, (grn_obj *)ii, ii_name, + GRN_TABLE_MAX_KEY_SIZE); + ERR(GRN_FILE_CORRUPT, + "[ii][select][near] " + "max position must be larger than min position: " + "min:<%d> max:<%d> ii:<%.*s> string:<%.*s>", + min, max, + ii_name_size, ii_name, + string_len, string); + rc = ctx->rc; + goto exit; + } + if ((max_interval < 0) || (max - min <= max_interval)) { + if (rep) { pi.pos = min; res_add(ctx, s, &pi, weight, op); } + noccur++; + if (ti->pos == max + 1) { + break; + } + SKIP_OR_BREAK(max + 1); + } else { + if (ti->pos == max - max_interval) { + break; + } + SKIP_OR_BREAK(max - max_interval); + } + bt_pop(bt); + } + } + } else { + for (tip = tis; ; tip++) { + if (tip == tie) { tip = tis; } + ti = *tip; + SKIP_OR_BREAK(pos); + if (ti->pos == pos) { + score += ti->p->weight + ti->cursors->bins[0]->weight; count++; + } else { + score = ti->p->weight + ti->cursors->bins[0]->weight; count = 1; + pos = ti->pos; + if (noccur == 0 && score_func) { + GRN_BULK_REWIND(&(record.terms)); + GRN_BULK_REWIND(&(record.term_weights)); + record.n_candidates = 0; + record.n_tokens = 0; + } + } + if (noccur == 0 && score_func) { + GRN_RECORD_PUT(ctx, &(record.terms), ti->cursors->bins[0]->id); + GRN_UINT32_PUT(ctx, &(record.term_weights), + ti->p->weight + ti->cursors->bins[0]->weight); + record.n_candidates += ti->size; + record.n_tokens += ti->ntoken; + } + if (count == n) { + if (rep) { + pi.pos = pos; res_add(ctx, s, &pi, (score + 1) * weight, op); + } + tscore += score; + score = 0; count = 0; pos++; + noccur++; + } + } + } + if (noccur && !rep) { + double record_score; + if (score_func) { + record.id = rid; + record.weight = weight; + record.n_occurrences = noccur; + record.total_term_weights = tscore; + record_score = score_func(ctx, &record) * weight; + } else { + record_score = (noccur + tscore) * weight; + } + if (set_min_enable_for_and_query) { + if (current_min == GRN_ID_NIL) { + current_min = rid; + } + } + res_add(ctx, s, &pi, record_score, op); + } +#undef SKIP_OR_BREAK + } + } + if (token_info_skip(ctx, *tis, nrid, nsid)) { goto exit; } + } +exit : + if (score_func) { + GRN_OBJ_FIN(ctx, &(record.terms)); + GRN_OBJ_FIN(ctx, &(record.term_weights)); + } + + if (set_min_enable_for_and_query) { + if (current_min > previous_min) { + optarg->match_info->min = current_min; + } + } + + for (tip = tis; tip < tis + n; tip++) { + if (*tip) { token_info_close(ctx, *tip); } + } + if (tis) { GRN_FREE(tis); } + if (!only_skip_token) { + grn_ii_resolve_sel_and(ctx, s, op); + } + // grn_hash_cursor_clear(r); + bt_close(ctx, bt); +#ifdef DEBUG + { + uint32_t segno = GRN_II_MAX_LSEG, nnref = 0; + grn_io_mapinfo *info = ii->seg->maps; + for (; segno; segno--, info++) { if (info->nref) { nnref++; } } + GRN_LOG(ctx, GRN_LOG_INFO, "nnref=%d", nnref); + } +#endif /* DEBUG */ + return rc; +} + +static uint32_t +grn_ii_estimate_size_for_query_regexp(grn_ctx *ctx, grn_ii *ii, + const char *query, unsigned int query_len, + grn_search_optarg *optarg) +{ + grn_rc rc; + grn_obj parsed_query; + uint32_t size; + + GRN_TEXT_INIT(&parsed_query, 0); + rc = grn_ii_parse_regexp_query(ctx, "[ii][estimate-size][query][regexp]", + query, query_len, &parsed_query); + if (rc != GRN_SUCCESS) { + GRN_OBJ_FIN(ctx, &parsed_query); + return 0; + } + + if (optarg) { + optarg->mode = GRN_OP_EXACT; + } + + size = grn_ii_estimate_size_for_query(ctx, ii, + GRN_TEXT_VALUE(&parsed_query), + GRN_TEXT_LEN(&parsed_query), + optarg); + GRN_OBJ_FIN(ctx, &parsed_query); + + if (optarg) { + optarg->mode = GRN_OP_REGEXP; + } + + return size; +} + +uint32_t +grn_ii_estimate_size_for_query(grn_ctx *ctx, grn_ii *ii, + const char *query, unsigned int query_len, + grn_search_optarg *optarg) +{ + grn_rc rc; + grn_obj *lexicon = ii->lexicon; + token_info **tis = NULL; + uint32_t i; + uint32_t n_tis = 0; + grn_bool only_skip_token = GRN_FALSE; + grn_operator mode = GRN_OP_EXACT; + double estimated_size = 0; + double normalized_ratio = 1.0; + grn_id min = GRN_ID_NIL; + + if (query_len == 0) { + return 0; + } + + if (optarg) { + switch (optarg->mode) { + case GRN_OP_NEAR : + case GRN_OP_NEAR2 : + mode = optarg->mode; + break; + case GRN_OP_SIMILAR : + mode = optarg->mode; + break; + case GRN_OP_REGEXP : + mode = optarg->mode; + break; + case GRN_OP_FUZZY : + mode = optarg->mode; + default : + break; + } + if (optarg->match_info.flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { + min = optarg->match_info.min; + } + } + + if (mode == GRN_OP_REGEXP) { + return grn_ii_estimate_size_for_query_regexp(ctx, ii, query, query_len, + optarg); + } + + tis = GRN_MALLOC(sizeof(token_info *) * query_len * 2); + if (!tis) { + return 0; + } + + switch (mode) { + case GRN_OP_FUZZY : + rc = token_info_build_fuzzy(ctx, lexicon, ii, query, query_len, + tis, &n_tis, &only_skip_token, min, + mode, &(optarg->fuzzy)); + break; + default : + rc = token_info_build(ctx, lexicon, ii, query, query_len, + tis, &n_tis, &only_skip_token, min, mode); + break; + } + + if (rc != GRN_SUCCESS) { + goto exit; + } + + for (i = 0; i < n_tis; i++) { + token_info *ti = tis[i]; + double term_estimated_size; + term_estimated_size = ((double)ti->size / ti->ntoken); + if (i == 0) { + estimated_size = term_estimated_size; + } else { + if (term_estimated_size < estimated_size) { + estimated_size = term_estimated_size; + } + normalized_ratio *= grn_ii_estimate_size_for_query_reduce_ratio; + } + } + + estimated_size *= normalized_ratio; + if (estimated_size > 0.0 && estimated_size < 1.0) { + estimated_size = 1.0; + } + +exit : + for (i = 0; i < n_tis; i++) { + token_info *ti = tis[i]; + if (ti) { + token_info_close(ctx, ti); + } + } + if (tis) { + GRN_FREE(tis); + } + + return estimated_size; +} + +uint32_t +grn_ii_estimate_size_for_lexicon_cursor(grn_ctx *ctx, grn_ii *ii, + grn_table_cursor *lexicon_cursor) +{ + grn_id term_id; + uint32_t estimated_size = 0; + + while ((term_id = grn_table_cursor_next(ctx, lexicon_cursor)) != GRN_ID_NIL) { + uint32_t term_estimated_size; + term_estimated_size = grn_ii_estimate_size(ctx, ii, term_id); + estimated_size += term_estimated_size; + } + + return estimated_size; +} + +grn_rc +grn_ii_sel(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len, + grn_hash *s, grn_operator op, grn_search_optarg *optarg) +{ + ERRCLR(ctx); + GRN_LOG(ctx, GRN_LOG_INFO, "grn_ii_sel > (%.*s)", string_len, string); + { + grn_select_optarg arg; + if (!s) { return GRN_INVALID_ARGUMENT; } + memset(&arg, 0, sizeof(grn_select_optarg)); + arg.mode = GRN_OP_EXACT; + if (optarg) { + switch (optarg->mode) { + case GRN_OP_NEAR : + case GRN_OP_NEAR2 : + arg.mode = optarg->mode; + arg.max_interval = optarg->max_interval; + break; + case GRN_OP_SIMILAR : + arg.mode = optarg->mode; + arg.similarity_threshold = optarg->similarity_threshold; + break; + case GRN_OP_REGEXP : + arg.mode = optarg->mode; + break; + case GRN_OP_FUZZY : + arg.mode = optarg->mode; + arg.fuzzy = optarg->fuzzy; + break; + default : + break; + } + if (optarg->vector_size != 0) { + arg.weight_vector = optarg->weight_vector; + arg.vector_size = optarg->vector_size; + } + arg.scorer = optarg->scorer; + arg.scorer_args_expr = optarg->scorer_args_expr; + arg.scorer_args_expr_offset = optarg->scorer_args_expr_offset; + arg.match_info = &(optarg->match_info); + } + /* todo : support subrec + grn_rset_init(ctx, s, grn_rec_document, 0, grn_rec_none, 0, 0); + */ + if (grn_ii_select(ctx, ii, string, string_len, s, op, &arg)) { + GRN_LOG(ctx, GRN_LOG_ERROR, "grn_ii_select on grn_ii_sel(1) failed !"); + return ctx->rc; + } + GRN_LOG(ctx, GRN_LOG_INFO, "exact: %d", GRN_HASH_SIZE(s)); + if (op == GRN_OP_OR) { + grn_id min = GRN_ID_NIL; + if ((int64_t)GRN_HASH_SIZE(s) <= ctx->impl->match_escalation_threshold) { + arg.mode = GRN_OP_UNSPLIT; + if (arg.match_info) { + if (arg.match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { + min = arg.match_info->min; + arg.match_info->min = GRN_ID_NIL; + } + } + if (grn_ii_select(ctx, ii, string, string_len, s, op, &arg)) { + GRN_LOG(ctx, GRN_LOG_ERROR, + "grn_ii_select on grn_ii_sel(2) failed !"); + return ctx->rc; + } + GRN_LOG(ctx, GRN_LOG_INFO, "unsplit: %d", GRN_HASH_SIZE(s)); + if (arg.match_info) { + if (arg.match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { + if (min > GRN_ID_NIL && min < arg.match_info->min) { + arg.match_info->min = min; + } + } + } + } + if ((int64_t)GRN_HASH_SIZE(s) <= ctx->impl->match_escalation_threshold) { + arg.mode = GRN_OP_PARTIAL; + if (arg.match_info) { + if (arg.match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { + min = arg.match_info->min; + arg.match_info->min = GRN_ID_NIL; + } + } + if (grn_ii_select(ctx, ii, string, string_len, s, op, &arg)) { + GRN_LOG(ctx, GRN_LOG_ERROR, + "grn_ii_select on grn_ii_sel(3) failed !"); + return ctx->rc; + } + GRN_LOG(ctx, GRN_LOG_INFO, "partial: %d", GRN_HASH_SIZE(s)); + if (arg.match_info) { + if (arg.match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { + if (min > GRN_ID_NIL && min < arg.match_info->min) { + arg.match_info->min = min; + } + } + } + } + } + GRN_LOG(ctx, GRN_LOG_INFO, "hits=%d", GRN_HASH_SIZE(s)); + return GRN_SUCCESS; + } +} + +grn_rc +grn_ii_at(grn_ctx *ctx, grn_ii *ii, grn_id id, grn_hash *s, grn_operator op) +{ + int rep = 0; + grn_ii_cursor *c; + grn_posting *pos; + if ((c = grn_ii_cursor_open(ctx, ii, id, GRN_ID_NIL, GRN_ID_MAX, + rep ? ii->n_elements : ii->n_elements - 1, 0))) { + while ((pos = grn_ii_cursor_next(ctx, c))) { + res_add(ctx, s, (grn_rset_posinfo *) pos, (1 + pos->weight), op); + } + grn_ii_cursor_close(ctx, c); + } + return ctx->rc; +} + +void +grn_ii_resolve_sel_and(grn_ctx *ctx, grn_hash *s, grn_operator op) +{ + if (op == GRN_OP_AND + && !(ctx->flags & GRN_CTX_TEMPORARY_DISABLE_II_RESOLVE_SEL_AND)) { + grn_id eid; + grn_rset_recinfo *ri; + grn_hash_cursor *c = grn_hash_cursor_open(ctx, s, NULL, 0, NULL, 0, + 0, -1, 0); + if (c) { + while ((eid = grn_hash_cursor_next(ctx, c))) { + grn_hash_cursor_get_value(ctx, c, (void **) &ri); + if ((ri->n_subrecs & GRN_RSET_UTIL_BIT)) { + ri->n_subrecs &= ~GRN_RSET_UTIL_BIT; + } else { + grn_hash_delete_by_id(ctx, s, eid, NULL); + } + } + grn_hash_cursor_close(ctx, c); + } + } +} + +void +grn_ii_cursor_inspect(grn_ctx *ctx, grn_ii_cursor *c, grn_obj *buf) +{ + grn_obj key_buf; + char key[GRN_TABLE_MAX_KEY_SIZE]; + int key_size; + int i = 0; + grn_ii_cursor_next_options options = { + .include_garbage = GRN_TRUE + }; + + GRN_TEXT_PUTS(ctx, buf, " #<"); + key_size = grn_table_get_key(ctx, c->ii->lexicon, c->id, + key, GRN_TABLE_MAX_KEY_SIZE); + GRN_OBJ_INIT(&key_buf, GRN_BULK, 0, c->ii->lexicon->header.domain); + GRN_TEXT_SET(ctx, &key_buf, key, key_size); + grn_inspect(ctx, buf, &key_buf); + GRN_OBJ_FIN(ctx, &key_buf); + + GRN_TEXT_PUTS(ctx, buf, "\n elements:[\n "); + while (grn_ii_cursor_next_internal(ctx, c, &options)) { + grn_posting *pos = c->post; + if (i > 0) { + GRN_TEXT_PUTS(ctx, buf, ",\n "); + } + i++; + GRN_TEXT_PUTS(ctx, buf, "{status:"); + if (pos->tf && pos->sid) { + GRN_TEXT_PUTS(ctx, buf, "available"); + } else { + GRN_TEXT_PUTS(ctx, buf, "garbage"); + } + GRN_TEXT_PUTS(ctx, buf, ", rid:"); + grn_text_lltoa(ctx, buf, pos->rid); + GRN_TEXT_PUTS(ctx, buf, ", sid:"); + grn_text_lltoa(ctx, buf, pos->sid); + GRN_TEXT_PUTS(ctx, buf, ", pos:"); + grn_text_lltoa(ctx, buf, pos->pos); + GRN_TEXT_PUTS(ctx, buf, ", tf:"); + grn_text_lltoa(ctx, buf, pos->tf); + GRN_TEXT_PUTS(ctx, buf, ", weight:"); + grn_text_lltoa(ctx, buf, pos->weight); + GRN_TEXT_PUTS(ctx, buf, ", rest:"); + grn_text_lltoa(ctx, buf, pos->rest); + GRN_TEXT_PUTS(ctx, buf, "}"); + } + GRN_TEXT_PUTS(ctx, buf, "\n ]\n >"); +} + +void +grn_ii_inspect_values(grn_ctx *ctx, grn_ii *ii, grn_obj *buf) +{ + grn_table_cursor *tc; + GRN_TEXT_PUTS(ctx, buf, "["); + if ((tc = grn_table_cursor_open(ctx, ii->lexicon, NULL, 0, NULL, 0, 0, -1, + GRN_CURSOR_ASCENDING))) { + int i = 0; + grn_id tid; + grn_ii_cursor *c; + while ((tid = grn_table_cursor_next(ctx, tc))) { + if (i > 0) { + GRN_TEXT_PUTS(ctx, buf, ","); + } + i++; + GRN_TEXT_PUTS(ctx, buf, "\n"); + if ((c = grn_ii_cursor_open(ctx, ii, tid, GRN_ID_NIL, GRN_ID_MAX, + ii->n_elements, + GRN_OBJ_WITH_POSITION|GRN_OBJ_WITH_SECTION))) { + grn_ii_cursor_inspect(ctx, c, buf); + grn_ii_cursor_close(ctx, c); + } + } + grn_table_cursor_close(ctx, tc); + } + GRN_TEXT_PUTS(ctx, buf, "]"); +} + +/********************** buffered index builder ***********************/ + +const grn_id II_BUFFER_TYPE_MASK = 0xc0000000; +#define II_BUFFER_TYPE_RID 0x80000000 +#define II_BUFFER_TYPE_WEIGHT 0x40000000 +#define II_BUFFER_TYPE(id) (((id) & II_BUFFER_TYPE_MASK)) +#define II_BUFFER_PACK(value, type) ((value) | (type)) +#define II_BUFFER_UNPACK(id, type) ((id) & ~(type)) +#define II_BUFFER_ORDER GRN_CURSOR_BY_KEY +const uint16_t II_BUFFER_NTERMS_PER_BUFFER = 16380; +const uint32_t II_BUFFER_PACKED_BUF_SIZE = 0x4000000; +const char *TMPFILE_PATH = "grn_ii_buffer_tmp"; +const uint32_t II_BUFFER_NCOUNTERS_MARGIN = 0x100000; +const size_t II_BUFFER_BLOCK_SIZE = 0x1000000; +const uint32_t II_BUFFER_BLOCK_READ_UNIT_SIZE = 0x200000; + +typedef struct { + unsigned int sid; /* Section ID */ + unsigned int weight; /* Weight */ + const char *p; /* Value address */ + uint32_t len; /* Value length */ + char *buf; /* Buffer address */ + uint32_t cap; /* Buffer size */ +} ii_buffer_value; + +/* ii_buffer_counter is associated with a combination of a block an a term. */ +typedef struct { + uint32_t nrecs; /* Number of records or sections */ + uint32_t nposts; /* Number of occurrences */ + + /* Information of the last value */ + grn_id last_rid; /* Record ID */ + uint32_t last_sid; /* Section ID */ + uint32_t last_tf; /* Term frequency */ + uint32_t last_weight; /* Total weight */ + uint32_t last_pos; /* Token position */ + + /* Meaning of offset_* is different before/after encoding. */ + /* Before encoding: size in encoded sequence */ + /* After encoding: Offset in encoded sequence */ + uint32_t offset_rid; /* Record ID */ + uint32_t offset_sid; /* Section ID */ + uint32_t offset_tf; /* Term frequency */ + uint32_t offset_weight; /* Weight */ + uint32_t offset_pos; /* Token position */ +} ii_buffer_counter; + +typedef struct { + off64_t head; + off64_t tail; + uint32_t nextsize; + uint8_t *buffer; + uint32_t buffersize; + uint8_t *bufcur; + uint32_t rest; + grn_id tid; + uint32_t nrecs; + uint32_t nposts; + grn_id *recs; + uint32_t *tfs; + uint32_t *posts; +} ii_buffer_block; + +struct _grn_ii_buffer { + grn_obj *lexicon; /* Global lexicon */ + grn_obj *tmp_lexicon; /* Temporary lexicon for each block */ + ii_buffer_block *blocks; /* Blocks */ + uint32_t nblocks; /* Number of blocks */ + int tmpfd; /* Descriptor of temporary file */ + char tmpfpath[PATH_MAX]; /* Path of temporary file */ + uint64_t update_buffer_size; + + // stuff for parsing + off64_t filepos; /* Write position of temporary file */ + grn_id *block_buf; /* Buffer for the current block */ + size_t block_buf_size; /* Size of block_buf */ + size_t block_pos; /* Write position of block_buf */ + ii_buffer_counter *counters; /* Status of terms */ + uint32_t ncounters; /* Number of counters */ + size_t total_size; + size_t curr_size; + ii_buffer_value *values; /* Values in block */ + unsigned int nvalues; /* Number of values in block */ + unsigned int max_nvalues; /* Size of values */ + grn_id last_rid; + + // stuff for merging + grn_ii *ii; + uint32_t lseg; + uint32_t dseg; + buffer *term_buffer; + datavec data_vectors[MAX_N_ELEMENTS + 1]; + uint8_t *packed_buf; + size_t packed_buf_size; + size_t packed_len; + size_t total_chunk_size; +}; + +/* block_new returns a new ii_buffer_block to store block information. */ +static ii_buffer_block * +block_new(grn_ctx *ctx, grn_ii_buffer *ii_buffer) +{ + ii_buffer_block *block; + if (!(ii_buffer->nblocks & 0x3ff)) { + ii_buffer_block *blocks; + if (!(blocks = GRN_REALLOC(ii_buffer->blocks, + (ii_buffer->nblocks + 0x400) * + sizeof(ii_buffer_block)))) { + return NULL; + } + ii_buffer->blocks = blocks; + } + block = &ii_buffer->blocks[ii_buffer->nblocks]; + block->head = ii_buffer->filepos; + block->rest = 0; + block->buffer = NULL; + block->buffersize = 0; + return block; +} + +/* allocate_outbuf allocates memory to flush a block. */ +static uint8_t * +allocate_outbuf(grn_ctx *ctx, grn_ii_buffer *ii_buffer) +{ + size_t bufsize = 0, bufsize_ = 0; + uint32_t flags = ii_buffer->ii->header->flags; + ii_buffer_counter *counter = ii_buffer->counters; + grn_id tid, tid_max = grn_table_size(ctx, ii_buffer->tmp_lexicon); + for (tid = 1; tid <= tid_max; counter++, tid++) { + counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1); + counter->last_rid = 0; + counter->last_tf = 0; + bufsize += 5; + bufsize += GRN_B_ENC_SIZE(counter->nrecs); + bufsize += GRN_B_ENC_SIZE(counter->nposts); + bufsize += counter->offset_rid; + if ((flags & GRN_OBJ_WITH_SECTION)) { + bufsize += counter->offset_sid; + } + bufsize += counter->offset_tf; + if ((flags & GRN_OBJ_WITH_WEIGHT)) { + bufsize += counter->offset_weight; + } + if ((flags & GRN_OBJ_WITH_POSITION)) { + bufsize += counter->offset_pos; + } + if (bufsize_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < bufsize) { + bufsize += sizeof(uint32_t); + bufsize_ = bufsize; + } + } + GRN_LOG(ctx, GRN_LOG_INFO, "flushing:%d bufsize:%" GRN_FMT_SIZE, + ii_buffer->nblocks, bufsize); + return (uint8_t *)GRN_MALLOC(bufsize); +} + +/* + * The temporary file format is roughly as follows: + * + * File = Block... + * Block = Unit... + * Unit = TermChunk (key order) + * NextUnitSize (The first unit size is kept on memory) + * Chunk = Term... + * Term = ID (gtid) + * NumRecordsOrSections (nrecs), NumOccurrences (nposts) + * RecordID... (rid, diff) + * [SectionID... (sid, diff)] + * TermFrequency... (tf, diff) + * [Weight... (weight, diff)] + * [Position... (pos, diff)] + */ + +/* + * encode_terms encodes terms in ii_buffer->tmp_lexicon and returns the + * expected temporary file size. + */ +static size_t +encode_terms(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + uint8_t *outbuf, ii_buffer_block *block) +{ + grn_id tid; + uint8_t *outbufp = outbuf; + uint8_t *outbufp_ = outbuf; + grn_table_cursor *tc; + /* The first size is written into block->nextsize. */ + uint8_t *pnext = (uint8_t *)&block->nextsize; + uint32_t flags = ii_buffer->ii->header->flags; + tc = grn_table_cursor_open(ctx, ii_buffer->tmp_lexicon, + NULL, 0, NULL, 0, 0, -1, II_BUFFER_ORDER); + while ((tid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { + char key[GRN_TABLE_MAX_KEY_SIZE]; + int key_size = grn_table_get_key(ctx, ii_buffer->tmp_lexicon, tid, + key, GRN_TABLE_MAX_KEY_SIZE); + /* gtid is a global term ID, not in a temporary lexicon. */ + grn_id gtid = grn_table_add(ctx, ii_buffer->lexicon, key, key_size, NULL); + ii_buffer_counter *counter = &ii_buffer->counters[tid - 1]; + if (counter->nrecs) { + uint32_t offset_rid = counter->offset_rid; + uint32_t offset_sid = counter->offset_sid; + uint32_t offset_tf = counter->offset_tf; + uint32_t offset_weight = counter->offset_weight; + uint32_t offset_pos = counter->offset_pos; + GRN_B_ENC(gtid, outbufp); + GRN_B_ENC(counter->nrecs, outbufp); + GRN_B_ENC(counter->nposts, outbufp); + ii_buffer->total_size += counter->nrecs + counter->nposts; + counter->offset_rid = outbufp - outbuf; + outbufp += offset_rid; + if ((flags & GRN_OBJ_WITH_SECTION)) { + counter->offset_sid = outbufp - outbuf; + outbufp += offset_sid; + } + counter->offset_tf = outbufp - outbuf; + outbufp += offset_tf; + if ((flags & GRN_OBJ_WITH_WEIGHT)) { + counter->offset_weight = outbufp - outbuf; + outbufp += offset_weight; + } + if ((flags & GRN_OBJ_WITH_POSITION)) { + counter->offset_pos = outbufp - outbuf; + outbufp += offset_pos; + } + } + if (outbufp_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < outbufp) { + uint32_t size = outbufp - outbufp_ + sizeof(uint32_t); + grn_memcpy(pnext, &size, sizeof(uint32_t)); + pnext = outbufp; + outbufp += sizeof(uint32_t); + outbufp_ = outbufp; + } + } + grn_table_cursor_close(ctx, tc); + if (outbufp_ < outbufp) { + uint32_t size = outbufp - outbufp_; + grn_memcpy(pnext, &size, sizeof(uint32_t)); + } + return outbufp - outbuf; +} + +/* encode_postings encodes data in ii_buffer->block_buf. */ +static void +encode_postings(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf) +{ + grn_id rid = 0; + unsigned int sid = 1; + unsigned int weight = 0; + uint32_t pos = 0; + uint32_t rest; + grn_id *bp = ii_buffer->block_buf; + uint32_t flags = ii_buffer->ii->header->flags; + for (rest = ii_buffer->block_pos; rest; bp++, rest--) { + grn_id id = *bp; + switch (II_BUFFER_TYPE(id)) { + case II_BUFFER_TYPE_RID : + rid = II_BUFFER_UNPACK(id, II_BUFFER_TYPE_RID); + if ((flags & GRN_OBJ_WITH_SECTION) && rest) { + sid = *++bp; + rest--; + } + weight = 0; + pos = 0; + break; + case II_BUFFER_TYPE_WEIGHT : + weight = II_BUFFER_UNPACK(id, II_BUFFER_TYPE_WEIGHT); + break; + default : + { + ii_buffer_counter *counter = &ii_buffer->counters[id - 1]; + if (counter->last_rid == rid && counter->last_sid == sid) { + counter->last_tf++; + counter->last_weight += weight; + } else { + if (counter->last_tf) { + uint8_t *p = outbuf + counter->offset_tf; + GRN_B_ENC(counter->last_tf - 1, p); + counter->offset_tf = p - outbuf; + if (flags & GRN_OBJ_WITH_WEIGHT) { + p = outbuf + counter->offset_weight; + GRN_B_ENC(counter->last_weight, p); + counter->offset_weight = p - outbuf; + } + } + { + uint8_t *p = outbuf + counter->offset_rid; + GRN_B_ENC(rid - counter->last_rid, p); + counter->offset_rid = p - outbuf; + } + if (flags & GRN_OBJ_WITH_SECTION) { + uint8_t *p = outbuf + counter->offset_sid; + if (counter->last_rid != rid) { + GRN_B_ENC(sid - 1, p); + } else { + GRN_B_ENC(sid - counter->last_sid - 1, p); + } + counter->offset_sid = p - outbuf; + } + counter->last_rid = rid; + counter->last_sid = sid; + counter->last_tf = 1; + counter->last_weight = weight; + counter->last_pos = 0; + } + if ((flags & GRN_OBJ_WITH_POSITION) && rest) { + uint8_t *p = outbuf + counter->offset_pos; + pos = *++bp; + rest--; + GRN_B_ENC(pos - counter->last_pos, p); + counter->offset_pos = p - outbuf; + counter->last_pos = pos; + } + } + break; + } + } +} + +/* encode_last_tf encodes last_tf and last_weight in counters. */ +static void +encode_last_tf(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf) +{ + ii_buffer_counter *counter = ii_buffer->counters; + grn_id tid, tid_max = grn_table_size(ctx, ii_buffer->tmp_lexicon); + for (tid = 1; tid <= tid_max; counter++, tid++) { + uint8_t *p = outbuf + counter->offset_tf; + GRN_B_ENC(counter->last_tf - 1, p); + } + if ((ii_buffer->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { + for (tid = 1; tid <= tid_max; counter++, tid++) { + uint8_t *p = outbuf + counter->offset_weight; + GRN_B_ENC(counter->last_weight, p); + } + } +} + +/* + * grn_ii_buffer_flush flushes the current block (ii_buffer->block_buf, + * counters and tmp_lexicon) to a temporary file (ii_buffer->tmpfd). + * Also, block information is stored into ii_buffer->blocks. + */ +static void +grn_ii_buffer_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) +{ + size_t encsize; + uint8_t *outbuf; + ii_buffer_block *block; + GRN_LOG(ctx, GRN_LOG_DEBUG, "flushing:%d npostings:%" GRN_FMT_SIZE, + ii_buffer->nblocks, ii_buffer->block_pos); + if (!(block = block_new(ctx, ii_buffer))) { return; } + if (!(outbuf = allocate_outbuf(ctx, ii_buffer))) { return; } + encsize = encode_terms(ctx, ii_buffer, outbuf, block); + encode_postings(ctx, ii_buffer, outbuf); + encode_last_tf(ctx, ii_buffer, outbuf); + { + ssize_t r = grn_write(ii_buffer->tmpfd, outbuf, encsize); + if (r != encsize) { + ERR(GRN_INPUT_OUTPUT_ERROR, + "write returned %" GRN_FMT_LLD " != %" GRN_FMT_LLU, + (long long int)r, (unsigned long long int)encsize); + GRN_FREE(outbuf); + return; + } + ii_buffer->filepos += r; + block->tail = ii_buffer->filepos; + } + GRN_FREE(outbuf); + memset(ii_buffer->counters, 0, + grn_table_size(ctx, ii_buffer->tmp_lexicon) * + sizeof(ii_buffer_counter)); + grn_obj_close(ctx, ii_buffer->tmp_lexicon); + GRN_LOG(ctx, GRN_LOG_DEBUG, "flushed: %d encsize:%" GRN_FMT_SIZE, + ii_buffer->nblocks, encsize); + ii_buffer->tmp_lexicon = NULL; + ii_buffer->nblocks++; + ii_buffer->block_pos = 0; +} + +const uint32_t PAT_CACHE_SIZE = 1<<20; + +/* + * get_tmp_lexicon returns a temporary lexicon. + * + * Note that a lexicon is created for each block and ii_buffer->tmp_lexicon is + * closed in grn_ii_buffer_flush. + */ +static grn_obj * +get_tmp_lexicon(grn_ctx *ctx, grn_ii_buffer *ii_buffer) +{ + grn_obj *tmp_lexicon = ii_buffer->tmp_lexicon; + if (!tmp_lexicon) { + grn_obj *domain = grn_ctx_at(ctx, ii_buffer->lexicon->header.domain); + grn_obj *range = grn_ctx_at(ctx, DB_OBJ(ii_buffer->lexicon)->range); + grn_obj *tokenizer; + grn_obj *normalizer; + grn_obj *token_filters; + grn_table_flags flags; + grn_table_get_info(ctx, ii_buffer->lexicon, &flags, NULL, + &tokenizer, &normalizer, &token_filters); + flags &= ~GRN_OBJ_PERSISTENT; + tmp_lexicon = grn_table_create(ctx, NULL, 0, NULL, flags, domain, range); + if (tmp_lexicon) { + ii_buffer->tmp_lexicon = tmp_lexicon; + grn_obj_set_info(ctx, tmp_lexicon, + GRN_INFO_DEFAULT_TOKENIZER, tokenizer); + grn_obj_set_info(ctx, tmp_lexicon, + GRN_INFO_NORMALIZER, normalizer); + grn_obj_set_info(ctx, tmp_lexicon, + GRN_INFO_TOKEN_FILTERS, token_filters); + if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { + grn_pat_cache_enable(ctx, (grn_pat *)tmp_lexicon, PAT_CACHE_SIZE); + } + } + } + return tmp_lexicon; +} + +/* get_buffer_counter returns a counter associated with tid. */ +static ii_buffer_counter * +get_buffer_counter(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + grn_obj *tmp_lexicon, grn_id tid) +{ + if (tid > ii_buffer->ncounters) { + ii_buffer_counter *counters; + uint32_t ncounters = + grn_table_size(ctx, tmp_lexicon) + II_BUFFER_NCOUNTERS_MARGIN; + counters = GRN_REALLOC(ii_buffer->counters, + ncounters * sizeof(ii_buffer_counter)); + if (!counters) { return NULL; } + memset(&counters[ii_buffer->ncounters], 0, + (ncounters - ii_buffer->ncounters) * sizeof(ii_buffer_counter)); + ii_buffer->ncounters = ncounters; + ii_buffer->counters = counters; + } + return &ii_buffer->counters[tid - 1]; +} + +/* + * grn_ii_buffer_tokenize_value tokenizes a value. + * + * The result is written into the current block (ii_buffer->tmp_lexicon, + * ii_buffer->block_buf, ii_buffer->counters, etc.). + */ +static void +grn_ii_buffer_tokenize_value(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + grn_id rid, const ii_buffer_value *value) +{ + grn_obj *tmp_lexicon; + if ((tmp_lexicon = get_tmp_lexicon(ctx, ii_buffer))) { + unsigned int token_flags = 0; + grn_token_cursor *token_cursor; + grn_id *buffer = ii_buffer->block_buf; + uint32_t block_pos = ii_buffer->block_pos; + uint32_t ii_flags = ii_buffer->ii->header->flags; + buffer[block_pos++] = II_BUFFER_PACK(rid, II_BUFFER_TYPE_RID); + if (ii_flags & GRN_OBJ_WITH_SECTION) { + buffer[block_pos++] = value->sid; + } + if (value->weight) { + buffer[block_pos++] = II_BUFFER_PACK(value->weight, + II_BUFFER_TYPE_WEIGHT); + } + if ((token_cursor = grn_token_cursor_open(ctx, tmp_lexicon, + value->p, value->len, + GRN_TOKEN_ADD, token_flags))) { + while (!token_cursor->status) { + grn_id tid; + if ((tid = grn_token_cursor_next(ctx, token_cursor))) { + ii_buffer_counter *counter; + counter = get_buffer_counter(ctx, ii_buffer, tmp_lexicon, tid); + if (!counter) { return; } + buffer[block_pos++] = tid; + if (ii_flags & GRN_OBJ_WITH_POSITION) { + buffer[block_pos++] = token_cursor->pos; + } + if (counter->last_rid != rid) { + counter->offset_rid += GRN_B_ENC_SIZE(rid - counter->last_rid); + counter->last_rid = rid; + counter->offset_sid += GRN_B_ENC_SIZE(value->sid - 1); + counter->last_sid = value->sid; + if (counter->last_tf) { + counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1); + counter->last_tf = 0; + counter->offset_weight += GRN_B_ENC_SIZE(counter->last_weight); + counter->last_weight = 0; + } + counter->last_pos = 0; + counter->nrecs++; + } else if (counter->last_sid != value->sid) { + counter->offset_rid += GRN_B_ENC_SIZE(0); + counter->offset_sid += + GRN_B_ENC_SIZE(value->sid - counter->last_sid - 1); + counter->last_sid = value->sid; + if (counter->last_tf) { + counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1); + counter->last_tf = 0; + counter->offset_weight += GRN_B_ENC_SIZE(counter->last_weight); + counter->last_weight = 0; + } + counter->last_pos = 0; + counter->nrecs++; + } + counter->offset_pos += + GRN_B_ENC_SIZE(token_cursor->pos - counter->last_pos); + counter->last_pos = token_cursor->pos; + counter->last_tf++; + counter->last_weight += value->weight; + counter->nposts++; + } + } + grn_token_cursor_close(ctx, token_cursor); + } + ii_buffer->block_pos = block_pos; + } +} + +/* + * grn_ii_buffer_tokenize tokenizes ii_buffer->values. + * + * grn_ii_buffer_tokenize estimates the size of tokenized values. + * If the remaining space of the current block is not enough to store the new + * tokenized values, the current block is flushed. + * Then, grn_ii_buffer_tokenize tokenizes values. + */ +static void +grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid) +{ + unsigned int i; + uint32_t est_len = 0; + for (i = 0; i < ii_buffer->nvalues; i++) { + est_len += ii_buffer->values[i].len * 2 + 2; + } + if (ii_buffer->block_buf_size < ii_buffer->block_pos + est_len) { + grn_ii_buffer_flush(ctx, ii_buffer); + } + if (ii_buffer->block_buf_size < est_len) { + grn_id *block_buf = (grn_id *)GRN_REALLOC(ii_buffer->block_buf, + est_len * sizeof(grn_id)); + if (block_buf) { + ii_buffer->block_buf = block_buf; + ii_buffer->block_buf_size = est_len; + } + } + + for (i = 0; i < ii_buffer->nvalues; i++) { + const ii_buffer_value *value = &ii_buffer->values[i]; + if (value->len) { + uint32_t est_len = value->len * 2 + 2; + if (ii_buffer->block_buf_size >= ii_buffer->block_pos + est_len) { + grn_ii_buffer_tokenize_value(ctx, ii_buffer, rid, value); + } + } + } + ii_buffer->nvalues = 0; +} + +/* grn_ii_buffer_fetch fetches the next term. */ +static void +grn_ii_buffer_fetch(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + ii_buffer_block *block) +{ + if (!block->rest) { + /* Read the next unit. */ + if (block->head < block->tail) { + size_t bytesize = block->nextsize; + if (block->buffersize < block->nextsize) { + void *r = GRN_REALLOC(block->buffer, bytesize); + if (r) { + block->buffer = (uint8_t *)r; + block->buffersize = block->nextsize; + } else { + GRN_LOG(ctx, GRN_LOG_WARNING, "realloc: %" GRN_FMT_LLU, + (unsigned long long int)bytesize); + return; + } + } + { + off64_t seeked_position; + seeked_position = grn_lseek(ii_buffer->tmpfd, block->head, SEEK_SET); + if (seeked_position != block->head) { + ERRNO_ERR("failed to " + "grn_lseek(%" GRN_FMT_OFF64_T ") -> %" GRN_FMT_OFF64_T, + block->head, + seeked_position); + return; + } + } + { + size_t read_bytesize; + read_bytesize = grn_read(ii_buffer->tmpfd, block->buffer, bytesize); + if (read_bytesize != bytesize) { + SERR("failed to grn_read(%" GRN_FMT_SIZE ") -> %" GRN_FMT_SIZE, + bytesize, read_bytesize); + return; + } + } + block->head += bytesize; + block->bufcur = block->buffer; + if (block->head >= block->tail) { + if (block->head > block->tail) { + GRN_LOG(ctx, GRN_LOG_WARNING, + "fetch error: %" GRN_FMT_INT64D " > %" GRN_FMT_INT64D, + block->head, block->tail); + } + block->rest = block->nextsize; + block->nextsize = 0; + } else { + block->rest = block->nextsize - sizeof(uint32_t); + grn_memcpy(&block->nextsize, + &block->buffer[block->rest], sizeof(uint32_t)); + } + } + } + if (block->rest) { + uint8_t *p = block->bufcur; + GRN_B_DEC(block->tid, p); + GRN_B_DEC(block->nrecs, p); + GRN_B_DEC(block->nposts, p); + block->rest -= (p - block->bufcur); + block->bufcur = p; + } else { + block->tid = 0; + } +} + +/* grn_ii_buffer_chunk_flush flushes the current buffer for packed postings. */ +static void +grn_ii_buffer_chunk_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) +{ + grn_io_win io_win; + uint32_t chunk_number; + chunk_new(ctx, ii_buffer->ii, &chunk_number, ii_buffer->packed_len); + GRN_LOG(ctx, GRN_LOG_INFO, "chunk:%d, packed_len:%" GRN_FMT_SIZE, + chunk_number, ii_buffer->packed_len); + fake_map(ctx, ii_buffer->ii->chunk, &io_win, ii_buffer->packed_buf, + chunk_number, ii_buffer->packed_len); + grn_io_win_unmap(&io_win); + ii_buffer->term_buffer->header.chunk = chunk_number; + ii_buffer->term_buffer->header.chunk_size = ii_buffer->packed_len; + ii_buffer->term_buffer->header.buffer_free = + S_SEGMENT - sizeof(buffer_header) - + ii_buffer->term_buffer->header.nterms * sizeof(buffer_term); + ii_buffer->term_buffer->header.nterms_void = 0; + buffer_segment_update(ii_buffer->ii, ii_buffer->lseg, ii_buffer->dseg); + ii_buffer->ii->header->total_chunk_size += ii_buffer->packed_len; + ii_buffer->total_chunk_size += ii_buffer->packed_len; + GRN_LOG(ctx, GRN_LOG_DEBUG, + "nterms=%d chunk=%d total=%" GRN_FMT_INT64U "KB", + ii_buffer->term_buffer->header.nterms, + ii_buffer->term_buffer->header.chunk_size, + ii_buffer->ii->header->total_chunk_size >> 10); + ii_buffer->term_buffer = NULL; + ii_buffer->packed_buf = NULL; + ii_buffer->packed_len = 0; + ii_buffer->packed_buf_size = 0; + ii_buffer->curr_size = 0; +} + +/* + * merge_hit_blocks merges hit blocks into ii_buffer->data_vectors. + * merge_hit_blocks returns the estimated maximum size in bytes. + */ +static size_t +merge_hit_blocks(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + ii_buffer_block *hits[], int nhits) +{ + uint64_t nrecs = 0; + uint64_t nposts = 0; + size_t max_size; + uint64_t flags = ii_buffer->ii->header->flags; + int i; + for (i = 0; i < nhits; i++) { + ii_buffer_block *block = hits[i]; + nrecs += block->nrecs; + nposts += block->nposts; + } + ii_buffer->curr_size += nrecs + nposts; + max_size = nrecs * (ii_buffer->ii->n_elements); + if (flags & GRN_OBJ_WITH_POSITION) { max_size += nposts - nrecs; } + datavec_reset(ctx, ii_buffer->data_vectors, + ii_buffer->ii->n_elements, nrecs, max_size); + { + int i; + uint32_t lr = 0; /* Last rid */ + uint64_t spos = 0; + uint32_t *ridp, *sidp = NULL, *tfp, *weightp = NULL, *posp = NULL; + { + /* Get write positions in datavec. */ + int j = 0; + ridp = ii_buffer->data_vectors[j++].data; + if (flags & GRN_OBJ_WITH_SECTION) { + sidp = ii_buffer->data_vectors[j++].data; + } + tfp = ii_buffer->data_vectors[j++].data; + if (flags & GRN_OBJ_WITH_WEIGHT) { + weightp = ii_buffer->data_vectors[j++].data; + } + if (flags & GRN_OBJ_WITH_POSITION) { + posp = ii_buffer->data_vectors[j++].data; + } + } + for (i = 0; i < nhits; i++) { + /* Read postings from hit blocks and join the postings into datavec. */ + ii_buffer_block *block = hits[i]; + uint8_t *p = block->bufcur; + uint32_t n = block->nrecs; + if (n) { + GRN_B_DEC(*ridp, p); + *ridp -= lr; + lr += *ridp++; + while (--n) { + GRN_B_DEC(*ridp, p); + lr += *ridp++; + } + } + if ((flags & GRN_OBJ_WITH_SECTION)) { + for (n = block->nrecs; n; n--) { + GRN_B_DEC(*sidp++, p); + } + } + for (n = block->nrecs; n; n--) { + GRN_B_DEC(*tfp++, p); + } + if ((flags & GRN_OBJ_WITH_WEIGHT)) { + for (n = block->nrecs; n; n--) { + GRN_B_DEC(*weightp++, p); + } + } + if ((flags & GRN_OBJ_WITH_POSITION)) { + for (n = block->nposts; n; n--) { + GRN_B_DEC(*posp, p); + spos += *posp++; + } + } + block->rest -= (p - block->bufcur); + block->bufcur = p; + grn_ii_buffer_fetch(ctx, ii_buffer, block); + } + { + /* Set size and flags of datavec. */ + int j = 0; + uint32_t f_s = (nrecs < 3) ? 0 : USE_P_ENC; + uint32_t f_d = ((nrecs < 16) || (nrecs <= (lr >> 8))) ? 0 : USE_P_ENC; + ii_buffer->data_vectors[j].data_size = nrecs; + ii_buffer->data_vectors[j++].flags = f_d; + if ((flags & GRN_OBJ_WITH_SECTION)) { + ii_buffer->data_vectors[j].data_size = nrecs; + ii_buffer->data_vectors[j++].flags = f_s; + } + ii_buffer->data_vectors[j].data_size = nrecs; + ii_buffer->data_vectors[j++].flags = f_s; + if ((flags & GRN_OBJ_WITH_WEIGHT)) { + ii_buffer->data_vectors[j].data_size = nrecs; + ii_buffer->data_vectors[j++].flags = f_s; + } + if ((flags & GRN_OBJ_WITH_POSITION)) { + uint32_t f_p = (((nposts < 32) || + (nposts <= (spos >> 13))) ? 0 : USE_P_ENC); + ii_buffer->data_vectors[j].data_size = nposts; + ii_buffer->data_vectors[j++].flags = f_p|ODD; + } + } + } + return (max_size + ii_buffer->ii->n_elements) * 4; +} + +static buffer * +get_term_buffer(grn_ctx *ctx, grn_ii_buffer *ii_buffer) +{ + if (!ii_buffer->term_buffer) { + uint32_t lseg; + void *term_buffer; + for (lseg = 0; lseg < GRN_II_MAX_LSEG; lseg++) { + if (ii_buffer->ii->header->binfo[lseg] == GRN_II_PSEG_NOT_ASSIGNED) { break; } + } + if (lseg == GRN_II_MAX_LSEG) { + DEFINE_NAME(ii_buffer->ii); + MERR("[ii][buffer][term-buffer] couldn't find a free buffer: " + "<%.*s>", + name_size, name); + return NULL; + } + ii_buffer->lseg = lseg; + ii_buffer->dseg = segment_get(ctx, ii_buffer->ii); + GRN_IO_SEG_REF(ii_buffer->ii->seg, ii_buffer->dseg, term_buffer); + ii_buffer->term_buffer = (buffer *)term_buffer; + } + return ii_buffer->term_buffer; +} + +/* + * try_in_place_packing tries to pack a posting in an array element. + * + * The requirements are as follows: + * - nposts == 1 + * - nhits == 1 && nrecs == 1 && tf == 0 + * - weight == 0 + * - !(flags & GRN_OBJ_WITH_SECTION) || (rid < 0x100000 && sid < 0x800) + */ +static grn_bool +try_in_place_packing(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + grn_id tid, ii_buffer_block *hits[], int nhits) +{ + if (nhits == 1 && hits[0]->nrecs == 1 && hits[0]->nposts == 1) { + grn_id rid; + uint32_t sid = 1, tf, pos = 0, weight = 0; + ii_buffer_block *block = hits[0]; + uint8_t *p = block->bufcur; + uint32_t flags = ii_buffer->ii->header->flags; + GRN_B_DEC(rid, p); + if (flags & GRN_OBJ_WITH_SECTION) { + GRN_B_DEC(sid, p); + sid++; + } + GRN_B_DEC(tf, p); + if (tf != 0) { GRN_LOG(ctx, GRN_LOG_WARNING, "tf=%d", tf); } + if (flags & GRN_OBJ_WITH_WEIGHT) { GRN_B_DEC(weight, p); } + if (flags & GRN_OBJ_WITH_POSITION) { GRN_B_DEC(pos, p); } + if (!weight) { + if (flags & GRN_OBJ_WITH_SECTION) { + if (rid < 0x100000 && sid < 0x800) { + uint32_t *a = array_get(ctx, ii_buffer->ii, tid); + a[0] = (rid << 12) + (sid << 1) + 1; + a[1] = pos; + array_unref(ii_buffer->ii, tid); + } else { + return GRN_FALSE; + } + } else { + uint32_t *a = array_get(ctx, ii_buffer->ii, tid); + a[0] = (rid << 1) + 1; + a[1] = pos; + array_unref(ii_buffer->ii, tid); + } + block->rest -= (p - block->bufcur); + block->bufcur = p; + grn_ii_buffer_fetch(ctx, ii_buffer, block); + return GRN_TRUE; + } + } + return GRN_FALSE; +} + +/* grn_ii_buffer_merge merges hit blocks and pack it. */ +static void +grn_ii_buffer_merge(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + grn_id tid, ii_buffer_block *hits[], int nhits) +{ + if (!try_in_place_packing(ctx, ii_buffer, tid, hits, nhits)) { + /* Merge hit blocks and reserve a buffer for packed data. */ + size_t max_size = merge_hit_blocks(ctx, ii_buffer, hits, nhits); + if (ii_buffer->packed_buf && + ii_buffer->packed_buf_size < ii_buffer->packed_len + max_size) { + grn_ii_buffer_chunk_flush(ctx, ii_buffer); + } + if (!ii_buffer->packed_buf) { + size_t buf_size = (max_size > II_BUFFER_PACKED_BUF_SIZE) + ? max_size : II_BUFFER_PACKED_BUF_SIZE; + if ((ii_buffer->packed_buf = GRN_MALLOC(buf_size))) { + ii_buffer->packed_buf_size = buf_size; + } + } + { + /* Pack postings into the current buffer. */ + uint16_t nterm; + size_t packed_len; + buffer_term *bt; + uint32_t *a; + buffer *term_buffer; + + a = array_get(ctx, ii_buffer->ii, tid); + if (!a) { + DEFINE_NAME(ii_buffer->ii); + MERR("[ii][buffer][merge] failed to allocate an array: " + "<%.*s>: " + "<%u>", + name_size, name, + tid); + return; + } + term_buffer = get_term_buffer(ctx, ii_buffer); + if (!term_buffer) { + DEFINE_NAME(ii_buffer->ii); + MERR("[ii][buffer][merge] failed to allocate a term buffer: " + "<%.*s>: " + "<%u>", + name_size, name, + tid); + return; + } + nterm = term_buffer->header.nterms++; + bt = &term_buffer->terms[nterm]; + a[0] = SEG2POS(ii_buffer->lseg, + (sizeof(buffer_header) + sizeof(buffer_term) * nterm)); + packed_len = grn_p_encv(ctx, ii_buffer->data_vectors, + ii_buffer->ii->n_elements, + ii_buffer->packed_buf + + ii_buffer->packed_len); + a[1] = ii_buffer->data_vectors[0].data_size; + bt->tid = tid; + bt->size_in_buffer = 0; + bt->pos_in_buffer = 0; + bt->size_in_chunk = packed_len; + bt->pos_in_chunk = ii_buffer->packed_len; + ii_buffer->packed_len += packed_len; + if (((ii_buffer->curr_size * ii_buffer->update_buffer_size) + + (ii_buffer->total_size * term_buffer->header.nterms * 16)) >= + (ii_buffer->total_size * II_BUFFER_NTERMS_PER_BUFFER * 16)) { + grn_ii_buffer_chunk_flush(ctx, ii_buffer); + } + array_unref(ii_buffer->ii, tid); + } + } +} + +grn_ii_buffer * +grn_ii_buffer_open(grn_ctx *ctx, grn_ii *ii, + long long unsigned int update_buffer_size) +{ + if (ii && ii->lexicon) { + grn_ii_buffer *ii_buffer = GRN_MALLOCN(grn_ii_buffer, 1); + if (ii_buffer) { + ii_buffer->ii = ii; + ii_buffer->lexicon = ii->lexicon; + ii_buffer->tmp_lexicon = NULL; + ii_buffer->nblocks = 0; + ii_buffer->blocks = NULL; + ii_buffer->ncounters = II_BUFFER_NCOUNTERS_MARGIN; + ii_buffer->block_pos = 0; + ii_buffer->filepos = 0; + ii_buffer->curr_size = 0; + ii_buffer->total_size = 0; + ii_buffer->update_buffer_size = update_buffer_size; + ii_buffer->counters = GRN_CALLOC(ii_buffer->ncounters * + sizeof(ii_buffer_counter)); + ii_buffer->term_buffer = NULL; + ii_buffer->packed_buf = NULL; + ii_buffer->packed_len = 0; + ii_buffer->packed_buf_size = 0; + ii_buffer->total_chunk_size = 0; + ii_buffer->values = NULL; + ii_buffer->nvalues = 0; + ii_buffer->max_nvalues = 0; + ii_buffer->last_rid = 0; + if (ii_buffer->counters) { + ii_buffer->block_buf = GRN_MALLOCN(grn_id, II_BUFFER_BLOCK_SIZE); + if (ii_buffer->block_buf) { + grn_snprintf(ii_buffer->tmpfpath, PATH_MAX, PATH_MAX, + "%-.256sXXXXXX", grn_io_path(ii->seg)); + ii_buffer->block_buf_size = II_BUFFER_BLOCK_SIZE; + ii_buffer->tmpfd = grn_mkstemp(ii_buffer->tmpfpath); + if (ii_buffer->tmpfd != -1) { + grn_table_flags flags; + grn_table_get_info(ctx, ii->lexicon, &flags, NULL, NULL, NULL, + NULL); + if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { + grn_pat_cache_enable(ctx, (grn_pat *)ii->lexicon, + PAT_CACHE_SIZE); + } + return ii_buffer; + } else { + SERR("failed grn_mkstemp(%-.256s)", + ii_buffer->tmpfpath); + } + GRN_FREE(ii_buffer->block_buf); + } + GRN_FREE(ii_buffer->counters); + } + GRN_FREE(ii_buffer); + } + } else { + ERR(GRN_INVALID_ARGUMENT, "ii or ii->lexicon is NULL"); + } + return NULL; +} + +static void +ii_buffer_value_init(grn_ctx *ctx, ii_buffer_value *value) +{ + value->sid = 0; + value->weight = 0; + value->p = NULL; + value->len = 0; + value->buf = NULL; + value->cap = 0; +} + +static void +ii_buffer_value_fin(grn_ctx *ctx, ii_buffer_value *value) +{ + if (value->buf) { + GRN_FREE(value->buf); + } +} + +/* + * ii_buffer_values_append appends a value to ii_buffer. + * This function deep-copies the value if need_copy == GRN_TRUE. + */ +static void +ii_buffer_values_append(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + unsigned int sid, unsigned weight, + const char *p, uint32_t len, grn_bool need_copy) +{ + if (ii_buffer->nvalues == ii_buffer->max_nvalues) { + unsigned int i; + unsigned int new_max_nvalues = ii_buffer->max_nvalues * 2; + unsigned int new_size; + ii_buffer_value *new_values; + if (new_max_nvalues == 0) { + new_max_nvalues = 1; + } + new_size = new_max_nvalues * sizeof(ii_buffer_value); + new_values = (ii_buffer_value *)GRN_REALLOC(ii_buffer->values, new_size); + if (!new_values) { + return; + } + for (i = ii_buffer->max_nvalues; i < new_max_nvalues; i++) { + ii_buffer_value_init(ctx, &new_values[i]); + } + ii_buffer->values = new_values; + ii_buffer->max_nvalues = new_max_nvalues; + } + + { + ii_buffer_value *value = &ii_buffer->values[ii_buffer->nvalues]; + if (need_copy) { + if (len > value->cap) { + char *new_buf = (char *)GRN_REALLOC(value->buf, len); + if (!new_buf) { + return; + } + value->buf = new_buf; + value->cap = len; + } + grn_memcpy(value->buf, p, len); + p = value->buf; + } + value->sid = sid; + value->weight = weight; + value->p = p; + value->len = len; + ii_buffer->nvalues++; + } +} + +grn_rc +grn_ii_buffer_append(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + grn_id rid, unsigned int sid, grn_obj *value) +{ + if (rid != ii_buffer->last_rid) { + if (ii_buffer->last_rid) { + grn_ii_buffer_tokenize(ctx, ii_buffer, ii_buffer->last_rid); + } + ii_buffer->last_rid = rid; + } + ii_buffer_values_append(ctx, ii_buffer, sid, 0, + GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value), + GRN_TRUE); + return ctx->rc; +} + +/* + * grn_ii_buffer_commit completes tokenization and builds an inverted index + * from data in a temporary file. + */ +grn_rc +grn_ii_buffer_commit(grn_ctx *ctx, grn_ii_buffer *ii_buffer) +{ + /* Tokenize the remaining values and free resources. */ + if (ii_buffer->last_rid && ii_buffer->nvalues) { + grn_ii_buffer_tokenize(ctx, ii_buffer, ii_buffer->last_rid); + } + if (ii_buffer->block_pos) { + grn_ii_buffer_flush(ctx, ii_buffer); + } + if (ii_buffer->tmpfd != -1) { + grn_close(ii_buffer->tmpfd); + } + if (ii_buffer->block_buf) { + GRN_FREE(ii_buffer->block_buf); + ii_buffer->block_buf = NULL; + } + if (ii_buffer->counters) { + GRN_FREE(ii_buffer->counters); + ii_buffer->counters = NULL; + } + + if (ii_buffer->update_buffer_size && + ii_buffer->update_buffer_size < 20) { + if (ii_buffer->update_buffer_size < 10) { + ii_buffer->update_buffer_size = + ii_buffer->total_size >> (10 - ii_buffer->update_buffer_size); + } else { + ii_buffer->update_buffer_size = + ii_buffer->total_size << (ii_buffer->update_buffer_size - 10); + } + } + + GRN_LOG(ctx, GRN_LOG_DEBUG, + "nblocks=%d, update_buffer_size=%" GRN_FMT_INT64U, + ii_buffer->nblocks, ii_buffer->update_buffer_size); + + datavec_init(ctx, ii_buffer->data_vectors, ii_buffer->ii->n_elements, 0, 0); + grn_open(ii_buffer->tmpfd, + ii_buffer->tmpfpath, + O_RDONLY | GRN_OPEN_FLAG_BINARY); + if (ii_buffer->tmpfd == -1) { + ERRNO_ERR("failed to open path: <%-.256s>", ii_buffer->tmpfpath); + return ctx->rc; + } + { + /* Fetch the first term of each block. */ + uint32_t i; + for (i = 0; i < ii_buffer->nblocks; i++) { + grn_ii_buffer_fetch(ctx, ii_buffer, &ii_buffer->blocks[i]); + } + } + { + ii_buffer_block **hits; + if ((hits = GRN_MALLOCN(ii_buffer_block *, ii_buffer->nblocks))) { + grn_id tid; + grn_table_cursor *tc; + tc = grn_table_cursor_open(ctx, ii_buffer->lexicon, + NULL, 0, NULL, 0, 0, -1, II_BUFFER_ORDER); + if (tc) { + while ((tid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { + /* + * Find blocks which contain the current term. + * Then, merge the postings. + */ + int nrests = 0; + int nhits = 0; + uint32_t i; + for (i = 0; i < ii_buffer->nblocks; i++) { + if (ii_buffer->blocks[i].tid == tid) { + hits[nhits++] = &ii_buffer->blocks[i]; + } + if (ii_buffer->blocks[i].tid) { nrests++; } + } + if (nhits) { grn_ii_buffer_merge(ctx, ii_buffer, tid, hits, nhits); } + if (!nrests) { break; } + } + if (ii_buffer->packed_len) { + grn_ii_buffer_chunk_flush(ctx, ii_buffer); + } + grn_table_cursor_close(ctx, tc); + } + GRN_FREE(hits); + } + } + datavec_fin(ctx, ii_buffer->data_vectors); + GRN_LOG(ctx, GRN_LOG_DEBUG, + "tmpfile_size:%" GRN_FMT_INT64D " > total_chunk_size:%" GRN_FMT_SIZE, + ii_buffer->filepos, ii_buffer->total_chunk_size); + grn_close(ii_buffer->tmpfd); + if (grn_unlink(ii_buffer->tmpfpath) == 0) { + GRN_LOG(ctx, GRN_LOG_INFO, + "[ii][buffer][commit] removed temporary path: <%-.256s>", + ii_buffer->tmpfpath); + } else { + ERRNO_ERR("[ii][buffer][commit] failed to remove temporary path: <%-.256s>", + ii_buffer->tmpfpath); + } + ii_buffer->tmpfd = -1; + return ctx->rc; +} + +grn_rc +grn_ii_buffer_close(grn_ctx *ctx, grn_ii_buffer *ii_buffer) +{ + uint32_t i; + grn_table_flags flags; + grn_table_get_info(ctx, ii_buffer->ii->lexicon, &flags, NULL, NULL, NULL, + NULL); + if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { + grn_pat_cache_disable(ctx, (grn_pat *)ii_buffer->ii->lexicon); + } + if (ii_buffer->tmp_lexicon) { + grn_obj_close(ctx, ii_buffer->tmp_lexicon); + } + if (ii_buffer->tmpfd != -1) { + grn_close(ii_buffer->tmpfd); + if (grn_unlink(ii_buffer->tmpfpath) == 0) { + GRN_LOG(ctx, GRN_LOG_INFO, + "[ii][buffer][close] removed temporary path: <%-.256s>", + ii_buffer->tmpfpath); + } else { + ERRNO_ERR("[ii][buffer][close] failed to remove temporary path: <%-.256s>", + ii_buffer->tmpfpath); + } + } + if (ii_buffer->block_buf) { + GRN_FREE(ii_buffer->block_buf); + } + if (ii_buffer->counters) { + GRN_FREE(ii_buffer->counters); + } + if (ii_buffer->blocks) { + for (i = 0; i < ii_buffer->nblocks; i++) { + if (ii_buffer->blocks[i].buffer) { + GRN_FREE(ii_buffer->blocks[i].buffer); + } + } + GRN_FREE(ii_buffer->blocks); + } + if (ii_buffer->values) { + for (i = 0; i < ii_buffer->max_nvalues; i++) { + ii_buffer_value_fin(ctx, &ii_buffer->values[i]); + } + GRN_FREE(ii_buffer->values); + } + GRN_FREE(ii_buffer); + return ctx->rc; +} + +/* + * grn_ii_buffer_parse tokenizes values to be indexed. + * + * For each record of the target table, grn_ii_buffer_parse makes a list of + * target values and calls grn_ii_buffer_tokenize. To make a list of target + * values, ii_buffer_values_append is called for each value. Note that + * ii_buffer_values_append is called for each element for a vector. + */ +static void +grn_ii_buffer_parse(grn_ctx *ctx, grn_ii_buffer *ii_buffer, + grn_obj *target, int ncols, grn_obj **cols) +{ + grn_table_cursor *tc; + grn_obj *vobjs; + if ((vobjs = GRN_MALLOCN(grn_obj, ncols))) { + int i; + for (i = 0; i < ncols; i++) { + GRN_TEXT_INIT(&vobjs[i], 0); + } + if ((tc = grn_table_cursor_open(ctx, target, + NULL, 0, NULL, 0, 0, -1, + GRN_CURSOR_BY_ID))) { + grn_id rid; + while ((rid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { + unsigned int j; + int sid; + grn_obj **col; + for (sid = 1, col = cols; sid <= ncols; sid++, col++) { + grn_obj *rv = &vobjs[sid - 1]; + grn_obj_reinit_for(ctx, rv, *col); + if (GRN_OBJ_TABLEP(*col)) { + grn_table_get_key2(ctx, *col, rid, rv); + } else { + grn_obj_get_value(ctx, *col, rid, rv); + } + switch (rv->header.type) { + case GRN_BULK : + ii_buffer_values_append(ctx, ii_buffer, sid, 0, + GRN_TEXT_VALUE(rv), GRN_TEXT_LEN(rv), + GRN_FALSE); + break; + case GRN_UVECTOR : + { + unsigned int size; + unsigned int elem_size; + size = grn_uvector_size(ctx, rv); + elem_size = grn_uvector_element_size(ctx, rv); + for (j = 0; j < size; j++) { + ii_buffer_values_append(ctx, ii_buffer, sid, 0, + GRN_BULK_HEAD(rv) + (elem_size * j), + elem_size, GRN_FALSE); + } + } + break; + case GRN_VECTOR : + if (rv->u.v.body) { + int j; + int n_sections = rv->u.v.n_sections; + grn_section *sections = rv->u.v.sections; + const char *head = GRN_BULK_HEAD(rv->u.v.body); + for (j = 0; j < n_sections; j++) { + grn_section *section = sections + j; + if (section->length == 0) { + continue; + } + ii_buffer_values_append(ctx, ii_buffer, sid, section->weight, + head + section->offset, + section->length, GRN_FALSE); + } + } + break; + default : + ERR(GRN_INVALID_ARGUMENT, + "[index] invalid object assigned as value"); + break; + } + } + grn_ii_buffer_tokenize(ctx, ii_buffer, rid); + } + grn_table_cursor_close(ctx, tc); + } + for (i = 0; i < ncols; i++) { + GRN_OBJ_FIN(ctx, &vobjs[i]); + } + GRN_FREE(vobjs); + } +} + +grn_rc +grn_ii_build(grn_ctx *ctx, grn_ii *ii, uint64_t sparsity) +{ + grn_ii_buffer *ii_buffer; + + { + /* Do nothing if there are no targets. */ + grn_obj *data_table = grn_ctx_at(ctx, DB_OBJ(ii)->range); + if (!data_table) { + return ctx->rc; + } + if (grn_table_size(ctx, data_table) == 0) { + return ctx->rc; + } + } + + ii_buffer = grn_ii_buffer_open(ctx, ii, sparsity); + if (ii_buffer) { + grn_id *source = (grn_id *)ii->obj.source; + if (ii->obj.source_size && ii->obj.source) { + int ncols = ii->obj.source_size / sizeof(grn_id); + grn_obj **cols = GRN_MALLOCN(grn_obj *, ncols); + if (cols) { + int i; + for (i = 0; i < ncols; i++) { + if (!(cols[i] = grn_ctx_at(ctx, source[i]))) { break; } + } + if (i == ncols) { /* All the source columns are available. */ + grn_obj *target = cols[0]; + if (!GRN_OBJ_TABLEP(target)) { + target = grn_ctx_at(ctx, target->header.domain); + } + if (target) { + grn_ii_buffer_parse(ctx, ii_buffer, target, ncols, cols); + grn_ii_buffer_commit(ctx, ii_buffer); + } else { + ERR(GRN_INVALID_ARGUMENT, "failed to resolve the target"); + } + } else { + ERR(GRN_INVALID_ARGUMENT, "failed to resolve a column (%d)", i); + } + GRN_FREE(cols); + } + } else { + ERR(GRN_INVALID_ARGUMENT, "ii->obj.source is void"); + } + grn_ii_buffer_close(ctx, ii_buffer); + } + return ctx->rc; +} + +/* + * ========================================================================== + * The following part provides constants, structures and functions for static + * indexing. + * ========================================================================== + */ + +#define GRN_II_BUILDER_BUFFER_CHUNK_SIZE (S_CHUNK >> 2) + +#define GRN_II_BUILDER_MAX_LEXICON_CACHE_SIZE (1 << 24) + +#define GRN_II_BUILDER_MIN_BLOCK_THRESHOLD 1 +#define GRN_II_BUILDER_MAX_BLOCK_THRESHOLD (1 << 28) + +#define GRN_II_BUILDER_MIN_FILE_BUF_SIZE (1 << 12) +#define GRN_II_BUILDER_MAX_FILE_BUF_SIZE (1 << 30) + +#define GRN_II_BUILDER_MIN_BLOCK_BUF_SIZE (1 << 12) +#define GRN_II_BUILDER_MAX_BLOCK_BUF_SIZE (1 << 30) + +#define GRN_II_BUILDER_MIN_CHUNK_THRESHOLD 1 +#define GRN_II_BUILDER_MAX_CHUNK_THRESHOLD (1 << 28) + +#define GRN_II_BUILDER_MIN_BUFFER_MAX_N_TERMS 1 +#define GRN_II_BUILDER_MAX_BUFFER_MAX_N_TERMS \ + ((S_SEGMENT - sizeof(buffer_header)) / sizeof(buffer_term)) + +struct grn_ii_builder_options { + uint32_t lexicon_cache_size; /* Cache size of temporary lexicon */ + /* A block is flushed if builder->n reaches this value. */ + uint32_t block_threshold; + uint32_t file_buf_size; /* Buffer size for buffered output */ + uint32_t block_buf_size; /* Buffer size for buffered input */ + /* A chunk is flushed if chunk->n reaches this value. */ + uint32_t chunk_threshold; + uint32_t buffer_max_n_terms; /* Maximum number of terms in each buffer */ +}; + +static const grn_ii_builder_options grn_ii_builder_default_options = { + 0x80000, /* lexicon_cache_size */ + 0x4000000, /* block_threshold */ + 0x10000, /* file_buf_size */ + 0x10000, /* block_buf_size */ + 0x1000, /* chunk_threshold */ + 0x3000, /* buffer_max_n_terms */ +}; + +/* grn_ii_builder_options_init fills options with the default options. */ +void +grn_ii_builder_options_init(grn_ii_builder_options *options) +{ + *options = grn_ii_builder_default_options; +} + +/* grn_ii_builder_options_fix fixes out-of-range options. */ +static void +grn_ii_builder_options_fix(grn_ii_builder_options *options) +{ + if (options->lexicon_cache_size > GRN_II_BUILDER_MAX_LEXICON_CACHE_SIZE) { + options->lexicon_cache_size = GRN_II_BUILDER_MAX_LEXICON_CACHE_SIZE; + } + + if (options->block_threshold < GRN_II_BUILDER_MIN_BLOCK_THRESHOLD) { + options->block_threshold = GRN_II_BUILDER_MIN_BLOCK_THRESHOLD; + } + if (options->block_threshold > GRN_II_BUILDER_MAX_BLOCK_THRESHOLD) { + options->block_threshold = GRN_II_BUILDER_MAX_BLOCK_THRESHOLD; + } + + if (options->file_buf_size < GRN_II_BUILDER_MIN_FILE_BUF_SIZE) { + options->file_buf_size = GRN_II_BUILDER_MIN_FILE_BUF_SIZE; + } + if (options->file_buf_size > GRN_II_BUILDER_MAX_FILE_BUF_SIZE) { + options->file_buf_size = GRN_II_BUILDER_MAX_FILE_BUF_SIZE; + } + + if (options->block_buf_size < GRN_II_BUILDER_MIN_BLOCK_BUF_SIZE) { + options->block_buf_size = GRN_II_BUILDER_MIN_BLOCK_BUF_SIZE; + } + if (options->block_buf_size > GRN_II_BUILDER_MAX_BLOCK_BUF_SIZE) { + options->block_buf_size = GRN_II_BUILDER_MAX_BLOCK_BUF_SIZE; + } + + if (options->chunk_threshold < GRN_II_BUILDER_MIN_CHUNK_THRESHOLD) { + options->chunk_threshold = GRN_II_BUILDER_MIN_CHUNK_THRESHOLD; + } + if (options->chunk_threshold > GRN_II_BUILDER_MAX_CHUNK_THRESHOLD) { + options->chunk_threshold = GRN_II_BUILDER_MAX_CHUNK_THRESHOLD; + } + + if (options->buffer_max_n_terms < GRN_II_BUILDER_MIN_BUFFER_MAX_N_TERMS) { + options->buffer_max_n_terms = GRN_II_BUILDER_MIN_BUFFER_MAX_N_TERMS; + } + if (options->buffer_max_n_terms > GRN_II_BUILDER_MAX_BUFFER_MAX_N_TERMS) { + options->buffer_max_n_terms = GRN_II_BUILDER_MAX_BUFFER_MAX_N_TERMS; + } +} + +#define GRN_II_BUILDER_TERM_INPLACE_SIZE\ + (sizeof(grn_ii_builder_term) - (uintptr_t)&((grn_ii_builder_term *)0)->dummy) + +typedef struct { + grn_id rid; /* Last record ID */ + uint32_t sid; /* Last section ID */ + /* Last position (GRN_OBJ_WITH_POSITION) or frequency. */ + uint32_t pos_or_freq; + uint32_t offset; /* Buffer write offset */ + uint32_t size; /* Buffer size */ + uint32_t dummy; /* Padding */ + uint8_t *buf; /* Buffer (to be freed) */ +} grn_ii_builder_term; + +/* grn_ii_builder_term_is_inplace returns whether a term buffer is inplace. */ +inline static grn_bool +grn_ii_builder_term_is_inplace(grn_ii_builder_term *term) +{ + return term->size == GRN_II_BUILDER_TERM_INPLACE_SIZE; +} + +/* grn_ii_builder_term_get_buf returns a term buffer. */ +inline static uint8_t * +grn_ii_builder_term_get_buf(grn_ii_builder_term *term) +{ + if (grn_ii_builder_term_is_inplace(term)) { + return (uint8_t *)&term->dummy; + } else { + return term->buf; + } +} + +/* + * grn_ii_builder_term_init initializes a term. Note that an initialized term + * must be finalized by grn_ii_builder_term_fin. + */ +static void +grn_ii_builder_term_init(grn_ctx *ctx, grn_ii_builder_term *term) +{ + term->rid = GRN_ID_NIL; + term->sid = 0; + term->pos_or_freq = 0; + term->offset = 0; + term->size = GRN_II_BUILDER_TERM_INPLACE_SIZE; +} + +/* grn_ii_builder_term_fin finalizes a term. */ +static void +grn_ii_builder_term_fin(grn_ctx *ctx, grn_ii_builder_term *term) +{ + if (!grn_ii_builder_term_is_inplace(term)) { + GRN_FREE(term->buf); + } +} + +/* grn_ii_builder_term_reinit reinitializes a term. */ +static void +grn_ii_builder_term_reinit(grn_ctx *ctx, grn_ii_builder_term *term) +{ + grn_ii_builder_term_fin(ctx, term); + grn_ii_builder_term_init(ctx, term); +} + +/* grn_ii_builder_term_extend extends a term buffer. */ +static grn_rc +grn_ii_builder_term_extend(grn_ctx *ctx, grn_ii_builder_term *term) +{ + uint8_t *buf; + uint32_t size = term->size * 2; + if (grn_ii_builder_term_is_inplace(term)) { + buf = (uint8_t *)GRN_MALLOC(size); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for term buffer: size = %u", size); + return ctx->rc; + } + grn_memcpy(buf, &term->dummy, term->offset); + } else { + buf = (uint8_t *)GRN_REALLOC(term->buf, size); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to reallocate memory for term buffer: size = %u", size); + return ctx->rc; + } + } + term->buf = buf; + term->size = size; + return GRN_SUCCESS; +} + +/* grn_ii_builder_term_append appends an integer to a term buffer. */ +inline static grn_rc +grn_ii_builder_term_append(grn_ctx *ctx, grn_ii_builder_term *term, + uint64_t value) +{ + uint8_t *p; + if (value < (uint64_t)1 << 5) { + if (term->offset + 1 > term->size) { + grn_rc rc = grn_ii_builder_term_extend(ctx, term); + if (rc != GRN_SUCCESS) { + return rc; + } + } + p = grn_ii_builder_term_get_buf(term) + term->offset; + p[0] = (uint8_t)value; + term->offset++; + return GRN_SUCCESS; + } else if (value < (uint64_t)1 << 13) { + if (term->offset + 2 > term->size) { + grn_rc rc = grn_ii_builder_term_extend(ctx, term); + if (rc != GRN_SUCCESS) { + return rc; + } + } + p = grn_ii_builder_term_get_buf(term) + term->offset; + p[0] = (uint8_t)((value & 0x1f) | (1 << 5)); + p[1] = (uint8_t)(value >> 5); + term->offset += 2; + return GRN_SUCCESS; + } else { + uint8_t i, n; + if (value < (uint64_t)1 << 21) { + n = 3; + } else if (value < (uint64_t)1 << 29) { + n = 4; + } else if (value < (uint64_t)1 << 37) { + n = 5; + } else if (value < (uint64_t)1 << 45) { + n = 6; + } else if (value < (uint64_t)1 << 53) { + n = 7; + } else { + n = 8; + } + if (term->offset + n > term->size) { + grn_rc rc = grn_ii_builder_term_extend(ctx, term); + if (rc != GRN_SUCCESS) { + return rc; + } + } + p = grn_ii_builder_term_get_buf(term) + term->offset; + p[0] = (uint8_t)(value & 0x1f) | ((n - 1) << 5); + value >>= 5; + for (i = 1; i < n; i++) { + p[i] = (uint8_t)value; + value >>= 8; + } + term->offset += n; + return GRN_SUCCESS; + } +} + +typedef struct { + uint64_t offset; /* File offset */ + uint32_t rest; /* Remaining size */ + uint8_t *buf; /* Buffer (to be freed) */ + uint8_t *cur; /* Current pointer */ + uint8_t *end; /* End pointer */ + uint32_t tid; /* Term ID */ +} grn_ii_builder_block; + +/* + * grn_ii_builder_block_init initializes a block. Note that an initialized + * block must be finalized by grn_ii_builder_block_fin. + */ +static void +grn_ii_builder_block_init(grn_ctx *ctx, grn_ii_builder_block *block) +{ + block->offset = 0; + block->rest = 0; + block->buf = NULL; + block->cur = NULL; + block->end = NULL; + block->tid = GRN_ID_NIL; +} + +/* grn_ii_builder_block_fin finalizes a block. */ +static void +grn_ii_builder_block_fin(grn_ctx *ctx, grn_ii_builder_block *block) +{ + if (block->buf) { + GRN_FREE(block->buf); + } +} + +/* + * grn_ii_builder_block_next reads the next integer. Note that this function + * returns GRN_END_OF_DATA if it reaches the end of a block. + */ +inline static grn_rc +grn_ii_builder_block_next(grn_ctx *ctx, grn_ii_builder_block *block, + uint64_t *value) +{ + uint8_t n; + if (block->cur == block->end) { + return GRN_END_OF_DATA; + } + n = (*block->cur >> 5) + 1; + if (n > block->end - block->cur) { + return GRN_END_OF_DATA; + } + *value = 0; + switch (n) { + case 8 : + *value |= (uint64_t)block->cur[7] << 53; + case 7 : + *value |= (uint64_t)block->cur[6] << 45; + case 6 : + *value |= (uint64_t)block->cur[5] << 37; + case 5 : + *value |= (uint64_t)block->cur[4] << 29; + case 4 : + *value |= (uint64_t)block->cur[3] << 21; + case 3 : + *value |= (uint64_t)block->cur[2] << 13; + case 2 : + *value |= (uint64_t)block->cur[1] << 5; + case 1 : + *value |= block->cur[0] & 0x1f; + break; + } + block->cur += n; + return GRN_SUCCESS; +} + +typedef struct { + grn_ii *ii; /* Inverted index */ + uint32_t buf_id; /* Buffer ID */ + uint32_t buf_seg_id; /* Buffer segment ID */ + buffer *buf; /* Buffer (to be unreferenced) */ + uint32_t chunk_id; /* Chunk ID */ + uint32_t chunk_seg_id; /* Chunk segment ID */ + uint8_t *chunk; /* Chunk (to be unreferenced) */ + uint32_t chunk_offset; /* Chunk write position */ + uint32_t chunk_size; /* Chunk size */ +} grn_ii_builder_buffer; + +/* + * grn_ii_builder_buffer_init initializes a buffer. Note that a buffer must be + * finalized by grn_ii_builder_buffer_fin. + */ +static void +grn_ii_builder_buffer_init(grn_ctx *ctx, grn_ii_builder_buffer *buf, + grn_ii *ii) +{ + buf->ii = ii; + buf->buf_id = 0; + buf->buf_seg_id = 0; + buf->buf = NULL; + buf->chunk_id = 0; + buf->chunk_seg_id = 0; + buf->chunk = NULL; + buf->chunk_offset = 0; + buf->chunk_size = 0; +} + +/* grn_ii_builder_buffer_fin finalizes a buffer. */ +static void +grn_ii_builder_buffer_fin(grn_ctx *ctx, grn_ii_builder_buffer *buf) +{ + if (buf->buf) { + GRN_IO_SEG_UNREF(buf->ii->seg, buf->buf_seg_id); + } + if (buf->chunk) { + GRN_IO_SEG_UNREF(buf->ii->chunk, buf->chunk_seg_id); + } +} + +/* grn_ii_builder_buffer_is_assigned returns whether a buffer is assigned. */ +static grn_bool +grn_ii_builder_buffer_is_assigned(grn_ctx *ctx, grn_ii_builder_buffer *buf) +{ + return buf->buf != NULL; +} + +/* grn_ii_builder_buffer_assign assigns a buffer. */ +static grn_rc +grn_ii_builder_buffer_assign(grn_ctx *ctx, grn_ii_builder_buffer *buf, + size_t min_chunk_size) +{ + void *seg; + size_t chunk_size; + grn_rc rc; + + /* Create a buffer. */ + buf->buf_id = GRN_II_PSEG_NOT_ASSIGNED; + rc = buffer_segment_new(ctx, buf->ii, &buf->buf_id); + if (rc != GRN_SUCCESS) { + if (ctx->rc != GRN_SUCCESS) { + ERR(rc, "failed to allocate segment for buffer"); + } + return rc; + } + buf->buf_seg_id = buf->ii->header->binfo[buf->buf_id]; + GRN_IO_SEG_REF(buf->ii->seg, buf->buf_seg_id, seg); + if (!seg) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, + "failed access buffer segment: buf_id = %u, seg_id = %u", + buf->buf_id, buf->buf_seg_id); + } + return ctx->rc; + } + buf->buf = (buffer *)seg; + + /* Create a chunk. */ + chunk_size = GRN_II_BUILDER_BUFFER_CHUNK_SIZE; + while (chunk_size < min_chunk_size) { + chunk_size *= 2; + } + rc = chunk_new(ctx, buf->ii, &buf->chunk_id, chunk_size); + if (rc != GRN_SUCCESS) { + return rc; + } + buf->chunk_seg_id = buf->chunk_id >> GRN_II_N_CHUNK_VARIATION; + GRN_IO_SEG_REF(buf->ii->chunk, buf->chunk_seg_id, seg); + if (!seg) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, + "failed access chunk segment: chunk_id = %u, seg_id = %u", + buf->chunk_id, buf->chunk_seg_id); + } + return ctx->rc; + } + buf->chunk = (uint8_t *)seg; + buf->chunk += (buf->chunk_id & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << + GRN_II_W_LEAST_CHUNK; + buf->chunk_offset = 0; + buf->chunk_size = chunk_size; + + buf->buf->header.chunk = buf->chunk_id; + buf->buf->header.chunk_size = chunk_size; + buf->buf->header.buffer_free = S_SEGMENT - sizeof(buffer_header); + buf->buf->header.nterms = 0; + buf->buf->header.nterms_void = 0; + buf->ii->header->total_chunk_size += chunk_size; + return GRN_SUCCESS; +} + +/* grn_ii_builder_buffer_flush flushes a buffer. */ +static grn_rc +grn_ii_builder_buffer_flush(grn_ctx *ctx, grn_ii_builder_buffer *buf) +{ + grn_ii *ii; + + buf->buf->header.buffer_free = S_SEGMENT - sizeof(buffer_header) - + buf->buf->header.nterms * sizeof(buffer_term); + GRN_LOG(ctx, GRN_LOG_DEBUG, + "n_terms = %u, chunk_offset = %u, chunk_size = %u, total = %" + GRN_FMT_INT64U "KB", + buf->buf->header.nterms, + buf->chunk_offset, + buf->buf->header.chunk_size, + buf->ii->header->total_chunk_size >> 10); + + ii = buf->ii; + grn_ii_builder_buffer_fin(ctx, buf); + grn_ii_builder_buffer_init(ctx, buf, ii); + return GRN_SUCCESS; +} + +typedef struct { + grn_id tid; /* Term ID */ + uint32_t n; /* Number of integers in buffers */ + grn_id rid; /* Record ID */ + uint32_t rid_gap; /* Record ID gap */ + uint64_t pos_sum; /* Sum of position gaps */ + + uint32_t offset; /* Write offset */ + uint32_t size; /* Buffer size */ + grn_id *rid_buf; /* Buffer for record IDs (to be freed) */ + uint32_t *sid_buf; /* Buffer for section IDs (to be freed) */ + uint32_t *freq_buf; /* Buffer for frequencies (to be freed) */ + uint32_t *weight_buf; /* Buffer for weights (to be freed) */ + + uint32_t pos_offset; /* Write offset of pos_buf */ + uint32_t pos_size; /* Buffer size of pos_buf */ + uint32_t *pos_buf; /* Buffer for positions (to be freed) */ + + size_t enc_offset; /* Write offset of enc_buf */ + size_t enc_size; /* Buffer size of enc_buf */ + uint8_t *enc_buf; /* Buffer for encoded data (to be freed) */ +} grn_ii_builder_chunk; + +/* + * grn_ii_builder_chunk_init initializes a chunk. Note that an initialized + * chunk must be finalized by grn_ii_builder_chunk_fin. + */ +static void +grn_ii_builder_chunk_init(grn_ctx *ctx, grn_ii_builder_chunk *chunk) +{ + chunk->tid = GRN_ID_NIL; + chunk->n = 0; + chunk->rid = GRN_ID_NIL; + chunk->rid_gap = 0; + chunk->pos_sum = 0; + + chunk->offset = 0; + chunk->size = 0; + chunk->rid_buf = NULL; + chunk->sid_buf = NULL; + chunk->freq_buf = NULL; + chunk->weight_buf = NULL; + + chunk->pos_offset = 0; + chunk->pos_size = 0; + chunk->pos_buf = NULL; + + chunk->enc_offset = 0; + chunk->enc_size = 0; + chunk->enc_buf = NULL; +} + +/* grn_ii_builder_chunk_fin finalizes a chunk. */ +static void +grn_ii_builder_chunk_fin(grn_ctx *ctx, grn_ii_builder_chunk *chunk) +{ + if (chunk->enc_buf) { + GRN_FREE(chunk->enc_buf); + } + if (chunk->pos_buf) { + GRN_FREE(chunk->pos_buf); + } + if (chunk->weight_buf) { + GRN_FREE(chunk->weight_buf); + } + if (chunk->freq_buf) { + GRN_FREE(chunk->freq_buf); + } + if (chunk->sid_buf) { + GRN_FREE(chunk->sid_buf); + } + if (chunk->rid_buf) { + GRN_FREE(chunk->rid_buf); + } +} + +/* + * grn_ii_builder_chunk_clear clears stats except rid and buffers except + * enc_buf. + */ +static void +grn_ii_builder_chunk_clear(grn_ctx *ctx, grn_ii_builder_chunk *chunk) +{ + chunk->n = 0; + chunk->rid_gap = 0; + chunk->pos_sum = 0; + chunk->offset = 0; + chunk->pos_offset = 0; +} + +/* + * grn_ii_builder_chunk_extend_bufs extends buffers except pos_buf and enc_buf. + */ +static grn_rc +grn_ii_builder_chunk_extend_bufs(grn_ctx *ctx, grn_ii_builder_chunk *chunk, + uint32_t ii_flags) +{ + uint32_t *buf, size = chunk->size ? chunk->size * 2 : 1; + size_t n_bytes = size * sizeof(uint32_t); + + buf = (uint32_t *)GRN_REALLOC(chunk->rid_buf, n_bytes); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for record IDs: n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + chunk->rid_buf = buf; + + if (ii_flags & GRN_OBJ_WITH_SECTION) { + buf = (uint32_t *)GRN_REALLOC(chunk->sid_buf, n_bytes); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for section IDs:" + " n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + chunk->sid_buf = buf; + } + + buf = (uint32_t *)GRN_REALLOC(chunk->freq_buf, n_bytes); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for frequencies: n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + chunk->freq_buf = buf; + + if (ii_flags & GRN_OBJ_WITH_WEIGHT) { + buf = (uint32_t *)GRN_REALLOC(chunk->weight_buf, n_bytes); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for weights: n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + chunk->weight_buf = buf; + } + + chunk->size = size; + return GRN_SUCCESS; +} + +/* grn_ii_builder_chunk_extend_pos_buf extends pos_buf. */ +static grn_rc +grn_ii_builder_chunk_extend_pos_buf(grn_ctx *ctx, grn_ii_builder_chunk *chunk) +{ + uint32_t *buf, size = chunk->pos_size ? chunk->pos_size * 2 : 1; + size_t n_bytes = size * sizeof(uint32_t); + buf = (uint32_t *)GRN_REALLOC(chunk->pos_buf, n_bytes); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for positions: n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + chunk->pos_buf = buf; + chunk->pos_size = size; + return GRN_SUCCESS; +} + +/* + * grn_ii_builder_chunk_reserve_enc_buf estimates a size that is enough to + * store encoded data and allocates memory to enc_buf. + */ +static grn_rc +grn_ii_builder_chunk_reserve_enc_buf(grn_ctx *ctx, grn_ii_builder_chunk *chunk, + uint32_t n_cinfos) +{ + size_t rich_size = (chunk->n + 4) * sizeof(uint32_t) + + n_cinfos * sizeof(chunk_info); + if (chunk->enc_size < rich_size) { + size_t size = chunk->enc_size ? chunk->enc_size * 2 : 1; + uint8_t *buf; + while (size < rich_size) { + size *= 2; + } + buf = GRN_REALLOC(chunk->enc_buf, size); + if (!buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for encoding: size = %" GRN_FMT_SIZE, + size); + return ctx->rc; + } + chunk->enc_buf = buf; + chunk->enc_size = size; + } + chunk->enc_offset = 0; + return GRN_SUCCESS; +} + +/* grn_ii_builder_chunk_encode encodes a chunk buffer. */ +static void +grn_ii_builder_chunk_encode_buf(grn_ctx *ctx, grn_ii_builder_chunk *chunk, + uint32_t *values, uint32_t n_values, + grn_bool use_p_enc) +{ + uint8_t *p = chunk->enc_buf + chunk->enc_offset; + uint32_t i; + if (use_p_enc) { + uint8_t freq[33]; + uint32_t buf[UNIT_SIZE]; + while (n_values >= UNIT_SIZE) { + memset(freq, 0, 33); + for (i = 0; i < UNIT_SIZE; i++) { + buf[i] = values[i]; + if (buf[i]) { + uint32_t w; + GRN_BIT_SCAN_REV(buf[i], w); + freq[w + 1]++; + } else { + freq[0]++; + } + } + p = pack(buf, UNIT_SIZE, freq, p); + values += UNIT_SIZE; + n_values -= UNIT_SIZE; + } + if (n_values) { + memset(freq, 0, 33); + for (i = 0; i < n_values; i++) { + buf[i] = values[i]; + if (buf[i]) { + uint32_t w; + GRN_BIT_SCAN_REV(buf[i], w); + freq[w + 1]++; + } else { + freq[0]++; + } + } + p = pack(buf, n_values, freq, p); + } + } else { + for (i = 0; i < n_values; i++) { + GRN_B_ENC(values[i], p); + } + } + chunk->enc_offset = p - chunk->enc_buf; +} + +/* grn_ii_builder_chunk_encode encodes a chunk. */ +static grn_rc +grn_ii_builder_chunk_encode(grn_ctx *ctx, grn_ii_builder_chunk *chunk, + chunk_info *cinfos, uint32_t n_cinfos) +{ + grn_rc rc; + uint8_t *p; + uint8_t shift = 0, use_p_enc_flags = 0; + uint8_t rid_use_p_enc, rest_use_p_enc, pos_use_p_enc = 0; + + /* Choose an encoding. */ + rid_use_p_enc = chunk->offset >= 16 && chunk->offset > (chunk->rid >> 8); + use_p_enc_flags |= rid_use_p_enc << shift++; + rest_use_p_enc = chunk->offset >= 3; + if (chunk->sid_buf) { + use_p_enc_flags |= rest_use_p_enc << shift++; + } + use_p_enc_flags |= rest_use_p_enc << shift++; + if (chunk->weight_buf) { + use_p_enc_flags |= rest_use_p_enc << shift++; + } + if (chunk->pos_buf) { + pos_use_p_enc = chunk->pos_offset >= 32 && + chunk->pos_offset > (chunk->pos_sum >> 13); + use_p_enc_flags |= pos_use_p_enc << shift++; + } + + rc = grn_ii_builder_chunk_reserve_enc_buf(ctx, chunk, n_cinfos); + if (rc != GRN_SUCCESS) { + return rc; + } + + /* Encode a header. */ + p = chunk->enc_buf; + if (n_cinfos) { + uint32_t i; + GRN_B_ENC(n_cinfos, p); + for (i = 0; i < n_cinfos; i++) { + GRN_B_ENC(cinfos[i].segno, p); + GRN_B_ENC(cinfos[i].size, p); + GRN_B_ENC(cinfos[i].dgap, p); + } + } + if (use_p_enc_flags) { + GRN_B_ENC(use_p_enc_flags << 1, p); + GRN_B_ENC(chunk->offset, p); + if (chunk->pos_buf) { + GRN_B_ENC(chunk->pos_offset - chunk->offset, p); + } + } else { + GRN_B_ENC((chunk->offset << 1) | 1, p); + } + chunk->enc_offset = p - chunk->enc_buf; + + /* Encode a body. */ + grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->rid_buf, chunk->offset, + rid_use_p_enc); + if (chunk->sid_buf) { + grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->sid_buf, chunk->offset, + rest_use_p_enc); + } + grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->freq_buf, chunk->offset, + rest_use_p_enc); + if (chunk->weight_buf) { + grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->weight_buf, + chunk->offset, rest_use_p_enc); + } + if (chunk->pos_buf) { + grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->pos_buf, + chunk->pos_offset, pos_use_p_enc); + } + + return GRN_SUCCESS; +} + +typedef struct { + grn_ii *ii; /* Building inverted index */ + grn_ii_builder_options options; /* Options */ + + grn_obj *src_table; /* Source table */ + grn_obj **srcs; /* Source columns (to be freed) */ + uint32_t n_srcs; /* Number of source columns */ + uint8_t sid_bits; /* Number of bits for section ID */ + uint64_t sid_mask; /* Mask bits for section ID */ + + grn_obj *lexicon; /* Block lexicon (to be closed) */ + grn_obj *tokenizer; /* Lexicon's tokenizer */ + grn_obj *normalizer; /* Lexicon's normalzier */ + + uint32_t n; /* Number of integers appended to the current block */ + grn_id rid; /* Record ID */ + uint32_t sid; /* Section ID */ + uint32_t pos; /* Position */ + + grn_ii_builder_term *terms; /* Terms (to be freed) */ + uint32_t n_terms; /* Number of distinct terms */ + uint32_t max_n_terms; /* Maximum number of distinct terms */ + uint32_t terms_size; /* Buffer size of terms */ + + /* A temporary file to save blocks. */ + char path[PATH_MAX]; /* File path */ + int fd; /* File descriptor (to be closed) */ + uint8_t *file_buf; /* File buffer for buffered output (to be freed) */ + uint32_t file_buf_offset; /* File buffer write offset */ + + grn_ii_builder_block *blocks; /* Blocks (to be freed) */ + uint32_t n_blocks; /* Number of blocks */ + uint32_t blocks_size; /* Buffer size of blocks */ + + grn_ii_builder_buffer buf; /* Buffer (to be finalized) */ + grn_ii_builder_chunk chunk; /* Chunk (to be finalized) */ + + uint32_t df; /* Document frequency (number of sections) */ + chunk_info *cinfos; /* Chunk headers (to be freed) */ + uint32_t n_cinfos; /* Number of chunks */ + uint32_t cinfos_size; /* Size of cinfos */ +} grn_ii_builder; + +/* + * grn_ii_builder_init initializes a builder. Note that an initialized builder + * must be finalized by grn_ii_builder_fin. + */ +static grn_rc +grn_ii_builder_init(grn_ctx *ctx, grn_ii_builder *builder, + grn_ii *ii, const grn_ii_builder_options *options) +{ + builder->ii = ii; + builder->options = *options; + if (grn_ii_builder_block_threshold_force > 0) { + builder->options.block_threshold = grn_ii_builder_block_threshold_force; + } + grn_ii_builder_options_fix(&builder->options); + + builder->src_table = NULL; + builder->srcs = NULL; + builder->n_srcs = 0; + builder->sid_bits = 0; + builder->sid_mask = 0; + + builder->lexicon = NULL; + builder->tokenizer = NULL; + builder->normalizer = NULL; + + builder->n = 0; + builder->rid = GRN_ID_NIL; + builder->sid = 0; + builder->pos = 0; + + builder->terms = NULL; + builder->n_terms = 0; + builder->max_n_terms = 0; + builder->terms_size = 0; + + builder->path[0] = '\0'; + builder->fd = -1; + builder->file_buf = NULL; + builder->file_buf_offset = 0; + + builder->blocks = NULL; + builder->n_blocks = 0; + builder->blocks_size = 0; + + grn_ii_builder_buffer_init(ctx, &builder->buf, ii); + grn_ii_builder_chunk_init(ctx, &builder->chunk); + + builder->df = 0; + builder->cinfos = NULL; + builder->n_cinfos = 0; + builder->cinfos_size = 0; + + return GRN_SUCCESS; +} + +/* grn_ii_builder_fin_terms finalizes terms. */ +static void +grn_ii_builder_fin_terms(grn_ctx *ctx, grn_ii_builder *builder) +{ + if (builder->terms) { + uint32_t i; + for (i = 0; i < builder->max_n_terms; i++) { + grn_ii_builder_term_fin(ctx, &builder->terms[i]); + } + GRN_FREE(builder->terms); + + /* To avoid double finalization. */ + builder->terms = NULL; + } +} + +/* grn_ii_builder_fin finalizes a builder. */ +static grn_rc +grn_ii_builder_fin(grn_ctx *ctx, grn_ii_builder *builder) +{ + if (builder->cinfos) { + GRN_FREE(builder->cinfos); + } + grn_ii_builder_chunk_fin(ctx, &builder->chunk); + grn_ii_builder_buffer_fin(ctx, &builder->buf); + if (builder->blocks) { + uint32_t i; + for (i = 0; i < builder->n_blocks; i++) { + grn_ii_builder_block_fin(ctx, &builder->blocks[i]); + } + GRN_FREE(builder->blocks); + } + if (builder->file_buf) { + GRN_FREE(builder->file_buf); + } + if (builder->fd != -1) { + grn_close(builder->fd); + if (grn_unlink(builder->path) == 0) { + GRN_LOG(ctx, GRN_LOG_INFO, + "[ii][builder][fin] removed path: <%-.256s>", + builder->path); + } else { + ERRNO_ERR("[ii][builder][fin] failed to remove path: <%-.256s>", + builder->path); + } + } + grn_ii_builder_fin_terms(ctx, builder); + if (builder->lexicon) { + grn_obj_close(ctx, builder->lexicon); + } + if (builder->srcs) { + GRN_FREE(builder->srcs); + } + return GRN_SUCCESS; +} + +/* + * grn_ii_builder_open creates a builder. Note that a builder must be closed by + * grn_ii_builder_close. + */ +static grn_rc +grn_ii_builder_open(grn_ctx *ctx, grn_ii *ii, + const grn_ii_builder_options *options, + grn_ii_builder **builder) +{ + grn_rc rc; + grn_ii_builder *new_builder = GRN_MALLOCN(grn_ii_builder, 1); + if (!new_builder) { + return GRN_NO_MEMORY_AVAILABLE; + } + if (!options) { + options = &grn_ii_builder_default_options; + } + rc = grn_ii_builder_init(ctx, new_builder, ii, options); + if (rc != GRN_SUCCESS) { + GRN_FREE(new_builder); + return rc; + } + *builder = new_builder; + return GRN_SUCCESS; +} + +/* grn_ii_builder_close closes a builder. */ +static grn_rc +grn_ii_builder_close(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_rc rc; + if (!builder) { + ERR(GRN_INVALID_ARGUMENT, "builder is null"); + return ctx->rc; + } + rc = grn_ii_builder_fin(ctx, builder); + GRN_FREE(builder); + return rc; +} + +/* grn_ii_builder_create_lexicon creates a block lexicon. */ +static grn_rc +grn_ii_builder_create_lexicon(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_table_flags flags; + grn_obj *domain = grn_ctx_at(ctx, builder->ii->lexicon->header.domain); + grn_obj *range = grn_ctx_at(ctx, DB_OBJ(builder->ii->lexicon)->range); + grn_obj *tokenizer, *normalizer, *token_filters; + grn_rc rc = grn_table_get_info(ctx, builder->ii->lexicon, &flags, NULL, + &tokenizer, &normalizer, &token_filters); + if (rc != GRN_SUCCESS) { + return rc; + } + flags &= ~GRN_OBJ_PERSISTENT; + builder->lexicon = grn_table_create(ctx, NULL, 0, NULL, + flags, domain, range); + if (!builder->lexicon) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, "[index] failed to create a block lexicon"); + } + return ctx->rc; + } + builder->tokenizer = tokenizer; + builder->normalizer = normalizer; + rc = grn_obj_set_info(ctx, builder->lexicon, + GRN_INFO_DEFAULT_TOKENIZER, tokenizer); + if (rc == GRN_SUCCESS) { + rc = grn_obj_set_info(ctx, builder->lexicon, + GRN_INFO_NORMALIZER, normalizer); + if (rc == GRN_SUCCESS) { + rc = grn_obj_set_info(ctx, builder->lexicon, + GRN_INFO_TOKEN_FILTERS, token_filters); + } + } + if (rc != GRN_SUCCESS) { + return rc; + } + if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { + if (builder->options.lexicon_cache_size) { + rc = grn_pat_cache_enable(ctx, (grn_pat *)builder->lexicon, + builder->options.lexicon_cache_size); + if (rc != GRN_SUCCESS) { + return rc; + } + } + } + return GRN_SUCCESS; +} + +/* + * grn_ii_builder_extend_terms extends a buffer for terms in order to make + * terms[n_terms - 1] available. + */ +static grn_rc +grn_ii_builder_extend_terms(grn_ctx *ctx, grn_ii_builder *builder, + uint32_t n_terms) +{ + if (n_terms <= builder->n_terms) { + return GRN_SUCCESS; + } + + if (n_terms > builder->max_n_terms) { + uint32_t i; + if (n_terms > builder->terms_size) { + /* Resize builder->terms for new terms. */ + size_t n_bytes; + uint32_t terms_size = builder->terms_size ? builder->terms_size * 2 : 1; + grn_ii_builder_term *terms; + while (terms_size < n_terms) { + terms_size *= 2; + } + n_bytes = terms_size * sizeof(grn_ii_builder_term); + terms = (grn_ii_builder_term *)GRN_REALLOC(builder->terms, n_bytes); + if (!terms) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for terms: n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + builder->terms = terms; + builder->terms_size = terms_size; + } + /* Initialize new terms. */ + for (i = builder->max_n_terms; i < n_terms; i++) { + grn_ii_builder_term_init(ctx, &builder->terms[i]); + } + builder->max_n_terms = n_terms; + } + + builder->n += n_terms - builder->n_terms; + builder->n_terms = n_terms; + return GRN_SUCCESS; +} + +/* grn_ii_builder_get_term gets a term associated with tid. */ +inline static grn_rc +grn_ii_builder_get_term(grn_ctx *ctx, grn_ii_builder *builder, grn_id tid, + grn_ii_builder_term **term) +{ + uint32_t n_terms = tid; + if (n_terms > builder->n_terms) { + grn_rc rc = grn_ii_builder_extend_terms(ctx, builder, n_terms); + if (rc != GRN_SUCCESS) { + return rc; + } + } + *term = &builder->terms[tid - 1]; + return GRN_SUCCESS; +} + +/* grn_ii_builder_flush_file_buf flushes buffered data as a block. */ +static grn_rc +grn_ii_builder_flush_file_buf(grn_ctx *ctx, grn_ii_builder *builder) +{ + if (builder->file_buf_offset) { + ssize_t size = grn_write(builder->fd, builder->file_buf, + builder->file_buf_offset); + if ((uint64_t)size != builder->file_buf_offset) { + SERR("failed to write data: expected = %u, actual = %" GRN_FMT_INT64D, + builder->file_buf_offset, (int64_t)size); + } + builder->file_buf_offset = 0; + } + return GRN_SUCCESS; +} + +/* grn_ii_builder_flush_term flushes a term and clears it */ +static grn_rc +grn_ii_builder_flush_term(grn_ctx *ctx, grn_ii_builder *builder, + grn_ii_builder_term *term) +{ + grn_rc rc; + uint8_t *term_buf; + + /* Append sentinels. */ + if (term->rid != GRN_ID_NIL) { + if (builder->ii->header->flags & GRN_OBJ_WITH_POSITION) { + rc = grn_ii_builder_term_append(ctx, term, 0); + } else { + rc = grn_ii_builder_term_append(ctx, term, term->pos_or_freq); + } + if (rc != GRN_SUCCESS) { + return rc; + } + } + rc = grn_ii_builder_term_append(ctx, term, 0); + if (rc != GRN_SUCCESS) { + return rc; + } + + { + /* Put the global term ID. */ + int key_size; + char key[GRN_TABLE_MAX_KEY_SIZE]; + uint8_t *p; + uint32_t rest, value; + grn_rc rc; + grn_id local_tid = term - builder->terms + 1, global_tid; + key_size = grn_table_get_key(ctx, builder->lexicon, local_tid, + key, GRN_TABLE_MAX_KEY_SIZE); + if (!key_size) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, "failed to get key: tid = %u", local_tid); + } + return ctx->rc; + } + global_tid = grn_table_add(ctx, builder->ii->lexicon, key, key_size, NULL); + if (global_tid == GRN_ID_NIL) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, + "failed to get global term ID: tid = %u, key = \"%.*s\"", + local_tid, key_size, key); + } + return ctx->rc; + } + + rest = builder->options.file_buf_size - builder->file_buf_offset; + if (rest < 10) { + rc = grn_ii_builder_flush_file_buf(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + } + value = global_tid; + p = builder->file_buf + builder->file_buf_offset; + if (value < 1U << 5) { + p[0] = (uint8_t)value; + builder->file_buf_offset++; + } else if (value < 1U << 13) { + p[0] = (uint8_t)((value & 0x1f) | (1 << 5)); + p[1] = (uint8_t)(value >> 5); + builder->file_buf_offset += 2; + } else { + uint8_t i, n; + if (value < 1U << 21) { + n = 3; + } else if (value < 1U << 29) { + n = 4; + } else { + n = 5; + } + p[0] = (uint8_t)(value & 0x1f) | ((n - 1) << 5); + value >>= 5; + for (i = 1; i < n; i++) { + p[i] = (uint8_t)value; + value >>= 8; + } + builder->file_buf_offset += n; + } + } + + /* Flush a term buffer. */ + term_buf = grn_ii_builder_term_get_buf(term); + if (term->offset > builder->options.file_buf_size) { + ssize_t size; + rc = grn_ii_builder_flush_file_buf(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + size = grn_write(builder->fd, term_buf, term->offset); + if ((uint64_t)size != term->offset) { + SERR("failed to write data: expected = %u, actual = %" GRN_FMT_INT64D, + term->offset, (int64_t)size); + } + } else { + uint32_t rest = builder->options.file_buf_size - builder->file_buf_offset; + if (term->offset <= rest) { + grn_memcpy(builder->file_buf + builder->file_buf_offset, + term_buf, term->offset); + builder->file_buf_offset += term->offset; + } else { + grn_memcpy(builder->file_buf + builder->file_buf_offset, + term_buf, rest); + builder->file_buf_offset += rest; + rc = grn_ii_builder_flush_file_buf(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->file_buf_offset = term->offset - rest; + grn_memcpy(builder->file_buf, term_buf + rest, builder->file_buf_offset); + } + } + grn_ii_builder_term_reinit(ctx, term); + return GRN_SUCCESS; +} + +/* + * grn_ii_builder_create_file creates a temporary file and allocates memory for + * buffered output. + */ +static grn_rc +grn_ii_builder_create_file(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_snprintf(builder->path, PATH_MAX, PATH_MAX, + "%-.256sXXXXXX", grn_io_path(builder->ii->seg)); + builder->fd = grn_mkstemp(builder->path); + if (builder->fd == -1) { + SERR("failed to create a temporary file: path = \"%-.256s\"", + builder->path); + return ctx->rc; + } + builder->file_buf = (uint8_t *)GRN_MALLOC(builder->options.file_buf_size); + if (!builder->file_buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for buffered output: size = %u", + builder->options.file_buf_size); + return ctx->rc; + } + return GRN_SUCCESS; +} + +/* grn_ii_builder_register_block registers a block. */ +static grn_rc +grn_ii_builder_register_block(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_ii_builder_block *block; + uint64_t file_offset = grn_lseek(builder->fd, 0, SEEK_CUR); + if (file_offset == (uint64_t)-1) { + SERR("failed to get file offset"); + return ctx->rc; + } + if (builder->n_blocks >= builder->blocks_size) { + size_t n_bytes; + uint32_t blocks_size = 1; + grn_ii_builder_block *blocks; + while (blocks_size <= builder->n_blocks) { + blocks_size *= 2; + } + n_bytes = blocks_size * sizeof(grn_ii_builder_block); + blocks = (grn_ii_builder_block *)GRN_REALLOC(builder->blocks, n_bytes); + if (!blocks) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for block: n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + builder->blocks = blocks; + builder->blocks_size = blocks_size; + } + block = &builder->blocks[builder->n_blocks]; + grn_ii_builder_block_init(ctx, block); + if (!builder->n_blocks) { + block->offset = 0; + } else { + grn_ii_builder_block *prev_block = &builder->blocks[builder->n_blocks - 1]; + block->offset = prev_block->offset + prev_block->rest; + } + block->rest = (uint32_t)(file_offset - block->offset); + builder->n_blocks++; + return GRN_SUCCESS; +} + +/* grn_ii_builder_flush_block flushes a block to a temporary file. */ +static grn_rc +grn_ii_builder_flush_block(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_rc rc; + grn_table_cursor *cursor; + + if (!builder->n) { + /* Do nothing if there are no output data. */ + return GRN_SUCCESS; + } + if (builder->fd == -1) { + rc = grn_ii_builder_create_file(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + } + + /* Flush terms into a temporary file. */ + cursor = grn_table_cursor_open(ctx, builder->lexicon, + NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_KEY); + for (;;) { + grn_id tid = grn_table_cursor_next(ctx, cursor); + if (tid == GRN_ID_NIL) { + break; + } + rc = grn_ii_builder_flush_term(ctx, builder, &builder->terms[tid - 1]); + if (rc != GRN_SUCCESS) { + return rc; + } + } + grn_table_cursor_close(ctx, cursor); + rc = grn_ii_builder_flush_file_buf(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + + /* Register a block and clear the current data. */ + rc = grn_ii_builder_register_block(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + rc = grn_table_truncate(ctx, builder->lexicon); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->rid = GRN_ID_NIL; + builder->n_terms = 0; + builder->n = 0; + return GRN_SUCCESS; +} + +/* grn_ii_builder_append_token appends a token. */ +static grn_rc +grn_ii_builder_append_token(grn_ctx *ctx, grn_ii_builder *builder, + grn_id rid, uint32_t sid, uint32_t weight, + grn_id tid, uint32_t pos) +{ + grn_rc rc; + uint32_t ii_flags = builder->ii->header->flags; + grn_ii_builder_term *term; + rc = grn_ii_builder_get_term(ctx, builder, tid, &term); + if (rc != GRN_SUCCESS) { + return rc; + } + if (rid != term->rid || sid != term->sid) { + uint64_t rsid; + if (term->rid != GRN_ID_NIL) { + if (ii_flags & GRN_OBJ_WITH_POSITION) { + /* Append the end of positions. */ + rc = grn_ii_builder_term_append(ctx, term, 0); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->n++; + } else { + /* Append a frequency if positions are not available. */ + rc = grn_ii_builder_term_append(ctx, term, term->pos_or_freq); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->n++; + } + } + rsid = ((uint64_t)(rid - term->rid) << builder->sid_bits) | (sid - 1); + rc = grn_ii_builder_term_append(ctx, term, rsid); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->n++; + if (ii_flags & GRN_OBJ_WITH_WEIGHT) { + rc = grn_ii_builder_term_append(ctx, term, weight); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->n++; + } + term->rid = rid; + term->sid = sid; + term->pos_or_freq = 0; + } + if (ii_flags & GRN_OBJ_WITH_POSITION) { + rc = grn_ii_builder_term_append(ctx, term, pos - term->pos_or_freq); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->n++; + term->pos_or_freq = pos; + } else { + term->pos_or_freq++; + } + return GRN_SUCCESS; +} + +/* + * grn_ii_builder_append_value appends a value. Note that values must be + * appended in ascending rid and sid order. + */ +static grn_rc +grn_ii_builder_append_value(grn_ctx *ctx, grn_ii_builder *builder, + grn_id rid, uint32_t sid, uint32_t weight, + const char *value, uint32_t value_size) +{ + uint32_t pos = 0; + grn_token_cursor *cursor; + if (rid != builder->rid) { + builder->rid = rid; + builder->sid = sid; + builder->pos = 1; + } else if (sid != builder->sid) { + builder->sid = sid; + builder->pos = 1; + } else { + /* Insert a space between values. */ + builder->pos++; + } + if (value_size) { + if (!builder->tokenizer && !builder->normalizer) { + grn_id tid; + switch (builder->lexicon->header.type) { + case GRN_TABLE_PAT_KEY : + tid = grn_pat_add(ctx, (grn_pat *)builder->lexicon, + value, value_size, NULL, NULL); + break; + case GRN_TABLE_DAT_KEY : + tid = grn_dat_add(ctx, (grn_dat *)builder->lexicon, + value, value_size, NULL, NULL); + break; + case GRN_TABLE_HASH_KEY : + tid = grn_hash_add(ctx, (grn_hash *)builder->lexicon, + value, value_size, NULL, NULL); + break; + case GRN_TABLE_NO_KEY : + tid = *(grn_id *)value; + break; + default : + tid = GRN_ID_NIL; + break; + } + if (tid != GRN_ID_NIL) { + grn_rc rc; + pos = builder->pos; + rc = grn_ii_builder_append_token(ctx, builder, rid, sid, + weight, tid, pos); + if (rc != GRN_SUCCESS) { + return rc; + } + } + } else { + cursor = grn_token_cursor_open(ctx, builder->lexicon, value, value_size, + GRN_TOKEN_ADD, 0); + if (!cursor) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, + "grn_token_cursor_open failed: value = <%.*s>", + value_size, value); + } + return ctx->rc; + } + while (cursor->status == GRN_TOKEN_CURSOR_DOING) { + grn_id tid = grn_token_cursor_next(ctx, cursor); + if (tid != GRN_ID_NIL) { + grn_rc rc; + pos = builder->pos + cursor->pos; + rc = grn_ii_builder_append_token(ctx, builder, rid, sid, + weight, tid, pos); + if (rc != GRN_SUCCESS) { + break; + } + } + } + grn_token_cursor_close(ctx, cursor); + } + } + builder->pos = pos + 1; + return ctx->rc; +} + +/* grn_ii_builder_append_obj appends a BULK, UVECTOR or VECTOR object. */ +static grn_rc +grn_ii_builder_append_obj(grn_ctx *ctx, grn_ii_builder *builder, + grn_id rid, uint32_t sid, grn_obj *obj) +{ + switch (obj->header.type) { + case GRN_BULK : + return grn_ii_builder_append_value(ctx, builder, rid, sid, 0, + GRN_TEXT_VALUE(obj), GRN_TEXT_LEN(obj)); + case GRN_UVECTOR : + { + const char *p = GRN_BULK_HEAD(obj); + uint32_t i, n_values = grn_uvector_size(ctx, obj); + uint32_t value_size = grn_uvector_element_size(ctx, obj); + for (i = 0; i < n_values; i++) { + grn_rc rc = grn_ii_builder_append_value(ctx, builder, rid, sid, 0, + p, value_size); + if (rc != GRN_SUCCESS) { + return rc; + } + p += value_size; + } + } + return GRN_SUCCESS; + case GRN_VECTOR : + if (obj->u.v.body) { + /* + * Note that the following sections and n_sections don't correspond to + * source columns. + */ + int i, n_secs = obj->u.v.n_sections; + grn_section *secs = obj->u.v.sections; + const char *head = GRN_BULK_HEAD(obj->u.v.body); + for (i = 0; i < n_secs; i++) { + grn_rc rc; + grn_section *sec = &secs[i]; + if (sec->length == 0) { + continue; + } + if (builder->tokenizer) { + sid = i + 1; + } + rc = grn_ii_builder_append_value(ctx, builder, rid, sid, sec->weight, + head + sec->offset, sec->length); + if (rc != GRN_SUCCESS) { + return rc; + } + } + } + return GRN_SUCCESS; + default : + ERR(GRN_INVALID_ARGUMENT, "[index] invalid object assigned as value"); + return ctx->rc; + } +} + +/* + * grn_ii_builder_append_srcs reads values from source columns and appends the + * values. + */ +static grn_rc +grn_ii_builder_append_srcs(grn_ctx *ctx, grn_ii_builder *builder) +{ + size_t i; + grn_rc rc = GRN_SUCCESS; + grn_obj *objs; + grn_table_cursor *cursor; + + /* Allocate memory for objects to store source values. */ + objs = GRN_MALLOCN(grn_obj, builder->n_srcs); + if (!objs) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for objs: n_srcs = %u", builder->n_srcs); + return ctx->rc; + } + + /* Create a cursor to get records in the ID order. */ + cursor = grn_table_cursor_open(ctx, builder->src_table, NULL, 0, NULL, 0, + 0, -1, GRN_CURSOR_BY_ID); + if (!cursor) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_OBJECT_CORRUPT, "[index] failed to open table cursor"); + } + GRN_FREE(objs); + return ctx->rc; + } + + /* Read source values and append it. */ + for (i = 0; i < builder->n_srcs; i++) { + GRN_TEXT_INIT(&objs[i], 0); + } + while (rc == GRN_SUCCESS) { + grn_id rid = grn_table_cursor_next(ctx, cursor); + if (rid == GRN_ID_NIL) { + break; + } + for (i = 0; i < builder->n_srcs; i++) { + grn_obj *obj = &objs[i]; + grn_obj *src = builder->srcs[i]; + rc = grn_obj_reinit_for(ctx, obj, src); + if (rc == GRN_SUCCESS) { + if (GRN_OBJ_TABLEP(src)) { + int len = grn_table_get_key2(ctx, src, rid, obj); + if (len <= 0) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, "failed to get key: rid = %u, len = %d", + rid, len); + } + rc = ctx->rc; + } + } else { + if (!grn_obj_get_value(ctx, src, rid, obj)) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, "failed to get value: rid = %u", rid); + } + rc = ctx->rc; + } + } + if (rc == GRN_SUCCESS) { + uint32_t sid = (uint32_t)(i + 1); + rc = grn_ii_builder_append_obj(ctx, builder, rid, sid, obj); + } + } + } + if (rc == GRN_SUCCESS && builder->n >= builder->options.block_threshold) { + rc = grn_ii_builder_flush_block(ctx, builder); + } + } + if (rc == GRN_SUCCESS) { + rc = grn_ii_builder_flush_block(ctx, builder); + } + for (i = 0; i < builder->n_srcs; i++) { + GRN_OBJ_FIN(ctx, &objs[i]); + } + grn_table_cursor_close(ctx, cursor); + GRN_FREE(objs); + return rc; +} + +/* grn_ii_builder_set_src_table sets a source table. */ +static grn_rc +grn_ii_builder_set_src_table(grn_ctx *ctx, grn_ii_builder *builder) +{ + builder->src_table = grn_ctx_at(ctx, DB_OBJ(builder->ii)->range); + if (!builder->src_table) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_INVALID_ARGUMENT, "source table is null: range = %d", + DB_OBJ(builder->ii)->range); + } + return ctx->rc; + } + return GRN_SUCCESS; +} + +/* grn_ii_builder_set_sid_bits calculates sid_bits and sid_mask. */ +static grn_rc +grn_ii_builder_set_sid_bits(grn_ctx *ctx, grn_ii_builder *builder) +{ + /* Calculate the number of bits required to represent a section ID. */ + if (builder->n_srcs == 1 && builder->tokenizer && + (builder->srcs[0]->header.flags & GRN_OBJ_COLUMN_VECTOR) != 0) { + /* If the source column is a vector column and the index has a tokenizer, */ + /* the maximum sid equals to the maximum number of elements. */ + size_t max_elems = 0; + grn_table_cursor *cursor; + grn_obj obj; + cursor = grn_table_cursor_open(ctx, builder->src_table, NULL, 0, NULL, 0, + 0, -1, GRN_CURSOR_BY_ID); + if (!cursor) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_OBJECT_CORRUPT, "[index] failed to open table cursor"); + } + return ctx->rc; + } + GRN_TEXT_INIT(&obj, 0); + for (;;) { + grn_id rid = grn_table_cursor_next(ctx, cursor); + if (rid == GRN_ID_NIL) { + break; + } + if (!grn_obj_get_value(ctx, builder->srcs[0], rid, &obj)) { + continue; + } + if (obj.u.v.n_sections > max_elems) { + max_elems = obj.u.v.n_sections; + } + } + GRN_OBJ_FIN(ctx, &obj); + grn_table_cursor_close(ctx, cursor); + while (((uint32_t)1 << builder->sid_bits) < max_elems) { + builder->sid_bits++; + } + } + if (builder->sid_bits == 0) { + while (((uint32_t)1 << builder->sid_bits) < builder->n_srcs) { + builder->sid_bits++; + } + } + builder->sid_mask = ((uint64_t)1 << builder->sid_bits) - 1; + return GRN_SUCCESS; +} + +/* grn_ii_builder_set_srcs sets source columns. */ +static grn_rc +grn_ii_builder_set_srcs(grn_ctx *ctx, grn_ii_builder *builder) +{ + size_t i; + grn_id *source; + builder->n_srcs = builder->ii->obj.source_size / sizeof(grn_id); + source = (grn_id *)builder->ii->obj.source; + if (!source || !builder->n_srcs) { + ERR(GRN_INVALID_ARGUMENT, + "source is not available: source = %p, source_size = %u", + builder->ii->obj.source, builder->ii->obj.source_size); + return ctx->rc; + } + builder->srcs = GRN_MALLOCN(grn_obj *, builder->n_srcs); + if (!builder->srcs) { + return GRN_NO_MEMORY_AVAILABLE; + } + for (i = 0; i < builder->n_srcs; i++) { + builder->srcs[i] = grn_ctx_at(ctx, source[i]); + if (!builder->srcs[i]) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_OBJECT_CORRUPT, "source not found: id = %d", source[i]); + } + return ctx->rc; + } + } + return grn_ii_builder_set_sid_bits(ctx, builder); +} + +/* grn_ii_builder_append_source appends values in source columns. */ +static grn_rc +grn_ii_builder_append_source(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_rc rc = grn_ii_builder_set_src_table(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + if (grn_table_size(ctx, builder->src_table) == 0) { + /* Nothing to do because there are no values. */ + return ctx->rc; + } + /* Create a block lexicon. */ + rc = grn_ii_builder_create_lexicon(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + rc = grn_ii_builder_set_srcs(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + rc = grn_ii_builder_append_srcs(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + grn_ii_builder_fin_terms(ctx, builder); + return GRN_SUCCESS; +} + +/* + * grn_ii_builder_fill_block reads the next data from a temporary file and fill + * a block buffer. + */ +static grn_rc +grn_ii_builder_fill_block(grn_ctx *ctx, grn_ii_builder *builder, + uint32_t block_id) +{ + ssize_t size; + uint32_t buf_rest; + uint64_t file_offset; + grn_ii_builder_block *block = &builder->blocks[block_id]; + if (!block->rest) { + return GRN_END_OF_DATA; + } + if (!block->buf) { + block->buf = (uint8_t *)GRN_MALLOC(builder->options.block_buf_size); + if (!block->buf) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for buffered input: size = %u", + builder->options.block_buf_size); + return ctx->rc; + } + } + + /* Move the remaining data to the head. */ + buf_rest = block->end - block->cur; + if (buf_rest) { + grn_memmove(block->buf, block->cur, buf_rest); + } + block->cur = block->buf; + block->end = block->buf + buf_rest; + + /* Read the next data. */ + file_offset = grn_lseek(builder->fd, block->offset, SEEK_SET); + if (file_offset != block->offset) { + SERR("failed to seek file: expected = %" GRN_FMT_INT64U + ", actual = %" GRN_FMT_INT64D, + block->offset, file_offset); + return ctx->rc; + } + buf_rest = builder->options.block_buf_size - buf_rest; + if (block->rest < buf_rest) { + buf_rest = block->rest; + } + size = grn_read(builder->fd, block->end, buf_rest); + if (size <= 0) { + SERR("failed to read data: expected = %u, actual = %" GRN_FMT_INT64D, + buf_rest, (int64_t)size); + return ctx->rc; + } + block->offset += size; + block->rest -= size; + block->end += size; + return GRN_SUCCESS; +} + +/* grn_ii_builder_read_from_block reads the next value from a block. */ +static grn_rc +grn_ii_builder_read_from_block(grn_ctx *ctx, grn_ii_builder *builder, + uint32_t block_id, uint64_t *value) +{ + grn_ii_builder_block *block = &builder->blocks[block_id]; + grn_rc rc = grn_ii_builder_block_next(ctx, block, value); + if (rc == GRN_SUCCESS) { + return GRN_SUCCESS; + } else if (rc == GRN_END_OF_DATA) { + rc = grn_ii_builder_fill_block(ctx, builder, block_id); + if (rc != GRN_SUCCESS) { + return rc; + } + return grn_ii_builder_block_next(ctx, block, value); + } + return rc; +} + +/* grn_ii_builder_pack_chunk tries to pack a chunk. */ +static grn_rc +grn_ii_builder_pack_chunk(grn_ctx *ctx, grn_ii_builder *builder, + grn_bool *packed) +{ + grn_id rid; + uint32_t sid, pos, *a; + grn_ii_builder_chunk *chunk = &builder->chunk; + *packed = GRN_FALSE; + if (chunk->offset != 1) { /* df != 1 */ + return GRN_SUCCESS; + } + if (chunk->weight_buf && chunk->weight_buf[0]) { /* weight != 0 */ + return GRN_SUCCESS; + } + if (chunk->freq_buf[0] != 0) { /* freq != 1 */ + return GRN_SUCCESS; + } + rid = chunk->rid_buf[0]; + if (chunk->sid_buf) { + if (rid >= 0x100000) { + return GRN_SUCCESS; + } + sid = chunk->sid_buf[0] + 1; + if (sid >= 0x800) { + return GRN_SUCCESS; + } + a = array_get(ctx, builder->ii, chunk->tid); + if (!a) { + DEFINE_NAME(builder->ii); + MERR("[ii][builder][chunk][pack] failed to allocate an array: " + "<%.*s>: " + "<%u>:<%u>:<%u>", + name_size, name, + rid, sid, chunk->tid); + return ctx->rc; + } + a[0] = ((rid << 12) + (sid << 1)) | 1; + } else { + a = array_get(ctx, builder->ii, chunk->tid); + if (!a) { + DEFINE_NAME(builder->ii); + MERR("[ii][builder][chunk][pack] failed to allocate an array: " + "<%.*s>: " + "<%u>:<%u>", + name_size, name, + rid, chunk->tid); + return ctx->rc; + } + a[0] = (rid << 1) | 1; + } + pos = 0; + if (chunk->pos_buf) { + pos = chunk->pos_buf[0]; + } + a[1] = pos; + array_unref(builder->ii, chunk->tid); + *packed = GRN_TRUE; + + grn_ii_builder_chunk_clear(ctx, chunk); + return GRN_SUCCESS; +} + +/* grn_ii_builder_get_cinfo returns a new cinfo. */ +static grn_rc +grn_ii_builder_get_cinfo(grn_ctx *ctx, grn_ii_builder *builder, + chunk_info **cinfo) +{ + if (builder->n_cinfos == builder->cinfos_size) { + uint32_t size = builder->cinfos_size ? (builder->cinfos_size * 2) : 1; + size_t n_bytes = size * sizeof(chunk_info); + chunk_info *cinfos = (chunk_info *)GRN_REALLOC(builder->cinfos, n_bytes); + if (!cinfos) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "failed to allocate memory for cinfos: n_bytes = %" GRN_FMT_SIZE, + n_bytes); + return ctx->rc; + } + builder->cinfos = cinfos; + builder->cinfos_size = size; + } + *cinfo = &builder->cinfos[builder->n_cinfos++]; + return GRN_SUCCESS; +} + +/* grn_ii_builder_flush_chunk flushes a chunk. */ +static grn_rc +grn_ii_builder_flush_chunk(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_rc rc; + chunk_info *cinfo = NULL; + grn_ii_builder_chunk *chunk = &builder->chunk; + void *seg; + uint8_t *in; + uint32_t in_size, chunk_id, seg_id, seg_offset, seg_rest; + + rc = grn_ii_builder_chunk_encode(ctx, chunk, NULL, 0); + if (rc != GRN_SUCCESS) { + return rc; + } + in = chunk->enc_buf; + in_size = chunk->enc_offset; + + rc = chunk_new(ctx, builder->ii, &chunk_id, chunk->enc_offset); + if (rc != GRN_SUCCESS) { + return rc; + } + + /* Copy to the first segment. */ + seg_id = chunk_id >> GRN_II_N_CHUNK_VARIATION; + seg_offset = (chunk_id & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << + GRN_II_W_LEAST_CHUNK; + GRN_IO_SEG_REF(builder->ii->chunk, seg_id, seg); + if (!seg) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, + "failed access chunk segment: chunk_id = %u, seg_id = %u", + chunk_id, seg_id); + } + return ctx->rc; + } + seg_rest = S_CHUNK - seg_offset; + if (in_size <= seg_rest) { + grn_memcpy((uint8_t *)seg + seg_offset, in, in_size); + in_size = 0; + } else { + grn_memcpy((uint8_t *)seg + seg_offset, in, seg_rest); + in += seg_rest; + in_size -= seg_rest; + } + GRN_IO_SEG_UNREF(builder->ii->chunk, seg_id); + + /* Copy to the next segments. */ + while (in_size) { + seg_id++; + GRN_IO_SEG_REF(builder->ii->chunk, seg_id, seg); + if (!seg) { + if (ctx->rc == GRN_SUCCESS) { + ERR(GRN_UNKNOWN_ERROR, + "failed access chunk segment: chunk_id = %u, seg_id = %u", + chunk_id, seg_id); + } + return ctx->rc; + } + if (in_size <= S_CHUNK) { + grn_memcpy(seg, in, in_size); + in_size = 0; + } else { + grn_memcpy(seg, in, S_CHUNK); + in += S_CHUNK; + in_size -= S_CHUNK; + } + GRN_IO_SEG_UNREF(builder->ii->chunk, seg_id); + } + + /* Append a cinfo. */ + rc = grn_ii_builder_get_cinfo(ctx, builder, &cinfo); + if (rc != GRN_SUCCESS) { + return rc; + } + cinfo->segno = chunk_id; + cinfo->size = chunk->enc_offset; + cinfo->dgap = chunk->rid_gap; + + builder->buf.ii->header->total_chunk_size += chunk->enc_offset; + grn_ii_builder_chunk_clear(ctx, chunk); + return GRN_SUCCESS; +} + +/* grn_ii_builder_read_to_chunk read values from a block to a chunk. */ +static grn_rc +grn_ii_builder_read_to_chunk(grn_ctx *ctx, grn_ii_builder *builder, + uint32_t block_id) +{ + grn_rc rc; + uint64_t value; + uint32_t rid = GRN_ID_NIL, last_sid = 0; + uint32_t ii_flags = builder->ii->header->flags; + grn_ii_builder_chunk *chunk = &builder->chunk; + + for (;;) { + uint32_t gap, freq; + uint64_t value; + rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); + if (rc != GRN_SUCCESS) { + return rc; + } + if (!value) { + break; + } + if (builder->chunk.offset == builder->chunk.size) { + rc = grn_ii_builder_chunk_extend_bufs(ctx, chunk, ii_flags); + if (rc != GRN_SUCCESS) { + return rc; + } + } + + /* Read record ID. */ + gap = value >> builder->sid_bits; /* In-block gap */ + if (gap) { + if (chunk->n >= builder->options.chunk_threshold) { + rc = grn_ii_builder_flush_chunk(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + } + last_sid = 0; + } + rid += gap; + gap = rid - chunk->rid; /* Global gap */ + chunk->rid_buf[chunk->offset] = chunk->offset ? gap : rid; + chunk->n++; + chunk->rid = rid; + chunk->rid_gap += gap; + builder->df++; + + /* Read section ID. */ + if (ii_flags & GRN_OBJ_WITH_SECTION) { + uint32_t sid = (value & builder->sid_mask) + 1; + chunk->sid_buf[chunk->offset] = sid - last_sid - 1; + chunk->n++; + last_sid = sid; + } + + /* Read weight. */ + if (ii_flags & GRN_OBJ_WITH_WEIGHT) { + uint32_t weight; + rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); + if (rc != GRN_SUCCESS) { + return rc; + } + weight = value; + chunk->weight_buf[chunk->offset] = weight; + chunk->n++; + } + + /* Read positions or a frequency. */ + if (ii_flags & GRN_OBJ_WITH_POSITION) { + uint32_t pos = -1; + freq = 0; + for (;;) { + rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); + if (rc != GRN_SUCCESS) { + return rc; + } + if (!value) { + break; + } + if (builder->chunk.pos_offset == builder->chunk.pos_size) { + rc = grn_ii_builder_chunk_extend_pos_buf(ctx, chunk); + if (rc != GRN_SUCCESS) { + return rc; + } + } + if (pos == -1) { + chunk->pos_buf[chunk->pos_offset] = value - 1; + chunk->pos_sum += value - 1; + } else { + chunk->pos_buf[chunk->pos_offset] = value; + chunk->pos_sum += value; + } + chunk->n++; + pos += value; + chunk->pos_offset++; + freq++; + } + } else { + rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); + if (rc != GRN_SUCCESS) { + return rc; + } + freq = value; + } + chunk->freq_buf[chunk->offset] = freq - 1; + chunk->n++; + chunk->offset++; + } + rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); + if (rc == GRN_SUCCESS) { + builder->blocks[block_id].tid = value; + } else if (rc == GRN_END_OF_DATA) { + builder->blocks[block_id].tid = GRN_ID_NIL; + } else { + return rc; + } + return GRN_SUCCESS; +} + +/* grn_ii_builder_register_chunks registers chunks. */ +static grn_rc +grn_ii_builder_register_chunks(grn_ctx *ctx, grn_ii_builder *builder) +{ + grn_rc rc; + uint32_t buf_tid, *a; + buffer_term *buf_term; + + rc = grn_ii_builder_chunk_encode(ctx, &builder->chunk, builder->cinfos, + builder->n_cinfos); + if (rc != GRN_SUCCESS) { + return rc; + } + + if (!grn_ii_builder_buffer_is_assigned(ctx, &builder->buf)) { + rc = grn_ii_builder_buffer_assign(ctx, &builder->buf, + builder->chunk.enc_offset); + if (rc != GRN_SUCCESS) { + return rc; + } + } + buf_tid = builder->buf.buf->header.nterms; + if (buf_tid >= builder->options.buffer_max_n_terms || + builder->buf.chunk_size - builder->buf.chunk_offset < + builder->chunk.enc_offset) { + rc = grn_ii_builder_buffer_flush(ctx, &builder->buf); + if (rc != GRN_SUCCESS) { + return rc; + } + rc = grn_ii_builder_buffer_assign(ctx, &builder->buf, + builder->chunk.enc_offset); + if (rc != GRN_SUCCESS) { + return rc; + } + buf_tid = 0; + } + buf_term = &builder->buf.buf->terms[buf_tid]; + buf_term->tid = builder->chunk.tid; + if (builder->n_cinfos) { + buf_term->tid |= CHUNK_SPLIT; + } + buf_term->size_in_buffer = 0; + buf_term->pos_in_buffer = 0; + buf_term->size_in_chunk = builder->chunk.enc_offset; + buf_term->pos_in_chunk = builder->buf.chunk_offset; + + grn_memcpy(builder->buf.chunk + builder->buf.chunk_offset, + builder->chunk.enc_buf, builder->chunk.enc_offset); + builder->buf.chunk_offset += builder->chunk.enc_offset; + + a = array_get(ctx, builder->ii, builder->chunk.tid); + if (!a) { + DEFINE_NAME(builder->ii); + MERR("[ii][builder][chunk][register] " + "failed to allocate an array in segment: " + "<%.*s>: " + "tid=<%u>: max_n_segments=<%u>", + name_size, name, + builder->chunk.tid, + builder->ii->seg->header->max_segment); + return ctx->rc; + } + a[0] = SEG2POS(builder->buf.buf_id, + sizeof(buffer_header) + buf_tid * sizeof(buffer_term)); + a[1] = builder->df; + array_unref(builder->ii, builder->chunk.tid); + + builder->buf.buf->header.nterms++; + builder->n_cinfos = 0; + grn_ii_builder_chunk_clear(ctx, &builder->chunk); + return GRN_SUCCESS; +} + +static grn_rc +grn_ii_builder_commit(grn_ctx *ctx, grn_ii_builder *builder) +{ + uint32_t i; + grn_rc rc; + grn_table_cursor *cursor; + + for (i = 0; i < builder->n_blocks; i++) { + uint64_t value; + rc = grn_ii_builder_read_from_block(ctx, builder, i, &value); + if (rc != GRN_SUCCESS) { + return rc; + } + builder->blocks[i].tid = value; + } + + cursor = grn_table_cursor_open(ctx, builder->ii->lexicon, + NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_KEY); + for (;;) { + grn_id tid = grn_table_cursor_next(ctx, cursor); + if (tid == GRN_ID_NIL) { + break; + } + builder->chunk.tid = tid; + builder->chunk.rid = GRN_ID_NIL; + builder->df = 0; + for (i = 0; i < builder->n_blocks; i++) { + if (tid == builder->blocks[i].tid) { + rc = grn_ii_builder_read_to_chunk(ctx, builder, i); + if (rc != GRN_SUCCESS) { + return rc; + } + } + } + if (!builder->chunk.n) { + /* This term does not appear. */ + continue; + } + if (!builder->n_cinfos) { + grn_bool packed; + rc = grn_ii_builder_pack_chunk(ctx, builder, &packed); + if (rc != GRN_SUCCESS) { + return rc; + } + if (packed) { + continue; + } + } + rc = grn_ii_builder_register_chunks(ctx, builder); + if (rc != GRN_SUCCESS) { + return rc; + } + } + grn_table_cursor_close(ctx, cursor); + if (grn_ii_builder_buffer_is_assigned(ctx, &builder->buf)) { + rc = grn_ii_builder_buffer_flush(ctx, &builder->buf); + if (rc != GRN_SUCCESS) { + return rc; + } + } + return GRN_SUCCESS; +} + +grn_rc +grn_ii_build2(grn_ctx *ctx, grn_ii *ii, const grn_ii_builder_options *options) +{ + grn_rc rc, rc_close; + grn_ii_builder *builder; + rc = grn_ii_builder_open(ctx, ii, options, &builder); + if (rc == GRN_SUCCESS) { + rc = grn_ii_builder_append_source(ctx, builder); + if (rc == GRN_SUCCESS) { + rc = grn_ii_builder_commit(ctx, builder); + } + rc_close = grn_ii_builder_close(ctx, builder); + if (rc == GRN_SUCCESS) { + rc = rc_close; + } + } + return rc; +} |