/* -*- c-basic-offset: 2 -*- */ /* Copyright(C) 2009-2017 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ #include "grn.h" #include #include #include #include #ifdef WIN32 # include # include #endif /* WIN32 */ #include "grn_ii.h" #include "grn_ctx_impl.h" #include "grn_token_cursor.h" #include "grn_pat.h" #include "grn_db.h" #include "grn_output.h" #include "grn_scorer.h" #include "grn_util.h" #ifdef GRN_WITH_ONIGMO # define GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH #endif #ifdef GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH # include "grn_string.h" # include #endif #define MAX_PSEG 0x20000 #define MAX_PSEG_SMALL 0x00200 /* MAX_PSEG_MEDIUM has enough space for the following source: * * Single source. * * Source is a fixed size column or _key of a table. * * Source column is a scalar column. * * Lexicon doesn't have tokenizer. */ #define MAX_PSEG_MEDIUM 0x10000 #define S_CHUNK (1 << GRN_II_W_CHUNK) #define W_SEGMENT 18 #define S_SEGMENT (1 << W_SEGMENT) #define W_ARRAY_ELEMENT 3 #define S_ARRAY_ELEMENT (1 << W_ARRAY_ELEMENT) #define W_ARRAY (W_SEGMENT - W_ARRAY_ELEMENT) #define ARRAY_MASK_IN_A_SEGMENT ((1 << W_ARRAY) - 1) #define S_GARBAGE (1<<12) #define CHUNK_SPLIT 0x80000000 #define CHUNK_SPLIT_THRESHOLD 0x60000 #define MAX_N_ELEMENTS 5 #define DEFINE_NAME(ii) \ const char *name; \ char name_buffer[GRN_TABLE_MAX_KEY_SIZE]; \ int name_size; \ do { \ if (DB_OBJ(ii)->id == GRN_ID_NIL) { \ name = "(temporary)"; \ name_size = strlen(name); \ } else { \ name_size = grn_obj_name(ctx, (grn_obj *)ii, \ name_buffer, GRN_TABLE_MAX_KEY_SIZE); \ name = name_buffer; \ } \ } while (GRN_FALSE) #define LSEG(pos) ((pos) >> 16) #define LPOS(pos) (((pos) & 0xffff) << 2) #define SEG2POS(seg,pos) ((((uint32_t)(seg)) << 16) + (((uint32_t)(pos)) >> 2)) #ifndef S_IRUSR # define S_IRUSR 0400 #endif /* S_IRUSR */ #ifndef S_IWUSR # define S_IWUSR 0200 #endif /* S_IWUSR */ static grn_bool grn_ii_cursor_set_min_enable = GRN_TRUE; static double grn_ii_select_too_many_index_match_ratio = -1; static double grn_ii_estimate_size_for_query_reduce_ratio = 0.9; static grn_bool grn_ii_overlap_token_skip_enable = GRN_FALSE; static uint32_t grn_ii_builder_block_threshold_force = 0; static uint32_t grn_ii_max_n_segments_small = MAX_PSEG_SMALL; static uint32_t grn_ii_max_n_chunks_small = GRN_II_MAX_CHUNK_SMALL; void grn_ii_init_from_env(void) { { char grn_ii_cursor_set_min_enable_env[GRN_ENV_BUFFER_SIZE]; grn_getenv("GRN_II_CURSOR_SET_MIN_ENABLE", grn_ii_cursor_set_min_enable_env, GRN_ENV_BUFFER_SIZE); if (strcmp(grn_ii_cursor_set_min_enable_env, "no") == 0) { grn_ii_cursor_set_min_enable = GRN_FALSE; } else { grn_ii_cursor_set_min_enable = GRN_TRUE; } } { char grn_ii_select_too_many_index_match_ratio_env[GRN_ENV_BUFFER_SIZE]; grn_getenv("GRN_II_SELECT_TOO_MANY_INDEX_MATCH_RATIO", grn_ii_select_too_many_index_match_ratio_env, GRN_ENV_BUFFER_SIZE); if (grn_ii_select_too_many_index_match_ratio_env[0]) { grn_ii_select_too_many_index_match_ratio = atof(grn_ii_select_too_many_index_match_ratio_env); } } { char grn_ii_estimate_size_for_query_reduce_ratio_env[GRN_ENV_BUFFER_SIZE]; grn_getenv("GRN_II_ESTIMATE_SIZE_FOR_QUERY_REDUCE_RATIO", grn_ii_estimate_size_for_query_reduce_ratio_env, GRN_ENV_BUFFER_SIZE); if (grn_ii_estimate_size_for_query_reduce_ratio_env[0]) { grn_ii_estimate_size_for_query_reduce_ratio = atof(grn_ii_estimate_size_for_query_reduce_ratio_env); } } { char grn_ii_overlap_token_skip_enable_env[GRN_ENV_BUFFER_SIZE]; grn_getenv("GRN_II_OVERLAP_TOKEN_SKIP_ENABLE", grn_ii_overlap_token_skip_enable_env, GRN_ENV_BUFFER_SIZE); if (grn_ii_overlap_token_skip_enable_env[0]) { grn_ii_overlap_token_skip_enable = GRN_TRUE; } else { grn_ii_overlap_token_skip_enable = GRN_FALSE; } } { char grn_ii_builder_block_threshold_env[GRN_ENV_BUFFER_SIZE]; grn_getenv("GRN_II_BUILDER_BLOCK_THRESHOLD", grn_ii_builder_block_threshold_env, GRN_ENV_BUFFER_SIZE); if (grn_ii_builder_block_threshold_env[0]) { grn_ii_builder_block_threshold_force = grn_atoui(grn_ii_builder_block_threshold_env, grn_ii_builder_block_threshold_env + strlen(grn_ii_builder_block_threshold_env), NULL); } else { grn_ii_builder_block_threshold_force = 0; } } { char grn_ii_max_n_segments_small_env[GRN_ENV_BUFFER_SIZE]; grn_getenv("GRN_II_MAX_N_SEGMENTS_SMALL", grn_ii_max_n_segments_small_env, GRN_ENV_BUFFER_SIZE); if (grn_ii_max_n_segments_small_env[0]) { grn_ii_max_n_segments_small = grn_atoui(grn_ii_max_n_segments_small_env, grn_ii_max_n_segments_small_env + strlen(grn_ii_max_n_segments_small_env), NULL); if (grn_ii_max_n_segments_small > MAX_PSEG) { grn_ii_max_n_segments_small = MAX_PSEG; } } } { char grn_ii_max_n_chunks_small_env[GRN_ENV_BUFFER_SIZE]; grn_getenv("GRN_II_MAX_N_CHUNKS_SMALL", grn_ii_max_n_chunks_small_env, GRN_ENV_BUFFER_SIZE); if (grn_ii_max_n_chunks_small_env[0]) { grn_ii_max_n_chunks_small = grn_atoui(grn_ii_max_n_chunks_small_env, grn_ii_max_n_chunks_small_env + strlen(grn_ii_max_n_chunks_small_env), NULL); if (grn_ii_max_n_chunks_small > GRN_II_MAX_CHUNK) { grn_ii_max_n_chunks_small = GRN_II_MAX_CHUNK; } } } } void grn_ii_cursor_set_min_enable_set(grn_bool enable) { grn_ii_cursor_set_min_enable = enable; } grn_bool grn_ii_cursor_set_min_enable_get(void) { return grn_ii_cursor_set_min_enable; } /* segment */ inline static uint32_t segment_get(grn_ctx *ctx, grn_ii *ii) { uint32_t pseg; if (ii->header->bgqtail == ((ii->header->bgqhead + 1) & (GRN_II_BGQSIZE - 1))) { pseg = ii->header->bgqbody[ii->header->bgqtail]; ii->header->bgqtail = (ii->header->bgqtail + 1) & (GRN_II_BGQSIZE - 1); } else { pseg = ii->header->pnext; #ifndef CUT_OFF_COMPATIBILITY if (!pseg) { uint32_t pmax = 0; char *used; uint32_t i, max_segment = ii->seg->header->max_segment; used = GRN_CALLOC(max_segment); if (!used) { return max_segment; } for (i = 0; i < GRN_II_MAX_LSEG && i < max_segment; i++) { if ((pseg = ii->header->ainfo[i]) != GRN_II_PSEG_NOT_ASSIGNED) { if (pseg > pmax) { pmax = pseg; } used[pseg] = 1; } if ((pseg = ii->header->binfo[i]) != GRN_II_PSEG_NOT_ASSIGNED) { if (pseg > pmax) { pmax = pseg; } used[pseg] = 1; } } for (pseg = 0; pseg < max_segment && used[pseg]; pseg++) ; GRN_FREE(used); ii->header->pnext = pmax + 1; } else #endif /* CUT_OFF_COMPATIBILITY */ if (ii->header->pnext < ii->seg->header->max_segment) { ii->header->pnext++; } } return pseg; } inline static grn_rc segment_get_clear(grn_ctx *ctx, grn_ii *ii, uint32_t *pseg) { uint32_t seg = segment_get(ctx, ii); if (seg < ii->seg->header->max_segment) { void *p = NULL; GRN_IO_SEG_REF(ii->seg, seg, p); if (!p) { return GRN_NO_MEMORY_AVAILABLE; } memset(p, 0, S_SEGMENT); GRN_IO_SEG_UNREF(ii->seg, seg); *pseg = seg; return GRN_SUCCESS; } else { return GRN_NO_MEMORY_AVAILABLE; } } inline static grn_rc buffer_segment_new(grn_ctx *ctx, grn_ii *ii, uint32_t *segno) { uint32_t lseg, pseg; if (*segno < GRN_II_MAX_LSEG) { if (ii->header->binfo[*segno] != GRN_II_PSEG_NOT_ASSIGNED) { return GRN_INVALID_ARGUMENT; } lseg = *segno; } else { for (lseg = 0; lseg < GRN_II_MAX_LSEG; lseg++) { if (ii->header->binfo[lseg] == GRN_II_PSEG_NOT_ASSIGNED) { break; } } if (lseg == GRN_II_MAX_LSEG) { return GRN_NO_MEMORY_AVAILABLE; } *segno = lseg; } pseg = segment_get(ctx, ii); if (pseg < ii->seg->header->max_segment) { ii->header->binfo[lseg] = pseg; if (lseg >= ii->header->bmax) { ii->header->bmax = lseg + 1; } return GRN_SUCCESS; } else { return GRN_NO_MEMORY_AVAILABLE; } } static grn_rc buffer_segment_reserve(grn_ctx *ctx, grn_ii *ii, uint32_t *lseg0, uint32_t *pseg0, uint32_t *lseg1, uint32_t *pseg1) { uint32_t i = 0; for (;; i++) { if (i == GRN_II_MAX_LSEG) { DEFINE_NAME(ii); MERR("[ii][buffer][segment][reserve] " "couldn't find a free buffer: <%.*s>: max:<%u>", name_size, name, GRN_II_MAX_LSEG); return ctx->rc; } if (ii->header->binfo[i] == GRN_II_PSEG_NOT_ASSIGNED) { break; } } *lseg0 = i++; for (;; i++) { if (i == GRN_II_MAX_LSEG) { DEFINE_NAME(ii); MERR("[ii][buffer][segment][reserve] " "couldn't find two free buffers: " "<%.*s>: " "found:<%u>, max:<%u>", name_size, name, *lseg0, GRN_II_MAX_LSEG); return ctx->rc; } if (ii->header->binfo[i] == GRN_II_PSEG_NOT_ASSIGNED) { break; } } *lseg1 = i; if ((*pseg0 = segment_get(ctx, ii)) == ii->seg->header->max_segment) { DEFINE_NAME(ii); MERR("[ii][buffer][segment][reserve] " "couldn't allocate a free segment: <%.*s>: " "buffer:<%u>, max:<%u>", name_size, name, *lseg0, ii->seg->header->max_segment); return ctx->rc; } if ((*pseg1 = segment_get(ctx, ii)) == ii->seg->header->max_segment) { DEFINE_NAME(ii); MERR("[ii][buffer][segment][reserve] " "couldn't allocate two free segments: " "<%.*s>: " "found:<%u>, not-found:<%u>, max:<%u>", name_size, name, *lseg0, *lseg1, ii->seg->header->max_segment); return ctx->rc; } /* { uint32_t pseg; char *used = GRN_CALLOC(ii->seg->header->max_segment); if (!used) { return GRN_NO_MEMORY_AVAILABLE; } for (i = 0; i < GRN_II_MAX_LSEG; i++) { if ((pseg = ii->header->ainfo[i]) != GRN_II_PSEG_NOT_ASSIGNED) { used[pseg] = 1; } if ((pseg = ii->header->binfo[i]) != GRN_II_PSEG_NOT_ASSIGNED) { used[pseg] = 1; } } for (pseg = 0;; pseg++) { if (pseg == ii->seg->header->max_segment) { GRN_FREE(used); return GRN_NO_MEMORY_AVAILABLE; } if (!used[pseg]) { break; } } *pseg0 = pseg++; for (;; pseg++) { if (pseg == ii->seg->header->max_segment) { GRN_FREE(used); return GRN_NO_MEMORY_AVAILABLE; } if (!used[pseg]) { break; } } *pseg1 = pseg; GRN_FREE(used); } */ return ctx->rc; } #define BGQENQUE(lseg) do {\ if (ii->header->binfo[lseg] != GRN_II_PSEG_NOT_ASSIGNED) {\ ii->header->bgqbody[ii->header->bgqhead] = ii->header->binfo[lseg];\ ii->header->bgqhead = (ii->header->bgqhead + 1) & (GRN_II_BGQSIZE - 1);\ GRN_ASSERT(ii->header->bgqhead != ii->header->bgqtail);\ }\ } while (0) inline static void buffer_segment_update(grn_ii *ii, uint32_t lseg, uint32_t pseg) { BGQENQUE(lseg); // smb_wmb(); ii->header->binfo[lseg] = pseg; if (lseg >= ii->header->bmax) { ii->header->bmax = lseg + 1; } } inline static void buffer_segment_clear(grn_ii *ii, uint32_t lseg) { BGQENQUE(lseg); // smb_wmb(); ii->header->binfo[lseg] = GRN_II_PSEG_NOT_ASSIGNED; } /* chunk */ #define HEADER_CHUNK_AT(ii,offset) \ ((((ii)->header->chunks[((offset) >> 3)]) >> ((offset) & 7)) & 1) #define HEADER_CHUNK_ON(ii,offset) \ (((ii)->header->chunks[((offset) >> 3)]) |= (1 << ((offset) & 7))) #define HEADER_CHUNK_OFF(ii,offset) \ (((ii)->header->chunks[((offset) >> 3)]) &= ~(1 << ((offset) & 7))) #define N_GARBAGES_TH 1 #define N_GARBAGES ((S_GARBAGE - (sizeof(uint32_t) * 4))/(sizeof(uint32_t))) typedef struct { uint32_t head; uint32_t tail; uint32_t nrecs; uint32_t next; uint32_t recs[N_GARBAGES]; } grn_ii_ginfo; #define WIN_MAP(chunk,ctx,iw,seg,pos,size,mode)\ grn_io_win_map(chunk, ctx, iw,\ ((seg) >> GRN_II_N_CHUNK_VARIATION),\ (((seg) & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << GRN_II_W_LEAST_CHUNK) + (pos),\ size, mode) /* static int new_histogram[32]; static int free_histogram[32]; */ static grn_rc chunk_new(grn_ctx *ctx, grn_ii *ii, uint32_t *res, uint32_t size) { uint32_t n_chunks; n_chunks = ii->chunk->header->max_segment; /* if (size) { int m, es = size - 1; GRN_BIT_SCAN_REV(es, m); m++; new_histogram[m]++; } */ if (size > S_CHUNK) { int j; uint32_t n = (size + S_CHUNK - 1) >> GRN_II_W_CHUNK, i; for (i = 0, j = -1; i < n_chunks; i++) { if (HEADER_CHUNK_AT(ii, i)) { j = i; } else { if (i == j + n) { j++; *res = j << GRN_II_N_CHUNK_VARIATION; for (; j <= (int) i; j++) { HEADER_CHUNK_ON(ii, j); } return GRN_SUCCESS; } } } { DEFINE_NAME(ii); MERR("[ii][chunk][new] index is full: " "<%.*s>: " "size:<%u>, n-chunks:<%u>", name_size, name, size, n_chunks); } return ctx->rc; } else { uint32_t *vp; int m, aligned_size; if (size > (1 << GRN_II_W_LEAST_CHUNK)) { int es = size - 1; GRN_BIT_SCAN_REV(es, m); m++; } else { m = GRN_II_W_LEAST_CHUNK; } aligned_size = 1 << (m - GRN_II_W_LEAST_CHUNK); if (ii->header->ngarbages[m - GRN_II_W_LEAST_CHUNK] > N_GARBAGES_TH) { grn_ii_ginfo *ginfo; uint32_t *gseg; grn_io_win iw, iw_; iw_.addr = NULL; gseg = &ii->header->garbages[m - GRN_II_W_LEAST_CHUNK]; while (*gseg != GRN_II_PSEG_NOT_ASSIGNED) { ginfo = WIN_MAP(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); //GRN_IO_SEG_MAP2(ii->chunk, *gseg, ginfo); if (!ginfo) { if (iw_.addr) { grn_io_win_unmap(&iw_); } { DEFINE_NAME(ii); MERR("[ii][chunk][new] failed to allocate garbage segment: " "<%.*s>: " "n-garbages:<%u>, size:<%u>, n-chunks:<%u>", name_size, name, ii->header->ngarbages[m - GRN_II_W_LEAST_CHUNK], size, n_chunks); } return ctx->rc; } if (ginfo->next != GRN_II_PSEG_NOT_ASSIGNED || ginfo->nrecs > N_GARBAGES_TH) { *res = ginfo->recs[ginfo->tail]; if (++ginfo->tail == N_GARBAGES) { ginfo->tail = 0; } ginfo->nrecs--; ii->header->ngarbages[m - GRN_II_W_LEAST_CHUNK]--; if (!ginfo->nrecs) { HEADER_CHUNK_OFF(ii, *gseg); *gseg = ginfo->next; } if (iw_.addr) { grn_io_win_unmap(&iw_); } grn_io_win_unmap(&iw); return GRN_SUCCESS; } if (iw_.addr) { grn_io_win_unmap(&iw_); } iw_ = iw; gseg = &ginfo->next; } if (iw_.addr) { grn_io_win_unmap(&iw_); } } vp = &ii->header->free_chunks[m - GRN_II_W_LEAST_CHUNK]; if (*vp == GRN_II_PSEG_NOT_ASSIGNED) { int i = 0; while (HEADER_CHUNK_AT(ii, i)) { if (++i >= (int) n_chunks) { DEFINE_NAME(ii); MERR("[ii][chunk][new] failed to find a free chunk: " "<%.*s>: " "index:<%u>, size:<%u>, n-chunks:<%u>", name_size, name, m - GRN_II_W_LEAST_CHUNK, size, n_chunks); return ctx->rc; } } HEADER_CHUNK_ON(ii, i); *vp = i << GRN_II_N_CHUNK_VARIATION; } *res = *vp; *vp += 1 << (m - GRN_II_W_LEAST_CHUNK); if (!(*vp & ((1 << GRN_II_N_CHUNK_VARIATION) - 1))) { *vp = GRN_II_PSEG_NOT_ASSIGNED; } return GRN_SUCCESS; } } static grn_rc chunk_free(grn_ctx *ctx, grn_ii *ii, uint32_t offset, uint32_t dummy, uint32_t size) { /* if (size) { int m, es = size - 1; GRN_BIT_SCAN_REV(es, m); m++; free_histogram[m]++; } */ grn_io_win iw, iw_; grn_ii_ginfo *ginfo= 0; uint32_t seg, m, *gseg; seg = offset >> GRN_II_N_CHUNK_VARIATION; if (size > S_CHUNK) { int n = (size + S_CHUNK - 1) >> GRN_II_W_CHUNK; for (; n--; seg++) { HEADER_CHUNK_OFF(ii, seg); } return GRN_SUCCESS; } if (size > (1 << GRN_II_W_LEAST_CHUNK)) { int es = size - 1; GRN_BIT_SCAN_REV(es, m); m++; } else { m = GRN_II_W_LEAST_CHUNK; } gseg = &ii->header->garbages[m - GRN_II_W_LEAST_CHUNK]; iw_.addr = NULL; while (*gseg != GRN_II_PSEG_NOT_ASSIGNED) { ginfo = WIN_MAP(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); // GRN_IO_SEG_MAP2(ii->chunk, *gseg, ginfo); if (!ginfo) { if (iw_.addr) { grn_io_win_unmap(&iw_); } return GRN_NO_MEMORY_AVAILABLE; } if (ginfo->nrecs < N_GARBAGES) { break; } if (iw_.addr) { grn_io_win_unmap(&iw_); } iw_ = iw; gseg = &ginfo->next; } if (*gseg == GRN_II_PSEG_NOT_ASSIGNED) { grn_rc rc; if ((rc = chunk_new(ctx, ii, gseg, S_GARBAGE))) { if (iw_.addr) { grn_io_win_unmap(&iw_); } return rc; } ginfo = WIN_MAP(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); /* uint32_t i = 0; while (HEADER_CHUNK_AT(ii, i)) { if (++i >= ii->chunk->header->max_segment) { return GRN_NO_MEMORY_AVAILABLE; } } HEADER_CHUNK_ON(ii, i); *gseg = i; GRN_IO_SEG_MAP2(ii->chunk, *gseg, ginfo); */ if (!ginfo) { if (iw_.addr) { grn_io_win_unmap(&iw_); } return GRN_NO_MEMORY_AVAILABLE; } ginfo->head = 0; ginfo->tail = 0; ginfo->nrecs = 0; ginfo->next = GRN_II_PSEG_NOT_ASSIGNED; } if (iw_.addr) { grn_io_win_unmap(&iw_); } ginfo->recs[ginfo->head] = offset; if (++ginfo->head == N_GARBAGES) { ginfo->head = 0; } ginfo->nrecs++; grn_io_win_unmap(&iw); ii->header->ngarbages[m - GRN_II_W_LEAST_CHUNK]++; return GRN_SUCCESS; } #define UNIT_SIZE 0x80 #define UNIT_MASK (UNIT_SIZE - 1) /* */ static uint8_t * pack_1(uint32_t *p, uint8_t *rp) { uint8_t v; v = *p++ << 7; v += *p++ << 6; v += *p++ << 5; v += *p++ << 4; v += *p++ << 3; v += *p++ << 2; v += *p++ << 1; *rp++ = v + *p++; return rp; } static uint8_t * unpack_1(uint32_t *p, uint8_t *dp) { *p++ = (*dp >> 7); *p++ = ((*dp >> 6) & 0x1); *p++ = ((*dp >> 5) & 0x1); *p++ = ((*dp >> 4) & 0x1); *p++ = ((*dp >> 3) & 0x1); *p++ = ((*dp >> 2) & 0x1); *p++ = ((*dp >> 1) & 0x1); *p++ = (*dp++ & 0x1); return dp; } static uint8_t * pack_2(uint32_t *p, uint8_t *rp) { uint8_t v; v = *p++ << 6; v += *p++ << 4; v += *p++ << 2; *rp++ = v + *p++; v = *p++ << 6; v += *p++ << 4; v += *p++ << 2; *rp++ = v + *p++; return rp; } static uint8_t * unpack_2(uint32_t *p, uint8_t *dp) { *p++ = (*dp >> 6); *p++ = ((*dp >> 4) & 0x3); *p++ = ((*dp >> 2) & 0x3); *p++ = (*dp++ & 0x3); *p++ = (*dp >> 6); *p++ = ((*dp >> 4) & 0x3); *p++ = ((*dp >> 2) & 0x3); *p++ = (*dp++ & 0x3); return dp; } static uint8_t * pack_3(uint32_t *p, uint8_t *rp) { uint8_t v; v = *p++ << 5; v += *p++ << 2; *rp++ = v + (*p >> 1); v = *p++ << 7; v += *p++ << 4; v += *p++ << 1; *rp++ = v + (*p >> 2); v = *p++ << 6; v += *p++ << 3; *rp++ = v + *p++; return rp; } static uint8_t * unpack_3(uint32_t *p, uint8_t *dp) { uint32_t v; *p++ = (*dp >> 5); *p++ = ((*dp >> 2) & 0x7); v = ((*dp++ << 1) & 0x7); *p++ = v + (*dp >> 7); *p++ = ((*dp >> 4) & 0x7); *p++ = ((*dp >> 1) & 0x7); v = ((*dp++ << 2) & 0x7); *p++ = v + (*dp >> 6); *p++ = ((*dp >> 3) & 0x7); *p++ = (*dp++ & 0x7); return dp; } static uint8_t * pack_4(uint32_t *p, uint8_t *rp) { uint8_t v; v = *p++ << 4; *rp++ = v + *p++; v = *p++ << 4; *rp++ = v + *p++; v = *p++ << 4; *rp++ = v + *p++; v = *p++ << 4; *rp++ = v + *p++; return rp; } static uint8_t * unpack_4(uint32_t *p, uint8_t *dp) { *p++ = (*dp >> 4); *p++ = (*dp++ & 0xf); *p++ = (*dp >> 4); *p++ = (*dp++ & 0xf); *p++ = (*dp >> 4); *p++ = (*dp++ & 0xf); *p++ = (*dp >> 4); *p++ = (*dp++ & 0xf); return dp; } static uint8_t * pack_5(uint32_t *p, uint8_t *rp) { uint8_t v; v = *p++ << 3; *rp++ = v + (*p >> 2); v = *p++ << 6; v += *p++ << 1; *rp++ = v + (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 1); v = *p++ << 7; v += *p++ << 2; *rp++ = v + (*p >> 3); v = *p++ << 5; *rp++ = v + *p++; return rp; } static uint8_t * unpack_5(uint32_t *p, uint8_t *dp) { uint32_t v; *p++ = (*dp >> 3); v = ((*dp++ << 2) & 0x1f); *p++ = v + (*dp >> 6); *p++ = ((*dp >> 1) & 0x1f); v = ((*dp++ << 4) & 0x1f); *p++ = v + (*dp >> 4); v = ((*dp++ << 1) & 0x1f); *p++ = v + (*dp >> 7); *p++ = ((*dp >> 2) & 0x1f); v = ((*dp++ << 3) & 0x1f); *p++ = v + (*dp >> 5); *p++ = (*dp++ & 0x1f); return dp; } static uint8_t * pack_6(uint32_t *p, uint8_t *rp) { uint8_t v; v = *p++ << 2; *rp++ = v + (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 2); v = *p++ << 6; *rp++ = v + *p++; v = *p++ << 2; *rp++ = v + (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 2); v = *p++ << 6; *rp++ = v + *p++; return rp; } static uint8_t * unpack_6(uint32_t *p, uint8_t *dp) { uint32_t v; *p++ = (*dp >> 2); v = ((*dp++ << 4) & 0x3f); *p++ = v + (*dp >> 4); v = ((*dp++ << 2) & 0x3f); *p++ = v + (*dp >> 6); *p++ = (*dp++ & 0x3f); *p++ = (*dp >> 2); v = ((*dp++ << 4) & 0x3f); *p++ = v + (*dp >> 4); v = ((*dp++ << 2) & 0x3f); *p++ = v + (*dp >> 6); *p++ = (*dp++ & 0x3f); return dp; } static uint8_t * pack_7(uint32_t *p, uint8_t *rp) { uint8_t v; v = *p++ << 1; *rp++ = v + (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 5); v = *p++ << 3; *rp++ = v + (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 3); v = *p++ << 5; *rp++ = v + (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 1); v = *p++ << 7; *rp++ = v + *p++; return rp; } static uint8_t * unpack_7(uint32_t *p, uint8_t *dp) { uint32_t v; *p++ = (*dp >> 1); v = ((*dp++ << 6) & 0x7f); *p++ = v + (*dp >> 2); v = ((*dp++ << 5) & 0x7f); *p++ = v + (*dp >> 3); v = ((*dp++ << 4) & 0x7f); *p++ = v + (*dp >> 4); v = ((*dp++ << 3) & 0x7f); *p++ = v + (*dp >> 5); v = ((*dp++ << 2) & 0x7f); *p++ = v + (*dp >> 6); v = ((*dp++ << 1) & 0x7f); *p++ = v + (*dp >> 7); *p++ = (*dp++ & 0x7f); return dp; } static uint8_t * pack_8(uint32_t *p, uint8_t *rp) { *rp++ = *p++; *rp++ = *p++; *rp++ = *p++; *rp++ = *p++; *rp++ = *p++; *rp++ = *p++; *rp++ = *p++; *rp++ = *p++; return rp; } static uint8_t * unpack_8(uint32_t *p, uint8_t *dp) { *p++ = *dp++; *p++ = *dp++; *p++ = *dp++; *p++ = *dp++; *p++ = *dp++; *p++ = *dp++; *p++ = *dp++; *p++ = *dp++; return dp; } static uint8_t * pack_9(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 1); v = *p++ << 7; *rp++ = v + (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 3); v = *p++ << 5; *rp++ = v + (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 5); v = *p++ << 3; *rp++ = v + (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 7); v = *p++ << 1; *rp++ = v + (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_9(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 1; *p++ = v + (*dp >> 7); v = ((*dp++ << 2) & 0x1ff); *p++ = v + (*dp >> 6); v = ((*dp++ << 3) & 0x1ff); *p++ = v + (*dp >> 5); v = ((*dp++ << 4) & 0x1ff); *p++ = v + (*dp >> 4); v = ((*dp++ << 5) & 0x1ff); *p++ = v + (*dp >> 3); v = ((*dp++ << 6) & 0x1ff); *p++ = v + (*dp >> 2); v = ((*dp++ << 7) & 0x1ff); *p++ = v + (*dp >> 1); v = ((*dp++ << 8) & 0x1ff); *p++ = v + *dp++; return dp; } static uint8_t * pack_10(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_10(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 4) & 0x3ff); *p++ = v + (*dp >> 4); v = ((*dp++ << 6) & 0x3ff); *p++ = v + (*dp >> 2); v = ((*dp++ << 8) & 0x3ff); *p++ = v + *dp++; v = *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 4) & 0x3ff); *p++ = v + (*dp >> 4); v = ((*dp++ << 6) & 0x3ff); *p++ = v + (*dp >> 2); v = ((*dp++ << 8) & 0x3ff); *p++ = v + *dp++; return dp; } static uint8_t * pack_11(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 3); v = *p++ << 5; *rp++ = v + (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; *rp++ = v + (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 7); v = *p++ << 1; *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 5); v = *p++ << 3; *rp++ = v + (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_11(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 3; *p++ = v + (*dp >> 5); v = ((*dp++ << 6) & 0x7ff); *p++ = v + (*dp >> 2); v = ((*dp++ << 9) & 0x7ff); v += *dp++ << 1; *p++ = v + (*dp >> 7); v = ((*dp++ << 4) & 0x7ff); *p++ = v + (*dp >> 4); v = ((*dp++ << 7) & 0x7ff); *p++ = v + (*dp >> 1); v = ((*dp++ << 10) & 0x7ff); v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 5) & 0x7ff); *p++ = v + (*dp >> 3); v = ((*dp++ << 8) & 0x7ff); *p++ = v + *dp++; return dp; } static uint8_t * pack_12(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_12(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 8) & 0xfff); *p++ = v + *dp++; v = *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 8) & 0xfff); *p++ = v + *dp++; v = *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 8) & 0xfff); *p++ = v + *dp++; v = *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 8) & 0xfff); *p++ = v + *dp++; return dp; } static uint8_t * pack_13(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 5); v = *p++ << 3; *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 7); v = *p++ << 1; *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; *rp++ = v + (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; *rp++ = v + (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_13(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 5; *p++ = v + (*dp >> 3); v = ((*dp++ << 10) & 0x1fff); v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 7) & 0x1fff); *p++ = v + (*dp >> 1); v = ((*dp++ << 12) & 0x1fff); v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 9) & 0x1fff); v += *dp++ << 1; *p++ = v + (*dp >> 7); v = ((*dp++ << 6) & 0x1fff); *p++ = v + (*dp >> 2); v = ((*dp++ << 11) & 0x1fff); v += *dp++ << 3; *p++ = v + (*dp >> 5); v = ((*dp++ << 8) & 0x1fff); *p++ = v + *dp++; return dp; } static uint8_t * pack_14(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_14(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 12) & 0x3fff); v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 10) & 0x3fff); v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 8) & 0x3fff); *p++ = v + *dp++; v = *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 12) & 0x3fff); v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 10) & 0x3fff); v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 8) & 0x3fff); *p++ = v + *dp++; return dp; } static uint8_t * pack_15(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 7); v = *p++ << 1; *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; *rp++ = v + (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_15(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 7; *p++ = v + (*dp >> 1); v = ((*dp++ << 14) & 0x7fff); v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 13) & 0x7fff); v += *dp++ << 5; *p++ = v + (*dp >> 3); v = ((*dp++ << 12) & 0x7fff); v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 11) & 0x7fff); v += *dp++ << 3; *p++ = v + (*dp >> 5); v = ((*dp++ << 10) & 0x7fff); v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 9) & 0x7fff); v += *dp++ << 1; *p++ = v + (*dp >> 7); v = ((*dp++ << 8) & 0x7fff); *p++ = v + *dp++; return dp; } static uint8_t * pack_16(uint32_t *p, uint8_t *rp) { *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_16(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_17(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_17(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 9; v += *dp++ << 1; *p++ = v + (*dp >> 7); v = ((*dp++ << 10) & 0x1ffff); v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 11) & 0x1ffff); v += *dp++ << 3; *p++ = v + (*dp >> 5); v = ((*dp++ << 12) & 0x1ffff); v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 13) & 0x1ffff); v += *dp++ << 5; *p++ = v + (*dp >> 3); v = ((*dp++ << 14) & 0x1ffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 15) & 0x1ffff); v += *dp++ << 7; *p++ = v + (*dp >> 1); v = ((*dp++ << 16) & 0x1ffff); v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_18(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_18(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 12) & 0x3ffff); v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 14) & 0x3ffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 16) & 0x3ffff); v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 12) & 0x3ffff); v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 14) & 0x3ffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 16) & 0x3ffff); v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_19(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_19(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 11; v += *dp++ << 3; *p++ = v + (*dp >> 5); v = ((*dp++ << 14) & 0x7ffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 17) & 0x7ffff); v += *dp++ << 9; v += *dp++ << 1; *p++ = v + (*dp >> 7); v = ((*dp++ << 12) & 0x7ffff); v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 15) & 0x7ffff); v += *dp++ << 7; *p++ = v + (*dp >> 1); v = ((*dp++ << 18) & 0x7ffff); v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 13) & 0x7ffff); v += *dp++ << 5; *p++ = v + (*dp >> 3); v = ((*dp++ << 16) & 0x7ffff); v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_20(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_20(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 16) & 0xfffff); v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 16) & 0xfffff); v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 16) & 0xfffff); v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 16) & 0xfffff); v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_21(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 19); *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_21(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 13; v += *dp++ << 5; *p++ = v + (*dp >> 3); v = ((*dp++ << 18) & 0x1fffff); v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 15) & 0x1fffff); v += *dp++ << 7; *p++ = v + (*dp >> 1); v = ((*dp++ << 20) & 0x1fffff); v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 17) & 0x1fffff); v += *dp++ << 9; v += *dp++ << 1; *p++ = v + (*dp >> 7); v = ((*dp++ << 14) & 0x1fffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 19) & 0x1fffff); v += *dp++ << 11; v += *dp++ << 3; *p++ = v + (*dp >> 5); v = ((*dp++ << 16) & 0x1fffff); v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_22(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_22(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 20) & 0x3fffff); v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 18) & 0x3fffff); v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 16) & 0x3fffff); v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 20) & 0x3fffff); v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 18) & 0x3fffff); v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 16) & 0x3fffff); v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_23(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 21); *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 19); *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_23(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 15; v += *dp++ << 7; *p++ = v + (*dp >> 1); v = ((*dp++ << 22) & 0x7fffff); v += *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 21) & 0x7fffff); v += *dp++ << 13; v += *dp++ << 5; *p++ = v + (*dp >> 3); v = ((*dp++ << 20) & 0x7fffff); v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 19) & 0x7fffff); v += *dp++ << 11; v += *dp++ << 3; *p++ = v + (*dp >> 5); v = ((*dp++ << 18) & 0x7fffff); v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 17) & 0x7fffff); v += *dp++ << 9; v += *dp++ << 1; *p++ = v + (*dp >> 7); v = ((*dp++ << 16) & 0x7fffff); v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_24(uint32_t *p, uint8_t *rp) { *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_24(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_25(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 19); *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 21); *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 23); *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_25(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 17; v += *dp++ << 9; v += *dp++ << 1; *p++ = v + (*dp >> 7); v = ((*dp++ << 18) & 0x1ffffff); v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 19) & 0x1ffffff); v += *dp++ << 11; v += *dp++ << 3; *p++ = v + (*dp >> 5); v = ((*dp++ << 20) & 0x1ffffff); v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 21) & 0x1ffffff); v += *dp++ << 13; v += *dp++ << 5; *p++ = v + (*dp >> 3); v = ((*dp++ << 22) & 0x1ffffff); v += *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 23) & 0x1ffffff); v += *dp++ << 15; v += *dp++ << 7; *p++ = v + (*dp >> 1); v = ((*dp++ << 24) & 0x1ffffff); v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_26(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_26(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 18; v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 20) & 0x3ffffff); v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 22) & 0x3ffffff); v += *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 24) & 0x3ffffff); v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 18; v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 20) & 0x3ffffff); v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 22) & 0x3ffffff); v += *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 24) & 0x3ffffff); v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_27(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 19); *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 25); *rp++ = (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 23); *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 21); *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_27(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 19; v += *dp++ << 11; v += *dp++ << 3; *p++ = v + (*dp >> 5); v = ((*dp++ << 22) & 0x7ffffff); v += *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 25) & 0x7ffffff); v += *dp++ << 17; v += *dp++ << 9; v += *dp++ << 1; *p++ = v + (*dp >> 7); v = ((*dp++ << 20) & 0x7ffffff); v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 23) & 0x7ffffff); v += *dp++ << 15; v += *dp++ << 7; *p++ = v + (*dp >> 1); v = ((*dp++ << 26) & 0x7ffffff); v += *dp++ << 18; v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 21) & 0x7ffffff); v += *dp++ << 13; v += *dp++ << 5; *p++ = v + (*dp >> 3); v = ((*dp++ << 24) & 0x7ffffff); v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_28(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_28(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 24) & 0xfffffff); v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 24) & 0xfffffff); v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 24) & 0xfffffff); v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 24) & 0xfffffff); v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_29(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 21); *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 23); *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; *rp++ = v + (*p >> 28); *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 25); *rp++ = (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 27); *rp++ = (*p >> 19); *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_29(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 21; v += *dp++ << 13; v += *dp++ << 5; *p++ = v + (*dp >> 3); v = ((*dp++ << 26) & 0x1fffffff); v += *dp++ << 18; v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 23) & 0x1fffffff); v += *dp++ << 15; v += *dp++ << 7; *p++ = v + (*dp >> 1); v = ((*dp++ << 28) & 0x1fffffff); v += *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 25) & 0x1fffffff); v += *dp++ << 17; v += *dp++ << 9; v += *dp++ << 1; *p++ = v + (*dp >> 7); v = ((*dp++ << 22) & 0x1fffffff); v += *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 27) & 0x1fffffff); v += *dp++ << 19; v += *dp++ << 11; v += *dp++ << 3; *p++ = v + (*dp >> 5); v = ((*dp++ << 24) & 0x1fffffff); v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_30(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 28); *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 28); *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_30(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 22; v += *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 28) & 0x3fffffff); v += *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 26) & 0x3fffffff); v += *dp++ << 18; v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 24) & 0x3fffffff); v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 22; v += *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 28) & 0x3fffffff); v += *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 26) & 0x3fffffff); v += *dp++ << 18; v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 24) & 0x3fffffff); v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_31(uint32_t *p, uint8_t *rp) { uint8_t v; *rp++ = (*p >> 23); *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; *rp++ = v + (*p >> 30); *rp++ = (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; *rp++ = v + (*p >> 29); *rp++ = (*p >> 21); *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; *rp++ = v + (*p >> 28); *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; *rp++ = v + (*p >> 27); *rp++ = (*p >> 19); *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; *rp++ = v + (*p >> 25); *rp++ = (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_31(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 23; v += *dp++ << 15; v += *dp++ << 7; *p++ = v + (*dp >> 1); v = ((*dp++ << 30) & 0x7fffffff); v += *dp++ << 22; v += *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); v = ((*dp++ << 29) & 0x7fffffff); v += *dp++ << 21; v += *dp++ << 13; v += *dp++ << 5; *p++ = v + (*dp >> 3); v = ((*dp++ << 28) & 0x7fffffff); v += *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); v = ((*dp++ << 27) & 0x7fffffff); v += *dp++ << 19; v += *dp++ << 11; v += *dp++ << 3; *p++ = v + (*dp >> 5); v = ((*dp++ << 26) & 0x7fffffff); v += *dp++ << 18; v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); v = ((*dp++ << 25) & 0x7fffffff); v += *dp++ << 17; v += *dp++ << 9; v += *dp++ << 1; *p++ = v + (*dp >> 7); v = ((*dp++ << 24) & 0x7fffffff); v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; return dp; } static uint8_t * pack_32(uint32_t *p, uint8_t *rp) { *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; return rp; } static uint8_t * unpack_32(uint32_t *p, uint8_t *dp) { uint32_t v; v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; return dp; } /* */ static uint8_t * pack_(uint32_t *p, uint32_t i, int w, uint8_t *rp) { while (i >= 8) { switch (w) { case 0 : break; case 1 : rp = pack_1(p, rp); break; case 2 : rp = pack_2(p, rp); break; case 3 : rp = pack_3(p, rp); break; case 4 : rp = pack_4(p, rp); break; case 5 : rp = pack_5(p, rp); break; case 6 : rp = pack_6(p, rp); break; case 7 : rp = pack_7(p, rp); break; case 8 : rp = pack_8(p, rp); break; case 9 : rp = pack_9(p, rp); break; case 10 : rp = pack_10(p, rp); break; case 11 : rp = pack_11(p, rp); break; case 12 : rp = pack_12(p, rp); break; case 13 : rp = pack_13(p, rp); break; case 14 : rp = pack_14(p, rp); break; case 15 : rp = pack_15(p, rp); break; case 16 : rp = pack_16(p, rp); break; case 17 : rp = pack_17(p, rp); break; case 18 : rp = pack_18(p, rp); break; case 19 : rp = pack_19(p, rp); break; case 20 : rp = pack_20(p, rp); break; case 21 : rp = pack_21(p, rp); break; case 22 : rp = pack_22(p, rp); break; case 23 : rp = pack_23(p, rp); break; case 24 : rp = pack_24(p, rp); break; case 25 : rp = pack_25(p, rp); break; case 26 : rp = pack_26(p, rp); break; case 27 : rp = pack_27(p, rp); break; case 28 : rp = pack_28(p, rp); break; case 29 : rp = pack_29(p, rp); break; case 30 : rp = pack_30(p, rp); break; case 31 : rp = pack_31(p, rp); break; case 32 : rp = pack_32(p, rp); break; } p += 8; i -= 8; } { int b; uint8_t v; uint32_t *pe = p + i; for (b = 8 - w, v = 0; p < pe;) { if (b > 0) { v += *p++ << b; b -= w; } else if (b < 0) { *rp++ = v + (*p >> -b); b += 8; v = 0; } else { *rp++ = v + *p++; b = 8 - w; v = 0; } } if (b + w != 8) { *rp++ = v; } return rp; } } static uint8_t * pack(uint32_t *p, uint32_t i, uint8_t *freq, uint8_t *rp) { int32_t k, w; uint8_t ebuf[UNIT_SIZE], *ep = ebuf; uint32_t s, *pe = p + i, r, th = i - (i >> 3); for (w = 0, s = 0; w <= 32; w++) { if ((s += freq[w]) >= th) { break; } } if (i == s) { *rp++ = w; return pack_(p, i, w, rp); } r = 1 << w; *rp++ = w + 0x80; *rp++ = i - s; if (r >= UNIT_SIZE) { uint32_t first, *last = &first; for (k = 0; p < pe; p++, k++) { if (*p >= r) { GRN_B_ENC(*p - r, ep); *last = k; last = p; } } *last = 0; *rp++ = (uint8_t) first; } else { for (k = 0; p < pe; p++, k++) { if (*p >= r) { *ep++ = k; GRN_B_ENC(*p - r, ep); *p = 0; } } } rp = pack_(p - i, i, w, rp); grn_memcpy(rp, ebuf, ep - ebuf); return rp + (ep - ebuf); } int grn_p_enc(grn_ctx *ctx, uint32_t *data, uint32_t data_size, uint8_t **res) { uint8_t *rp, freq[33]; uint32_t j, *dp, *dpe, d, w, buf[UNIT_SIZE]; *res = rp = GRN_MALLOC(data_size * sizeof(uint32_t) * 2); GRN_B_ENC(data_size, rp); memset(freq, 0, 33); for (j = 0, dp = data, dpe = dp + data_size; dp < dpe; j++, dp++) { if (j == UNIT_SIZE) { rp = pack(buf, j, freq, rp); memset(freq, 0, 33); j = 0; } if ((d = buf[j] = *dp)) { GRN_BIT_SCAN_REV(d, w); freq[w + 1]++; } else { freq[0]++; } } if (j) { rp = pack(buf, j, freq, rp); } return rp - *res; } #define USE_P_ENC (1<<0) /* Use PForDelta */ #define CUT_OFF (1<<1) /* Deprecated */ #define ODD (1<<2) /* Variable size data */ typedef struct { uint32_t *data; uint32_t data_size; uint32_t flags; } datavec; static grn_rc datavec_reset(grn_ctx *ctx, datavec *dv, uint32_t dvlen, size_t unitsize, size_t totalsize) { uint32_t i; if (!dv[0].data || dv[dvlen].data < dv[0].data + totalsize) { if (dv[0].data) { GRN_FREE(dv[0].data); } if (!(dv[0].data = GRN_MALLOC(totalsize * sizeof(uint32_t)))) { MERR("[ii][data-vector][reset] failed to allocate data: " "length:<%u>, " "unit-size:<%" GRN_FMT_SIZE ">, " "total-size:<%" GRN_FMT_SIZE ">", dvlen, unitsize, totalsize); return ctx->rc; } dv[dvlen].data = dv[0].data + totalsize; } for (i = 1; i < dvlen; i++) { dv[i].data = dv[i - 1].data + unitsize; } return GRN_SUCCESS; } static grn_rc datavec_init(grn_ctx *ctx, datavec *dv, uint32_t dvlen, size_t unitsize, size_t totalsize) { uint32_t i; if (!totalsize) { memset(dv, 0, sizeof(datavec) * (dvlen + 1)); return GRN_SUCCESS; } if (!(dv[0].data = GRN_MALLOC(totalsize * sizeof(uint32_t)))) { MERR("[ii][data-vector][init] failed to allocate data: " "length:<%u>, " "unit-size:<%" GRN_FMT_SIZE ">, " "total-size:<%" GRN_FMT_SIZE ">", dvlen, unitsize, totalsize); return ctx->rc; } dv[dvlen].data = dv[0].data + totalsize; for (i = 1; i < dvlen; i++) { dv[i].data = dv[i - 1].data + unitsize; } return GRN_SUCCESS; } static void datavec_fin(grn_ctx *ctx, datavec *dv) { if (dv[0].data) { GRN_FREE(dv[0].data); } } size_t grn_p_encv(grn_ctx *ctx, datavec *dv, uint32_t dvlen, uint8_t *res) { uint8_t *rp = res, freq[33]; uint32_t pgap, usep, l, df, data_size, *dp, *dpe; if (!dvlen || !(df = dv[0].data_size)) { return 0; } for (usep = 0, data_size = 0, l = 0; l < dvlen; l++) { uint32_t dl = dv[l].data_size; if (dl < df || ((dl > df) && (l != dvlen - 1))) { /* invalid argument */ return 0; } usep += (dv[l].flags & USE_P_ENC) << l; data_size += dl; } pgap = data_size - df * dvlen; if (!usep) { GRN_B_ENC((df << 1) + 1, rp); for (l = 0; l < dvlen; l++) { for (dp = dv[l].data, dpe = dp + dv[l].data_size; dp < dpe; dp++) { GRN_B_ENC(*dp, rp); } } } else { uint32_t buf[UNIT_SIZE]; GRN_B_ENC((usep << 1), rp); GRN_B_ENC(df, rp); if (dv[dvlen - 1].flags & ODD) { GRN_B_ENC(pgap, rp); } else { GRN_ASSERT(!pgap); } for (l = 0; l < dvlen; l++) { dp = dv[l].data; dpe = dp + dv[l].data_size; if ((dv[l].flags & USE_P_ENC)) { uint32_t j = 0, d; memset(freq, 0, 33); while (dp < dpe) { if (j == UNIT_SIZE) { rp = pack(buf, j, freq, rp); memset(freq, 0, 33); j = 0; } if ((d = buf[j++] = *dp++)) { uint32_t w; GRN_BIT_SCAN_REV(d, w); freq[w + 1]++; } else { freq[0]++; } } if (j) { rp = pack(buf, j, freq, rp); } } else { while (dp < dpe) { GRN_B_ENC(*dp++, rp); } } } } return rp - res; } #define GRN_B_DEC_CHECK(v,p,pe) do { \ uint8_t *_p = (uint8_t *)p; \ uint32_t _v; \ if (_p >= pe) { return 0; } \ _v = *_p++; \ switch (_v >> 4) { \ case 0x08 : \ if (_v == 0x8f) { \ if (_p + sizeof(uint32_t) > pe) { return 0; } \ grn_memcpy(&_v, _p, sizeof(uint32_t)); \ _p += sizeof(uint32_t); \ } \ break; \ case 0x09 : \ if (_p + 3 > pe) { return 0; } \ _v = (_v - 0x90) * 0x100 + *_p++; \ _v = _v * 0x100 + *_p++; \ _v = _v * 0x100 + *_p++ + 0x20408f; \ break; \ case 0x0a : \ case 0x0b : \ if (_p + 2 > pe) { return 0; } \ _v = (_v - 0xa0) * 0x100 + *_p++; \ _v = _v * 0x100 + *_p++ + 0x408f; \ break; \ case 0x0c : \ case 0x0d : \ case 0x0e : \ case 0x0f : \ if (_p + 1 > pe) { return 0; } \ _v = (_v - 0xc0) * 0x100 + *_p++ + 0x8f; \ break; \ } \ v = _v; \ p = _p; \ } while (0) static uint8_t * unpack(uint8_t *dp, uint8_t *dpe, int i, uint32_t *rp) { uint8_t ne = 0, k = 0, w = *dp++; uint32_t m, *p = rp; if (w & 0x80) { ne = *dp++; w -= 0x80; m = (1 << w) - 1; if (m >= UNIT_MASK) { k = *dp++; } } else { m = (1 << w) - 1; } if (w) { while (i >= 8) { if (dp + w > dpe) { return NULL; } switch (w) { case 1 : dp = unpack_1(p, dp); break; case 2 : dp = unpack_2(p, dp); break; case 3 : dp = unpack_3(p, dp); break; case 4 : dp = unpack_4(p, dp); break; case 5 : dp = unpack_5(p, dp); break; case 6 : dp = unpack_6(p, dp); break; case 7 : dp = unpack_7(p, dp); break; case 8 : dp = unpack_8(p, dp); break; case 9 : dp = unpack_9(p, dp); break; case 10 : dp = unpack_10(p, dp); break; case 11 : dp = unpack_11(p, dp); break; case 12 : dp = unpack_12(p, dp); break; case 13 : dp = unpack_13(p, dp); break; case 14 : dp = unpack_14(p, dp); break; case 15 : dp = unpack_15(p, dp); break; case 16 : dp = unpack_16(p, dp); break; case 17 : dp = unpack_17(p, dp); break; case 18 : dp = unpack_18(p, dp); break; case 19 : dp = unpack_19(p, dp); break; case 20 : dp = unpack_20(p, dp); break; case 21 : dp = unpack_21(p, dp); break; case 22 : dp = unpack_22(p, dp); break; case 23 : dp = unpack_23(p, dp); break; case 24 : dp = unpack_24(p, dp); break; case 25 : dp = unpack_25(p, dp); break; case 26 : dp = unpack_26(p, dp); break; case 27 : dp = unpack_27(p, dp); break; case 28 : dp = unpack_28(p, dp); break; case 29 : dp = unpack_29(p, dp); break; case 30 : dp = unpack_30(p, dp); break; case 31 : dp = unpack_31(p, dp); break; case 32 : dp = unpack_32(p, dp); break; } i -= 8; p += 8; } { int b; uint32_t v, *pe; for (b = 8 - w, v = 0, pe = p + i; p < pe && dp < dpe;) { if (b > 0) { *p++ = v + ((*dp >> b) & m); b -= w; v = 0; } else if (b < 0) { v += (*dp++ << -b) & m; b += 8; } else { *p++ = v + (*dp++ & m); b = 8 - w; v = 0; } } if (b + w != 8) { dp++; } } } else { memset(p, 0, sizeof(uint32_t) * i); } if (ne) { if (m >= UNIT_MASK) { uint32_t *pp; while (ne--) { pp = &rp[k]; k = *pp; GRN_B_DEC_CHECK(*pp, dp, dpe); *pp += (m + 1); } } else { while (ne--) { k = *dp++; GRN_B_DEC_CHECK(rp[k], dp, dpe); rp[k] += (m + 1); } } } return dp; } int grn_p_dec(grn_ctx *ctx, uint8_t *data, uint32_t data_size, uint32_t nreq, uint32_t **res) { uint8_t *dp = data, *dpe = data + data_size; uint32_t rest, orig_size, *rp, *rpe; GRN_B_DEC(orig_size, dp); if (!orig_size) { if (!nreq || nreq > data_size) { nreq = data_size; } if ((*res = rp = GRN_MALLOC(nreq * 4))) { for (rpe = rp + nreq; dp < data + data_size && rp < rpe; rp++) { GRN_B_DEC(*rp, dp); } } return rp - *res; } else { if (!(*res = rp = GRN_MALLOC(orig_size * sizeof(uint32_t)))) { return 0; } if (!nreq || nreq > orig_size) { nreq = orig_size; } for (rest = nreq; rest >= UNIT_SIZE; rest -= UNIT_SIZE) { if (!(dp = unpack(dp, dpe, UNIT_SIZE, rp))) { return 0; } rp += UNIT_SIZE; } if (rest) { if (!(dp = unpack(dp, dpe, rest, rp))) { return 0; } } GRN_ASSERT(data + data_size == dp); return nreq; } } int grn_p_decv(grn_ctx *ctx, uint8_t *data, uint32_t data_size, datavec *dv, uint32_t dvlen) { size_t size; uint32_t df, l, i, *rp, nreq; uint8_t *dp = data, *dpe = data + data_size; if (!data_size) { dv[0].data_size = 0; return 0; } for (nreq = 0; nreq < dvlen; nreq++) { if (dv[nreq].flags & CUT_OFF) { break; } } if (!nreq) { return 0; } GRN_B_DEC_CHECK(df, dp, dpe); if ((df & 1)) { df >>= 1; size = nreq == dvlen ? data_size : df * nreq; if (dv[dvlen].data < dv[0].data + size) { if (dv[0].data) { GRN_FREE(dv[0].data); } if (!(rp = GRN_MALLOC(size * sizeof(uint32_t)))) { return 0; } dv[dvlen].data = rp + size; } else { rp = dv[0].data; } for (l = 0; l < dvlen; l++) { if (dv[l].flags & CUT_OFF) { break; } dv[l].data = rp; if (l < dvlen - 1) { for (i = 0; i < df; i++, rp++) { GRN_B_DEC_CHECK(*rp, dp, dpe); } } else { for (i = 0; dp < dpe; i++, rp++) { GRN_B_DEC_CHECK(*rp, dp, dpe); } } dv[l].data_size = i; } } else { uint32_t n, rest, usep = df >> 1; GRN_B_DEC_CHECK(df, dp, dpe); if (dv[dvlen -1].flags & ODD) { GRN_B_DEC_CHECK(rest, dp, dpe); } else { rest = 0; } size = df * nreq + (nreq == dvlen ? rest : 0); if (dv[dvlen].data < dv[0].data + size) { if (dv[0].data) { GRN_FREE(dv[0].data); } if (!(rp = GRN_MALLOC(size * sizeof(uint32_t)))) { return 0; } dv[dvlen].data = rp + size; } else { rp = dv[0].data; } for (l = 0; l < dvlen; l++) { if (dv[l].flags & CUT_OFF) { break; } dv[l].data = rp; dv[l].data_size = n = (l < dvlen - 1) ? df : df + rest; if (usep & (1 << l)) { for (; n >= UNIT_SIZE; n -= UNIT_SIZE) { if (!(dp = unpack(dp, dpe, UNIT_SIZE, rp))) { return 0; } rp += UNIT_SIZE; } if (n) { if (!(dp = unpack(dp, dpe, n, rp))) { return 0; } rp += n; } dv[l].flags |= USE_P_ENC; } else { for (; n; n--, rp++) { GRN_B_DEC_CHECK(*rp, dp, dpe); } } } GRN_ASSERT(dp == dpe); if (dp != dpe) { GRN_LOG(ctx, GRN_LOG_DEBUG, "data_size=%d, %" GRN_FMT_LLD, data_size, (long long int)(dpe - dp)); } } return rp - dv[0].data; } int grn_b_enc(grn_ctx *ctx, uint32_t *data, uint32_t data_size, uint8_t **res) { uint8_t *rp; uint32_t *dp, i; *res = rp = GRN_MALLOC(data_size * sizeof(uint32_t) * 2); GRN_B_ENC(data_size, rp); for (i = data_size, dp = data; i; i--, dp++) { GRN_B_ENC(*dp, rp); } return rp - *res; } int grn_b_dec(grn_ctx *ctx, uint8_t *data, uint32_t data_size, uint32_t **res) { uint32_t i, *rp, orig_size; uint8_t *dp = data; GRN_B_DEC(orig_size, dp); *res = rp = GRN_MALLOC(orig_size * sizeof(uint32_t)); for (i = orig_size; i; i--, rp++) { GRN_B_DEC(*rp, dp); } return orig_size; } /* buffer */ typedef struct { uint32_t tid; uint32_t size_in_chunk; uint32_t pos_in_chunk; uint16_t size_in_buffer; uint16_t pos_in_buffer; } buffer_term; typedef struct { uint16_t step; uint16_t jump; } buffer_rec; typedef struct { uint32_t chunk; uint32_t chunk_size; uint32_t buffer_free; uint16_t nterms; uint16_t nterms_void; } buffer_header; struct grn_ii_buffer { buffer_header header; buffer_term terms[(S_SEGMENT - sizeof(buffer_header))/sizeof(buffer_term)]; }; typedef struct grn_ii_buffer buffer; inline static uint32_t buffer_open(grn_ctx *ctx, grn_ii *ii, uint32_t pos, buffer_term **bt, buffer **b) { byte *p = NULL; uint16_t lseg = (uint16_t) (LSEG(pos)); uint32_t pseg = ii->header->binfo[lseg]; if (pseg != GRN_II_PSEG_NOT_ASSIGNED) { GRN_IO_SEG_REF(ii->seg, pseg, p); if (!p) { return GRN_II_PSEG_NOT_ASSIGNED; } if (b) { *b = (buffer *)p; } if (bt) { *bt = (buffer_term *)(p + LPOS(pos)); } } return pseg; } inline static grn_rc buffer_close(grn_ctx *ctx, grn_ii *ii, uint32_t pseg) { if (pseg >= ii->seg->header->max_segment) { GRN_LOG(ctx, GRN_LOG_NOTICE, "invalid pseg buffer_close(%d)", pseg); return GRN_INVALID_ARGUMENT; } GRN_IO_SEG_UNREF(ii->seg, pseg); return GRN_SUCCESS; } typedef struct { uint32_t rid; uint32_t sid; } docid; #define BUFFER_REC_DEL(r) ((r)->jump = 1) #define BUFFER_REC_DELETED(r) ((r)->jump == 1) #define BUFFER_REC_AT(b,pos) ((buffer_rec *)(b) + (pos)) #define BUFFER_REC_POS(b,rec) ((uint16_t)((rec) - (buffer_rec *)(b))) inline static void buffer_term_dump(grn_ctx *ctx, grn_ii *ii, buffer *b, buffer_term *bt) { int pos, rid, sid; uint8_t *p; buffer_rec *r; if (!grn_logger_pass(ctx, GRN_LOG_DEBUG)) { return; } GRN_LOG(ctx, GRN_LOG_DEBUG, "b=(%x %u %u %u)", b->header.chunk, b->header.chunk_size, b->header.buffer_free, b->header.nterms); GRN_LOG(ctx, GRN_LOG_DEBUG, "bt=(%u %u %u %u %u)", bt->tid, bt->size_in_chunk, bt->pos_in_chunk, bt->size_in_buffer, bt->pos_in_buffer); for (pos = bt->pos_in_buffer; pos; pos = r->step) { r = BUFFER_REC_AT(b, pos); p = GRN_NEXT_ADDR(r); GRN_B_DEC(rid, p); if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { GRN_B_DEC(sid, p); } else { sid = 1; } GRN_LOG(ctx, GRN_LOG_DEBUG, "%d=(%d:%d),(%d:%d)", pos, r->jump, r->step, rid, sid); } } inline static grn_rc check_jump(grn_ctx *ctx, grn_ii *ii, buffer *b, buffer_rec *r, int j) { uint16_t i = BUFFER_REC_POS(b, r); uint8_t *p; buffer_rec *r2; docid id, id2; if (!j) { return GRN_SUCCESS; } p = GRN_NEXT_ADDR(r); GRN_B_DEC(id.rid, p); if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { GRN_B_DEC(id.sid, p); } else { id.sid = 1; } if (j == 1) { GRN_LOG(ctx, GRN_LOG_DEBUG, "deleting! %d(%d:%d)", i, id.rid, id.sid); return GRN_SUCCESS; } r2 = BUFFER_REC_AT(b, j); p = GRN_NEXT_ADDR(r2); GRN_B_DEC(id2.rid, p); if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { GRN_B_DEC(id2.sid, p); } else { id2.sid = 1; } if (r2->step == i) { GRN_LOG(ctx, GRN_LOG_EMERG, "cycle! %d(%d:%d)<->%d(%d:%d)", i, id.rid, id.sid, j, id2.rid, id2.sid); return GRN_FILE_CORRUPT; } if (id2.rid < id.rid || (id2.rid == id.rid && id2.sid <= id.sid)) { GRN_LOG(ctx, GRN_LOG_CRIT, "invalid jump! %d(%d:%d)(%d:%d)->%d(%d:%d)(%d:%d)", i, r->jump, r->step, id.rid, id.sid, j, r2->jump, r2->step, id2.rid, id2.sid); return GRN_FILE_CORRUPT; } return GRN_SUCCESS; } inline static grn_rc set_jump_r(grn_ctx *ctx, grn_ii *ii, buffer *b, buffer_rec *from, int to) { int i, j, max_jump = 100; buffer_rec *r, *r2; for (r = from, j = to; j > 1 && max_jump--; r = BUFFER_REC_AT(b, r->step)) { r2 = BUFFER_REC_AT(b, j); if (r == r2) { break; } if (BUFFER_REC_DELETED(r2)) { break; } if (j == (i = r->jump)) { break; } if (j == r->step) { break; } if (check_jump(ctx, ii, b, r, j)) { ERR(GRN_FILE_CORRUPT, "check_jump failed"); return ctx->rc; } r->jump = j; j = i; if (!r->step) { return GRN_FILE_CORRUPT; } } return GRN_SUCCESS; } #define GET_NUM_BITS(x,n) do {\ n = x;\ n = (n & 0x55555555) + ((n >> 1) & 0x55555555);\ n = (n & 0x33333333) + ((n >> 2) & 0x33333333);\ n = (n & 0x0F0F0F0F) + ((n >> 4) & 0x0F0F0F0F);\ n = (n & 0x00FF00FF) + ((n >> 8) & 0x00FF00FF);\ n = (n & 0x0000FFFF) + ((n >>16) & 0x0000FFFF);\ } while (0) inline static grn_rc buffer_put(grn_ctx *ctx, grn_ii *ii, buffer *b, buffer_term *bt, buffer_rec *rnew, uint8_t *bs, grn_ii_updspec *u, int size) { uint8_t *p; docid id_curr = {0, 0}, id_start = {0, 0}, id_post = {0, 0}; buffer_rec *r_curr, *r_start = NULL; uint16_t last = 0, *lastp = &bt->pos_in_buffer, pos = BUFFER_REC_POS(b, rnew); int vdelta = 0, delta, delta0 = 0, vhops = 0, nhops = 0, reset = 1; grn_memcpy(GRN_NEXT_ADDR(rnew), bs, size - sizeof(buffer_rec)); for (;;) { if (!*lastp) { rnew->step = 0; rnew->jump = 0; // smb_wmb(); *lastp = pos; if (bt->size_in_buffer++ > 1) { buffer_rec *rhead = BUFFER_REC_AT(b, bt->pos_in_buffer); rhead->jump = pos; if (!(bt->size_in_buffer & 1)) { int n; buffer_rec *r = BUFFER_REC_AT(b, rhead->step), *r2; GET_NUM_BITS(bt->size_in_buffer, n); while (n-- && (r->jump > 1)) { r2 = BUFFER_REC_AT(b, r->jump); if (BUFFER_REC_DELETED(r2)) { break; } r = r2; } if (r != rnew) { set_jump_r(ctx, ii, b, r, last); } } } break; } r_curr = BUFFER_REC_AT(b, *lastp); p = GRN_NEXT_ADDR(r_curr); GRN_B_DEC(id_curr.rid, p); if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { GRN_B_DEC(id_curr.sid, p); } else { id_curr.sid = 1; } if (id_curr.rid < id_post.rid || (id_curr.rid == id_post.rid && id_curr.sid < id_post.sid)) { { DEFINE_NAME(ii); CRIT(GRN_FILE_CORRUPT, "[ii][buffer][put] loop is found: " "<%.*s>: " "(%d:%d)->(%d:%d)", name_size, name, id_post.rid, id_post.sid, id_curr.rid, id_curr.sid); } buffer_term_dump(ctx, ii, b, bt); bt->pos_in_buffer = 0; bt->size_in_buffer = 0; lastp = &bt->pos_in_buffer; continue; } id_post.rid = id_curr.rid; id_post.sid = id_curr.sid; if (u->rid < id_curr.rid || (u->rid == id_curr.rid && u->sid <= id_curr.sid)) { uint16_t step = *lastp, jump = r_curr->jump; if (u->rid == id_curr.rid) { if (u->sid == 0) { while (id_curr.rid == u->rid) { BUFFER_REC_DEL(r_curr); if (!(step = r_curr->step)) { break; } r_curr = BUFFER_REC_AT(b, step); p = GRN_NEXT_ADDR(r_curr); GRN_B_DEC(id_curr.rid, p); if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { GRN_B_DEC(id_curr.sid, p); } else { id_curr.sid = 1; } } } else if (u->sid == id_curr.sid) { BUFFER_REC_DEL(r_curr); step = r_curr->step; } } rnew->step = step; rnew->jump = check_jump(ctx, ii, b, rnew, jump) ? 0 : jump; // smb_wmb(); *lastp = pos; break; } if (reset) { r_start = r_curr; id_start.rid = id_curr.rid; id_start.sid = id_curr.sid; if (!(delta0 = u->rid - id_start.rid)) { delta0 = u->sid - id_start.sid; } nhops = 0; vhops = 1; vdelta = delta0 >> 1; } else { if (!(delta = id_curr.rid - id_start.rid)) { delta = id_curr.sid - id_start.sid; } if (vdelta < delta) { vdelta += (delta0 >> ++vhops); r_start = r_curr; } if (nhops > vhops) { set_jump_r(ctx, ii, b, r_start, *lastp); } else { nhops++; } } last = *lastp; lastp = &r_curr->step; reset = 0; { uint16_t posj = r_curr->jump; if (posj > 1) { buffer_rec *rj = BUFFER_REC_AT(b, posj); if (!BUFFER_REC_DELETED(rj)) { docid idj; p = GRN_NEXT_ADDR(rj); GRN_B_DEC(idj.rid, p); if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { GRN_B_DEC(idj.sid, p); } else { idj.sid = 1; } if (idj.rid < u->rid || (idj.rid == u->rid && idj.sid < u->sid)) { last = posj; lastp = &rj->step; } else { reset = 1; } } } } } return ctx->rc; } /* array */ inline static uint32_t * array_at(grn_ctx *ctx, grn_ii *ii, uint32_t id) { byte *p = NULL; uint32_t seg, pseg; if (id > GRN_ID_MAX) { return NULL; } seg = id >> W_ARRAY; if ((pseg = ii->header->ainfo[seg]) == GRN_II_PSEG_NOT_ASSIGNED) { return NULL; } GRN_IO_SEG_REF(ii->seg, pseg, p); if (!p) { return NULL; } return (uint32_t *)(p + (id & ARRAY_MASK_IN_A_SEGMENT) * S_ARRAY_ELEMENT); } inline static uint32_t * array_get(grn_ctx *ctx, grn_ii *ii, uint32_t id) { byte *p = NULL; uint16_t seg; uint32_t pseg; if (id > GRN_ID_MAX) { return NULL; } seg = id >> W_ARRAY; if ((pseg = ii->header->ainfo[seg]) == GRN_II_PSEG_NOT_ASSIGNED) { if (segment_get_clear(ctx, ii, &pseg)) { return NULL; } ii->header->ainfo[seg] = pseg; if (seg >= ii->header->amax) { ii->header->amax = seg + 1; } } GRN_IO_SEG_REF(ii->seg, pseg, p); if (!p) { return NULL; } return (uint32_t *)(p + (id & ARRAY_MASK_IN_A_SEGMENT) * S_ARRAY_ELEMENT); } inline static void array_unref(grn_ii *ii, uint32_t id) { GRN_IO_SEG_UNREF(ii->seg, ii->header->ainfo[id >> W_ARRAY]); } /* updspec */ grn_ii_updspec * grn_ii_updspec_open(grn_ctx *ctx, uint32_t rid, uint32_t sid) { grn_ii_updspec *u; if (!(u = GRN_MALLOC(sizeof(grn_ii_updspec)))) { return NULL; } u->rid = rid; u->sid = sid; u->weight = 0; u->tf = 0; u->atf = 0; u->pos = NULL; u->tail = NULL; // u->vnodes = NULL; return u; } #define GRN_II_MAX_TF 0x1ffff grn_rc grn_ii_updspec_add(grn_ctx *ctx, grn_ii_updspec *u, int pos, int32_t weight) { struct _grn_ii_pos *p; u->atf++; if (u->tf >= GRN_II_MAX_TF) { return GRN_SUCCESS; } if (!(p = GRN_MALLOC(sizeof(struct _grn_ii_pos)))) { return GRN_NO_MEMORY_AVAILABLE; } u->weight += weight; p->pos = pos; p->next = NULL; if (u->tail) { u->tail->next = p; } else { u->pos = p; } u->tail = p; u->tf++; return GRN_SUCCESS; } int grn_ii_updspec_cmp(grn_ii_updspec *a, grn_ii_updspec *b) { struct _grn_ii_pos *pa, *pb; if (a->rid != b->rid) { return a->rid - b->rid; } if (a->sid != b->sid) { return a->sid - b->sid; } if (a->weight != b->weight) { return a->weight - b->weight; } if (a->tf != b->tf) { return a->tf - b->tf; } for (pa = a->pos, pb = b->pos; pa && pb; pa = pa->next, pb = pb->next) { if (pa->pos != pb->pos) { return pa->pos - pb->pos; } } if (pa) { return 1; } if (pb) { return -1; } return 0; } grn_rc grn_ii_updspec_close(grn_ctx *ctx, grn_ii_updspec *u) { struct _grn_ii_pos *p = u->pos, *q; while (p) { q = p->next; GRN_FREE(p); p = q; } GRN_FREE(u); return GRN_SUCCESS; } inline static uint8_t * encode_rec(grn_ctx *ctx, grn_ii *ii, grn_ii_updspec *u, unsigned int *size, int deletep) { uint8_t *br, *p; struct _grn_ii_pos *pp; uint32_t lpos, tf, weight; if (deletep) { tf = 0; weight = 0; } else { tf = u->tf; weight = u->weight; } if (!(br = GRN_MALLOC((tf + 4) * 5))) { return NULL; } p = br; GRN_B_ENC(u->rid, p); if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { GRN_B_ENC(u->sid, p); } else { u->sid = 1; } GRN_B_ENC(tf, p); if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { GRN_B_ENC(weight, p); } if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { for (lpos = 0, pp = u->pos; pp && tf--; lpos = pp->pos, pp = pp->next) { GRN_B_ENC(pp->pos - lpos, p); } } while (((intptr_t)p & 0x03)) { *p++ = 0; } *size = (unsigned int) ((p - br) + sizeof(buffer_rec)); return br; } typedef struct { grn_ii *ii; grn_hash *h; } lexicon_deletable_arg; #ifdef CASCADE_DELETE_LEXICON static int lexicon_deletable(grn_ctx *ctx, grn_obj *lexicon, grn_id tid, void *arg) { uint32_t *a; grn_hash *h = ((lexicon_deletable_arg *)arg)->h; grn_ii *ii = ((lexicon_deletable_arg *)arg)->ii; if (!h) { return 0; } if ((a = array_at(ctx, ii, tid))) { if (a[0]) { array_unref(ii, tid); return 0; } array_unref(ii, tid); } { grn_ii_updspec **u; if (!grn_hash_get(ctx, h, &tid, sizeof(grn_id), (void **) &u)) { return (ERRP(ctx, GRN_ERROR)) ? 0 : 1; } if (!(*u)->tf || !(*u)->sid) { return 1; } return 0; } } #endif /* CASCADE_DELETE_LEXICON */ inline static void lexicon_delete(grn_ctx *ctx, grn_ii *ii, uint32_t tid, grn_hash *h) { #ifdef CASCADE_DELETE_LEXICON lexicon_deletable_arg arg = {ii, h}; grn_table_delete_optarg optarg = {0, lexicon_deletable, &arg}; _grn_table_delete_by_id(ctx, ii->lexicon, tid, &optarg); #endif /* CASCADE_DELETE_LEXICON */ } typedef struct { grn_id rid; uint32_t sid; uint32_t tf; uint32_t weight; uint32_t flags; } docinfo; #define GETNEXTC() do {\ if (sdf) {\ uint32_t dgap = *srp++;\ cid.rid += dgap;\ if (dgap) { cid.sid = 0; }\ snp += cid.tf;\ cid.tf = 1 + *stp++;\ if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { cid.weight = *sop++; }\ if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) {\ cid.sid += 1 + *ssp++;\ } else {\ cid.sid = 1;\ }\ sdf--;\ } else {\ cid.rid = 0;\ }\ } while (0) #define PUTNEXT_(id) do {\ uint32_t dgap = id.rid - lid.rid;\ uint32_t sgap = (dgap ? id.sid : id.sid - lid.sid) - 1;\ *ridp++ = dgap;\ if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) {\ *sidp++ = sgap;\ }\ *tfp++ = id.tf - 1;\ if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { *weightp++ = id.weight; }\ lid.rid = id.rid;\ lid.sid = id.sid;\ } while (0) #define PUTNEXTC() do {\ if (cid.rid) {\ if (cid.tf) {\ if (lid.rid > cid.rid || (lid.rid == cid.rid && lid.sid >= cid.sid)) {\ DEFINE_NAME(ii);\ CRIT(GRN_FILE_CORRUPT,\ "[ii][broken] posting in list is larger than posting in chunk: "\ "<%.*s>: (%d:%d) -> (%d:%d)",\ name_size, name, lid.rid, lid.sid, cid.rid, cid.sid);\ break;\ }\ PUTNEXT_(cid);\ if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) {\ uint32_t i;\ for (i = 0; i < cid.tf; i++) {\ *posp++ = snp[i];\ spos += snp[i];\ }\ }\ } else {\ DEFINE_NAME(ii);\ CRIT(GRN_FILE_CORRUPT,\ "[ii][broken] invalid posting in chunk: <%.*s>: (%d,%d)",\ name_size, name, bt->tid, cid.rid);\ break;\ }\ }\ GETNEXTC();\ } while (0) #define GETNEXTB() do {\ if (nextb) {\ uint32_t lrid = bid.rid, lsid = bid.sid;\ buffer_rec *br = BUFFER_REC_AT(sb, nextb);\ sbp = GRN_NEXT_ADDR(br);\ GRN_B_DEC(bid.rid, sbp);\ if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) {\ GRN_B_DEC(bid.sid, sbp);\ } else {\ bid.sid = 1;\ }\ if (lrid > bid.rid || (lrid == bid.rid && lsid >= bid.sid)) {\ DEFINE_NAME(ii);\ CRIT(GRN_FILE_CORRUPT,\ "[ii][broken] postings in block aren't sorted: "\ "<%.*s>: (%d:%d) -> (%d:%d)",\ name_size, name, lrid, lsid, bid.rid, bid.sid);\ break;\ }\ nextb = br->step;\ } else {\ bid.rid = 0;\ }\ } while (0) #define PUTNEXTB() do {\ if (bid.rid && bid.sid) {\ GRN_B_DEC(bid.tf, sbp);\ if (bid.tf > 0) {\ if (lid.rid > bid.rid || (lid.rid == bid.rid && lid.sid >= bid.sid)) {\ DEFINE_NAME(ii);\ CRIT(GRN_FILE_CORRUPT,\ "[ii][broken] posting in list is larger than posting in buffer: "\ "<%.*s>: (%d:%d) -> (%d:%d)",\ name_size, name, lid.rid, lid.sid, bid.rid, bid.sid);\ break;\ }\ if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) {\ GRN_B_DEC(bid.weight, sbp);\ }\ PUTNEXT_(bid);\ if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) {\ while (bid.tf--) { GRN_B_DEC(*posp, sbp); spos += *posp++; }\ }\ }\ }\ GETNEXTB();\ } while (0) #define MERGE_BC(cond) do {\ if (bid.rid) {\ if (cid.rid) {\ if (cid.rid < bid.rid) {\ PUTNEXTC();\ if (ctx->rc != GRN_SUCCESS) { break; }\ } else {\ if (bid.rid < cid.rid) {\ PUTNEXTB();\ if (ctx->rc != GRN_SUCCESS) { break; }\ } else {\ if (bid.sid) {\ if (cid.sid < bid.sid) {\ PUTNEXTC();\ if (ctx->rc != GRN_SUCCESS) { break; }\ } else {\ if (bid.sid == cid.sid) { GETNEXTC(); }\ PUTNEXTB();\ if (ctx->rc != GRN_SUCCESS) { break; }\ }\ } else {\ GETNEXTC();\ }\ }\ }\ } else {\ PUTNEXTB();\ if (ctx->rc != GRN_SUCCESS) { break; }\ }\ } else {\ if (cid.rid) {\ PUTNEXTC();\ if (ctx->rc != GRN_SUCCESS) { break; }\ } else {\ break;\ }\ }\ } while (cond) typedef struct { uint32_t segno; uint32_t size; uint32_t dgap; } chunk_info; static grn_rc chunk_flush(grn_ctx *ctx, grn_ii *ii, chunk_info *cinfo, uint8_t *enc, uint32_t encsize) { uint8_t *dc; uint32_t dcn; grn_io_win dw; if (encsize) { chunk_new(ctx, ii, &dcn, encsize); if (ctx->rc == GRN_SUCCESS) { if ((dc = WIN_MAP(ii->chunk, ctx, &dw, dcn, 0, encsize, grn_io_wronly))) { grn_memcpy(dc, enc, encsize); grn_io_win_unmap(&dw); cinfo->segno = dcn; cinfo->size = encsize; } else { chunk_free(ctx, ii, dcn, 0, encsize); { DEFINE_NAME(ii); MERR("[ii][chunk][flush] failed to allocate a destination chunk: " "<%.*s> :" "segment:<%u>, size:<%u>", name_size, name, dcn, encsize); } } } } else { cinfo->segno = 0; cinfo->size = 0; } return ctx->rc; } static grn_rc chunk_merge(grn_ctx *ctx, grn_ii *ii, buffer *sb, buffer_term *bt, chunk_info *cinfo, grn_id rid, datavec *dv, uint16_t *nextbp, uint8_t **sbpp, docinfo *bidp, int32_t *balance) { grn_io_win sw; uint64_t spos = 0; uint32_t segno = cinfo->segno, size = cinfo->size, sdf = 0, ndf = 0; uint32_t *ridp = NULL, *sidp = NULL, *tfp, *weightp = NULL, *posp = NULL; docinfo cid = {0, 0, 0, 0, 0}, lid = {0, 0, 0, 0, 0}, bid = *bidp; uint8_t *scp = WIN_MAP(ii->chunk, ctx, &sw, segno, 0, size, grn_io_rdonly); if (scp) { uint16_t nextb = *nextbp; uint32_t snn = 0, *srp, *ssp = NULL, *stp, *sop = NULL, *snp; uint8_t *sbp = *sbpp; datavec rdv[MAX_N_ELEMENTS + 1]; size_t bufsize = S_SEGMENT * ii->n_elements; datavec_init(ctx, rdv, ii->n_elements, 0, 0); if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { rdv[ii->n_elements - 1].flags = ODD; } bufsize += grn_p_decv(ctx, scp, cinfo->size, rdv, ii->n_elements); // (df in chunk list) = a[1] - sdf; { int j = 0; sdf = rdv[j].data_size; srp = rdv[j++].data; if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { ssp = rdv[j++].data; } stp = rdv[j++].data; if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { sop = rdv[j++].data; } snn = rdv[j].data_size; snp = rdv[j].data; } datavec_reset(ctx, dv, ii->n_elements, sdf + S_SEGMENT, bufsize); if (ctx->rc == GRN_SUCCESS) { { int j = 0; ridp = dv[j++].data; if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { sidp = dv[j++].data; } tfp = dv[j++].data; if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { weightp = dv[j++].data; } posp = dv[j].data; } GETNEXTC(); MERGE_BC(bid.rid <= rid || cid.rid); if (ctx->rc == GRN_SUCCESS) { *sbpp = sbp; *nextbp = nextb; *bidp = bid; GRN_ASSERT(posp < dv[ii->n_elements].data); ndf = ridp - dv[0].data; } } datavec_fin(ctx, rdv); grn_io_win_unmap(&sw); } else { DEFINE_NAME(ii); MERR("[ii][chunk][merge] failed to allocate a source chunk: " "<%.*s> :" "record:<%u>, segment:<%u>, size:<%u>", name_size, name, rid, segno, size); } if (ctx->rc == GRN_SUCCESS) { int j = 0; uint8_t *enc; uint32_t encsize; uint32_t np = posp - dv[ii->n_elements - 1].data; uint32_t f_s = (ndf < 3) ? 0 : USE_P_ENC; uint32_t f_d = ((ndf < 16) || (ndf <= (lid.rid >> 8))) ? 0 : USE_P_ENC; dv[j].data_size = ndf; dv[j++].flags = f_d; if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { dv[j].data_size = ndf; dv[j++].flags = f_s; } dv[j].data_size = ndf; dv[j++].flags = f_s; if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { dv[j].data_size = ndf; dv[j++].flags = f_s; } if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { uint32_t f_p = ((np < 32) || (np <= (spos >> 13))) ? 0 : USE_P_ENC; dv[j].data_size = np; dv[j].flags = f_p|ODD; } if ((enc = GRN_MALLOC((ndf * 4 + np) * 2))) { encsize = grn_p_encv(ctx, dv, ii->n_elements, enc); chunk_flush(ctx, ii, cinfo, enc, encsize); if (ctx->rc == GRN_SUCCESS) { chunk_free(ctx, ii, segno, 0, size); } GRN_FREE(enc); } else { DEFINE_NAME(ii); MERR("[ii][chunk][merge] failed to allocate a encode buffer: " "<%.*s> :" "record:<%u>, segment:<%u>, size:<%u>", name_size, name, rid, segno, size); } } *balance += (ndf - sdf); return ctx->rc; } static void buffer_merge_dump_datavec(grn_ctx *ctx, grn_ii *ii, datavec *dv, datavec *rdv) { int i, j; grn_obj buffer; GRN_TEXT_INIT(&buffer, 0); for (i = 0; (uint) i < ii->n_elements; i++) { GRN_LOG(ctx, GRN_LOG_DEBUG, "rdv[%d] data_size=%d, flags=%d", i, rdv[i].data_size, rdv[i].flags); GRN_BULK_REWIND(&buffer); for (j = 0; (uint) j < rdv[i].data_size;) { grn_text_printf(ctx, &buffer, " %d", rdv[i].data[j]); j++; if (!(j % 32) || (uint) j == rdv[i].data_size) { GRN_LOG(ctx, GRN_LOG_DEBUG, "rdv[%d].data[%d]%.*s", i, j, (int)GRN_TEXT_LEN(&buffer), GRN_TEXT_VALUE(&buffer)); GRN_BULK_REWIND(&buffer); } } } for (i = 0; (uint) i < ii->n_elements; i++) { GRN_LOG(ctx, GRN_LOG_DEBUG, "dv[%d] data_size=%d, flags=%d", i, dv[i].data_size, dv[i].flags); GRN_BULK_REWIND(&buffer); for (j = 0; (uint) j < dv[i].data_size;) { grn_text_printf(ctx, &buffer, " %d", dv[i].data[j]); j++; if (!(j % 32) || (uint) j == dv[i].data_size) { GRN_LOG(ctx, GRN_LOG_DEBUG, "dv[%d].data[%d]%.*s", i, j, (int)GRN_TEXT_LEN(&buffer), GRN_TEXT_VALUE(&buffer)); GRN_BULK_REWIND(&buffer); } } } GRN_OBJ_FIN(ctx, &buffer); } /* If dc doesn't have enough space, program may be crashed. * TODO: Support auto space extension or max size check. */ static grn_rc buffer_merge(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h, buffer *sb, uint8_t *sc, buffer *db, uint8_t *dc) { buffer_term *bt; uint8_t *sbp = NULL, *dcp = dc; datavec dv[MAX_N_ELEMENTS + 1]; datavec rdv[MAX_N_ELEMENTS + 1]; uint16_t n = db->header.nterms, nterms_void = 0; size_t unitsize = (S_SEGMENT + sb->header.chunk_size / sb->header.nterms) * 2; // size_t unitsize = (S_SEGMENT + sb->header.chunk_size) * 2 + (1<<24); size_t totalsize = unitsize * ii->n_elements; //todo : realloc datavec_init(ctx, dv, ii->n_elements, unitsize, totalsize); if (ctx->rc != GRN_SUCCESS) { DEFINE_NAME(ii); ERR(ctx->rc, "[ii][buffer][merge] failed to initialize data vector: " "<%.*s>: " "unit-size:<%" GRN_FMT_SIZE ">, " "total-size:<%" GRN_FMT_SIZE ">", name_size, name, unitsize, totalsize); return ctx->rc; } datavec_init(ctx, rdv, ii->n_elements, 0, 0); if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { rdv[ii->n_elements - 1].flags = ODD; } for (bt = db->terms; n; n--, bt++) { uint16_t nextb; uint64_t spos = 0; int32_t balance = 0; uint32_t *ridp, *sidp = NULL, *tfp, *weightp = NULL, *posp, nchunks = 0; uint32_t nvchunks = 0; chunk_info *cinfo = NULL; grn_id crid = GRN_ID_NIL; docinfo cid = {0, 0, 0, 0, 0}, lid = {0, 0, 0, 0, 0}, bid = {0, 0, 0, 0, 0}; uint32_t sdf = 0, snn = 0, ndf; uint32_t *srp = NULL, *ssp = NULL, *stp = NULL, *sop = NULL, *snp = NULL; if (!bt->tid) { nterms_void++; continue; } if (!bt->pos_in_buffer) { GRN_ASSERT(!bt->size_in_buffer); if (bt->size_in_chunk) { grn_memcpy(dcp, sc + bt->pos_in_chunk, bt->size_in_chunk); bt->pos_in_chunk = (uint32_t)(dcp - dc); dcp += bt->size_in_chunk; } continue; } nextb = bt->pos_in_buffer; GETNEXTB(); if (sc && bt->size_in_chunk) { uint8_t *scp = sc + bt->pos_in_chunk; uint8_t *sce = scp + bt->size_in_chunk; size_t size = S_SEGMENT * ii->n_elements; if ((bt->tid & CHUNK_SPLIT)) { int i; GRN_B_DEC(nchunks, scp); if (!(cinfo = GRN_MALLOCN(chunk_info, nchunks + 1))) { datavec_fin(ctx, dv); datavec_fin(ctx, rdv); { DEFINE_NAME(ii); MERR("[ii][buffer][merge] failed to allocate chunk info: " "<%.*s> :" "segment:<%u>, " "n-chunks:<%u>, " "unit-size:<%" GRN_FMT_SIZE ">, " "total-size:<%" GRN_FMT_SIZE ">", name_size, name, seg, nchunks, unitsize, totalsize); } return ctx->rc; } for (i = 0; (uint) i < nchunks; i++) { GRN_B_DEC(cinfo[i].segno, scp); GRN_B_DEC(cinfo[i].size, scp); GRN_B_DEC(cinfo[i].dgap, scp); crid += cinfo[i].dgap; if (bid.rid <= crid) { chunk_merge(ctx, ii, sb, bt, &cinfo[i], crid, dv, &nextb, &sbp, &bid, &balance); if (ctx->rc != GRN_SUCCESS) { if (cinfo) { GRN_FREE(cinfo); } datavec_fin(ctx, dv); datavec_fin(ctx, rdv); { DEFINE_NAME(ii); ERR(ctx->rc, "[ii][buffer][merge] failed to merge chunk: " "<%.*s>: " "chunk:<%u>, " "n-chunks:<%u>", name_size, name, i, nchunks); } return ctx->rc; } } if (cinfo[i].size) { nvchunks++; } else { crid -= cinfo[i].dgap; cinfo[i + 1].dgap += cinfo[i].dgap; } } } if (sce > scp) { size += grn_p_decv(ctx, scp, sce - scp, rdv, ii->n_elements); { int j = 0; sdf = rdv[j].data_size; srp = rdv[j++].data; if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { ssp = rdv[j++].data; } stp = rdv[j++].data; if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { sop = rdv[j++].data; } snn = rdv[j].data_size; snp = rdv[j].data; } datavec_reset(ctx, dv, ii->n_elements, sdf + S_SEGMENT, size); if (ctx->rc != GRN_SUCCESS) { if (cinfo) { GRN_FREE(cinfo); } datavec_fin(ctx, dv); datavec_fin(ctx, rdv); { DEFINE_NAME(ii); ERR(ctx->rc, "[ii][buffer][merge] failed to reset data vector: " "<%.*s>: " "unit-size:<%" GRN_FMT_SIZE ">, " "total-size:<%" GRN_FMT_SIZE ">", name_size, name, (size_t)(sdf + S_SEGMENT), size); } return ctx->rc; } } } { int j = 0; ridp = dv[j++].data; if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { sidp = dv[j++].data; } tfp = dv[j++].data; if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { weightp = dv[j++].data; } posp = dv[j].data; } GETNEXTC(); MERGE_BC(1); if (ctx->rc != GRN_SUCCESS) { if (cinfo) { GRN_FREE(cinfo); } datavec_fin(ctx, dv); datavec_fin(ctx, rdv); { DEFINE_NAME(ii); ERR(ctx->rc, "[ii][buffer][merge] failed to merge chunk: <%.*s>", name_size, name); } return ctx->rc; } GRN_ASSERT(posp < dv[ii->n_elements].data); ndf = ridp - dv[0].data; /* { grn_obj buf; uint32_t rid, sid, tf, i, pos, *pp; GRN_TEXT_INIT(&buf, 0); rid = 0; pp = dv[3].data; for (i = 0; i < ndf; i++) { GRN_BULK_REWIND(&buf); rid += dv[0].data[i]; if (dv[0].data[i]) { sid = 0; } sid += dv[1].data[i] + 1; tf = dv[2].data[i] + 1; pos = 0; grn_text_itoa(ctx, &buf, rid); GRN_TEXT_PUTC(ctx, &buf, ':'); grn_text_itoa(ctx, &buf, sid); GRN_TEXT_PUTC(ctx, &buf, ':'); grn_text_itoa(ctx, &buf, tf); GRN_TEXT_PUTC(ctx, &buf, ':'); while (tf--) { pos += *pp++; grn_text_itoa(ctx, &buf, pos); if (tf) { GRN_TEXT_PUTC(ctx, &buf, ','); } } GRN_TEXT_PUTC(ctx, &buf, '\0'); GRN_LOG(ctx, GRN_LOG_DEBUG, "Posting:%s", GRN_TEXT_VALUE(&buf)); } GRN_OBJ_FIN(ctx, &buf); } */ { grn_id tid = bt->tid & GRN_ID_MAX; uint32_t *a = array_at(ctx, ii, tid); if (!a) { GRN_LOG(ctx, GRN_LOG_DEBUG, "array_entry not found tid=%d", tid); memset(bt, 0, sizeof(buffer_term)); nterms_void++; } else { if (!ndf && !nvchunks) { a[0] = 0; a[1] = 0; lexicon_delete(ctx, ii, tid, h); memset(bt, 0, sizeof(buffer_term)); nterms_void++; } else if ((ii->header->flags & GRN_OBJ_WITH_SECTION) && !nvchunks && ndf == 1 && lid.rid < 0x100000 && lid.sid < 0x800 && lid.tf == 1 && lid.weight == 0) { a[0] = (lid.rid << 12) + (lid.sid << 1) + 1; a[1] = (ii->header->flags & GRN_OBJ_WITH_POSITION) ? posp[-1] : 0; memset(bt, 0, sizeof(buffer_term)); nterms_void++; } else if (!(ii->header->flags & GRN_OBJ_WITH_SECTION) && !nvchunks && ndf == 1 && lid.tf == 1 && lid.weight == 0) { a[0] = (lid.rid << 1) + 1; a[1] = (ii->header->flags & GRN_OBJ_WITH_POSITION) ? posp[-1] : 0; memset(bt, 0, sizeof(buffer_term)); nterms_void++; } else { int j = 0; uint8_t *dcp0; uint32_t encsize; uint32_t f_s = (ndf < 3) ? 0 : USE_P_ENC; uint32_t f_d = ((ndf < 16) || (ndf <= (lid.rid >> 8))) ? 0 : USE_P_ENC; dv[j].data_size = ndf; dv[j++].flags = f_d; if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { dv[j].data_size = ndf; dv[j++].flags = f_s; } dv[j].data_size = ndf; dv[j++].flags = f_s; if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { dv[j].data_size = ndf; dv[j++].flags = f_s; } if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { uint32_t np = posp - dv[ii->n_elements - 1].data; uint32_t f_p = ((np < 32) || (np <= (spos >> 13))) ? 0 : USE_P_ENC; dv[j].data_size = np; dv[j].flags = f_p|ODD; } dcp0 = dcp; a[1] = (bt->size_in_chunk ? a[1] : 0) + (ndf - sdf) + balance; if (nvchunks) { int i; GRN_B_ENC(nvchunks, dcp); for (i = 0; (uint) i < nchunks; i++) { if (cinfo[i].size) { GRN_B_ENC(cinfo[i].segno, dcp); GRN_B_ENC(cinfo[i].size, dcp); GRN_B_ENC(cinfo[i].dgap, dcp); } } } encsize = grn_p_encv(ctx, dv, ii->n_elements, dcp); if (grn_logger_pass(ctx, GRN_LOG_DEBUG)) { if (sb->header.chunk_size + S_SEGMENT <= (dcp - dc) + encsize) { GRN_LOG(ctx, GRN_LOG_DEBUG, "cs(%d)+(%d)=(%d)" "<=(%" GRN_FMT_LLD ")+(%d)=" "(%" GRN_FMT_LLD ")", sb->header.chunk_size, S_SEGMENT, sb->header.chunk_size + S_SEGMENT, (long long int)(dcp - dc), encsize, (long long int)((dcp - dc) + encsize)); buffer_merge_dump_datavec(ctx, ii, dv, rdv); } } if (encsize > CHUNK_SPLIT_THRESHOLD && (cinfo || (cinfo = GRN_MALLOCN(chunk_info, nchunks + 1))) && !chunk_flush(ctx, ii, &cinfo[nchunks], dcp, encsize)) { int i; cinfo[nchunks].dgap = lid.rid - crid; nvchunks++; dcp = dcp0; GRN_B_ENC(nvchunks, dcp); for (i = 0; (uint) i <= nchunks; i++) { if (cinfo[i].size) { GRN_B_ENC(cinfo[i].segno, dcp); GRN_B_ENC(cinfo[i].size, dcp); GRN_B_ENC(cinfo[i].dgap, dcp); } } GRN_LOG(ctx, GRN_LOG_DEBUG, "split (%d) encsize=%d", tid, encsize); bt->tid |= CHUNK_SPLIT; } else { dcp += encsize; if (!nvchunks) { bt->tid &= ~CHUNK_SPLIT; } } bt->pos_in_chunk = (uint32_t)(dcp0 - dc); bt->size_in_chunk = (uint32_t)(dcp - dcp0); bt->size_in_buffer = 0; bt->pos_in_buffer = 0; } array_unref(ii, tid); } } if (cinfo) { GRN_FREE(cinfo); } } datavec_fin(ctx, rdv); datavec_fin(ctx, dv); db->header.chunk_size = (uint32_t)(dcp - dc); db->header.buffer_free = S_SEGMENT - sizeof(buffer_header) - db->header.nterms * sizeof(buffer_term); db->header.nterms_void = nterms_void; return ctx->rc; } static void fake_map(grn_ctx *ctx, grn_io *io, grn_io_win *iw, void *addr, uint32_t seg, uint32_t size) { iw->ctx = ctx; iw->diff = 0; iw->io = io; iw->mode = grn_io_wronly; iw->segment = ((seg) >> GRN_II_N_CHUNK_VARIATION); iw->offset = (((seg) & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << GRN_II_W_LEAST_CHUNK); iw->size = size; iw->cached = 0; iw->addr = addr; } static grn_rc buffer_flush(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) { grn_io_win sw, dw; buffer *sb, *db = NULL; uint8_t *dc, *sc = NULL; uint32_t ds, pseg, scn, dcn = 0; if (ii->header->binfo[seg] == GRN_II_PSEG_NOT_ASSIGNED) { DEFINE_NAME(ii); CRIT(GRN_FILE_CORRUPT, "[ii][buffer][flush] invalid segment: " "<%.*s> :" "request:<%u>, max:<%u>", name_size, name, seg, ii->seg->header->max_segment); return ctx->rc; } if ((ds = segment_get(ctx, ii)) == ii->seg->header->max_segment) { DEFINE_NAME(ii); MERR("[ii][buffer][flush] segment is full: " "<%.*s> :" "request:<%u>, max:<%u>", name_size, name, seg, ii->seg->header->max_segment); return ctx->rc; } pseg = buffer_open(ctx, ii, SEG2POS(seg, 0), NULL, &sb); if (pseg == GRN_II_PSEG_NOT_ASSIGNED) { DEFINE_NAME(ii); MERR("[ii][buffer][flush] failed to open buffer: " "<%.*s> :" "segment:<%u>, position:<%u>, max:<%u>", name_size, name, seg, SEG2POS(seg, 0), ii->seg->header->max_segment); return ctx->rc; } { GRN_IO_SEG_REF(ii->seg, ds, db); if (db) { uint32_t actual_chunk_size = 0; uint32_t max_dest_chunk_size = sb->header.chunk_size + S_SEGMENT; if ((dc = GRN_MALLOC(max_dest_chunk_size * 2))) { if ((scn = sb->header.chunk) == GRN_II_PSEG_NOT_ASSIGNED || (sc = WIN_MAP(ii->chunk, ctx, &sw, scn, 0, sb->header.chunk_size, grn_io_rdonly))) { uint16_t n = sb->header.nterms; memset(db, 0, S_SEGMENT); grn_memcpy(db->terms, sb->terms, n * sizeof(buffer_term)); db->header.nterms = n; buffer_merge(ctx, ii, seg, h, sb, sc, db, dc); if (ctx->rc == GRN_SUCCESS) { actual_chunk_size = db->header.chunk_size; if (actual_chunk_size > 0) { chunk_new(ctx, ii, &dcn, actual_chunk_size); } if (ctx->rc == GRN_SUCCESS) { grn_rc rc; db->header.chunk = actual_chunk_size ? dcn : GRN_II_PSEG_NOT_ASSIGNED; fake_map(ctx, ii->chunk, &dw, dc, dcn, actual_chunk_size); rc = grn_io_win_unmap(&dw); if (rc == GRN_SUCCESS) { buffer_segment_update(ii, seg, ds); ii->header->total_chunk_size += actual_chunk_size; if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); chunk_free(ctx, ii, scn, 0, sb->header.chunk_size); ii->header->total_chunk_size -= sb->header.chunk_size; } } else { GRN_FREE(dc); if (actual_chunk_size) { chunk_free(ctx, ii, dcn, 0, actual_chunk_size); } if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } { DEFINE_NAME(ii); ERR(rc, "[ii][buffer][flush] failed to unmap a destination chunk: " "<%.*s> : " "segment:<%u>, destination-segment:<%u>, actual-size:<%u>", name_size, name, seg, dcn, actual_chunk_size); } } } else { GRN_FREE(dc); if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { GRN_FREE(dc); if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { GRN_FREE(dc); { DEFINE_NAME(ii); MERR("[ii][buffer][flush] failed to map a source chunk: " "<%.*s> :" "segment:<%u>, source-segment:<%u>, chunk-size:<%u>", name_size, name, seg, scn, sb->header.chunk_size); } } } else { DEFINE_NAME(ii); MERR("[ii][buffer][flush] failed to allocate a destination chunk: " "<%.*s> :" "segment:<%u>, destination-segment:<%u>", name_size, name, seg, ds); } GRN_IO_SEG_UNREF(ii->seg, ds); } else { DEFINE_NAME(ii); MERR("[ii][buffer][flush] failed to allocate a destination segment: " "<%.*s> :" "segment:<%u>, destination-segment:<%u>", name_size, name, seg, ds); } buffer_close(ctx, ii, pseg); } return ctx->rc; } void grn_ii_buffer_check(grn_ctx *ctx, grn_ii *ii, uint32_t seg) { grn_io_win sw; buffer *sb; uint8_t *sc = NULL; uint32_t pseg, scn, nterms_with_corrupt_chunk = 0, nterm_with_chunk = 0; uint32_t ndeleted_terms_with_value = 0; buffer_term *bt; uint8_t *sbp = NULL; datavec rdv[MAX_N_ELEMENTS + 1]; uint16_t n; int nterms_void = 0; int size_in_buffer = 0; grn_obj buf; size_t lower_bound; int64_t nloops = 0, nviolations = 0; if (ii->header->binfo[seg] == GRN_II_PSEG_NOT_ASSIGNED) { GRN_OUTPUT_BOOL(GRN_FALSE); return; } pseg = buffer_open(ctx, ii, SEG2POS(seg, 0), NULL, &sb); if (pseg == GRN_II_PSEG_NOT_ASSIGNED) { GRN_OUTPUT_BOOL(GRN_FALSE); return; } lower_bound = (sb->header.buffer_free + sizeof(buffer_term) * sb->header.nterms) / sizeof(buffer_rec); datavec_init(ctx, rdv, ii->n_elements, 0, 0); if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { rdv[ii->n_elements - 1].flags = ODD; } GRN_OUTPUT_MAP_OPEN("BUFFER", -1); GRN_OUTPUT_CSTR("buffer id"); GRN_OUTPUT_INT64(seg); if ((scn = sb->header.chunk) == GRN_II_PSEG_NOT_ASSIGNED) { GRN_OUTPUT_CSTR("void chunk size"); GRN_OUTPUT_INT64(sb->header.chunk_size); } else { if ((sc = WIN_MAP(ii->chunk, ctx, &sw, scn, 0, sb->header.chunk_size, grn_io_rdonly))) { GRN_OUTPUT_CSTR("chunk size"); GRN_OUTPUT_INT64(sb->header.chunk_size); } else { GRN_OUTPUT_CSTR("unmappable chunk size"); GRN_OUTPUT_INT64(sb->header.chunk_size); } } GRN_OUTPUT_CSTR("buffer term"); GRN_OUTPUT_ARRAY_OPEN("TERMS", sb->header.nterms); GRN_OBJ_INIT(&buf, GRN_BULK, 0, ii->lexicon->header.domain); for (bt = sb->terms, n = sb->header.nterms; n; n--, bt++) { grn_id tid, tid_; char key[GRN_TABLE_MAX_KEY_SIZE]; int key_size; uint16_t nextb; uint32_t nchunks = 0; chunk_info *cinfo = NULL; grn_id crid = GRN_ID_NIL; docinfo bid = {0, 0, 0, 0, 0}; uint32_t sdf = 0, snn = 0; uint32_t *srp = NULL, *ssp = NULL, *stp = NULL, *sop = NULL, *snp = NULL; if (!bt->tid && !bt->pos_in_buffer && !bt->size_in_buffer) { nterms_void++; continue; } GRN_OUTPUT_ARRAY_OPEN("TERM", -1); tid = (bt->tid & GRN_ID_MAX); key_size = grn_table_get_key(ctx, ii->lexicon, tid, key, GRN_TABLE_MAX_KEY_SIZE); tid_ = grn_table_get(ctx, ii->lexicon, key, key_size); GRN_TEXT_SET(ctx, &buf, key, key_size); GRN_OUTPUT_OBJ(&buf, NULL); GRN_OUTPUT_INT64(bt->tid); GRN_OUTPUT_INT64(tid_); nextb = bt->pos_in_buffer; size_in_buffer += bt->size_in_buffer; if (tid != tid_ && (bt->size_in_buffer || bt->size_in_chunk)) { ndeleted_terms_with_value++; } GETNEXTB(); GRN_OUTPUT_INT64(bt->size_in_buffer); GRN_OUTPUT_INT64(bt->size_in_chunk); if (sc && bt->size_in_chunk) { uint8_t *scp = sc + bt->pos_in_chunk; uint8_t *sce = scp + bt->size_in_chunk; size_t size = S_SEGMENT * ii->n_elements; if ((bt->tid & CHUNK_SPLIT)) { int i; GRN_B_DEC(nchunks, scp); if (!(cinfo = GRN_MALLOCN(chunk_info, nchunks + 1))) { datavec_fin(ctx, rdv); GRN_OBJ_FIN(ctx, &buf); return; } for (i = 0; (uint) i < nchunks; i++) { GRN_B_DEC(cinfo[i].segno, scp); GRN_B_DEC(cinfo[i].size, scp); GRN_B_DEC(cinfo[i].dgap, scp); crid += cinfo[i].dgap; } } if (sce > scp) { size += grn_p_decv(ctx, scp, sce - scp, rdv, ii->n_elements); { int j = 0; sdf = rdv[j].data_size; GRN_OUTPUT_INT64(sdf); srp = rdv[j++].data; if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { ssp = rdv[j++].data; } if (sdf != rdv[j].data_size) { nterms_with_corrupt_chunk++; } stp = rdv[j++].data; if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { sop = rdv[j++].data; } GRN_OUTPUT_INT64(rdv[j].data_size); snn = rdv[j].data_size; snp = rdv[j].data; } nterm_with_chunk++; } } { uint16_t pos; grn_id rid, sid, rid_ = 0, sid_ = 0; uint8_t *p; buffer_rec *r; for (pos = bt->pos_in_buffer; pos; pos = r->step) { if (pos < lower_bound) { nviolations++; } r = BUFFER_REC_AT(sb, pos); p = GRN_NEXT_ADDR(r); GRN_B_DEC(rid, p); if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { GRN_B_DEC(sid, p); } else { sid = 1; } if (rid < rid_ || (rid == rid_ && sid < sid_)) { nloops++; } rid_ = rid; sid_ = sid; } } GRN_OUTPUT_ARRAY_CLOSE(); if (cinfo) { GRN_FREE(cinfo); } } GRN_OBJ_FIN(ctx, &buf); GRN_OUTPUT_ARRAY_CLOSE(); GRN_OUTPUT_CSTR("buffer free"); GRN_OUTPUT_INT64(sb->header.buffer_free); GRN_OUTPUT_CSTR("size in buffer"); GRN_OUTPUT_INT64(size_in_buffer); GRN_OUTPUT_CSTR("nterms"); GRN_OUTPUT_INT64(sb->header.nterms); if (nterms_void != sb->header.nterms_void) { GRN_OUTPUT_CSTR("nterms void gap"); GRN_OUTPUT_INT64(nterms_void - sb->header.nterms_void); } GRN_OUTPUT_CSTR("nterms with chunk"); GRN_OUTPUT_INT64(nterm_with_chunk); if (nterms_with_corrupt_chunk) { GRN_OUTPUT_CSTR("nterms with corrupt chunk"); GRN_OUTPUT_INT64(nterms_with_corrupt_chunk); } if (ndeleted_terms_with_value) { GRN_OUTPUT_CSTR("number of deleted terms with value"); GRN_OUTPUT_INT64(ndeleted_terms_with_value); } if (nloops) { GRN_OUTPUT_CSTR("number of loops"); GRN_OUTPUT_INT64(nloops); } if (nviolations) { GRN_OUTPUT_CSTR("number of violations"); GRN_OUTPUT_INT64(nviolations); } GRN_OUTPUT_MAP_CLOSE(); datavec_fin(ctx, rdv); if (sc) { grn_io_win_unmap(&sw); } buffer_close(ctx, ii, pseg); } typedef struct { buffer_term *bt; const char *key; uint32_t key_size; } term_sort; static int term_compar(const void *t1, const void *t2) { int r; const term_sort *x = (term_sort *)t1, *y = (term_sort *)t2; if (x->key_size > y->key_size) { r = memcmp(x->key, y->key, y->key_size); return r ? r : x->key_size - y->key_size; } else { r = memcmp(x->key, y->key, x->key_size); return r ? r : x->key_size - y->key_size; } } static grn_rc term_split(grn_ctx *ctx, grn_obj *lexicon, buffer *sb, buffer *db0, buffer *db1) { uint16_t i, n, *nt; buffer_term *bt; uint32_t s, th = (sb->header.chunk_size + sb->header.nterms) >> 1; term_sort *ts = GRN_MALLOC(sb->header.nterms * sizeof(term_sort)); if (!ts) { return GRN_NO_MEMORY_AVAILABLE; } for (i = 0, n = sb->header.nterms, bt = sb->terms; n; bt++, n--) { if (bt->tid) { grn_id tid = bt->tid & GRN_ID_MAX; ts[i].key = _grn_table_key(ctx, lexicon, tid, &ts[i].key_size); ts[i].bt = bt; i++; } } qsort(ts, i, sizeof(term_sort), term_compar); memset(db0, 0, S_SEGMENT); bt = db0->terms; nt = &db0->header.nterms; for (s = 0; n + 1 < i && s <= th; n++, bt++) { grn_memcpy(bt, ts[n].bt, sizeof(buffer_term)); (*nt)++; s += ts[n].bt->size_in_chunk + 1; } memset(db1, 0, S_SEGMENT); bt = db1->terms; nt = &db1->header.nterms; for (; n < i; n++, bt++) { grn_memcpy(bt, ts[n].bt, sizeof(buffer_term)); (*nt)++; } GRN_FREE(ts); GRN_LOG(ctx, GRN_LOG_DEBUG, "d0=%d d1=%d", db0->header.nterms, db1->header.nterms); return GRN_SUCCESS; } static void array_update(grn_ctx *ctx, grn_ii *ii, uint32_t dls, buffer *db) { uint16_t n; buffer_term *bt; uint32_t *a, pos = SEG2POS(dls, sizeof(buffer_header)); for (n = db->header.nterms, bt = db->terms; n; n--, bt++) { if (bt->tid) { grn_id tid = bt->tid & GRN_ID_MAX; if ((a = array_at(ctx, ii, tid))) { a[0] = pos; array_unref(ii, tid); } else { GRN_LOG(ctx, GRN_LOG_WARNING, "array_at failed (%d)", tid); } } pos += sizeof(buffer_term) >> 2; } } static grn_rc buffer_split(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) { grn_io_win sw, dw0, dw1; buffer *sb, *db0 = NULL, *db1 = NULL; uint8_t *sc = NULL, *dc0, *dc1; uint32_t dps0 = 0, dps1 = 0, dls0 = 0, dls1 = 0, sps, scn, dcn0 = 0, dcn1 = 0; if (ii->header->binfo[seg] == GRN_II_PSEG_NOT_ASSIGNED) { DEFINE_NAME(ii); CRIT(GRN_FILE_CORRUPT, "[ii][buffer][split] invalid segment: " "<%.*s> :" "request:<%u>, max:<%u>", name_size, name, seg, ii->seg->header->max_segment); return ctx->rc; } buffer_segment_reserve(ctx, ii, &dls0, &dps0, &dls1, &dps1); if (ctx->rc != GRN_SUCCESS) { DEFINE_NAME(ii); ERR(ctx->rc, "[ii][buffer][split] failed to reserve buffer segments: " "<%.*s> :" "request:<%u>, max:<%u>", name_size, name, seg, ii->seg->header->max_segment); return ctx->rc; } sps = buffer_open(ctx, ii, SEG2POS(seg, 0), NULL, &sb); if (sps == GRN_II_PSEG_NOT_ASSIGNED) { DEFINE_NAME(ii); MERR("[ii][buffer][split] failed to open buffer: " "<%.*s> :" "segment:<%u>, position:<%u>, max-segment:<%u>", name_size, name, seg, SEG2POS(seg, 0), ii->seg->header->max_segment); } else { GRN_IO_SEG_REF(ii->seg, dps0, db0); if (db0) { GRN_IO_SEG_REF(ii->seg, dps1, db1); if (db1) { uint32_t actual_db0_chunk_size = 0; uint32_t actual_db1_chunk_size = 0; uint32_t max_dest_chunk_size = sb->header.chunk_size + S_SEGMENT; if ((dc0 = GRN_MALLOC(max_dest_chunk_size * 2))) { if ((dc1 = GRN_MALLOC(max_dest_chunk_size * 2))) { if ((scn = sb->header.chunk) == GRN_II_PSEG_NOT_ASSIGNED || (sc = WIN_MAP(ii->chunk, ctx, &sw, scn, 0, sb->header.chunk_size, grn_io_rdonly))) { term_split(ctx, ii->lexicon, sb, db0, db1); buffer_merge(ctx, ii, seg, h, sb, sc, db0, dc0); if (ctx->rc == GRN_SUCCESS) { actual_db0_chunk_size = db0->header.chunk_size; if (actual_db0_chunk_size > 0) { chunk_new(ctx, ii, &dcn0, actual_db0_chunk_size); } if (ctx->rc == GRN_SUCCESS) { grn_rc rc; db0->header.chunk = actual_db0_chunk_size ? dcn0 : GRN_II_PSEG_NOT_ASSIGNED; fake_map(ctx, ii->chunk, &dw0, dc0, dcn0, actual_db0_chunk_size); rc = grn_io_win_unmap(&dw0); if (rc == GRN_SUCCESS) { buffer_merge(ctx, ii, seg, h, sb, sc, db1, dc1); if (ctx->rc == GRN_SUCCESS) { actual_db1_chunk_size = db1->header.chunk_size; if (actual_db1_chunk_size > 0) { chunk_new(ctx, ii, &dcn1, actual_db1_chunk_size); } if (ctx->rc == GRN_SUCCESS) { fake_map(ctx, ii->chunk, &dw1, dc1, dcn1, actual_db1_chunk_size); rc = grn_io_win_unmap(&dw1); if (rc == GRN_SUCCESS) { db1->header.chunk = actual_db1_chunk_size ? dcn1 : GRN_II_PSEG_NOT_ASSIGNED; buffer_segment_update(ii, dls0, dps0); buffer_segment_update(ii, dls1, dps1); array_update(ctx, ii, dls0, db0); array_update(ctx, ii, dls1, db1); buffer_segment_clear(ii, seg); ii->header->total_chunk_size += actual_db0_chunk_size; ii->header->total_chunk_size += actual_db1_chunk_size; if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); chunk_free(ctx, ii, scn, 0, sb->header.chunk_size); ii->header->total_chunk_size -= sb->header.chunk_size; } } else { if (actual_db1_chunk_size) { chunk_free(ctx, ii, dcn1, 0, actual_db1_chunk_size); } if (actual_db0_chunk_size) { chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); } GRN_FREE(dc1); if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } { DEFINE_NAME(ii); ERR(rc, "[ii][buffer[merge] " "failed to unmap a destination chunk2: " "<%.*s> :" "segment:<%u>, " "destination-chunk1:<%u>, " "destination-chunk2:<%u>, " "actual-size1:<%u>, " "actual-size2:<%u>", name_size, name, seg, dcn0, dcn1, actual_db0_chunk_size, actual_db1_chunk_size); } } } else { if (actual_db0_chunk_size) { chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); } GRN_FREE(dc1); if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { if (actual_db0_chunk_size) { chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); } GRN_FREE(dc1); if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { if (actual_db0_chunk_size) { chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); } GRN_FREE(dc1); GRN_FREE(dc0); if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } { DEFINE_NAME(ii); ERR(rc, "[ii][buffer[merge] " "failed to unmap a destination chunk1: " "<%.*s> :" "segment:<%u>, " "destination-chunk1:<%u>, " "actual-size1:<%u>", name_size, name, seg, dcn0, actual_db0_chunk_size); } } } else { GRN_FREE(dc1); GRN_FREE(dc0); if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { GRN_FREE(dc1); GRN_FREE(dc0); if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { GRN_FREE(dc1); GRN_FREE(dc0); { DEFINE_NAME(ii); MERR("[ii][buffer][split] failed to map a source chunk: " "<%.*s> :" "segment:<%u>, " "source-segment:<%u>, " "chunk-size:<%u>", name_size, name, seg, scn, sb->header.chunk_size); } } } else { GRN_FREE(dc0); { DEFINE_NAME(ii); MERR("[ii][buffer][split] " "failed to allocate a destination chunk2: " "<%.*s> :" "segment:<%u>, " "destination-segment1:<%u>, " "destination-segment2:<%u>", name_size, name, seg, dps0, dps1); } } } else { DEFINE_NAME(ii); MERR("[ii][buffer][split] failed to allocate a destination chunk1: " "<%.*s>: " "segment:<%u>, " "destination-segment1:<%u>, " "destination-segment2:<%u>", name_size, name, seg, dps0, dps1); } GRN_IO_SEG_UNREF(ii->seg, dps1); } else { DEFINE_NAME(ii); MERR("[ii][buffer][split] failed to allocate a destination segment2: " "<%.*s>: " "segment:<%u>, " "destination-segment1:<%u>, " "destination-segment2:<%u>", name_size, name, seg, dps0, dps1); } GRN_IO_SEG_UNREF(ii->seg, dps0); } else { DEFINE_NAME(ii); MERR("[ii][buffer][split] failed to allocate a destination segment1: " "<%.*s>: " "segment:<%u>, " "destination-segment1:<%u>, " "destination-segment2:<%u>", name_size, name, seg, dps0, dps1); } buffer_close(ctx, ii, sps); } return ctx->rc; } #define SCALE_FACTOR 2048 #define MAX_NTERMS 8192 #define SPLIT_COND(ii, buffer)\ ((buffer)->header.nterms > 1024 ||\ ((buffer)->header.nterms > 1 &&\ (buffer)->header.chunk_size * 100 > (ii)->header->total_chunk_size)) inline static void buffer_new_find_segment(grn_ctx *ctx, grn_ii *ii, int size, grn_id tid, grn_hash *h, buffer **b, uint32_t *lseg, uint32_t *pseg) { uint32_t *a; a = array_at(ctx, ii, tid); if (!a) { return; } for (;;) { uint32_t pos = a[0]; if (!pos || (pos & 1)) { break; } *pseg = buffer_open(ctx, ii, pos, NULL, b); if (*pseg == GRN_II_PSEG_NOT_ASSIGNED) { break; } if ((*b)->header.buffer_free >= size + sizeof(buffer_term)) { *lseg = LSEG(pos); break; } buffer_close(ctx, ii, *pseg); if (SPLIT_COND(ii, (*b))) { /* ((S_SEGMENT - sizeof(buffer_header) + ii->header->bmax - (*b)->header.nterms * sizeof(buffer_term)) * 4 < (*b)->header.chunk_size) */ GRN_LOG(ctx, GRN_LOG_DEBUG, "nterms=%d chunk=%d total=%" GRN_FMT_INT64U, (*b)->header.nterms, (*b)->header.chunk_size, ii->header->total_chunk_size >> 10); if (buffer_split(ctx, ii, LSEG(pos), h)) { break; } } else { if (S_SEGMENT - sizeof(buffer_header) - (*b)->header.nterms * sizeof(buffer_term) < size + sizeof(buffer_term)) { break; } if (buffer_flush(ctx, ii, LSEG(pos), h)) { break; } } } array_unref(ii, tid); } inline static void buffer_new_lexicon_pat(grn_ctx *ctx, grn_ii *ii, int size, grn_id id, grn_hash *h, buffer **b, uint32_t *lseg, uint32_t *pseg) { grn_pat_cursor *cursor; char key[GRN_TABLE_MAX_KEY_SIZE]; int key_size; key_size = grn_table_get_key(ctx, ii->lexicon, id, key, GRN_TABLE_MAX_KEY_SIZE); if (ii->lexicon->header.flags & GRN_OBJ_KEY_VAR_SIZE) { grn_obj *tokenizer = NULL; grn_table_get_info(ctx, ii->lexicon, NULL, NULL, &tokenizer, NULL, NULL); if (tokenizer) { /* For natural language */ cursor = grn_pat_cursor_open(ctx, (grn_pat *)(ii->lexicon), key, key_size, NULL, 0, 0, -1, GRN_CURSOR_ASCENDING|GRN_CURSOR_GT); if (cursor) { grn_id tid; while (ctx->rc == GRN_SUCCESS && *lseg == GRN_II_PSEG_NOT_ASSIGNED && (tid = grn_pat_cursor_next(ctx, cursor))) { buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); } grn_pat_cursor_close(ctx, cursor); } } else { /* For text data */ int target_key_size = key_size; int reduced_key_size = 0; while (*lseg == GRN_II_PSEG_NOT_ASSIGNED && target_key_size > 0) { grn_id tid; cursor = grn_pat_cursor_open(ctx, (grn_pat *)(ii->lexicon), key, target_key_size, NULL, 0, 0, -1, GRN_CURSOR_PREFIX); if (!cursor) { break; } if (reduced_key_size == 0) { while (ctx->rc == GRN_SUCCESS && *lseg == GRN_II_PSEG_NOT_ASSIGNED && (tid = grn_pat_cursor_next(ctx, cursor))) { buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); } } else { while (ctx->rc == GRN_SUCCESS && *lseg == GRN_II_PSEG_NOT_ASSIGNED && (tid = grn_pat_cursor_next(ctx, cursor))) { void *current_key; int current_key_size; current_key_size = grn_pat_cursor_get_key(ctx, cursor, ¤t_key); if (memcmp(((char *)current_key) + target_key_size, key + target_key_size, reduced_key_size) == 0) { continue; } buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); } } grn_pat_cursor_close(ctx, cursor); if (reduced_key_size == 0) { reduced_key_size = 1; } else { reduced_key_size *= 2; } target_key_size -= reduced_key_size; } } } else { /* For other data */ cursor = grn_pat_cursor_open(ctx, (grn_pat *)(ii->lexicon), NULL, 0, key, key_size, 0, -1, GRN_CURSOR_PREFIX); if (cursor) { grn_id tid; while (ctx->rc == GRN_SUCCESS && *lseg == GRN_II_PSEG_NOT_ASSIGNED && (tid = grn_pat_cursor_next(ctx, cursor))) { buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); } grn_pat_cursor_close(ctx, cursor); } } } inline static void buffer_new_lexicon_other(grn_ctx *ctx, grn_ii *ii, int size, grn_id id, grn_hash *h, buffer **b, uint32_t *lseg, uint32_t *pseg) { GRN_TABLE_EACH_BEGIN(ctx, ii->lexicon, cursor, tid) { if (ctx->rc != GRN_SUCCESS || *lseg != GRN_II_PSEG_NOT_ASSIGNED) { break; } buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); } GRN_TABLE_EACH_END(ctx, cursor); } inline static uint32_t buffer_new(grn_ctx *ctx, grn_ii *ii, int size, uint32_t *pos, buffer_term **bt, buffer_rec **br, buffer **bp, grn_id id, grn_hash *h) { buffer *b = NULL; uint16_t offset; uint32_t lseg = GRN_II_PSEG_NOT_ASSIGNED, pseg = GRN_II_PSEG_NOT_ASSIGNED; if (S_SEGMENT - sizeof(buffer_header) < size + sizeof(buffer_term)) { DEFINE_NAME(ii); MERR("[ii][buffer][new] requested size is too large: " "<%.*s> :" "requested:<%" GRN_FMT_SIZE ">, max:<%" GRN_FMT_SIZE ">", name_size, name, (size_t)(size + sizeof(buffer_term)), (size_t)(S_SEGMENT - sizeof(buffer_header))); return GRN_II_PSEG_NOT_ASSIGNED; } if (ii->lexicon->header.type == GRN_TABLE_PAT_KEY) { buffer_new_lexicon_pat(ctx, ii, size, id, h, &b, &lseg, &pseg); } else { buffer_new_lexicon_other(ctx, ii, size, id, h, &b, &lseg, &pseg); } if (lseg == GRN_II_PSEG_NOT_ASSIGNED) { if (buffer_segment_new(ctx, ii, &lseg) || (pseg = buffer_open(ctx, ii, SEG2POS(lseg, 0), NULL, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { return GRN_II_PSEG_NOT_ASSIGNED; } memset(b, 0, S_SEGMENT); b->header.buffer_free = S_SEGMENT - sizeof(buffer_header); b->header.chunk = GRN_II_PSEG_NOT_ASSIGNED; } if (b->header.nterms_void) { for (offset = 0; offset < b->header.nterms; offset++) { if (!b->terms[offset].tid) { break; } } if (offset == b->header.nterms) { GRN_LOG(ctx, GRN_LOG_DEBUG, "inconsistent buffer(%d)", lseg); b->header.nterms_void = 0; b->header.nterms++; b->header.buffer_free -= size + sizeof(buffer_term); } else { b->header.nterms_void--; b->header.buffer_free -= size; } } else { offset = b->header.nterms++; b->header.buffer_free -= size + sizeof(buffer_term); } *pos = SEG2POS(lseg, (sizeof(buffer_header) + sizeof(buffer_term) * offset)); *bt = &b->terms[offset]; *br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) + b->header.buffer_free); *bp = b; return pseg; } /* ii */ static grn_ii * _grn_ii_create(grn_ctx *ctx, grn_ii *ii, const char *path, grn_obj *lexicon, uint32_t flags) { int i; uint32_t max_n_segments; uint32_t max_n_chunks; grn_io *seg, *chunk; char path2[PATH_MAX]; struct grn_ii_header *header; grn_table_flags lflags; grn_encoding encoding; grn_obj *tokenizer; /* for (i = 0; i < 32; i++) { new_histogram[i] = 0; free_histogram[i] = 0; } */ if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, NULL, NULL)) { return NULL; } if (path && strlen(path) + 6 >= PATH_MAX) { return NULL; } if (flags & GRN_OBJ_INDEX_SMALL) { max_n_segments = grn_ii_max_n_segments_small; max_n_chunks = grn_ii_max_n_chunks_small; } else if (flags & GRN_OBJ_INDEX_MEDIUM) { max_n_segments = MAX_PSEG_MEDIUM; max_n_chunks = GRN_II_MAX_CHUNK_MEDIUM; } else { max_n_segments = MAX_PSEG; max_n_chunks = GRN_II_MAX_CHUNK; } seg = grn_io_create(ctx, path, sizeof(struct grn_ii_header), S_SEGMENT, max_n_segments, grn_io_auto, GRN_IO_EXPIRE_SEGMENT); if (!seg) { return NULL; } if (path) { grn_strcpy(path2, PATH_MAX, path); grn_strcat(path2, PATH_MAX, ".c"); chunk = grn_io_create(ctx, path2, 0, S_CHUNK, max_n_chunks, grn_io_auto, GRN_IO_EXPIRE_SEGMENT); } else { chunk = grn_io_create(ctx, NULL, 0, S_CHUNK, max_n_chunks, grn_io_auto, 0); } if (!chunk) { grn_io_close(ctx, seg); grn_io_remove(ctx, path); return NULL; } header = grn_io_header(seg); grn_io_set_type(seg, GRN_COLUMN_INDEX); for (i = 0; i < GRN_II_MAX_LSEG; i++) { header->ainfo[i] = GRN_II_PSEG_NOT_ASSIGNED; header->binfo[i] = GRN_II_PSEG_NOT_ASSIGNED; } for (i = 0; i <= GRN_II_N_CHUNK_VARIATION; i++) { header->free_chunks[i] = GRN_II_PSEG_NOT_ASSIGNED; header->garbages[i] = GRN_II_PSEG_NOT_ASSIGNED; } header->flags = flags; ii->seg = seg; ii->chunk = chunk; ii->lexicon = lexicon; ii->lflags = lflags; ii->encoding = encoding; ii->header = header; ii->n_elements = 2; if ((flags & GRN_OBJ_WITH_SECTION)) { ii->n_elements++; } if ((flags & GRN_OBJ_WITH_WEIGHT)) { ii->n_elements++; } if ((flags & GRN_OBJ_WITH_POSITION)) { ii->n_elements++; } return ii; } grn_ii * grn_ii_create(grn_ctx *ctx, const char *path, grn_obj *lexicon, uint32_t flags) { grn_ii *ii = NULL; if (!(ii = GRN_MALLOCN(grn_ii, 1))) { return NULL; } GRN_DB_OBJ_SET_TYPE(ii, GRN_COLUMN_INDEX); if (!_grn_ii_create(ctx, ii, path, lexicon, flags)) { GRN_FREE(ii); return NULL; } return ii; } grn_rc grn_ii_remove(grn_ctx *ctx, const char *path) { grn_rc rc; char buffer[PATH_MAX]; if (!path || strlen(path) > PATH_MAX - 4) { return GRN_INVALID_ARGUMENT; } if ((rc = grn_io_remove(ctx, path))) { goto exit; } grn_snprintf(buffer, PATH_MAX, PATH_MAX, "%-.256s.c", path); rc = grn_io_remove(ctx, buffer); exit : return rc; } grn_rc grn_ii_truncate(grn_ctx *ctx, grn_ii *ii) { grn_rc rc; const char *io_segpath, *io_chunkpath; char *segpath, *chunkpath = NULL; grn_obj *lexicon; uint32_t flags; if ((io_segpath = grn_io_path(ii->seg)) && *io_segpath != '\0') { if (!(segpath = GRN_STRDUP(io_segpath))) { ERR(GRN_NO_MEMORY_AVAILABLE, "cannot duplicate path: <%-.256s>", io_segpath); return GRN_NO_MEMORY_AVAILABLE; } if ((io_chunkpath = grn_io_path(ii->chunk)) && *io_chunkpath != '\0') { if (!(chunkpath = GRN_STRDUP(io_chunkpath))) { ERR(GRN_NO_MEMORY_AVAILABLE, "cannot duplicate path: <%-.256s>", io_chunkpath); return GRN_NO_MEMORY_AVAILABLE; } } else { chunkpath = NULL; } } else { segpath = NULL; } lexicon = ii->lexicon; flags = ii->header->flags; if ((rc = grn_io_close(ctx, ii->seg))) { goto exit; } if ((rc = grn_io_close(ctx, ii->chunk))) { goto exit; } ii->seg = NULL; ii->chunk = NULL; if (segpath && (rc = grn_io_remove(ctx, segpath))) { goto exit; } if (chunkpath && (rc = grn_io_remove(ctx, chunkpath))) { goto exit; } if (!_grn_ii_create(ctx, ii, segpath, lexicon, flags)) { rc = GRN_UNKNOWN_ERROR; } exit: if (segpath) { GRN_FREE(segpath); } if (chunkpath) { GRN_FREE(chunkpath); } return rc; } grn_ii * grn_ii_open(grn_ctx *ctx, const char *path, grn_obj *lexicon) { grn_io *seg, *chunk; grn_ii *ii; char path2[PATH_MAX]; struct grn_ii_header *header; uint32_t io_type; grn_table_flags lflags; grn_encoding encoding; grn_obj *tokenizer; if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, NULL, NULL)) { return NULL; } if (strlen(path) + 6 >= PATH_MAX) { return NULL; } grn_strcpy(path2, PATH_MAX, path); grn_strcat(path2, PATH_MAX, ".c"); seg = grn_io_open(ctx, path, grn_io_auto); if (!seg) { return NULL; } chunk = grn_io_open(ctx, path2, grn_io_auto); if (!chunk) { grn_io_close(ctx, seg); return NULL; } header = grn_io_header(seg); io_type = grn_io_get_type(seg); if (io_type != GRN_COLUMN_INDEX) { ERR(GRN_INVALID_FORMAT, "[column][index] file type must be %#04x: <%#04x>", GRN_COLUMN_INDEX, io_type); grn_io_close(ctx, seg); grn_io_close(ctx, chunk); return NULL; } if (!(ii = GRN_MALLOCN(grn_ii, 1))) { grn_io_close(ctx, seg); grn_io_close(ctx, chunk); return NULL; } GRN_DB_OBJ_SET_TYPE(ii, GRN_COLUMN_INDEX); ii->seg = seg; ii->chunk = chunk; ii->lexicon = lexicon; ii->lflags = lflags; ii->encoding = encoding; ii->header = header; ii->n_elements = 2; if ((header->flags & GRN_OBJ_WITH_SECTION)) { ii->n_elements++; } if ((header->flags & GRN_OBJ_WITH_WEIGHT)) { ii->n_elements++; } if ((header->flags & GRN_OBJ_WITH_POSITION)) { ii->n_elements++; } return ii; } grn_rc grn_ii_close(grn_ctx *ctx, grn_ii *ii) { grn_rc rc; if (!ii) { return GRN_INVALID_ARGUMENT; } if ((rc = grn_io_close(ctx, ii->seg))) { return rc; } if ((rc = grn_io_close(ctx, ii->chunk))) { return rc; } GRN_FREE(ii); /* { int i; for (i = 0; i < 32; i++) { GRN_LOG(ctx, GRN_LOG_DEBUG, "new[%d]=%d free[%d]=%d", i, new_histogram[i], i, free_histogram[i]); } } */ return rc; } grn_rc grn_ii_info(grn_ctx *ctx, grn_ii *ii, uint64_t *seg_size, uint64_t *chunk_size) { grn_rc rc; if (seg_size) { if ((rc = grn_io_size(ctx, ii->seg, seg_size))) { return rc; } } if (chunk_size) { if ((rc = grn_io_size(ctx, ii->chunk, chunk_size))) { return rc; } } return GRN_SUCCESS; } grn_column_flags grn_ii_get_flags(grn_ctx *ctx, grn_ii *ii) { if (!ii) { return 0; } return ii->header->flags; } uint32_t grn_ii_get_n_elements(grn_ctx *ctx, grn_ii *ii) { if (!ii) { return 0; } return ii->n_elements; } void grn_ii_expire(grn_ctx *ctx, grn_ii *ii) { /* grn_io_expire(ctx, ii->seg, 128, 1000000); */ grn_io_expire(ctx, ii->chunk, 0, 1000000); } grn_rc grn_ii_flush(grn_ctx *ctx, grn_ii *ii) { grn_rc rc; rc = grn_io_flush(ctx, ii->seg); if (rc == GRN_SUCCESS) { rc = grn_io_flush(ctx, ii->chunk); } return rc; } size_t grn_ii_get_disk_usage(grn_ctx *ctx, grn_ii *ii) { size_t usage; usage = grn_io_get_disk_usage(ctx, ii->seg); usage += grn_io_get_disk_usage(ctx, ii->chunk); return usage; } #define BIT11_01(x) ((x >> 1) & 0x7ff) #define BIT31_12(x) (x >> 12) grn_rc grn_ii_update_one(grn_ctx *ctx, grn_ii *ii, grn_id tid, grn_ii_updspec *u, grn_hash *h) { buffer *b; uint8_t *bs; buffer_rec *br = NULL; buffer_term *bt; uint32_t pseg = 0, pos = 0, size, *a; if (!tid) { return ctx->rc; } if (!u->tf || !u->sid) { return grn_ii_delete_one(ctx, ii, tid, u, h); } if (u->sid > ii->header->smax) { ii->header->smax = u->sid; } if (!(a = array_get(ctx, ii, tid))) { DEFINE_NAME(ii); MERR("[ii][update][one] failed to allocate an array: " "<%.*s>: " "<%u>:<%u>:<%u>", name_size, name, u->rid, u->sid, tid); return ctx->rc; } if (!(bs = encode_rec(ctx, ii, u, &size, 0))) { DEFINE_NAME(ii); MERR("[ii][update][one] failed to encode a record: " "<%.*s>: " "<%u>:<%u>:<%u>", name_size, name, u->rid, u->sid, tid); goto exit; } for (;;) { if (a[0]) { if (!(a[0] & 1)) { pos = a[0]; if ((pseg = buffer_open(ctx, ii, pos, &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { DEFINE_NAME(ii); MERR("[ii][update][one] failed to allocate a buffer: " "<%.*s>: " "<%u>:<%u>:<%u>: " "segment:<%u>", name_size, name, u->rid, u->sid, tid, pos); goto exit; } if (b->header.buffer_free < size) { int bfb = b->header.buffer_free; GRN_LOG(ctx, GRN_LOG_DEBUG, "flushing a[0]=%d seg=%d(%p) free=%d", a[0], LSEG(a[0]), b, b->header.buffer_free); buffer_close(ctx, ii, pseg); if (SPLIT_COND(ii, b)) { /*((S_SEGMENT - sizeof(buffer_header) + ii->header->bmax - b->header.nterms * sizeof(buffer_term)) * 4 < b->header.chunk_size)*/ GRN_LOG(ctx, GRN_LOG_DEBUG, "nterms=%d chunk=%d total=%" GRN_FMT_INT64U, b->header.nterms, b->header.chunk_size, ii->header->total_chunk_size >> 10); buffer_split(ctx, ii, LSEG(pos), h); if (ctx->rc != GRN_SUCCESS) { DEFINE_NAME(ii); ERR(ctx->rc, "[ii][update][one] failed to split a buffer: " "<%.*s>: " "<%u>:<%u><%u>: " "segment:<%u>", name_size, name, u->rid, u->sid, tid, pos); goto exit; } continue; } buffer_flush(ctx, ii, LSEG(pos), h); if (ctx->rc != GRN_SUCCESS) { DEFINE_NAME(ii); ERR(ctx->rc, "[ii][update][one] failed to flush a buffer: " "<%.*s>: " "<%u>:<%u><%u>: " "segment:<%u>", name_size, name, u->rid, u->sid, tid, pos); goto exit; } if (a[0] != pos) { GRN_LOG(ctx, GRN_LOG_DEBUG, "grn_ii_update_one: a[0] changed %d->%d", a[0], pos); continue; } if ((pseg = buffer_open(ctx, ii, pos, &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { GRN_LOG(ctx, GRN_LOG_CRIT, "buffer not found a[0]=%d", a[0]); { DEFINE_NAME(ii); MERR("[ii][update][one] failed to reallocate a buffer: " "<%.*s>: " "<%u>:<%u>:<%u>: " "segment:<%u>, new-segment:<%u>", name_size, name, u->rid, u->sid, tid, pos, a[0]); } goto exit; } GRN_LOG(ctx, GRN_LOG_DEBUG, "flushed a[0]=%d seg=%d(%p) free=%d->%d nterms=%d v=%d", a[0], LSEG(a[0]), b, bfb, b->header.buffer_free, b->header.nterms, b->header.nterms_void); if (b->header.buffer_free < size) { DEFINE_NAME(ii); MERR("[ii][update][one] buffer is full: " "<%.*s>: " "<%u>:<%u><%u>: " "segment:<%u>, new-segment:<%u>, free:<%u>, required:<%u>", name_size, name, u->rid, u->sid, tid, pos, a[0], b->header.buffer_free, size); buffer_close(ctx, ii, pseg); /* todo: direct merge */ goto exit; } } b->header.buffer_free -= size; br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) + b->header.buffer_free); } else { grn_ii_updspec u2; uint32_t size2 = 0, v = a[0]; struct _grn_ii_pos pos2; pos2.pos = a[1]; pos2.next = NULL; u2.pos = &pos2; if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { u2.rid = BIT31_12(v); u2.sid = BIT11_01(v); } else { u2.rid = v >> 1; u2.sid = 1; } u2.tf = 1; u2.weight = 0; if (u2.rid != u->rid || u2.sid != u->sid) { uint8_t *bs2 = encode_rec(ctx, ii, &u2, &size2, 0); if (!bs2) { DEFINE_NAME(ii); MERR("[ii][update][one] failed to encode a record2: " "<%.*s>: " "<%u>:<%u>:<%u>", name_size, name, u2.rid, u2.sid, tid); goto exit; } pseg = buffer_new(ctx, ii, size + size2, &pos, &bt, &br, &b, tid, h); if (pseg == GRN_II_PSEG_NOT_ASSIGNED) { GRN_FREE(bs2); { DEFINE_NAME(ii); MERR("[ii][update][one] failed to create a buffer2: " "<%.*s>: " "<%u>:<%u>:<%u>: " "size:<%u>", name_size, name, u2.rid, u2.sid, tid, size + size2); } goto exit; } bt->tid = tid; bt->size_in_chunk = 0; bt->pos_in_chunk = 0; bt->size_in_buffer = 0; bt->pos_in_buffer = 0; buffer_put(ctx, ii, b, bt, br, bs2, &u2, size2); if (ctx->rc != GRN_SUCCESS) { GRN_FREE(bs2); buffer_close(ctx, ii, pseg); { DEFINE_NAME(ii); MERR("[ii][update][one] failed to put to buffer: " "<%.*s>: " "<%u>:<%u>:<%u>", name_size, name, u2.rid, u2.sid, tid); } goto exit; } br = (buffer_rec *)(((byte *)br) + size2); GRN_FREE(bs2); } } } break; } if (!br) { if (u->tf == 1 && u->weight == 0) { if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { if (u->rid < 0x100000 && u->sid < 0x800) { a[0] = (u->rid << 12) + (u->sid << 1) + 1; a[1] = u->pos->pos; goto exit; } } else { a[0] = (u->rid << 1) + 1; a[1] = u->pos->pos; goto exit; } } pseg = buffer_new(ctx, ii, size, &pos, &bt, &br, &b, tid, h); if (pseg == GRN_II_PSEG_NOT_ASSIGNED) { DEFINE_NAME(ii); MERR("[ii][update][one] failed to create a buffer: " "<%.*s>: " "<%u>:<%u>:<%u>: " "size:<%u>", name_size, name, u->rid, u->sid, tid, size); goto exit; } bt->tid = tid; bt->size_in_chunk = 0; bt->pos_in_chunk = 0; bt->size_in_buffer = 0; bt->pos_in_buffer = 0; } buffer_put(ctx, ii, b, bt, br, bs, u, size); buffer_close(ctx, ii, pseg); if (!a[0] || (a[0] & 1)) { a[0] = pos; } exit : array_unref(ii, tid); if (bs) { GRN_FREE(bs); } if (u->tf != u->atf) { grn_obj *source_table; char source_table_name[GRN_TABLE_MAX_KEY_SIZE]; int source_table_name_size; char term[GRN_TABLE_MAX_KEY_SIZE]; int term_size; source_table = grn_ctx_at(ctx, DB_OBJ(ii)->range); if (source_table) { source_table_name_size = grn_obj_name(ctx, source_table, source_table_name, GRN_TABLE_MAX_KEY_SIZE); } else { grn_strcpy(source_table_name, GRN_TABLE_MAX_KEY_SIZE, "(null)"); source_table_name_size = strlen(source_table_name); } term_size = grn_table_get_key(ctx, ii->lexicon, tid, term, GRN_TABLE_MAX_KEY_SIZE); { DEFINE_NAME(ii); GRN_LOG(ctx, GRN_LOG_WARNING, "[ii][update][one] too many postings: " "<%.*s>: " "record:<%.*s>(%d), " "n-postings:<%d>, " "n-discarded-postings:<%d>, " "term:<%d>(<%.*s>)", name_size, name, source_table_name_size, source_table_name, u->rid, u->atf, u->atf - u->tf, tid, term_size, term); } } grn_ii_expire(ctx, ii); return ctx->rc; } grn_rc grn_ii_delete_one(grn_ctx *ctx, grn_ii *ii, grn_id tid, grn_ii_updspec *u, grn_hash *h) { buffer *b; uint8_t *bs = NULL; buffer_rec *br; buffer_term *bt; uint32_t pseg, size, *a; if (!tid) { return ctx->rc; } if (!(a = array_at(ctx, ii, tid))) { return ctx->rc; } for (;;) { if (!a[0]) { goto exit; } if (a[0] & 1) { if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { uint32_t rid = BIT31_12(a[0]); uint32_t sid = BIT11_01(a[0]); if (u->rid == rid && (!u->sid || u->sid == sid)) { a[0] = 0; lexicon_delete(ctx, ii, tid, h); } } else { uint32_t rid = a[0] >> 1; if (u->rid == rid) { a[0] = 0; lexicon_delete(ctx, ii, tid, h); } } goto exit; } if (!(bs = encode_rec(ctx, ii, u, &size, 1))) { DEFINE_NAME(ii); MERR("[ii][delete][one] failed to encode a record: " "<%.*s>: " "<%u>:<%u>:<%u>", name_size, name, u->rid, u->sid, tid); goto exit; } if ((pseg = buffer_open(ctx, ii, a[0], &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { DEFINE_NAME(ii); MERR("[ii][delete][one] failed to allocate a buffer: " "<%.*s>: " "<%u>:<%u><%u>: " "position:<%u>", name_size, name, u->rid, u->sid, tid, a[0]); goto exit; } if (b->header.buffer_free < size) { uint32_t _a = a[0]; GRN_LOG(ctx, GRN_LOG_DEBUG, "flushing! b=%p free=%d, seg(%d)", b, b->header.buffer_free, LSEG(a[0])); buffer_close(ctx, ii, pseg); buffer_flush(ctx, ii, LSEG(a[0]), h); if (ctx->rc != GRN_SUCCESS) { DEFINE_NAME(ii); ERR(ctx->rc, "[ii][delete][one] failed to flush a buffer: " "<%.*s>: " "<%u>:<%u><%u>: " "position:<%u>", name_size, name, u->rid, u->sid, tid, a[0]); goto exit; } if (a[0] != _a) { GRN_LOG(ctx, GRN_LOG_DEBUG, "grn_ii_delete_one: a[0] changed %d->%d)", a[0], _a); continue; } if ((pseg = buffer_open(ctx, ii, a[0], &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { DEFINE_NAME(ii); MERR("[ii][delete][one] failed to reallocate a buffer: " "<%.*s>: " "<%u>:<%u><%u>: " "position:<%u>", name_size, name, u->rid, u->sid, tid, a[0]); goto exit; } GRN_LOG(ctx, GRN_LOG_DEBUG, "flushed! b=%p free=%d, seg(%d)", b, b->header.buffer_free, LSEG(a[0])); if (b->header.buffer_free < size) { DEFINE_NAME(ii); MERR("[ii][delete][one] buffer is full: " "<%.*s>: " "<%u>:<%u><%u>: " "segment:<%u>, free:<%u>, required:<%u>", name_size, name, u->rid, u->sid, tid, a[0], b->header.buffer_free, size); buffer_close(ctx, ii, pseg); goto exit; } } b->header.buffer_free -= size; br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) + b->header.buffer_free); buffer_put(ctx, ii, b, bt, br, bs, u, size); buffer_close(ctx, ii, pseg); break; } exit : array_unref(ii, tid); if (bs) { GRN_FREE(bs); } return ctx->rc; } #define CHUNK_USED 1 #define BUFFER_USED 2 #define SOLE_DOC_USED 4 #define SOLE_POS_USED 8 struct _grn_ii_cursor { grn_db_obj obj; grn_ctx *ctx; grn_ii *ii; grn_id id; grn_posting *post; grn_id min; /* Minimum record ID */ grn_id max; grn_posting pc; grn_posting pb; uint32_t cdf; /* Document frequency */ uint32_t *cdp; uint32_t *crp; /* Record ID */ uint32_t *csp; /* Section ID */ uint32_t *ctp; /* Term frequency */ uint32_t *cwp; /* Weight */ uint32_t *cpp; /* Position */ uint8_t *bp; int nelements; uint32_t nchunks; uint32_t curr_chunk; chunk_info *cinfo; grn_io_win iw; uint8_t *cp; uint8_t *cpe; datavec rdv[MAX_N_ELEMENTS + 1]; struct grn_ii_buffer *buf; uint16_t stat; uint16_t nextb; uint32_t buffer_pseg; int flags; uint32_t *ppseg; int weight; uint32_t prev_chunk_rid; }; static grn_bool buffer_is_reused(grn_ctx *ctx, grn_ii *ii, grn_ii_cursor *c) { if (*c->ppseg != c->buffer_pseg) { uint32_t i; for (i = ii->header->bgqtail; i != ii->header->bgqhead; i = (i + 1) & (GRN_II_BGQSIZE - 1)) { if (ii->header->bgqbody[i] == c->buffer_pseg) { return GRN_FALSE; } } return GRN_TRUE; } return GRN_FALSE; } static int chunk_is_reused(grn_ctx *ctx, grn_ii *ii, grn_ii_cursor *c, uint32_t offset, uint32_t size) { if (*c->ppseg != c->buffer_pseg) { uint32_t i, m, gseg; if (size > S_CHUNK) { return 1; } if (size > (1 << GRN_II_W_LEAST_CHUNK)) { int es = size - 1; GRN_BIT_SCAN_REV(es, m); m++; } else { m = GRN_II_W_LEAST_CHUNK; } gseg = ii->header->garbages[m - GRN_II_W_LEAST_CHUNK]; while (gseg != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win iw; grn_ii_ginfo *ginfo = WIN_MAP(ii->chunk, ctx, &iw, gseg, 0, S_GARBAGE, grn_io_rdwr); if (!ginfo) { break; } for (i = 0; i < ginfo->nrecs; i++) { if (ginfo->recs[i] == offset) { grn_io_win_unmap(&iw); return 0; } } gseg = ginfo->next; grn_io_win_unmap(&iw); } return 1; } return 0; } #define GRN_II_CURSOR_CMP(c1,c2) \ (((c1)->post->rid > (c2)->post->rid) || \ (((c1)->post->rid == (c2)->post->rid) && \ (((c1)->post->sid > (c2)->post->sid) || \ (((c1)->post->sid == (c2)->post->sid) && \ ((c1)->post->pos > (c2)->post->pos))))) grn_ii_cursor * grn_ii_cursor_open(grn_ctx *ctx, grn_ii *ii, grn_id tid, grn_id min, grn_id max, int nelements, int flags) { grn_ii_cursor *c = NULL; uint32_t pos, *a; if (!(a = array_at(ctx, ii, tid))) { return NULL; } for (;;) { c = NULL; if (!(pos = a[0])) { goto exit; } if (!(c = GRN_MALLOC(sizeof(grn_ii_cursor)))) { goto exit; } memset(c, 0, sizeof(grn_ii_cursor)); c->ctx = ctx; c->ii = ii; c->id = tid; c->min = min; c->max = max; c->nelements = nelements; c->flags = flags; c->weight = 0; if (pos & 1) { c->stat = 0; if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { c->pb.rid = BIT31_12(pos); c->pb.sid = BIT11_01(pos); } else { c->pb.rid = pos >> 1; c->pb.sid = 1; } c->pb.tf = 1; c->pb.weight = 0; c->pb.pos = a[1]; } else { uint32_t chunk; buffer_term *bt; c->buffer_pseg = buffer_open(ctx, ii, pos, &bt, &c->buf); if (c->buffer_pseg == GRN_II_PSEG_NOT_ASSIGNED) { GRN_FREE(c); c = NULL; goto exit; } c->ppseg = &ii->header->binfo[LSEG(pos)]; if (bt->size_in_chunk && (chunk = c->buf->header.chunk) != GRN_II_PSEG_NOT_ASSIGNED) { if (!(c->cp = WIN_MAP(ii->chunk, ctx, &c->iw, chunk, bt->pos_in_chunk, bt->size_in_chunk, grn_io_rdonly))) { buffer_close(ctx, ii, c->buffer_pseg); GRN_FREE(c); c = NULL; goto exit; } if (buffer_is_reused(ctx, ii, c)) { grn_ii_cursor_close(ctx, c); continue; } c->cpe = c->cp + bt->size_in_chunk; if ((bt->tid & CHUNK_SPLIT)) { int i; grn_id crid; GRN_B_DEC(c->nchunks, c->cp); if (chunk_is_reused(ctx, ii, c, chunk, c->buf->header.chunk_size)) { grn_ii_cursor_close(ctx, c); continue; } if (!(c->cinfo = GRN_MALLOCN(chunk_info, c->nchunks))) { buffer_close(ctx, ii, c->buffer_pseg); grn_io_win_unmap(&c->iw); GRN_FREE(c); c = NULL; goto exit; } for (i = 0, crid = GRN_ID_NIL; (uint) i < c->nchunks; i++) { GRN_B_DEC(c->cinfo[i].segno, c->cp); GRN_B_DEC(c->cinfo[i].size, c->cp); GRN_B_DEC(c->cinfo[i].dgap, c->cp); crid += c->cinfo[i].dgap; if (crid < min) { c->pc.rid = crid; c->curr_chunk = i + 1; } } if (chunk_is_reused(ctx, ii, c, chunk, c->buf->header.chunk_size)) { grn_ii_cursor_close(ctx, c); continue; } } if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { c->rdv[ii->n_elements - 1].flags = ODD; } } c->nextb = bt->pos_in_buffer; c->stat = CHUNK_USED|BUFFER_USED; } if (pos == a[0]) { break; } grn_ii_cursor_close(ctx, c); } exit : array_unref(ii, tid); return c; } static inline void grn_ii_cursor_set_min(grn_ctx *ctx, grn_ii_cursor *c, grn_id min) { if (c->min >= min) { return; } if (grn_ii_cursor_set_min_enable) { grn_id old_min = c->min; c->min = min; if (c->buf && c->pc.rid != GRN_ID_NIL && c->pc.rid < c->min && c->prev_chunk_rid < c->min && c->curr_chunk < c->nchunks) { uint32_t i; uint32_t skip_chunk = 0; grn_id rid = c->prev_chunk_rid; if (c->curr_chunk > 0) { i = c->curr_chunk - 1; } else { i = 0; } for (; i < c->nchunks; i++) { rid += c->cinfo[i].dgap; if (rid < c->min) { skip_chunk = i + 1; } else { rid -= c->cinfo[i].dgap; break; } } if (skip_chunk > c->curr_chunk) { uint32_t old_chunk = c->curr_chunk; grn_bool old_chunk_used = (c->stat & CHUNK_USED); c->pc.rid = rid; c->pc.rest = 0; c->prev_chunk_rid = rid - c->cinfo[skip_chunk - 1].dgap; c->curr_chunk = skip_chunk; c->crp = c->cdp + c->cdf; c->stat |= CHUNK_USED; GRN_LOG(ctx, GRN_LOG_DEBUG, "[ii][cursor][min] skip: %p: min(%u->%u): chunk(%u->%u): " "chunk-used(%-.256s->%-.256s)", c, old_min, min, old_chunk, c->curr_chunk, old_chunk_used ? "true" : "false", (c->stat & CHUNK_USED) ? "true" : "false"); } } } } typedef struct { grn_bool include_garbage; } grn_ii_cursor_next_options; static inline grn_posting * grn_ii_cursor_next_internal(grn_ctx *ctx, grn_ii_cursor *c, grn_ii_cursor_next_options *options) { const grn_bool include_garbage = options->include_garbage; if (c->buf) { for (;;) { if (c->stat & CHUNK_USED) { for (;;) { if (c->crp < c->cdp + c->cdf) { uint32_t dgap = *c->crp++; c->pc.rid += dgap; if (dgap) { c->pc.sid = 0; } if ((c->ii->header->flags & GRN_OBJ_WITH_SECTION)) { c->pc.sid += 1 + *c->csp++; } else { c->pc.sid = 1; } c->cpp += c->pc.rest; c->pc.rest = c->pc.tf = 1 + *c->ctp++; if ((c->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { c->pc.weight = *c->cwp++; } else { c->pc.weight = 0; } c->pc.pos = 0; /* { static int count = 0; int tf = c->pc.tf, pos = 0, *pp = (int *)c->cpp; grn_obj buf; GRN_TEXT_INIT(&buf, 0); grn_text_itoa(ctx, &buf, c->pc.rid); GRN_TEXT_PUTC(ctx, &buf, ':'); grn_text_itoa(ctx, &buf, c->pc.sid); GRN_TEXT_PUTC(ctx, &buf, ':'); grn_text_itoa(ctx, &buf, c->pc.tf); GRN_TEXT_PUTC(ctx, &buf, '('); while (tf--) { pos += *pp++; count++; grn_text_itoa(ctx, &buf, pos); if (tf) { GRN_TEXT_PUTC(ctx, &buf, ':'); } } GRN_TEXT_PUTC(ctx, &buf, ')'); GRN_TEXT_PUTC(ctx, &buf, '\0'); GRN_LOG(ctx, GRN_LOG_DEBUG, "posting(%d):%-.256s", count, GRN_TEXT_VALUE(&buf)); GRN_OBJ_FIN(ctx, &buf); } */ } else { if (c->curr_chunk <= c->nchunks) { if (c->curr_chunk == c->nchunks) { if (c->cp < c->cpe) { int decoded_size; decoded_size = grn_p_decv(ctx, c->cp, c->cpe - c->cp, c->rdv, c->ii->n_elements); if (decoded_size == 0) { GRN_LOG(ctx, GRN_LOG_WARNING, "[ii][cursor][next][chunk][last] " "chunk(%d) is changed by another thread " "while decoding: %p", c->cinfo[c->curr_chunk].segno, c); c->pc.rid = GRN_ID_NIL; break; } if (buffer_is_reused(ctx, c->ii, c)) { GRN_LOG(ctx, GRN_LOG_WARNING, "[ii][cursor][next][chunk][last] " "buffer is reused by another thread: %p", c); c->pc.rid = GRN_ID_NIL; break; } if (chunk_is_reused(ctx, c->ii, c, c->buf->header.chunk, c->buf->header.chunk_size)) { GRN_LOG(ctx, GRN_LOG_WARNING, "[ii][cursor][next][chunk][last] " "chunk(%d) is reused by another thread: %p", c->buf->header.chunk, c); c->pc.rid = GRN_ID_NIL; break; } } else { c->pc.rid = GRN_ID_NIL; break; } } else { uint8_t *cp; grn_io_win iw; uint32_t size = c->cinfo[c->curr_chunk].size; if (size && (cp = WIN_MAP(c->ii->chunk, ctx, &iw, c->cinfo[c->curr_chunk].segno, 0, size, grn_io_rdonly))) { int decoded_size; decoded_size = grn_p_decv(ctx, cp, size, c->rdv, c->ii->n_elements); grn_io_win_unmap(&iw); if (decoded_size == 0) { GRN_LOG(ctx, GRN_LOG_WARNING, "[ii][cursor][next][chunk] " "chunk(%d) is changed by another thread " "while decoding: %p", c->cinfo[c->curr_chunk].segno, c); c->pc.rid = GRN_ID_NIL; break; } if (chunk_is_reused(ctx, c->ii, c, c->cinfo[c->curr_chunk].segno, size)) { GRN_LOG(ctx, GRN_LOG_WARNING, "[ii][cursor][next][chunk] " "chunk(%d) is reused by another thread: %p", c->cinfo[c->curr_chunk].segno, c); c->pc.rid = GRN_ID_NIL; break; } } else { c->pc.rid = GRN_ID_NIL; break; } } { int j = 0; c->cdf = c->rdv[j].data_size; c->crp = c->cdp = c->rdv[j++].data; if ((c->ii->header->flags & GRN_OBJ_WITH_SECTION)) { c->csp = c->rdv[j++].data; } c->ctp = c->rdv[j++].data; if ((c->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { c->cwp = c->rdv[j++].data; } if ((c->ii->header->flags & GRN_OBJ_WITH_POSITION)) { c->cpp = c->rdv[j].data; } } c->prev_chunk_rid = c->pc.rid; c->pc.rid = GRN_ID_NIL; c->pc.sid = 0; c->pc.rest = 0; c->curr_chunk++; continue; } else { c->pc.rid = GRN_ID_NIL; } } break; } } if (c->stat & BUFFER_USED) { for (;;) { if (c->nextb) { uint32_t lrid = c->pb.rid, lsid = c->pb.sid; /* for check */ buffer_rec *br = BUFFER_REC_AT(c->buf, c->nextb); if (buffer_is_reused(ctx, c->ii, c)) { GRN_LOG(ctx, GRN_LOG_WARNING, "[ii][cursor][next][buffer] " "buffer(%d,%d) is reused by another thread: %p", c->buffer_pseg, *c->ppseg, c); c->pb.rid = GRN_ID_NIL; break; } c->bp = GRN_NEXT_ADDR(br); GRN_B_DEC(c->pb.rid, c->bp); if ((c->ii->header->flags & GRN_OBJ_WITH_SECTION)) { GRN_B_DEC(c->pb.sid, c->bp); } else { c->pb.sid = 1; } if (lrid > c->pb.rid || (lrid == c->pb.rid && lsid >= c->pb.sid)) { DEFINE_NAME(c->ii); ERR(GRN_FILE_CORRUPT, "[ii][broken][cursor][next][buffer] " "posting in list in buffer isn't sorted: " "<%.*s>: (%d:%d) -> (%d:%d) (%d->%d)", name_size, name, lrid, lsid, c->pb.rid, c->pb.sid, c->buffer_pseg, *c->ppseg); c->pb.rid = GRN_ID_NIL; break; } if (c->pb.rid < c->min) { c->pb.rid = 0; if (br->jump > 0 && !BUFFER_REC_DELETED(br)) { buffer_rec *jump_br = BUFFER_REC_AT(c->buf, br->jump); if (BUFFER_REC_DELETED(jump_br)) { c->nextb = br->step; } else { uint8_t *jump_bp; uint32_t jump_rid; jump_bp = GRN_NEXT_ADDR(jump_br); GRN_B_DEC(jump_rid, jump_bp); if (jump_rid < c->min) { c->nextb = br->jump; } else { c->nextb = br->step; } } } else { c->nextb = br->step; } continue; } c->nextb = br->step; GRN_B_DEC(c->pb.tf, c->bp); if ((c->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { GRN_B_DEC(c->pb.weight, c->bp); } else { c->pb.weight = 0; } c->pb.rest = c->pb.tf; c->pb.pos = 0; } else { c->pb.rid = 0; } break; } } if (c->pb.rid) { if (c->pc.rid) { if (c->pc.rid < c->pb.rid) { c->stat = CHUNK_USED; if (include_garbage || (c->pc.tf && c->pc.sid)) { c->post = &c->pc; break; } } else { if (c->pb.rid < c->pc.rid) { c->stat = BUFFER_USED; if (include_garbage || (c->pb.tf && c->pb.sid)) { c->post = &c->pb; break; } } else { if (c->pb.sid) { if (c->pc.sid < c->pb.sid) { c->stat = CHUNK_USED; if (include_garbage || (c->pc.tf && c->pc.sid)) { c->post = &c->pc; break; } } else { c->stat = BUFFER_USED; if (c->pb.sid == c->pc.sid) { c->stat |= CHUNK_USED; } if (include_garbage || (c->pb.tf)) { c->post = &c->pb; break; } } } else { c->stat = CHUNK_USED; } } } } else { c->stat = BUFFER_USED; if (include_garbage || (c->pb.tf && c->pb.sid)) { c->post = &c->pb; break; } } } else { if (c->pc.rid) { c->stat = CHUNK_USED; if (include_garbage || (c->pc.tf && c->pc.sid)) { c->post = &c->pc; break; } } else { c->post = NULL; return NULL; } } } } else { if (c->stat & SOLE_DOC_USED) { c->post = NULL; return NULL; } else { c->post = &c->pb; c->stat |= SOLE_DOC_USED; if (c->post->rid < c->min) { c->post = NULL; return NULL; } } } return c->post; } grn_posting * grn_ii_cursor_next(grn_ctx *ctx, grn_ii_cursor *c) { grn_ii_cursor_next_options options = { .include_garbage = GRN_FALSE }; return grn_ii_cursor_next_internal(ctx, c, &options); } grn_posting * grn_ii_cursor_next_pos(grn_ctx *ctx, grn_ii_cursor *c) { uint32_t gap; if ((c->ii->header->flags & GRN_OBJ_WITH_POSITION)) { if (c->nelements == (int) c->ii->n_elements) { if (c->buf) { if (c->post == &c->pc) { if (c->pc.rest) { c->pc.rest--; c->pc.pos += *c->cpp++; } else { return NULL; } } else if (c->post == &c->pb) { if (buffer_is_reused(ctx, c->ii, c)) { GRN_LOG(ctx, GRN_LOG_WARNING, "[ii][cursor][next][pos][buffer] " "buffer(%d,%d) is reused by another thread: %p", c->buffer_pseg, *c->ppseg, c); return NULL; } if (c->pb.rest) { c->pb.rest--; GRN_B_DEC(gap, c->bp); c->pb.pos += gap; } else { return NULL; } } else { return NULL; } } else { if (c->stat & SOLE_POS_USED) { return NULL; } else { c->stat |= SOLE_POS_USED; } } } } else { if (c->stat & SOLE_POS_USED) { return NULL; } else { c->stat |= SOLE_POS_USED; } } return c->post; } grn_rc grn_ii_cursor_close(grn_ctx *ctx, grn_ii_cursor *c) { if (!c) { return GRN_INVALID_ARGUMENT; } datavec_fin(ctx, c->rdv); if (c->cinfo) { GRN_FREE(c->cinfo); } if (c->buf) { buffer_close(ctx, c->ii, c->buffer_pseg); } if (c->cp) { grn_io_win_unmap(&c->iw); } GRN_FREE(c); return GRN_SUCCESS; } uint32_t grn_ii_get_chunksize(grn_ctx *ctx, grn_ii *ii, grn_id tid) { uint32_t res, pos, *a; a = array_at(ctx, ii, tid); if (!a) { return 0; } if ((pos = a[0])) { if (pos & 1) { res = 0; } else { buffer *buf; uint32_t pseg; buffer_term *bt; if ((pseg = buffer_open(ctx, ii, pos, &bt, &buf)) == GRN_II_PSEG_NOT_ASSIGNED) { res = 0; } else { res = bt->size_in_chunk; buffer_close(ctx, ii, pseg); } } } else { res = 0; } array_unref(ii, tid); return res; } uint32_t grn_ii_estimate_size(grn_ctx *ctx, grn_ii *ii, grn_id tid) { uint32_t res, pos, *a; a = array_at(ctx, ii, tid); if (!a) { return 0; } if ((pos = a[0])) { if (pos & 1) { res = 1; } else { buffer *buf; uint32_t pseg; buffer_term *bt; if ((pseg = buffer_open(ctx, ii, pos, &bt, &buf)) == GRN_II_PSEG_NOT_ASSIGNED) { res = 0; } else { res = a[1] + bt->size_in_buffer + 2; buffer_close(ctx, ii, pseg); } } } else { res = 0; } array_unref(ii, tid); return res; } int grn_ii_entry_info(grn_ctx *ctx, grn_ii *ii, grn_id tid, unsigned int *a, unsigned int *chunk, unsigned int *chunk_size, unsigned int *buffer_free, unsigned int *nterms, unsigned int *nterms_void, unsigned int *bt_tid, unsigned int *size_in_chunk, unsigned int *pos_in_chunk, unsigned int *size_in_buffer, unsigned int *pos_in_buffer) { buffer *b; buffer_term *bt; uint32_t pseg, *ap; ERRCLR(NULL); ap = array_at(ctx, ii, tid); if (!ap) { return 0; } a[0] = *ap; array_unref(ii, tid); if (!a[0]) { return 1; } if (a[0] & 1) { return 2; } if ((pseg = buffer_open(ctx, ii, a[0], &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { return 3; } *chunk = b->header.chunk; *chunk_size = b->header.chunk_size; *buffer_free = b->header.buffer_free; *nterms = b->header.nterms; *bt_tid = bt->tid; *size_in_chunk = bt->size_in_chunk; *pos_in_chunk = bt->pos_in_chunk; *size_in_buffer = bt->size_in_buffer; *pos_in_buffer = bt->pos_in_buffer; buffer_close(ctx, ii, pseg); return 4; } const char * grn_ii_path(grn_ii *ii) { return grn_io_path(ii->seg); } uint32_t grn_ii_max_section(grn_ii *ii) { return ii->header->smax; } grn_obj * grn_ii_lexicon(grn_ii *ii) { return ii->lexicon; } /* private classes */ /* b-heap */ typedef struct { int n_entries; int n_bins; grn_ii_cursor **bins; } cursor_heap; static inline cursor_heap * cursor_heap_open(grn_ctx *ctx, int max) { cursor_heap *h = GRN_MALLOC(sizeof(cursor_heap)); if (!h) { return NULL; } h->bins = GRN_MALLOC(sizeof(grn_ii_cursor *) * max); if (!h->bins) { GRN_FREE(h); return NULL; } h->n_entries = 0; h->n_bins = max; return h; } static inline grn_rc cursor_heap_push(grn_ctx *ctx, cursor_heap *h, grn_ii *ii, grn_id tid, uint32_t offset2, int weight, grn_id min) { int n, n2; grn_ii_cursor *c, *c2; if (h->n_entries >= h->n_bins) { int max = h->n_bins * 2; grn_ii_cursor **bins = GRN_REALLOC(h->bins, sizeof(grn_ii_cursor *) * max); GRN_LOG(ctx, GRN_LOG_DEBUG, "expanded cursor_heap to %d,%p", max, bins); if (!bins) { return GRN_NO_MEMORY_AVAILABLE; } h->n_bins = max; h->bins = bins; } { if (!(c = grn_ii_cursor_open(ctx, ii, tid, min, GRN_ID_MAX, ii->n_elements, 0))) { GRN_LOG(ctx, GRN_LOG_ERROR, "cursor open failed"); return ctx->rc; } if (!grn_ii_cursor_next(ctx, c)) { grn_ii_cursor_close(ctx, c); return GRN_END_OF_DATA; } if (!grn_ii_cursor_next_pos(ctx, c)) { if (grn_logger_pass(ctx, GRN_LOG_ERROR)) { char token[GRN_TABLE_MAX_KEY_SIZE]; int token_size; token_size = grn_table_get_key(ctx, c->ii->lexicon, c->id, &token, GRN_TABLE_MAX_KEY_SIZE); GRN_LOG(ctx, GRN_LOG_ERROR, "[ii][cursor][heap][push] invalid cursor: " "%p: token:<%.*s>(%u)", c, token_size, token, c->id); } grn_ii_cursor_close(ctx, c); return GRN_END_OF_DATA; } if (weight) { c->weight = weight; } n = h->n_entries++; while (n) { n2 = (n - 1) >> 1; c2 = h->bins[n2]; if (GRN_II_CURSOR_CMP(c, c2)) { break; } h->bins[n] = c2; n = n2; } h->bins[n] = c; } return GRN_SUCCESS; } static inline grn_rc cursor_heap_push2(cursor_heap *h) { grn_rc rc = GRN_SUCCESS; return rc; } static inline grn_ii_cursor * cursor_heap_min(cursor_heap *h) { return h->n_entries ? h->bins[0] : NULL; } static inline void cursor_heap_recalc_min(cursor_heap *h) { int n = 0, n1, n2, m; if ((m = h->n_entries) > 1) { grn_ii_cursor *c = h->bins[0], *c1, *c2; for (;;) { n1 = n * 2 + 1; n2 = n1 + 1; c1 = n1 < m ? h->bins[n1] : NULL; c2 = n2 < m ? h->bins[n2] : NULL; if (c1 && GRN_II_CURSOR_CMP(c, c1)) { if (c2 && GRN_II_CURSOR_CMP(c, c2) && GRN_II_CURSOR_CMP(c1, c2)) { h->bins[n] = c2; n = n2; } else { h->bins[n] = c1; n = n1; } } else { if (c2 && GRN_II_CURSOR_CMP(c, c2)) { h->bins[n] = c2; n = n2; } else { h->bins[n] = c; break; } } } } } static inline void cursor_heap_pop(grn_ctx *ctx, cursor_heap *h, grn_id min) { if (h->n_entries) { grn_ii_cursor *c = h->bins[0]; grn_ii_cursor_set_min(ctx, c, min); if (!grn_ii_cursor_next(ctx, c)) { grn_ii_cursor_close(ctx, c); h->bins[0] = h->bins[--h->n_entries]; } else if (!grn_ii_cursor_next_pos(ctx, c)) { if (grn_logger_pass(ctx, GRN_LOG_ERROR)) { char token[GRN_TABLE_MAX_KEY_SIZE]; int token_size; token_size = grn_table_get_key(ctx, c->ii->lexicon, c->id, &token, GRN_TABLE_MAX_KEY_SIZE); GRN_LOG(ctx, GRN_LOG_ERROR, "[ii][cursor][heap][pop] invalid cursor: " "%p: token:<%.*s>(%u)", c, token_size, token, c->id); } grn_ii_cursor_close(ctx, c); h->bins[0] = h->bins[--h->n_entries]; } if (h->n_entries > 1) { cursor_heap_recalc_min(h); } } } static inline void cursor_heap_pop_pos(grn_ctx *ctx, cursor_heap *h) { if (h->n_entries) { grn_ii_cursor *c = h->bins[0]; if (!grn_ii_cursor_next_pos(ctx, c)) { if (!grn_ii_cursor_next(ctx, c)) { grn_ii_cursor_close(ctx, c); h->bins[0] = h->bins[--h->n_entries]; } else if (!grn_ii_cursor_next_pos(ctx, c)) { if (grn_logger_pass(ctx, GRN_LOG_ERROR)) { char token[GRN_TABLE_MAX_KEY_SIZE]; int token_size; token_size = grn_table_get_key(ctx, c->ii->lexicon, c->id, &token, GRN_TABLE_MAX_KEY_SIZE); GRN_LOG(ctx, GRN_LOG_ERROR, "[ii][cursor][heap][pop][position] invalid cursor: " "%p: token:<%.*s>(%u)", c, token_size, token, c->id); } grn_ii_cursor_close(ctx, c); h->bins[0] = h->bins[--h->n_entries]; } } if (h->n_entries > 1) { cursor_heap_recalc_min(h); } } } static inline void cursor_heap_close(grn_ctx *ctx, cursor_heap *h) { int i; if (!h) { return; } for (i = h->n_entries; i--;) { grn_ii_cursor_close(ctx, h->bins[i]); } GRN_FREE(h->bins); GRN_FREE(h); } /* update */ #ifdef USE_VGRAM inline static grn_rc index_add(grn_ctx *ctx, grn_id rid, grn_obj *lexicon, grn_ii *ii, grn_vgram *vgram, const char *value, size_t value_len) { grn_hash *h; unsigned int token_flags = 0; grn_token_cursor *token_cursor; grn_ii_updspec **u; grn_id tid, *tp; grn_rc r, rc = GRN_SUCCESS; grn_vgram_buf *sbuf = NULL; if (!rid) { return GRN_INVALID_ARGUMENT; } if (!(token_cursor = grn_token_cursor_open(ctx, lexicon, value, value_len, GRN_TOKEN_ADD, token_flags))) { return GRN_NO_MEMORY_AVAILABLE; } if (vgram) { sbuf = grn_vgram_buf_open(value_len); } h = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), GRN_HASH_TINY); if (!h) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_hash_create on index_add failed !"); grn_token_cursor_close(ctx, token_cursor); if (sbuf) { grn_vgram_buf_close(sbuf); } return GRN_NO_MEMORY_AVAILABLE; } while (!token_cursor->status) { (tid = grn_token_cursor_next(ctx, token_cursor)); if (tid) { if (!grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **) &u, NULL)) { break; } if (!*u) { if (!(*u = grn_ii_updspec_open(ctx, rid, 1))) { GRN_LOG(ctx, GRN_LOG_ERROR, "grn_ii_updspec_open on index_add failed!"); goto exit; } } if (grn_ii_updspec_add(ctx, *u, token_cursor->pos, 0)) { GRN_LOG(ctx, GRN_LOG_ERROR, "grn_ii_updspec_add on index_add failed!"); goto exit; } if (sbuf) { grn_vgram_buf_add(sbuf, tid); } } } grn_token_cursor_close(ctx, token_cursor); // todo : support vgram // if (sbuf) { grn_vgram_update(vgram, rid, sbuf, (grn_set *)h); } GRN_HASH_EACH(ctx, h, id, &tp, NULL, &u, { if ((r = grn_ii_update_one(ctx, ii, *tp, *u, h))) { rc = r; } grn_ii_updspec_close(ctx, *u); }); grn_hash_close(ctx, h); if (sbuf) { grn_vgram_buf_close(sbuf); } return rc; exit: grn_hash_close(ctx, h); grn_token_cursor_close(ctx, token_cursor); if (sbuf) { grn_vgram_buf_close(sbuf); } return GRN_NO_MEMORY_AVAILABLE; } inline static grn_rc index_del(grn_ctx *ctx, grn_id rid, grn_obj *lexicon, grn_ii *ii, grn_vgram *vgram, const char *value, size_t value_len) { grn_rc rc = GRN_SUCCESS; grn_hash *h; unsigned int token_flags = 0; grn_token_cursor *token_cursor; grn_ii_updspec **u; grn_id tid, *tp; if (!rid) { return GRN_INVALID_ARGUMENT; } if (!(token_cursor = grn_token_cursor_open(ctx, lexicon, value, value_len, GRN_TOKEN_DEL, token_flags))) { return GRN_NO_MEMORY_AVAILABLE; } h = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), GRN_HASH_TINY); if (!h) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_hash_create on index_del failed !"); grn_token_cursor_close(ctx, token_cursor); return GRN_NO_MEMORY_AVAILABLE; } while (!token_cursor->status) { if ((tid = grn_token_cursor_next(ctx, token_cursor))) { if (!grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **) &u, NULL)) { break; } if (!*u) { if (!(*u = grn_ii_updspec_open(ctx, rid, 0))) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_ii_updspec_open on index_del failed !"); grn_hash_close(ctx, h); grn_token_cursor_close(ctx, token_cursor); return GRN_NO_MEMORY_AVAILABLE; } } } } grn_token_cursor_close(ctx, token_cursor); GRN_HASH_EACH(ctx, h, id, &tp, NULL, &u, { if (*tp) { grn_rc r; r = grn_ii_delete_one(ctx, ii, *tp, *u, NULL); if (r) { rc = r; } } grn_ii_updspec_close(ctx, *u); }); grn_hash_close(ctx, h); return rc; } grn_rc grn_ii_upd(grn_ctx *ctx, grn_ii *ii, grn_id rid, grn_vgram *vgram, const char *oldvalue, unsigned int oldvalue_len, const char *newvalue, unsigned int newvalue_len) { grn_rc rc; grn_obj *lexicon = ii->lexicon; if (!rid) { return GRN_INVALID_ARGUMENT; } if (oldvalue && *oldvalue) { if ((rc = index_del(ctx, rid, lexicon, ii, vgram, oldvalue, oldvalue_len))) { GRN_LOG(ctx, GRN_LOG_ERROR, "index_del on grn_ii_upd failed !"); goto exit; } } if (newvalue && *newvalue) { rc = index_add(ctx, rid, lexicon, ii, vgram, newvalue, newvalue_len); } exit : return rc; } grn_rc grn_ii_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, grn_vgram *vgram, unsigned int section, grn_values *oldvalues, grn_values *newvalues) { int j; grn_value *v; unsigned int token_flags = 0; grn_token_cursor *token_cursor; grn_rc rc = GRN_SUCCESS; grn_hash *old, *new; grn_id tid, *tp; grn_ii_updspec **u, **un; grn_obj *lexicon = ii->lexicon; if (!lexicon || !ii || !rid) { GRN_LOG(ctx, GRN_LOG_WARNING, "grn_ii_update: invalid argument"); return GRN_INVALID_ARGUMENT; } if (newvalues) { new = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), GRN_HASH_TINY); if (!new) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_hash_create on grn_ii_update failed !"); rc = GRN_NO_MEMORY_AVAILABLE; goto exit; } for (j = newvalues->n_values, v = newvalues->values; j; j--, v++) { if ((token_cursor = grn_token_cursor_open(ctx, lexicon, v->str, v->str_len, GRN_TOKEN_ADD, token_flags))) { while (!token_cursor->status) { if ((tid = grn_token_cursor_next(ctx, token_cursor))) { if (!grn_hash_add(ctx, new, &tid, sizeof(grn_id), (void **) &u, NULL)) { break; } if (!*u) { if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_ii_updspec_open on grn_ii_update failed!"); grn_token_cursor_close(ctx, token_cursor); grn_hash_close(ctx, new); rc = GRN_NO_MEMORY_AVAILABLE; goto exit; } } if (grn_ii_updspec_add(ctx, *u, token_cursor->pos, v->weight)) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_ii_updspec_add on grn_ii_update failed!"); grn_token_cursor_close(ctx, token_cursor); grn_hash_close(ctx, new); rc = GRN_NO_MEMORY_AVAILABLE; goto exit; } } } grn_token_cursor_close(ctx, token_cursor); } } if (!GRN_HASH_SIZE(new)) { grn_hash_close(ctx, new); new = NULL; } } else { new = NULL; } if (oldvalues) { old = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), GRN_HASH_TINY); if (!old) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_hash_create(ctx, NULL, old) on grn_ii_update failed!"); if (new) { grn_hash_close(ctx, new); } rc = GRN_NO_MEMORY_AVAILABLE; goto exit; } for (j = oldvalues->n_values, v = oldvalues->values; j; j--, v++) { if ((token_cursor = grn_token_cursor_open(ctx, lexicon, v->str, v->str_len, GRN_TOKEN_DEL, token_flags))) { while (!token_cursor->status) { if ((tid = grn_token_cursor_next(ctx, token_cursor))) { if (!grn_hash_add(ctx, old, &tid, sizeof(grn_id), (void **) &u, NULL)) { break; } if (!*u) { if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_ii_updspec_open on grn_ii_update failed!"); grn_token_cursor_close(ctx, token_cursor); if (new) { grn_hash_close(ctx, new); }; grn_hash_close(ctx, old); rc = GRN_NO_MEMORY_AVAILABLE; goto exit; } } if (grn_ii_updspec_add(ctx, *u, token_cursor->pos, v->weight)) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_ii_updspec_add on grn_ii_update failed!"); grn_token_cursor_close(ctx, token_cursor); if (new) { grn_hash_close(ctx, new); }; grn_hash_close(ctx, old); rc = GRN_NO_MEMORY_AVAILABLE; goto exit; } } } grn_token_cursor_close(ctx, token_cursor); } } } else { old = NULL; } if (old) { grn_id eid; GRN_HASH_EACH(ctx, old, id, &tp, NULL, &u, { if (new && (eid = grn_hash_get(ctx, new, tp, sizeof(grn_id), (void **) &un))) { if (!grn_ii_updspec_cmp(*u, *un)) { grn_ii_updspec_close(ctx, *un); grn_hash_delete_by_id(ctx, new, eid, NULL); } } else { grn_rc r; r = grn_ii_delete_one(ctx, ii, *tp, *u, new); if (r) { rc = r; } } grn_ii_updspec_close(ctx, *u); }); grn_hash_close(ctx, old); } if (new) { GRN_HASH_EACH(ctx, new, id, &tp, NULL, &u, { grn_rc r; if ((r = grn_ii_update_one(ctx, ii, *tp, *u, new))) { rc = r; } grn_ii_updspec_close(ctx, *u); }); grn_hash_close(ctx, new); } else { if (!section) { /* todo: delete key when all sections deleted */ } } exit : return rc; } #endif /* USE_VGRAM */ static grn_rc grn_vector2updspecs(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, grn_obj *in, grn_obj *out, grn_tokenize_mode mode, grn_obj *posting) { int j; grn_id tid; grn_section *v; grn_token_cursor *token_cursor; grn_ii_updspec **u; grn_hash *h = (grn_hash *)out; grn_obj *lexicon = ii->lexicon; if (in->u.v.body) { const char *head = GRN_BULK_HEAD(in->u.v.body); for (j = in->u.v.n_sections, v = in->u.v.sections; j; j--, v++) { unsigned int token_flags = 0; if (v->length && (token_cursor = grn_token_cursor_open(ctx, lexicon, head + v->offset, v->length, mode, token_flags))) { while (!token_cursor->status) { if ((tid = grn_token_cursor_next(ctx, token_cursor))) { if (posting) { GRN_RECORD_PUT(ctx, posting, tid); } if (!grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **) &u, NULL)) { break; } if (!*u) { if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { DEFINE_NAME(ii); MERR("[ii][update][spec] failed to create an update spec: " "<%.*s>: " "record:<%u>:<%u>, token:<%u>:<%d>:<%u>", name_size, name, rid, section, tid, token_cursor->pos, v->weight); grn_token_cursor_close(ctx, token_cursor); return ctx->rc; } } if (grn_ii_updspec_add(ctx, *u, token_cursor->pos, v->weight)) { DEFINE_NAME(ii); MERR("[ii][update][spec] failed to add to update spec: " "<%.*s>: " "record:<%u>:<%u>, token:<%u>:<%d>:<%u>", name_size, name, rid, section, tid, token_cursor->pos, v->weight); grn_token_cursor_close(ctx, token_cursor); return ctx->rc; } } } grn_token_cursor_close(ctx, token_cursor); } } } return ctx->rc; } static grn_rc grn_uvector2updspecs_data(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, grn_obj *in, grn_obj *out, grn_tokenize_mode mode, grn_obj *posting) { int i, n; grn_hash *h = (grn_hash *)out; grn_obj *lexicon = ii->lexicon; unsigned int element_size; n = grn_uvector_size(ctx, in); element_size = grn_uvector_element_size(ctx, in); for (i = 0; i < n; i++) { grn_obj *tokenizer; grn_token_cursor *token_cursor; unsigned int token_flags = 0; const char *element; tokenizer = grn_obj_get_info(ctx, lexicon, GRN_INFO_DEFAULT_TOKENIZER, NULL); element = GRN_BULK_HEAD(in) + (element_size * i); token_cursor = grn_token_cursor_open(ctx, lexicon, element, element_size, mode, token_flags); if (!token_cursor) { continue; } while (!token_cursor->status) { grn_id tid; if ((tid = grn_token_cursor_next(ctx, token_cursor))) { grn_ii_updspec **u; int pos; if (posting) { GRN_RECORD_PUT(ctx, posting, tid); } if (!grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **)&u, NULL)) { break; } if (!*u) { if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_ii_updspec_open on grn_uvector2updspecs_data failed!"); grn_token_cursor_close(ctx, token_cursor); return GRN_NO_MEMORY_AVAILABLE; } } if (tokenizer) { pos = token_cursor->pos; } else { pos = i; } if (grn_ii_updspec_add(ctx, *u, pos, 0)) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_ii_updspec_add on grn_uvector2updspecs failed!"); grn_token_cursor_close(ctx, token_cursor); return GRN_NO_MEMORY_AVAILABLE; } } } grn_token_cursor_close(ctx, token_cursor); } return GRN_SUCCESS; } static grn_rc grn_uvector2updspecs_id(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, grn_obj *in, grn_obj *out) { int i, n; grn_ii_updspec **u; grn_hash *h = (grn_hash *)out; n = grn_vector_size(ctx, in); for (i = 0; i < n; i++) { grn_id id; unsigned int weight; id = grn_uvector_get_element(ctx, in, i, &weight); if (!grn_hash_add(ctx, h, &id, sizeof(grn_id), (void **)&u, NULL)) { break; } if (!*u) { if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_ii_updspec_open on grn_ii_update failed!"); return GRN_NO_MEMORY_AVAILABLE; } } if (grn_ii_updspec_add(ctx, *u, i, weight)) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_ii_updspec_add on grn_ii_update failed!"); return GRN_NO_MEMORY_AVAILABLE; } } return GRN_SUCCESS; } static grn_rc grn_uvector2updspecs(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, grn_obj *in, grn_obj *out, grn_tokenize_mode mode, grn_obj *posting) { if (in->header.domain < GRN_N_RESERVED_TYPES) { return grn_uvector2updspecs_data(ctx, ii, rid, section, in, out, mode, posting); } else { return grn_uvector2updspecs_id(ctx, ii, rid, section, in, out); } } grn_rc grn_ii_column_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, grn_obj *oldvalue, grn_obj *newvalue, grn_obj *posting) { grn_id *tp; grn_bool do_grn_ii_updspec_cmp = GRN_TRUE; grn_ii_updspec **u, **un; grn_obj *old_, *old = oldvalue, *new_, *new = newvalue, oldv, newv; grn_obj buf, *post = NULL; if (!ii) { ERR(GRN_INVALID_ARGUMENT, "[ii][column][update] ii is NULL"); return ctx->rc; } if (!ii->lexicon) { ERR(GRN_INVALID_ARGUMENT, "[ii][column][update] lexicon is NULL"); return ctx->rc; } if (rid == GRN_ID_NIL) { ERR(GRN_INVALID_ARGUMENT, "[ii][column][update] record ID is nil"); return ctx->rc; } if (old || new) { unsigned char type = GRN_VOID; if (old) { type = (ii->obj.header.domain == old->header.domain) ? GRN_UVECTOR : old->header.type; } if (new) { type = (ii->obj.header.domain == new->header.domain) ? GRN_UVECTOR : new->header.type; } if (type == GRN_VECTOR) { grn_obj *tokenizer; grn_table_get_info(ctx, ii->lexicon, NULL, NULL, &tokenizer, NULL, NULL); if (tokenizer) { grn_obj old_elem, new_elem; unsigned int i, max_n; unsigned int old_n = 0, new_n = 0; if (old) { old_n = grn_vector_size(ctx, old); } if (new) { new_n = grn_vector_size(ctx, new); } max_n = (old_n > new_n) ? old_n : new_n; GRN_OBJ_INIT(&old_elem, GRN_BULK, GRN_OBJ_DO_SHALLOW_COPY, old->header.domain); GRN_OBJ_INIT(&new_elem, GRN_BULK, GRN_OBJ_DO_SHALLOW_COPY, new->header.domain); for (i = 0; i < max_n; i++) { grn_rc rc; grn_obj *old_p = NULL, *new_p = NULL; if (i < old_n) { const char *str; unsigned int size = grn_vector_get_element(ctx, old, i, &str, NULL, NULL); GRN_TEXT_SET_REF(&old_elem, str, size); old_p = &old_elem; } if (i < new_n) { const char *str; unsigned int size = grn_vector_get_element(ctx, new, i, &str, NULL, NULL); GRN_TEXT_SET_REF(&new_elem, str, size); new_p = &new_elem; } rc = grn_ii_column_update(ctx, ii, rid, section + i, old_p, new_p, posting); if (rc != GRN_SUCCESS) { break; } } GRN_OBJ_FIN(ctx, &old_elem); GRN_OBJ_FIN(ctx, &new_elem); return ctx->rc; } } } if (posting) { GRN_RECORD_INIT(&buf, GRN_OBJ_VECTOR, grn_obj_id(ctx, ii->lexicon)); post = &buf; } if (grn_io_lock(ctx, ii->seg, grn_lock_timeout)) { return ctx->rc; } if (new) { unsigned char type = (ii->obj.header.domain == new->header.domain) ? GRN_UVECTOR : new->header.type; switch (type) { case GRN_BULK : { if (grn_bulk_is_zero(ctx, new)) { do_grn_ii_updspec_cmp = GRN_FALSE; } new_ = new; GRN_OBJ_INIT(&newv, GRN_VECTOR, GRN_OBJ_DO_SHALLOW_COPY, GRN_DB_TEXT); newv.u.v.body = new; new = &newv; grn_vector_delimit(ctx, new, 0, GRN_ID_NIL); if (new_ != newvalue) { grn_obj_close(ctx, new_); } } /* fallthru */ case GRN_VECTOR : new_ = new; new = (grn_obj *)grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), GRN_HASH_TINY); if (!new) { DEFINE_NAME(ii); MERR("[ii][column][update][new][vector] failed to create a hash table: " "<%.*s>: ", name_size, name); } else { grn_vector2updspecs(ctx, ii, rid, section, new_, new, GRN_TOKEN_ADD, post); } if (new_ != newvalue) { grn_obj_close(ctx, new_); } if (ctx->rc != GRN_SUCCESS) { goto exit; } break; case GRN_UVECTOR : new_ = new; new = (grn_obj *)grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), GRN_HASH_TINY); if (!new) { DEFINE_NAME(ii); MERR("[ii][column][update][new][uvector] failed to create a hash table: " "<%.*s>: ", name_size, name); } else { if (new_->header.type == GRN_UVECTOR) { grn_uvector2updspecs(ctx, ii, rid, section, new_, new, GRN_TOKEN_ADD, post); } else { grn_obj uvector; unsigned int weight = 0; GRN_VALUE_FIX_SIZE_INIT(&uvector, GRN_OBJ_VECTOR, new_->header.domain); if (new_->header.impl_flags & GRN_OBJ_WITH_WEIGHT) { uvector.header.impl_flags |= GRN_OBJ_WITH_WEIGHT; } grn_uvector_add_element(ctx, &uvector, GRN_RECORD_VALUE(new_), weight); grn_uvector2updspecs(ctx, ii, rid, section, &uvector, new, GRN_TOKEN_ADD, post); GRN_OBJ_FIN(ctx, &uvector); } } if (new_ != newvalue) { grn_obj_close(ctx, new_); } if (ctx->rc != GRN_SUCCESS) { goto exit; } break; case GRN_TABLE_HASH_KEY : break; default : { DEFINE_NAME(ii); ERR(GRN_INVALID_ARGUMENT, "[ii][column][update][new] invalid object: " "<%.*s>: " "<%-.256s>(%#x)", name_size, name, grn_obj_type_to_string(type), type); } goto exit; } } if (posting) { grn_ii_updspec *u_; uint32_t offset = 0; grn_id tid_ = 0, gap, tid, *tpe; grn_table_sort_optarg arg = {GRN_TABLE_SORT_ASC| GRN_TABLE_SORT_AS_NUMBER| GRN_TABLE_SORT_AS_UNSIGNED, NULL, NULL, 0, 0}; grn_array *sorted = grn_array_create(ctx, NULL, sizeof(grn_id), 0); grn_hash_sort(ctx, (grn_hash *)new, -1, sorted, &arg); GRN_TEXT_PUT(ctx, posting, ((grn_hash *)new)->n_entries, sizeof(uint32_t)); GRN_ARRAY_EACH(ctx, sorted, 0, 0, id, &tp, { grn_hash_get_key(ctx, (grn_hash *)new, *tp, &tid, sizeof(grn_id)); gap = tid - tid_; GRN_TEXT_PUT(ctx, posting, &gap, sizeof(grn_id)); tid_ = tid; }); GRN_ARRAY_EACH(ctx, sorted, 0, 0, id, &tp, { grn_hash_get_value(ctx, (grn_hash *)new, *tp, &u_); u_->offset = offset++; GRN_TEXT_PUT(ctx, posting, &u_->tf, sizeof(int32_t)); }); tpe = (grn_id *)GRN_BULK_CURR(post); for (tp = (grn_id *)GRN_BULK_HEAD(post); tp < tpe; tp++) { grn_hash_get(ctx, (grn_hash *)new, (void *)tp, sizeof(grn_id), (void **)&u); GRN_TEXT_PUT(ctx, posting, &(*u)->offset, sizeof(int32_t)); } GRN_OBJ_FIN(ctx, post); grn_array_close(ctx, sorted); } if (old) { unsigned char type = (ii->obj.header.domain == old->header.domain) ? GRN_UVECTOR : old->header.type; switch (type) { case GRN_BULK : { // const char *str = GRN_BULK_HEAD(old); // unsigned int str_len = GRN_BULK_VSIZE(old); old_ = old; GRN_OBJ_INIT(&oldv, GRN_VECTOR, GRN_OBJ_DO_SHALLOW_COPY, GRN_DB_TEXT); oldv.u.v.body = old; old = &oldv; grn_vector_delimit(ctx, old, 0, GRN_ID_NIL); if (old_ != oldvalue) { grn_obj_close(ctx, old_); } } /* fallthru */ case GRN_VECTOR : old_ = old; old = (grn_obj *)grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), GRN_HASH_TINY); if (!old) { DEFINE_NAME(ii); MERR("[ii][column][update][old][vector] failed to create a hash table: " "<%.*s>: ", name_size, name); } else { grn_vector2updspecs(ctx, ii, rid, section, old_, old, GRN_TOKEN_DEL, NULL); } if (old_ != oldvalue) { grn_obj_close(ctx, old_); } if (ctx->rc != GRN_SUCCESS) { goto exit; } break; case GRN_UVECTOR : old_ = old; old = (grn_obj *)grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), GRN_HASH_TINY); if (!old) { DEFINE_NAME(ii); MERR("[ii][column][update][old][uvector] failed to create a hash table: " "<%.*s>: ", name_size, name); } else { if (old_->header.type == GRN_UVECTOR) { grn_uvector2updspecs(ctx, ii, rid, section, old_, old, GRN_TOKEN_DEL, NULL); } else { grn_obj uvector; unsigned int weight = 0; GRN_VALUE_FIX_SIZE_INIT(&uvector, GRN_OBJ_VECTOR, old_->header.domain); if (old_->header.impl_flags & GRN_OBJ_WITH_WEIGHT) { uvector.header.impl_flags |= GRN_OBJ_WITH_WEIGHT; } grn_uvector_add_element(ctx, &uvector, GRN_RECORD_VALUE(old_), weight); grn_uvector2updspecs(ctx, ii, rid, section, &uvector, old, GRN_TOKEN_DEL, NULL); GRN_OBJ_FIN(ctx, &uvector); } } if (old_ != oldvalue) { grn_obj_close(ctx, old_); } if (ctx->rc != GRN_SUCCESS) { goto exit; } break; case GRN_TABLE_HASH_KEY : break; default : { DEFINE_NAME(ii); ERR(GRN_INVALID_ARGUMENT, "[ii][column][update][old] invalid object: " "<%.*s>: " "<%-.256s>(%#x)", name_size, name, grn_obj_type_to_string(type), type); } goto exit; } } if (old) { grn_id eid; grn_hash *o = (grn_hash *)old; grn_hash *n = (grn_hash *)new; GRN_HASH_EACH(ctx, o, id, &tp, NULL, &u, { if (n && (eid = grn_hash_get(ctx, n, tp, sizeof(grn_id), (void **) &un))) { if (do_grn_ii_updspec_cmp && !grn_ii_updspec_cmp(*u, *un)) { grn_ii_updspec_close(ctx, *un); grn_hash_delete_by_id(ctx, n, eid, NULL); } } else { grn_ii_delete_one(ctx, ii, *tp, *u, n); } grn_ii_updspec_close(ctx, *u); if (ctx->rc != GRN_SUCCESS) { break; } }); } if (new) { grn_hash *n = (grn_hash *)new; GRN_HASH_EACH(ctx, n, id, &tp, NULL, &u, { grn_ii_update_one(ctx, ii, *tp, *u, n); grn_ii_updspec_close(ctx, *u); if (ctx->rc != GRN_SUCCESS) { break; } }); } else { if (!section) { /* todo: delete key when all sections deleted */ } } exit : grn_io_unlock(ii->seg); if (old && old != oldvalue) { grn_obj_close(ctx, old); } if (new && new != newvalue) { grn_obj_close(ctx, new); } return ctx->rc; } /* token_info */ typedef struct { cursor_heap *cursors; int offset; int pos; int size; int ntoken; grn_posting *p; } token_info; #define EX_NONE 0 #define EX_PREFIX 1 #define EX_SUFFIX 2 #define EX_BOTH 3 #define EX_FUZZY 4 inline static void token_info_expand_both(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *key, unsigned int key_size, token_info *ti) { int s = 0; grn_hash *h, *g; uint32_t *offset2; grn_hash_cursor *c; grn_id *tp, *tq; if ((h = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, 0))) { grn_table_search(ctx, lexicon, key, key_size, GRN_OP_PREFIX, (grn_obj *)h, GRN_OP_OR); if (GRN_HASH_SIZE(h)) { if ((ti->cursors = cursor_heap_open(ctx, GRN_HASH_SIZE(h) + 256))) { if ((c = grn_hash_cursor_open(ctx, h, NULL, 0, NULL, 0, 0, -1, 0))) { uint32_t key2_size; const char *key2; while (grn_hash_cursor_next(ctx, c)) { grn_hash_cursor_get_key(ctx, c, (void **) &tp); key2 = _grn_table_key(ctx, lexicon, *tp, &key2_size); if (!key2) { break; } if ((lexicon->header.type != GRN_TABLE_PAT_KEY) || !(lexicon->header.flags & GRN_OBJ_KEY_WITH_SIS) || key2_size <= 2) { // todo: refine if ((s = grn_ii_estimate_size(ctx, ii, *tp))) { cursor_heap_push(ctx, ti->cursors, ii, *tp, 0, 0, GRN_ID_NIL); ti->ntoken++; ti->size += s; } } else { if ((g = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, GRN_HASH_TINY))) { grn_pat_suffix_search(ctx, (grn_pat *)lexicon, key2, key2_size, g); GRN_HASH_EACH(ctx, g, id, &tq, NULL, &offset2, { if ((s = grn_ii_estimate_size(ctx, ii, *tq))) { cursor_heap_push(ctx, ti->cursors, ii, *tq, /* *offset2 */ 0, 0, GRN_ID_NIL); ti->ntoken++; ti->size += s; } }); grn_hash_close(ctx, g); } } } grn_hash_cursor_close(ctx, c); } } } grn_hash_close(ctx, h); } } inline static grn_rc token_info_close(grn_ctx *ctx, token_info *ti) { cursor_heap_close(ctx, ti->cursors); GRN_FREE(ti); return GRN_SUCCESS; } inline static token_info * token_info_open(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *key, unsigned int key_size, uint32_t offset, int mode, grn_fuzzy_search_optarg *args, grn_id min) { int s = 0; grn_hash *h; token_info *ti; grn_id tid; grn_id *tp; if (!key) { return NULL; } if (!(ti = GRN_MALLOC(sizeof(token_info)))) { return NULL; } ti->cursors = NULL; ti->size = 0; ti->ntoken = 0; ti->offset = offset; switch (mode) { case EX_BOTH : token_info_expand_both(ctx, lexicon, ii, key, key_size, ti); break; case EX_NONE : if ((tid = grn_table_get(ctx, lexicon, key, key_size)) && (s = grn_ii_estimate_size(ctx, ii, tid)) && (ti->cursors = cursor_heap_open(ctx, 1))) { cursor_heap_push(ctx, ti->cursors, ii, tid, 0, 0, min); ti->ntoken++; ti->size = s; } break; case EX_PREFIX : if ((h = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, 0))) { grn_table_search(ctx, lexicon, key, key_size, GRN_OP_PREFIX, (grn_obj *)h, GRN_OP_OR); if (GRN_HASH_SIZE(h)) { if ((ti->cursors = cursor_heap_open(ctx, GRN_HASH_SIZE(h)))) { GRN_HASH_EACH(ctx, h, id, &tp, NULL, NULL, { if ((s = grn_ii_estimate_size(ctx, ii, *tp))) { cursor_heap_push(ctx, ti->cursors, ii, *tp, 0, 0, min); ti->ntoken++; ti->size += s; } }); } } grn_hash_close(ctx, h); } break; case EX_SUFFIX : if ((h = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, 0))) { grn_table_search(ctx, lexicon, key, key_size, GRN_OP_SUFFIX, (grn_obj *)h, GRN_OP_OR); if (GRN_HASH_SIZE(h)) { if ((ti->cursors = cursor_heap_open(ctx, GRN_HASH_SIZE(h)))) { uint32_t *offset2; GRN_HASH_EACH(ctx, h, id, &tp, NULL, &offset2, { if ((s = grn_ii_estimate_size(ctx, ii, *tp))) { cursor_heap_push(ctx, ti->cursors, ii, *tp, /* *offset2 */ 0, 0, min); ti->ntoken++; ti->size += s; } }); } } grn_hash_close(ctx, h); } break; case EX_FUZZY : if ((h = (grn_hash *)grn_table_create(ctx, NULL, 0, NULL, GRN_OBJ_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, grn_ctx_at(ctx, GRN_DB_UINT32), NULL))) { grn_table_fuzzy_search(ctx, lexicon, key, key_size, args, (grn_obj *)h, GRN_OP_OR); if (GRN_HASH_SIZE(h)) { if ((ti->cursors = cursor_heap_open(ctx, GRN_HASH_SIZE(h)))) { grn_rset_recinfo *ri; GRN_HASH_EACH(ctx, h, id, &tp, NULL, (void **)&ri, { if ((s = grn_ii_estimate_size(ctx, ii, *tp))) { cursor_heap_push(ctx, ti->cursors, ii, *tp, 0, ri->score - 1, min); ti->ntoken++; ti->size += s; } }); } } grn_obj_close(ctx, (grn_obj *)h); } break; } if (cursor_heap_push2(ti->cursors)) { token_info_close(ctx, ti); return NULL; } { grn_ii_cursor *ic; if (ti->cursors && (ic = cursor_heap_min(ti->cursors))) { grn_posting *p = ic->post; ti->pos = p->pos - ti->offset; ti->p = p; } else { token_info_close(ctx, ti); ti = NULL; } } return ti; } static inline grn_rc token_info_skip(grn_ctx *ctx, token_info *ti, uint32_t rid, uint32_t sid) { grn_ii_cursor *c; grn_posting *p; for (;;) { if (!(c = cursor_heap_min(ti->cursors))) { return GRN_END_OF_DATA; } p = c->post; if (p->rid > rid || (p->rid == rid && p->sid >= sid)) { break; } cursor_heap_pop(ctx, ti->cursors, rid); } ti->pos = p->pos - ti->offset; ti->p = p; return GRN_SUCCESS; } static inline grn_rc token_info_skip_pos(grn_ctx *ctx, token_info *ti, uint32_t rid, uint32_t sid, uint32_t pos) { grn_ii_cursor *c; grn_posting *p; pos += ti->offset; for (;;) { if (!(c = cursor_heap_min(ti->cursors))) { return GRN_END_OF_DATA; } p = c->post; if (p->rid != rid || p->sid != sid || p->pos >= pos) { break; } cursor_heap_pop_pos(ctx, ti->cursors); } ti->pos = p->pos - ti->offset; ti->p = p; return GRN_SUCCESS; } inline static int token_compare(const void *a, const void *b) { const token_info *t1 = *((token_info **)a), *t2 = *((token_info **)b); return t1->size - t2->size; } #define TOKEN_CANDIDATE_NODE_SIZE 32 #define TOKEN_CANDIDATE_ADJACENT_MAX_SIZE 16 #define TOKEN_CANDIDATE_QUEUE_SIZE 64 #define TOKEN_CANDIDATE_SIZE 16 typedef struct { grn_id tid; const unsigned char *token; uint32_t token_size; int32_t pos; grn_token_cursor_status status; int ef; uint32_t estimated_size; uint8_t adjacent[TOKEN_CANDIDATE_ADJACENT_MAX_SIZE]; /* Index of adjacent node from top */ uint8_t n_adjacent; } token_candidate_node; typedef struct { uint32_t *candidates; /* Standing bits indicate index of token_candidate_node */ int top; int rear; int size; } token_candidate_queue; inline static void token_candidate_adjacent_set(grn_ctx *ctx, grn_token_cursor *token_cursor, token_candidate_node *top, token_candidate_node *curr) { grn_bool exists_adjacent = GRN_FALSE; token_candidate_node *adj; for (adj = top; adj < curr; adj++) { if (token_cursor->curr <= adj->token + adj->token_size) { if (adj->n_adjacent < TOKEN_CANDIDATE_ADJACENT_MAX_SIZE) { adj->adjacent[adj->n_adjacent] = curr - top; adj->n_adjacent++; exists_adjacent = GRN_TRUE; } } } if (!exists_adjacent) { adj = curr - 1; if (adj->n_adjacent < TOKEN_CANDIDATE_ADJACENT_MAX_SIZE) { adj->adjacent[adj->n_adjacent] = curr - top; adj->n_adjacent++; } } } inline static grn_rc token_candidate_init(grn_ctx *ctx, grn_ii *ii, grn_token_cursor *token_cursor, grn_id tid, int ef, token_candidate_node **nodes, int *n_nodes, uint32_t *max_estimated_size) { grn_rc rc; token_candidate_node *top, *curr; int size = TOKEN_CANDIDATE_NODE_SIZE; *nodes = GRN_MALLOC(TOKEN_CANDIDATE_NODE_SIZE * sizeof(token_candidate_node)); if (!*nodes) { return GRN_NO_MEMORY_AVAILABLE; } top = *nodes; curr = top; #define TOKEN_CANDIDATE_NODE_SET() { \ curr->tid = tid; \ curr->token = token_cursor->curr; \ curr->token_size = token_cursor->curr_size; \ curr->pos = token_cursor->pos; \ curr->status = token_cursor->status; \ curr->ef = ef; \ curr->estimated_size = grn_ii_estimate_size(ctx, ii, tid); \ curr->n_adjacent = 0; \ } TOKEN_CANDIDATE_NODE_SET(); GRN_LOG(ctx, GRN_LOG_DEBUG, "[ii][overlap_token_skip] tid=%u pos=%d estimated_size=%u", curr->tid, curr->pos, curr->estimated_size); *max_estimated_size = curr->estimated_size; curr++; while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) { if (curr - top >= size) { if (!(*nodes = GRN_REALLOC(*nodes, (curr - top + TOKEN_CANDIDATE_NODE_SIZE) * sizeof(token_candidate_node)))) { return GRN_NO_MEMORY_AVAILABLE; } top = *nodes; curr = top + size; size += TOKEN_CANDIDATE_NODE_SIZE; } tid = grn_token_cursor_next(ctx, token_cursor); if (token_cursor->status != GRN_TOKEN_CURSOR_DONE_SKIP) { if (token_cursor->force_prefix) { ef |= EX_PREFIX; } TOKEN_CANDIDATE_NODE_SET(); token_candidate_adjacent_set(ctx, token_cursor, top, curr); if (curr->estimated_size > *max_estimated_size) { *max_estimated_size = curr->estimated_size; } curr++; } } *n_nodes = curr - top; rc = GRN_SUCCESS; return rc; #undef TOKEN_CANDIDATE_NODE_SET } inline static grn_rc token_candidate_queue_init(grn_ctx *ctx, token_candidate_queue *q) { q->top = 0; q->rear = 0; q->size = TOKEN_CANDIDATE_QUEUE_SIZE; q->candidates = GRN_MALLOC(TOKEN_CANDIDATE_QUEUE_SIZE * sizeof(uint32_t)); if (!q->candidates) { q->size = 0; return GRN_NO_MEMORY_AVAILABLE; } return GRN_SUCCESS; } inline static grn_rc token_candidate_enqueue(grn_ctx *ctx, token_candidate_queue *q, uint32_t candidate) { if (q->rear >= q->size) { if (!(q->candidates = GRN_REALLOC(q->candidates, (q->rear + TOKEN_CANDIDATE_QUEUE_SIZE) * sizeof(uint32_t)))) { q->size = 0; return GRN_NO_MEMORY_AVAILABLE; } q->size += TOKEN_CANDIDATE_QUEUE_SIZE; } *(q->candidates + q->rear) = candidate; q->rear++; return GRN_SUCCESS; } inline static grn_rc token_candidate_dequeue(grn_ctx *ctx, token_candidate_queue *q, uint32_t *candidate) { if (q->top == q->rear) { return GRN_END_OF_DATA; } *candidate = *(q->candidates + q->top); q->top++; return GRN_SUCCESS; } inline static void token_candidate_queue_fin(grn_ctx *ctx, token_candidate_queue *q) { GRN_FREE(q->candidates); } inline static token_candidate_node* token_candidate_last_node(grn_ctx *ctx, token_candidate_node *nodes, uint32_t candidate, int offset) { int i; GRN_BIT_SCAN_REV(candidate, i); return nodes + i + offset; } inline static uint64_t token_candidate_score(grn_ctx *ctx, token_candidate_node *nodes, uint32_t candidate, int offset, uint32_t max_estimated_size) { int i, last; uint64_t score = 0; GRN_BIT_SCAN_REV(candidate, last); for (i = 0; i <= last; i++) { if (candidate & (1 << i)) { token_candidate_node *node = nodes + i + offset; if (node->estimated_size > 0) { score += max_estimated_size / node->estimated_size; } } } return score; } inline static grn_rc token_candidate_select(grn_ctx *ctx, token_candidate_node *nodes, int offset, int limit, int end, uint32_t *selected_candidate, uint32_t max_estimated_size) { grn_rc rc; token_candidate_queue q; uint32_t candidate; uint64_t max_score = 0; int i, min_n_nodes = 0; if (offset + limit > end) { limit = end - offset; } rc = token_candidate_queue_init(ctx, &q); if (rc != GRN_SUCCESS) { return rc; } rc = token_candidate_enqueue(ctx, &q, 1); if (rc != GRN_SUCCESS) { goto exit; } while (token_candidate_dequeue(ctx, &q, &candidate) != GRN_END_OF_DATA) { token_candidate_node *candidate_last_node = token_candidate_last_node(ctx, nodes, candidate, offset); for (i = 0; i < candidate_last_node->n_adjacent; i++) { int adjacent, n_nodes = 0; uint32_t new_candidate; adjacent = candidate_last_node->adjacent[i] - offset; if (adjacent > limit) { break; } new_candidate = candidate | (1 << adjacent); GET_NUM_BITS(new_candidate, n_nodes); if (min_n_nodes > 0 && n_nodes > min_n_nodes + 1) { goto exit; } rc = token_candidate_enqueue(ctx, &q, new_candidate); if (rc != GRN_SUCCESS) { goto exit; } if (adjacent == limit) { if (min_n_nodes == 0) { min_n_nodes = n_nodes; } if (n_nodes >= min_n_nodes && n_nodes <= min_n_nodes + 1) { uint64_t score; score = token_candidate_score(ctx, nodes, new_candidate, offset, max_estimated_size); if (score > max_score) { max_score = score; *selected_candidate = new_candidate; } } } } } rc = GRN_SUCCESS; exit : token_candidate_queue_fin(ctx, &q); return rc; } inline static grn_rc token_candidate_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, token_info **tis, uint32_t *n, token_candidate_node *nodes, uint32_t selected_candidate, int offset, grn_id min) { grn_rc rc = GRN_END_OF_DATA; token_info *ti; const char *key; uint32_t size; int i, last = 0; GRN_BIT_SCAN_REV(selected_candidate, last); for (i = 1; i <= last; i++) { if (selected_candidate & (1 << i)) { token_candidate_node *node = nodes + i + offset; switch (node->status) { case GRN_TOKEN_CURSOR_DOING : key = _grn_table_key(ctx, lexicon, node->tid, &size); ti = token_info_open(ctx, lexicon, ii, key, size, node->pos, EX_NONE, NULL, min); break; case GRN_TOKEN_CURSOR_DONE : if (node->tid) { key = _grn_table_key(ctx, lexicon, node->tid, &size); ti = token_info_open(ctx, lexicon, ii, key, size, node->pos, node->ef & EX_PREFIX, NULL, min); break; } /* else fallthru */ default : ti = token_info_open(ctx, lexicon, ii, (char *)node->token, node->token_size, node->pos, node->ef & EX_PREFIX, NULL, min); break; } if (!ti) { goto exit; } tis[(*n)++] = ti; GRN_LOG(ctx, GRN_LOG_DEBUG, "[ii][overlap_token_skip] tid=%u pos=%d estimated_size=%u", node->tid, node->pos, node->estimated_size); } } rc = GRN_SUCCESS; exit : return rc; } inline static grn_rc token_info_build_skipping_overlap(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, token_info **tis, uint32_t *n, grn_token_cursor *token_cursor, grn_id tid, int ef, grn_id min) { grn_rc rc; token_candidate_node *nodes = NULL; int n_nodes = 0, offset = 0, limit = TOKEN_CANDIDATE_SIZE - 1; uint32_t max_estimated_size; rc = token_candidate_init(ctx, ii, token_cursor, tid, ef, &nodes, &n_nodes, &max_estimated_size); if (rc != GRN_SUCCESS) { return rc; } while (offset < n_nodes - 1) { uint32_t selected_candidate = 0; rc = token_candidate_select(ctx, nodes, offset, limit, n_nodes - 1, &selected_candidate, max_estimated_size); if (rc != GRN_SUCCESS) { goto exit; } rc = token_candidate_build(ctx, lexicon, ii, tis, n, nodes, selected_candidate, offset, min); if (rc != GRN_SUCCESS) { goto exit; } offset += limit; } rc = GRN_SUCCESS; exit : if (nodes) { GRN_FREE(nodes); } return rc; } inline static grn_rc token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, unsigned int string_len, token_info **tis, uint32_t *n, grn_bool *only_skip_token, grn_id min, grn_operator mode) { token_info *ti; const char *key; uint32_t size; grn_rc rc = GRN_END_OF_DATA; unsigned int token_flags = GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER; grn_token_cursor *token_cursor = grn_token_cursor_open(ctx, lexicon, string, string_len, GRN_TOKEN_GET, token_flags); *only_skip_token = GRN_FALSE; if (!token_cursor) { return GRN_NO_MEMORY_AVAILABLE; } if (mode == GRN_OP_UNSPLIT) { if ((ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->orig, token_cursor->orig_blen, 0, EX_BOTH, NULL, min))) { tis[(*n)++] = ti; rc = GRN_SUCCESS; } } else { grn_id tid; int ef; switch (mode) { case GRN_OP_PREFIX : ef = EX_PREFIX; break; case GRN_OP_SUFFIX : ef = EX_SUFFIX; break; case GRN_OP_PARTIAL : ef = EX_BOTH; break; default : ef = EX_NONE; break; } tid = grn_token_cursor_next(ctx, token_cursor); if (token_cursor->force_prefix) { ef |= EX_PREFIX; } switch (token_cursor->status) { case GRN_TOKEN_CURSOR_DOING : key = _grn_table_key(ctx, lexicon, tid, &size); ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, ef & EX_SUFFIX, NULL, min); break; case GRN_TOKEN_CURSOR_DONE : ti = token_info_open(ctx, lexicon, ii, (const char *)token_cursor->curr, token_cursor->curr_size, 0, ef, NULL, min); /* key = _grn_table_key(ctx, lexicon, tid, &size); ti = token_info_open(ctx, lexicon, ii, token_cursor->curr, token_cursor->curr_size, token_cursor->pos, ef, NULL, GRN_ID_NIL); ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->orig, token_cursor->orig_blen, token_cursor->pos, ef, NULL, GRN_ID_NIL); */ break; case GRN_TOKEN_CURSOR_NOT_FOUND : ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->orig, token_cursor->orig_blen, 0, ef, NULL, min); break; case GRN_TOKEN_CURSOR_DONE_SKIP : *only_skip_token = GRN_TRUE; goto exit; default : goto exit; } if (!ti) { goto exit ; } tis[(*n)++] = ti; if (grn_ii_overlap_token_skip_enable) { rc = token_info_build_skipping_overlap(ctx, lexicon, ii, tis, n, token_cursor, tid, ef, min); goto exit; } while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) { tid = grn_token_cursor_next(ctx, token_cursor); if (token_cursor->force_prefix) { ef |= EX_PREFIX; } switch (token_cursor->status) { case GRN_TOKEN_CURSOR_DONE_SKIP : continue; case GRN_TOKEN_CURSOR_DOING : key = _grn_table_key(ctx, lexicon, tid, &size); ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, EX_NONE, NULL, min); break; case GRN_TOKEN_CURSOR_DONE : if (tid) { key = _grn_table_key(ctx, lexicon, tid, &size); ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, ef & EX_PREFIX, NULL, min); break; } /* else fallthru */ default : ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->curr, token_cursor->curr_size, token_cursor->pos, ef & EX_PREFIX, NULL, min); break; } if (!ti) { goto exit; } tis[(*n)++] = ti; } rc = GRN_SUCCESS; } exit : grn_token_cursor_close(ctx, token_cursor); return rc; } inline static grn_rc token_info_build_fuzzy(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, unsigned int string_len, token_info **tis, uint32_t *n, grn_bool *only_skip_token, grn_id min, grn_operator mode, grn_fuzzy_search_optarg *args) { token_info *ti; grn_rc rc = GRN_END_OF_DATA; unsigned int token_flags = GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER; grn_token_cursor *token_cursor = grn_token_cursor_open(ctx, lexicon, string, string_len, GRN_TOKENIZE_ONLY, token_flags); *only_skip_token = GRN_FALSE; if (!token_cursor) { return GRN_NO_MEMORY_AVAILABLE; } grn_token_cursor_next(ctx, token_cursor); switch (token_cursor->status) { case GRN_TOKEN_CURSOR_DONE_SKIP : *only_skip_token = GRN_TRUE; goto exit; case GRN_TOKEN_CURSOR_DOING : case GRN_TOKEN_CURSOR_DONE : ti = token_info_open(ctx, lexicon, ii, (const char *)token_cursor->curr, token_cursor->curr_size, token_cursor->pos, EX_FUZZY, args, min); break; default : ti = NULL; break; } if (!ti) { goto exit ; } tis[(*n)++] = ti; while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) { grn_token_cursor_next(ctx, token_cursor); switch (token_cursor->status) { case GRN_TOKEN_CURSOR_DONE_SKIP : continue; case GRN_TOKEN_CURSOR_DOING : case GRN_TOKEN_CURSOR_DONE : ti = token_info_open(ctx, lexicon, ii, (const char *)token_cursor->curr, token_cursor->curr_size, token_cursor->pos, EX_FUZZY, args, min); break; default : break; } if (!ti) { goto exit; } tis[(*n)++] = ti; } rc = GRN_SUCCESS; exit : grn_token_cursor_close(ctx, token_cursor); return rc; } static void token_info_clear_offset(token_info **tis, uint32_t n) { token_info **tie; for (tie = tis + n; tis < tie; tis++) { (*tis)->offset = 0; } } /* select */ inline static void res_add(grn_ctx *ctx, grn_hash *s, grn_rset_posinfo *pi, double score, grn_operator op) { grn_rset_recinfo *ri; switch (op) { case GRN_OP_OR : if (grn_hash_add(ctx, s, pi, s->key_size, (void **)&ri, NULL)) { if (s->obj.header.flags & GRN_OBJ_WITH_SUBREC) { grn_table_add_subrec((grn_obj *)s, ri, score, pi, 1); } } break; case GRN_OP_AND : if (grn_hash_get(ctx, s, pi, s->key_size, (void **)&ri)) { if (s->obj.header.flags & GRN_OBJ_WITH_SUBREC) { ri->n_subrecs |= GRN_RSET_UTIL_BIT; grn_table_add_subrec((grn_obj *)s, ri, score, pi, 1); } } break; case GRN_OP_AND_NOT : { grn_id id; if ((id = grn_hash_get(ctx, s, pi, s->key_size, (void **)&ri))) { grn_hash_delete_by_id(ctx, s, id, NULL); } } break; case GRN_OP_ADJUST : if (grn_hash_get(ctx, s, pi, s->key_size, (void **)&ri)) { if (s->obj.header.flags & GRN_OBJ_WITH_SUBREC) { ri->score += score; } } break; default : break; } } grn_rc grn_ii_posting_add(grn_ctx *ctx, grn_posting *pos, grn_hash *s, grn_operator op) { res_add(ctx, s, (grn_rset_posinfo *)(pos), (1 + pos->weight), op); return ctx->rc; } #ifdef USE_BHEAP /* todo */ #else /* USE_BHEAP */ struct _btr_node { struct _btr_node *car; struct _btr_node *cdr; token_info *ti; }; typedef struct _btr_node btr_node; typedef struct { int n; token_info *min; token_info *max; btr_node *root; btr_node *nodes; } btr; inline static void bt_zap(btr *bt) { bt->n = 0; bt->min = NULL; bt->max = NULL; bt->root = NULL; } inline static btr * bt_open(grn_ctx *ctx, int size) { btr *bt = GRN_MALLOC(sizeof(btr)); if (bt) { bt_zap(bt); if (!(bt->nodes = GRN_MALLOC(sizeof(btr_node) * size))) { GRN_FREE(bt); bt = NULL; } } return bt; } inline static void bt_close(grn_ctx *ctx, btr *bt) { if (!bt) { return; } GRN_FREE(bt->nodes); GRN_FREE(bt); } inline static void bt_push(btr *bt, token_info *ti) { int pos = ti->pos, minp = 1, maxp = 1; btr_node *node, *new, **last; new = bt->nodes + bt->n++; new->ti = ti; new->car = NULL; new->cdr = NULL; for (last = &bt->root; (node = *last);) { if (pos < node->ti->pos) { last = &node->car; maxp = 0; } else { last = &node->cdr; minp = 0; } } *last = new; if (minp) { bt->min = ti; } if (maxp) { bt->max = ti; } } inline static void bt_pop(btr *bt) { btr_node *node, *min, *newmin, **last; for (last = &bt->root; (min = *last) && min->car; last = &min->car) ; if (min) { int pos = min->ti->pos, minp = 1, maxp = 1; *last = min->cdr; min->cdr = NULL; for (last = &bt->root; (node = *last);) { if (pos < node->ti->pos) { last = &node->car; maxp = 0; } else { last = &node->cdr; minp = 0; } } *last = min; if (maxp) { bt->max = min->ti; } if (!minp) { for (newmin = bt->root; newmin->car; newmin = newmin->car) ; bt->min = newmin->ti; } } } #endif /* USE_BHEAP */ typedef enum { grn_wv_none = 0, grn_wv_static, grn_wv_dynamic, grn_wv_constant } grn_wv_mode; inline static double get_weight(grn_ctx *ctx, grn_hash *s, grn_id rid, int sid, grn_wv_mode wvm, grn_select_optarg *optarg) { switch (wvm) { case grn_wv_none : return 1; case grn_wv_static : return sid <= optarg->vector_size ? optarg->weight_vector[sid - 1] : 0; case grn_wv_dynamic : /* todo : support hash with keys if (s->keys) { uint32_t key_size; const char *key = _grn_table_key(ctx, s->keys, rid, &key_size); // todo : change grn_select_optarg return key ? optarg->func(s, key, key_size, sid, optarg->func_arg) : 0; } */ /* todo : cast */ return optarg->func(ctx, (void *)s, (void *)(intptr_t)rid, sid, optarg->func_arg); case grn_wv_constant : return optarg->vector_size; default : return 1; } } grn_rc grn_ii_similar_search(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len, grn_hash *s, grn_operator op, grn_select_optarg *optarg) { int *w1, limit; grn_id tid, *tp, max_size; grn_rc rc = GRN_SUCCESS; grn_hash *h; grn_token_cursor *token_cursor; unsigned int token_flags = GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER; grn_obj *lexicon = ii->lexicon; if (!lexicon || !ii || !string || !string_len || !s || !optarg) { return GRN_INVALID_ARGUMENT; } if (!(h = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(int), 0))) { return GRN_NO_MEMORY_AVAILABLE; } if (!(token_cursor = grn_token_cursor_open(ctx, lexicon, string, string_len, GRN_TOKEN_GET, token_flags))) { grn_hash_close(ctx, h); return GRN_NO_MEMORY_AVAILABLE; } if (!(max_size = optarg->max_size)) { max_size = 1048576; } while (token_cursor->status != GRN_TOKEN_CURSOR_DONE && token_cursor->status != GRN_TOKEN_CURSOR_DONE_SKIP) { if ((tid = grn_token_cursor_next(ctx, token_cursor))) { if (grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **)&w1, NULL)) { (*w1)++; } } if (tid && token_cursor->curr_size) { if (optarg->mode == GRN_OP_UNSPLIT) { grn_table_search(ctx, lexicon, token_cursor->curr, token_cursor->curr_size, GRN_OP_PREFIX, (grn_obj *)h, GRN_OP_OR); } if (optarg->mode == GRN_OP_PARTIAL) { grn_table_search(ctx, lexicon, token_cursor->curr, token_cursor->curr_size, GRN_OP_SUFFIX, (grn_obj *)h, GRN_OP_OR); } } } grn_token_cursor_close(ctx, token_cursor); { grn_hash_cursor *c = grn_hash_cursor_open(ctx, h, NULL, 0, NULL, 0, 0, -1, 0); if (!c) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_hash_cursor_open on grn_ii_similar_search failed !"); grn_hash_close(ctx, h); return GRN_NO_MEMORY_AVAILABLE; } while (grn_hash_cursor_next(ctx, c)) { uint32_t es; grn_hash_cursor_get_key_value(ctx, c, (void **) &tp, NULL, (void **) &w1); if ((es = grn_ii_estimate_size(ctx, ii, *tp))) { *w1 += max_size / es; } else { grn_hash_cursor_delete(ctx, c, NULL); } } grn_hash_cursor_close(ctx, c); } limit = optarg->similarity_threshold ? (optarg->similarity_threshold > (int) GRN_HASH_SIZE(h) ? GRN_HASH_SIZE(h) : optarg->similarity_threshold) : (GRN_HASH_SIZE(h) >> 3) + 1; if (GRN_HASH_SIZE(h)) { grn_id j, id; int w2, rep; grn_ii_cursor *c; grn_posting *pos; grn_wv_mode wvm = grn_wv_none; grn_table_sort_optarg arg = { GRN_TABLE_SORT_DESC|GRN_TABLE_SORT_BY_VALUE|GRN_TABLE_SORT_AS_NUMBER, NULL, NULL, 0, 0 }; grn_array *sorted = grn_array_create(ctx, NULL, sizeof(grn_id), 0); if (!sorted) { GRN_LOG(ctx, GRN_LOG_ALERT, "grn_hash_sort on grn_ii_similar_search failed !"); grn_hash_close(ctx, h); return GRN_NO_MEMORY_AVAILABLE; } grn_hash_sort(ctx, h, limit, sorted, &arg); /* todo support subrec rep = (s->record_unit == grn_rec_position || s->subrec_unit == grn_rec_position); */ rep = 0; if (optarg->func) { wvm = grn_wv_dynamic; } else if (optarg->vector_size) { wvm = optarg->weight_vector ? grn_wv_static : grn_wv_constant; } for (j = 1; j <= (uint) limit; j++) { grn_array_get_value(ctx, sorted, j, &id); _grn_hash_get_key_value(ctx, h, id, (void **) &tp, (void **) &w1); if (!*tp || !(c = grn_ii_cursor_open(ctx, ii, *tp, GRN_ID_NIL, GRN_ID_MAX, rep ? ii->n_elements : ii->n_elements - 1, 0))) { GRN_LOG(ctx, GRN_LOG_ERROR, "cursor open failed (%d)", *tp); continue; } if (rep) { while (grn_ii_cursor_next(ctx, c)) { pos = c->post; if ((w2 = get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg)) > 0) { while (grn_ii_cursor_next_pos(ctx, c)) { res_add(ctx, s, (grn_rset_posinfo *) pos, *w1 * w2 * (1 + pos->weight), op); } } } } else { while (grn_ii_cursor_next(ctx, c)) { pos = c->post; if ((w2 = get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg)) > 0) { res_add(ctx, s, (grn_rset_posinfo *) pos, *w1 * w2 * (pos->tf + pos->weight), op); } } } grn_ii_cursor_close(ctx, c); } grn_array_close(ctx, sorted); } grn_hash_close(ctx, h); grn_ii_resolve_sel_and(ctx, s, op); // grn_hash_cursor_clear(r); return rc; } #define TERM_EXTRACT_EACH_POST 0 #define TERM_EXTRACT_EACH_TERM 1 grn_rc grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len, grn_hash *s, grn_operator op, grn_select_optarg *optarg) { grn_rset_posinfo pi; grn_id tid; const char *p, *pe; grn_obj *nstr; const char *normalized; unsigned int normalized_length_in_bytes; grn_ii_cursor *c; grn_posting *pos; int skip, rep, policy; grn_rc rc = GRN_SUCCESS; grn_wv_mode wvm = grn_wv_none; if (!ii || !string || !string_len || !s || !optarg) { return GRN_INVALID_ARGUMENT; } if (!(nstr = grn_string_open(ctx, string, string_len, NULL, 0))) { return GRN_INVALID_ARGUMENT; } policy = optarg->max_interval; if (optarg->func) { wvm = grn_wv_dynamic; } else if (optarg->vector_size) { wvm = optarg->weight_vector ? grn_wv_static : grn_wv_constant; } /* todo support subrec if (policy == TERM_EXTRACT_EACH_POST) { if ((rc = grn_records_reopen(s, grn_rec_section, grn_rec_none, 0))) { goto exit; } } rep = (s->record_unit == grn_rec_position || s->subrec_unit == grn_rec_position); */ rep = 0; grn_string_get_normalized(ctx, nstr, &normalized, &normalized_length_in_bytes, NULL); for (p = normalized, pe = p + normalized_length_in_bytes; p < pe; p += skip) { if ((tid = grn_table_lcp_search(ctx, ii->lexicon, p, pe - p))) { if (policy == TERM_EXTRACT_EACH_POST) { if (!(skip = grn_table_get_key(ctx, ii->lexicon, tid, NULL, 0))) { break; } } else { if (!(skip = (int)grn_charlen(ctx, p, pe))) { break; } } if (!(c = grn_ii_cursor_open(ctx, ii, tid, GRN_ID_NIL, GRN_ID_MAX, rep ? ii->n_elements : ii->n_elements - 1, 0))) { GRN_LOG(ctx, GRN_LOG_ERROR, "cursor open failed (%d)", tid); continue; } if (rep) { while (grn_ii_cursor_next(ctx, c)) { pos = c->post; while (grn_ii_cursor_next_pos(ctx, c)) { res_add(ctx, s, (grn_rset_posinfo *) pos, get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg), op); } } } else { while (grn_ii_cursor_next(ctx, c)) { if (policy == TERM_EXTRACT_EACH_POST) { pi.rid = c->post->rid; pi.sid = p - normalized; res_add(ctx, s, &pi, pi.sid + 1, op); } else { pos = c->post; res_add(ctx, s, (grn_rset_posinfo *) pos, get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg), op); } } } grn_ii_cursor_close(ctx, c); } else { if (!(skip = (int)grn_charlen(ctx, p, pe))) { break; } } } grn_obj_close(ctx, nstr); return rc; } typedef struct { grn_id rid; uint32_t sid; uint32_t start_pos; uint32_t end_pos; uint32_t tf; uint32_t weight; } grn_ii_select_cursor_posting; typedef struct { btr *bt; grn_ii *ii; token_info **tis; uint32_t n_tis; int max_interval; grn_operator mode; grn_ii_select_cursor_posting posting; const char *string; unsigned int string_len; grn_bool done; grn_ii_select_cursor_posting unshifted_posting; grn_bool have_unshifted_posting; } grn_ii_select_cursor; static grn_rc grn_ii_select_cursor_close(grn_ctx *ctx, grn_ii_select_cursor *cursor) { token_info **tip; if (!cursor) { return GRN_SUCCESS; } for (tip = cursor->tis; tip < cursor->tis + cursor->n_tis; tip++) { if (*tip) { token_info_close(ctx, *tip); } } if (cursor->tis) { GRN_FREE(cursor->tis); } bt_close(ctx, cursor->bt); GRN_FREE(cursor); return GRN_SUCCESS; } static grn_ii_select_cursor * grn_ii_select_cursor_open(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len, grn_select_optarg *optarg) { grn_operator mode = GRN_OP_EXACT; grn_ii_select_cursor *cursor; if (string_len == 0) { ERR(GRN_INVALID_ARGUMENT, "[ii][select][cursor][open] empty string"); return NULL; } if (optarg) { mode = optarg->mode; } switch (mode) { case GRN_OP_EXACT : case GRN_OP_FUZZY : case GRN_OP_NEAR : case GRN_OP_NEAR2 : break; default : ERR(GRN_INVALID_ARGUMENT, "[ii][select][cursor][open] " "EXACT, FUZZY, NEAR and NEAR2 are only supported mode: %-.256s", grn_operator_to_string(mode)); break; } cursor = GRN_CALLOC(sizeof(grn_ii_select_cursor)); if (!cursor) { ERR(ctx->rc, "[ii][select][cursor][open] failed to allocate cursor: %-.256s", ctx->errbuf); return NULL; } cursor->ii = ii; cursor->mode = mode; if (!(cursor->tis = GRN_MALLOC(sizeof(token_info *) * string_len * 2))) { ERR(ctx->rc, "[ii][select][cursor][open] failed to allocate token info container: %-.256s", ctx->errbuf); GRN_FREE(cursor); return NULL; } cursor->n_tis = 0; if (cursor->mode == GRN_OP_FUZZY) { grn_bool only_skip_token = GRN_FALSE; grn_id previous_min = GRN_ID_NIL; if (token_info_build_fuzzy(ctx, ii->lexicon, ii, string, string_len, cursor->tis, &(cursor->n_tis), &only_skip_token, previous_min, cursor->mode, &(optarg->fuzzy)) != GRN_SUCCESS) { grn_ii_select_cursor_close(ctx, cursor); return NULL; } } else { grn_bool only_skip_token = GRN_FALSE; grn_id previous_min = GRN_ID_NIL; if (token_info_build(ctx, ii->lexicon, ii, string, string_len, cursor->tis, &(cursor->n_tis), &only_skip_token, previous_min, cursor->mode) != GRN_SUCCESS) { grn_ii_select_cursor_close(ctx, cursor); return NULL; } } if (cursor->n_tis == 0) { grn_ii_select_cursor_close(ctx, cursor); return NULL; } switch (cursor->mode) { case GRN_OP_NEAR2 : token_info_clear_offset(cursor->tis, cursor->n_tis); cursor->mode = GRN_OP_NEAR; /* fallthru */ case GRN_OP_NEAR : if (!(cursor->bt = bt_open(ctx, cursor->n_tis))) { ERR(ctx->rc, "[ii][select][cursor][open] failed to allocate btree: %-.256s", ctx->errbuf); grn_ii_select_cursor_close(ctx, cursor); return NULL; } cursor->max_interval = optarg->max_interval; break; default : break; } qsort(cursor->tis, cursor->n_tis, sizeof(token_info *), token_compare); GRN_LOG(ctx, GRN_LOG_INFO, "[ii][select][cursor][open] n=%d <%.*s>", cursor->n_tis, string_len, string); cursor->string = string; cursor->string_len = string_len; cursor->done = GRN_FALSE; cursor->have_unshifted_posting = GRN_FALSE; return cursor; } static grn_ii_select_cursor_posting * grn_ii_select_cursor_next(grn_ctx *ctx, grn_ii_select_cursor *cursor) { btr *bt = cursor->bt; token_info **tis = cursor->tis; token_info **tie = tis + cursor->n_tis; uint32_t n_tis = cursor->n_tis; int max_interval = cursor->max_interval; grn_operator mode = cursor->mode; if (cursor->have_unshifted_posting) { cursor->have_unshifted_posting = GRN_FALSE; return &(cursor->unshifted_posting); } if (cursor->done) { return NULL; } for (;;) { grn_id rid; grn_id sid; grn_id next_rid; grn_id next_sid; token_info **tip; rid = (*tis)->p->rid; sid = (*tis)->p->sid; for (tip = tis + 1, next_rid = rid, next_sid = sid + 1; tip < tie; tip++) { token_info *ti = *tip; if (token_info_skip(ctx, ti, rid, sid)) { return NULL; } if (ti->p->rid != rid || ti->p->sid != sid) { next_rid = ti->p->rid; next_sid = ti->p->sid; break; } } if (tip == tie) { int start_pos = 0; int pos = 0; int end_pos = 0; int score = 0; int tf = 0; int tscore = 0; #define SKIP_OR_BREAK(pos) {\ if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; } \ if (ti->p->rid != rid || ti->p->sid != sid) { \ next_rid = ti->p->rid; \ next_sid = ti->p->sid; \ break; \ } \ } #define RETURN_POSTING() do { \ cursor->posting.rid = rid; \ cursor->posting.sid = sid; \ cursor->posting.start_pos = start_pos; \ cursor->posting.end_pos = end_pos; \ cursor->posting.tf = tf; \ cursor->posting.weight = tscore; \ if (token_info_skip_pos(ctx, *tis, rid, sid, pos) != GRN_SUCCESS) { \ if (token_info_skip(ctx, *tis, next_rid, next_sid) != GRN_SUCCESS) { \ cursor->done = GRN_TRUE; \ } \ } \ return &(cursor->posting); \ } while (GRN_FALSE) if (n_tis == 1) { start_pos = pos = end_pos = (*tis)->p->pos; pos++; tf = (*tis)->p->tf; tscore = (*tis)->p->weight + (*tis)->cursors->bins[0]->weight; RETURN_POSTING(); } else if (mode == GRN_OP_NEAR) { bt_zap(bt); for (tip = tis; tip < tie; tip++) { token_info *ti = *tip; SKIP_OR_BREAK(pos); bt_push(bt, ti); } if (tip == tie) { for (;;) { token_info *ti; int min; int max; ti = bt->min; min = ti->pos; max = bt->max->pos; if (min > max) { char ii_name[GRN_TABLE_MAX_KEY_SIZE]; int ii_name_size; ii_name_size = grn_obj_name(ctx, (grn_obj *)(cursor->ii), ii_name, GRN_TABLE_MAX_KEY_SIZE); ERR(GRN_FILE_CORRUPT, "[ii][select][cursor][near] " "max position must be larger than min position: " "min:<%d> max:<%d> ii:<%.*s> string:<%.*s>", min, max, ii_name_size, ii_name, cursor->string_len, cursor->string); return NULL; } if ((max_interval < 0) || (max - min <= max_interval)) { /* TODO: Set start_pos, pos, end_pos, tf and tscore */ RETURN_POSTING(); if (ti->pos == max + 1) { break; } SKIP_OR_BREAK(max + 1); } else { if (ti->pos == max - max_interval) { break; } SKIP_OR_BREAK(max - max_interval); } bt_pop(bt); } } } else { int count = 0; for (tip = tis; ; tip++) { token_info *ti; if (tip == tie) { tip = tis; } ti = *tip; SKIP_OR_BREAK(pos); if (ti->pos == pos) { score += ti->p->weight + ti->cursors->bins[0]->weight; count++; if ((int) ti->p->pos > end_pos) { end_pos = ti->p->pos; } } else { score = ti->p->weight + ti->cursors->bins[0]->weight; count = 1; start_pos = pos = ti->pos; end_pos = ti->p->pos; } if (count == (int) n_tis) { pos++; if ((int) ti->p->pos > end_pos) { end_pos = ti->p->pos; } tf = 1; tscore += score; RETURN_POSTING(); } } } #undef SKIP_OR_BREAK } if (token_info_skip(ctx, *tis, next_rid, next_sid)) { return NULL; } } } static void grn_ii_select_cursor_unshift(grn_ctx *ctx, grn_ii_select_cursor *cursor, grn_ii_select_cursor_posting *posting) { cursor->unshifted_posting = *posting; cursor->have_unshifted_posting = GRN_TRUE; } static grn_rc grn_ii_parse_regexp_query(grn_ctx *ctx, const char *log_tag, const char *string, unsigned int string_len, grn_obj *parsed_strings) { grn_bool escaping = GRN_FALSE; int nth_char = 0; const char *current = string; const char *string_end = string + string_len; grn_obj buffer; GRN_TEXT_INIT(&buffer, 0); while (current < string_end) { const char *target; int char_len; char_len = grn_charlen(ctx, current, string_end); if (char_len == 0) { GRN_OBJ_FIN(ctx, &buffer); ERR(GRN_INVALID_ARGUMENT, "%-.256s invalid encoding character: <%.*s|%#x|>", log_tag, (int)(current - string), string, *current); return ctx->rc; } target = current; current += char_len; if (escaping) { escaping = GRN_FALSE; if (char_len == 1) { switch (*target) { case 'A' : if (nth_char == 0) { target = GRN_TOKENIZER_BEGIN_MARK_UTF8; char_len = GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN; } break; case 'z' : if (current == string_end) { target = GRN_TOKENIZER_END_MARK_UTF8; char_len = GRN_TOKENIZER_END_MARK_UTF8_LEN; } break; default : break; } } } else { if (char_len == 1) { if (*target == '\\') { escaping = GRN_TRUE; continue; } else if (*target == '.' && grn_charlen(ctx, current, string_end) == 1 && *current == '*') { if (GRN_TEXT_LEN(&buffer) > 0) { grn_vector_add_element(ctx, parsed_strings, GRN_TEXT_VALUE(&buffer), GRN_TEXT_LEN(&buffer), 0, GRN_DB_TEXT); GRN_BULK_REWIND(&buffer); } current++; nth_char++; continue; } } } GRN_TEXT_PUT(ctx, &buffer, target, char_len); nth_char++; } if (GRN_TEXT_LEN(&buffer) > 0) { grn_vector_add_element(ctx, parsed_strings, GRN_TEXT_VALUE(&buffer), GRN_TEXT_LEN(&buffer), 0, GRN_DB_TEXT); } GRN_OBJ_FIN(ctx, &buffer); return GRN_SUCCESS; } static grn_rc grn_ii_select_regexp(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len, grn_hash *s, grn_operator op, grn_select_optarg *optarg) { grn_rc rc; grn_obj parsed_strings; unsigned int n_parsed_strings; GRN_TEXT_INIT(&parsed_strings, GRN_OBJ_VECTOR); rc = grn_ii_parse_regexp_query(ctx, "[ii][select][regexp]", string, string_len, &parsed_strings); if (rc != GRN_SUCCESS) { GRN_OBJ_FIN(ctx, &parsed_strings); return rc; } if (optarg) { optarg->mode = GRN_OP_EXACT; } n_parsed_strings = grn_vector_size(ctx, &parsed_strings); if (n_parsed_strings == 1) { const char *parsed_string; unsigned int parsed_string_len; parsed_string_len = grn_vector_get_element(ctx, &parsed_strings, 0, &parsed_string, NULL, NULL); rc = grn_ii_select(ctx, ii, parsed_string, parsed_string_len, s, op, optarg); } else { int i; grn_ii_select_cursor **cursors; grn_bool have_error = GRN_FALSE; cursors = GRN_CALLOC(sizeof(grn_ii_select_cursor *) * n_parsed_strings); for (i = 0; (uint) i < n_parsed_strings; i++) { const char *parsed_string; unsigned int parsed_string_len; parsed_string_len = grn_vector_get_element(ctx, &parsed_strings, i, &parsed_string, NULL, NULL); cursors[i] = grn_ii_select_cursor_open(ctx, ii, parsed_string, parsed_string_len, optarg); if (!cursors[i]) { have_error = GRN_TRUE; break; } } while (!have_error) { grn_ii_select_cursor_posting *posting; uint32_t pos; posting = grn_ii_select_cursor_next(ctx, cursors[0]); if (!posting) { break; } pos = posting->end_pos; for (i = 1; (uint) i < n_parsed_strings; i++) { grn_ii_select_cursor_posting *posting_i; for (;;) { posting_i = grn_ii_select_cursor_next(ctx, cursors[i]); if (!posting_i) { break; } if (posting_i->rid == posting->rid && posting_i->sid == posting->sid && posting_i->start_pos > pos) { grn_ii_select_cursor_unshift(ctx, cursors[i], posting_i); break; } if (posting_i->rid > posting->rid) { grn_ii_select_cursor_unshift(ctx, cursors[i], posting_i); break; } } if (!posting_i) { break; } if (posting_i->rid != posting->rid || posting_i->sid != posting->sid) { break; } pos = posting_i->end_pos; } if ((uint) i == n_parsed_strings) { grn_rset_posinfo pi = {posting->rid, posting->sid, pos}; double record_score = 1.0; res_add(ctx, s, &pi, record_score, op); } } for (i = 0; (uint) i < n_parsed_strings; i++) { if (cursors[i]) { grn_ii_select_cursor_close(ctx, cursors[i]); } } GRN_FREE(cursors); } GRN_OBJ_FIN(ctx, &parsed_strings); if (optarg) { optarg->mode = GRN_OP_REGEXP; } return rc; } #ifdef GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH static grn_bool grn_ii_select_sequential_search_should_use(grn_ctx *ctx, grn_ii *ii, const char *raw_query, unsigned int raw_query_len, grn_hash *result, grn_operator op, grn_wv_mode wvm, grn_select_optarg *optarg, token_info **token_infos, uint32_t n_token_infos, double too_many_index_match_ratio) { int n_sources; if (too_many_index_match_ratio < 0.0) { return GRN_FALSE; } if (op != GRN_OP_AND) { return GRN_FALSE; } if (optarg->mode != GRN_OP_EXACT) { return GRN_FALSE; } n_sources = ii->obj.source_size / sizeof(grn_id); if (n_sources == 0) { return GRN_FALSE; } { uint32_t i; int n_existing_records; n_existing_records = GRN_HASH_SIZE(result); for (i = 0; i < n_token_infos; i++) { token_info *info = token_infos[i]; if (n_existing_records <= (info->size * too_many_index_match_ratio)) { return GRN_TRUE; } } return GRN_FALSE; } } static void grn_ii_select_sequential_search_body(grn_ctx *ctx, grn_ii *ii, grn_obj *normalizer, grn_encoding encoding, OnigRegex regex, grn_hash *result, grn_operator op, grn_wv_mode wvm, grn_select_optarg *optarg) { int i, n_sources; grn_id *source_ids = ii->obj.source; grn_obj buffer; GRN_TEXT_INIT(&buffer, 0); n_sources = ii->obj.source_size / sizeof(grn_id); for (i = 0; i < n_sources; i++) { grn_id source_id = source_ids[i]; grn_obj *source; grn_obj *accessor; source = grn_ctx_at(ctx, source_id); switch (source->header.type) { case GRN_TABLE_HASH_KEY : case GRN_TABLE_PAT_KEY : case GRN_TABLE_DAT_KEY : accessor = grn_obj_column(ctx, (grn_obj *)result, GRN_COLUMN_NAME_KEY, GRN_COLUMN_NAME_KEY_LEN); break; default : { char column_name[GRN_TABLE_MAX_KEY_SIZE]; int column_name_size; column_name_size = grn_column_name(ctx, source, column_name, GRN_TABLE_MAX_KEY_SIZE); accessor = grn_obj_column(ctx, (grn_obj *)result, column_name, column_name_size); } break; } { grn_hash_cursor *cursor; grn_id id; cursor = grn_hash_cursor_open(ctx, result, NULL, 0, NULL, 0, 0, -1, 0); while ((id = grn_hash_cursor_next(ctx, cursor)) != GRN_ID_NIL) { OnigPosition position; grn_obj *value; const char *normalized_value; unsigned int normalized_value_length; GRN_BULK_REWIND(&buffer); grn_obj_get_value(ctx, accessor, id, &buffer); value = grn_string_open_(ctx, GRN_TEXT_VALUE(&buffer), GRN_TEXT_LEN(&buffer), normalizer, 0, encoding); grn_string_get_normalized(ctx, value, &normalized_value, &normalized_value_length, NULL); position = onig_search(regex, normalized_value, normalized_value + normalized_value_length, normalized_value, normalized_value + normalized_value_length, NULL, 0); if (position != ONIG_MISMATCH) { grn_id *record_id; grn_rset_posinfo info; double score; grn_hash_cursor_get_key(ctx, cursor, (void **)&record_id); info.rid = *record_id; info.sid = i + 1; info.pos = 0; score = get_weight(ctx, result, info.rid, info.sid, wvm, optarg); res_add(ctx, result, &info, score, op); } grn_obj_unlink(ctx, value); } grn_hash_cursor_close(ctx, cursor); } grn_obj_unlink(ctx, accessor); } grn_obj_unlink(ctx, &buffer); } static grn_bool grn_ii_select_sequential_search(grn_ctx *ctx, grn_ii *ii, const char *raw_query, unsigned int raw_query_len, grn_hash *result, grn_operator op, grn_wv_mode wvm, grn_select_optarg *optarg, token_info **token_infos, uint32_t n_token_infos) { grn_bool processed = GRN_TRUE; { if (!grn_ii_select_sequential_search_should_use(ctx, ii, raw_query, raw_query_len, result, op, wvm, optarg, token_infos, n_token_infos, grn_ii_select_too_many_index_match_ratio)) { return GRN_FALSE; } } { grn_encoding encoding; grn_obj *normalizer; int nflags = 0; grn_obj *query; const char *normalized_query; unsigned int normalized_query_length; grn_table_get_info(ctx, ii->lexicon, NULL, &encoding, NULL, &normalizer, NULL); query = grn_string_open_(ctx, raw_query, raw_query_len, normalizer, nflags, encoding); grn_string_get_normalized(ctx, query, &normalized_query, &normalized_query_length, NULL); { OnigRegex regex; int onig_result; OnigErrorInfo error_info; onig_result = onig_new(®ex, normalized_query, normalized_query + normalized_query_length, ONIG_OPTION_NONE, ONIG_ENCODING_UTF8, ONIG_SYNTAX_ASIS, &error_info); if (onig_result == ONIG_NORMAL) { grn_ii_select_sequential_search_body(ctx, ii, normalizer, encoding, regex, result, op, wvm, optarg); onig_free(regex); } else { char message[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str(message, onig_result, error_info); GRN_LOG(ctx, GRN_LOG_WARNING, "[ii][select][sequential] " "failed to create regular expression object: %-.256s", message); processed = GRN_FALSE; } } grn_obj_unlink(ctx, query); } return processed; } #endif grn_rc grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len, grn_hash *s, grn_operator op, grn_select_optarg *optarg) { btr *bt = NULL; grn_rc rc = GRN_SUCCESS; int rep, orp, weight, max_interval = 0; token_info *ti, **tis = NULL, **tip, **tie; uint32_t n = 0, rid, sid, nrid, nsid; grn_bool only_skip_token = GRN_FALSE; grn_operator mode = GRN_OP_EXACT; grn_wv_mode wvm = grn_wv_none; grn_obj *lexicon = ii->lexicon; grn_scorer_score_func *score_func = NULL; grn_scorer_matched_record record; grn_id previous_min = GRN_ID_NIL; grn_id current_min = GRN_ID_NIL; grn_bool set_min_enable_for_and_query = GRN_FALSE; if (!lexicon || !ii || !s) { return GRN_INVALID_ARGUMENT; } if (optarg) { mode = optarg->mode; if (optarg->func) { wvm = grn_wv_dynamic; } else if (optarg->vector_size) { wvm = optarg->weight_vector ? grn_wv_static : grn_wv_constant; } if (optarg->match_info) { if (optarg->match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { previous_min = optarg->match_info->min; set_min_enable_for_and_query = GRN_TRUE; } } } if (mode == GRN_OP_SIMILAR) { return grn_ii_similar_search(ctx, ii, string, string_len, s, op, optarg); } if (mode == GRN_OP_TERM_EXTRACT) { return grn_ii_term_extract(ctx, ii, string, string_len, s, op, optarg); } if (mode == GRN_OP_REGEXP) { return grn_ii_select_regexp(ctx, ii, string, string_len, s, op, optarg); } /* todo : support subrec rep = (s->record_unit == grn_rec_position || s->subrec_unit == grn_rec_position); orp = (s->record_unit == grn_rec_position || op == GRN_OP_OR); */ rep = 0; orp = op == GRN_OP_OR; if (!string_len) { goto exit; } if (!(tis = GRN_MALLOC(sizeof(token_info *) * string_len * 2))) { return GRN_NO_MEMORY_AVAILABLE; } if (mode == GRN_OP_FUZZY) { if (token_info_build_fuzzy(ctx, lexicon, ii, string, string_len, tis, &n, &only_skip_token, previous_min, mode, &(optarg->fuzzy)) || !n) { goto exit; } } else { if (token_info_build(ctx, lexicon, ii, string, string_len, tis, &n, &only_skip_token, previous_min, mode) || !n) { goto exit; } } switch (mode) { case GRN_OP_NEAR2 : token_info_clear_offset(tis, n); mode = GRN_OP_NEAR; /* fallthru */ case GRN_OP_NEAR : if (!(bt = bt_open(ctx, n))) { rc = GRN_NO_MEMORY_AVAILABLE; goto exit; } max_interval = optarg->max_interval; break; default : break; } qsort(tis, n, sizeof(token_info *), token_compare); tie = tis + n; /* for (tip = tis; tip < tie; tip++) { ti = *tip; grn_log("o=%d n=%d s=%d r=%d", ti->offset, ti->ntoken, ti->size, ti->rid); } */ GRN_LOG(ctx, GRN_LOG_INFO, "n=%d (%.*s)", n, string_len, string); /* todo : array as result if (n == 1 && (*tis)->cursors->n_entries == 1 && op == GRN_OP_OR && !GRN_HASH_SIZE(s) && !s->garbages && s->record_unit == grn_rec_document && !s->max_n_subrecs && grn_ii_max_section(ii) == 1) { grn_ii_cursor *c = (*tis)->cursors->bins[0]; if ((rc = grn_hash_array_init(s, (*tis)->size + 32768))) { goto exit; } do { grn_rset_recinfo *ri; grn_posting *p = c->post; if ((weight = get_weight(ctx, s, p->rid, p->sid, wvm, optarg))) { GRN_HASH_INT_ADD(s, p, ri); ri->score = (p->tf + p->score) * weight; ri->n_subrecs = 1; } } while (grn_ii_cursor_next(ctx, c)); goto exit; } */ #ifdef GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH if (grn_ii_select_sequential_search(ctx, ii, string, string_len, s, op, wvm, optarg, tis, n)) { goto exit; } #endif if (optarg && optarg->scorer) { grn_proc *scorer = (grn_proc *)(optarg->scorer); score_func = scorer->callbacks.scorer.score; record.table = grn_ctx_at(ctx, s->obj.header.domain); record.lexicon = lexicon; record.id = GRN_ID_NIL; GRN_RECORD_INIT(&(record.terms), GRN_OBJ_VECTOR, lexicon->header.domain); GRN_UINT32_INIT(&(record.term_weights), GRN_OBJ_VECTOR); record.total_term_weights = 0; record.n_documents = grn_table_size(ctx, record.table); record.n_occurrences = 0; record.n_candidates = 0; record.n_tokens = 0; record.weight = 0; record.args_expr = optarg->scorer_args_expr; record.args_expr_offset = optarg->scorer_args_expr_offset; } for (;;) { rid = (*tis)->p->rid; sid = (*tis)->p->sid; for (tip = tis + 1, nrid = rid, nsid = sid + 1; tip < tie; tip++) { ti = *tip; if (token_info_skip(ctx, ti, rid, sid)) { goto exit; } if (ti->p->rid != rid || ti->p->sid != sid) { nrid = ti->p->rid; nsid = ti->p->sid; break; } } weight = get_weight(ctx, s, rid, sid, wvm, optarg); if (tip == tie && weight != 0) { grn_rset_posinfo pi = {rid, sid, 0}; if (orp || grn_hash_get(ctx, s, &pi, s->key_size, NULL)) { int count = 0, noccur = 0, pos = 0, score = 0, tscore = 0, min, max; if (score_func) { GRN_BULK_REWIND(&(record.terms)); GRN_BULK_REWIND(&(record.term_weights)); record.n_candidates = 0; record.n_tokens = 0; } #define SKIP_OR_BREAK(pos) {\ if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; } \ if (ti->p->rid != rid || ti->p->sid != sid) { \ nrid = ti->p->rid; \ nsid = ti->p->sid; \ break; \ } \ } if (n == 1 && !rep) { noccur = (*tis)->p->tf; tscore = (*tis)->p->weight + (*tis)->cursors->bins[0]->weight; if (score_func) { GRN_RECORD_PUT(ctx, &(record.terms), (*tis)->cursors->bins[0]->id); GRN_UINT32_PUT(ctx, &(record.term_weights), tscore); record.n_occurrences = noccur; record.n_candidates = (*tis)->size; record.n_tokens = (*tis)->ntoken; } } else if (mode == GRN_OP_NEAR) { bt_zap(bt); for (tip = tis; tip < tie; tip++) { ti = *tip; SKIP_OR_BREAK(pos); bt_push(bt, ti); } if (tip == tie) { for (;;) { ti = bt->min; min = ti->pos; max = bt->max->pos; if (min > max) { char ii_name[GRN_TABLE_MAX_KEY_SIZE]; int ii_name_size; ii_name_size = grn_obj_name(ctx, (grn_obj *)ii, ii_name, GRN_TABLE_MAX_KEY_SIZE); ERR(GRN_FILE_CORRUPT, "[ii][select][near] " "max position must be larger than min position: " "min:<%d> max:<%d> ii:<%.*s> string:<%.*s>", min, max, ii_name_size, ii_name, string_len, string); rc = ctx->rc; goto exit; } if ((max_interval < 0) || (max - min <= max_interval)) { if (rep) { pi.pos = min; res_add(ctx, s, &pi, weight, op); } noccur++; if (ti->pos == max + 1) { break; } SKIP_OR_BREAK(max + 1); } else { if (ti->pos == max - max_interval) { break; } SKIP_OR_BREAK(max - max_interval); } bt_pop(bt); } } } else { for (tip = tis; ; tip++) { if (tip == tie) { tip = tis; } ti = *tip; SKIP_OR_BREAK(pos); if (ti->pos == pos) { score += ti->p->weight + ti->cursors->bins[0]->weight; count++; } else { score = ti->p->weight + ti->cursors->bins[0]->weight; count = 1; pos = ti->pos; if (noccur == 0 && score_func) { GRN_BULK_REWIND(&(record.terms)); GRN_BULK_REWIND(&(record.term_weights)); record.n_candidates = 0; record.n_tokens = 0; } } if (noccur == 0 && score_func) { GRN_RECORD_PUT(ctx, &(record.terms), ti->cursors->bins[0]->id); GRN_UINT32_PUT(ctx, &(record.term_weights), ti->p->weight + ti->cursors->bins[0]->weight); record.n_candidates += ti->size; record.n_tokens += ti->ntoken; } if ((uint) count == n) { if (rep) { pi.pos = pos; res_add(ctx, s, &pi, (score + 1) * weight, op); } tscore += score; score = 0; count = 0; pos++; noccur++; } } } if (noccur && !rep) { double record_score; if (score_func) { record.id = rid; record.weight = weight; record.n_occurrences = noccur; record.total_term_weights = tscore; record_score = score_func(ctx, &record) * weight; } else { record_score = (noccur + tscore) * weight; } if (set_min_enable_for_and_query) { if (current_min == GRN_ID_NIL) { current_min = rid; } } res_add(ctx, s, &pi, record_score, op); } #undef SKIP_OR_BREAK } } if (token_info_skip(ctx, *tis, nrid, nsid)) { goto exit; } } exit : if (score_func) { GRN_OBJ_FIN(ctx, &(record.terms)); GRN_OBJ_FIN(ctx, &(record.term_weights)); } if (set_min_enable_for_and_query) { if (current_min > previous_min) { optarg->match_info->min = current_min; } } for (tip = tis; tip < tis + n; tip++) { if (*tip) { token_info_close(ctx, *tip); } } if (tis) { GRN_FREE(tis); } if (!only_skip_token) { grn_ii_resolve_sel_and(ctx, s, op); } // grn_hash_cursor_clear(r); bt_close(ctx, bt); #ifdef DEBUG { uint32_t segno = GRN_II_MAX_LSEG, nnref = 0; grn_io_mapinfo *info = ii->seg->maps; for (; segno; segno--, info++) { if (info->nref) { nnref++; } } GRN_LOG(ctx, GRN_LOG_INFO, "nnref=%d", nnref); } #endif /* DEBUG */ return rc; } static uint32_t grn_ii_estimate_size_for_query_regexp(grn_ctx *ctx, grn_ii *ii, const char *query, unsigned int query_len, grn_search_optarg *optarg) { grn_rc rc; grn_obj parsed_query; uint32_t size; GRN_TEXT_INIT(&parsed_query, 0); rc = grn_ii_parse_regexp_query(ctx, "[ii][estimate-size][query][regexp]", query, query_len, &parsed_query); if (rc != GRN_SUCCESS) { GRN_OBJ_FIN(ctx, &parsed_query); return 0; } if (optarg) { optarg->mode = GRN_OP_EXACT; } size = grn_ii_estimate_size_for_query(ctx, ii, GRN_TEXT_VALUE(&parsed_query), GRN_TEXT_LEN(&parsed_query), optarg); GRN_OBJ_FIN(ctx, &parsed_query); if (optarg) { optarg->mode = GRN_OP_REGEXP; } return size; } uint32_t grn_ii_estimate_size_for_query(grn_ctx *ctx, grn_ii *ii, const char *query, unsigned int query_len, grn_search_optarg *optarg) { grn_rc rc; grn_obj *lexicon = ii->lexicon; token_info **tis = NULL; uint32_t i; uint32_t n_tis = 0; grn_bool only_skip_token = GRN_FALSE; grn_operator mode = GRN_OP_EXACT; double estimated_size = 0; double normalized_ratio = 1.0; grn_id min = GRN_ID_NIL; if (query_len == 0) { return 0; } if (optarg) { switch (optarg->mode) { case GRN_OP_NEAR : case GRN_OP_NEAR2 : mode = optarg->mode; break; case GRN_OP_SIMILAR : mode = optarg->mode; break; case GRN_OP_REGEXP : mode = optarg->mode; break; case GRN_OP_FUZZY : mode = optarg->mode; default : break; } if (optarg->match_info.flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { min = optarg->match_info.min; } } if (mode == GRN_OP_REGEXP) { return grn_ii_estimate_size_for_query_regexp(ctx, ii, query, query_len, optarg); } tis = GRN_MALLOC(sizeof(token_info *) * query_len * 2); if (!tis) { return 0; } switch (mode) { case GRN_OP_FUZZY : rc = token_info_build_fuzzy(ctx, lexicon, ii, query, query_len, tis, &n_tis, &only_skip_token, min, mode, &(optarg->fuzzy)); break; default : rc = token_info_build(ctx, lexicon, ii, query, query_len, tis, &n_tis, &only_skip_token, min, mode); break; } if (rc != GRN_SUCCESS) { goto exit; } for (i = 0; i < n_tis; i++) { token_info *ti = tis[i]; double term_estimated_size; term_estimated_size = ((double)ti->size / ti->ntoken); if (i == 0) { estimated_size = term_estimated_size; } else { if (term_estimated_size < estimated_size) { estimated_size = term_estimated_size; } normalized_ratio *= grn_ii_estimate_size_for_query_reduce_ratio; } } estimated_size *= normalized_ratio; if (estimated_size > 0.0 && estimated_size < 1.0) { estimated_size = 1.0; } exit : for (i = 0; i < n_tis; i++) { token_info *ti = tis[i]; if (ti) { token_info_close(ctx, ti); } } if (tis) { GRN_FREE(tis); } return estimated_size; } uint32_t grn_ii_estimate_size_for_lexicon_cursor(grn_ctx *ctx, grn_ii *ii, grn_table_cursor *lexicon_cursor) { grn_id term_id; uint32_t estimated_size = 0; while ((term_id = grn_table_cursor_next(ctx, lexicon_cursor)) != GRN_ID_NIL) { uint32_t term_estimated_size; term_estimated_size = grn_ii_estimate_size(ctx, ii, term_id); estimated_size += term_estimated_size; } return estimated_size; } grn_rc grn_ii_sel(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len, grn_hash *s, grn_operator op, grn_search_optarg *optarg) { ERRCLR(ctx); GRN_LOG(ctx, GRN_LOG_INFO, "grn_ii_sel > (%.*s)", string_len, string); { grn_select_optarg arg; if (!s) { return GRN_INVALID_ARGUMENT; } memset(&arg, 0, sizeof(grn_select_optarg)); arg.mode = GRN_OP_EXACT; if (optarg) { switch (optarg->mode) { case GRN_OP_NEAR : case GRN_OP_NEAR2 : arg.mode = optarg->mode; arg.max_interval = optarg->max_interval; break; case GRN_OP_SIMILAR : arg.mode = optarg->mode; arg.similarity_threshold = optarg->similarity_threshold; break; case GRN_OP_REGEXP : arg.mode = optarg->mode; break; case GRN_OP_FUZZY : arg.mode = optarg->mode; arg.fuzzy = optarg->fuzzy; break; default : break; } if (optarg->vector_size != 0) { arg.weight_vector = optarg->weight_vector; arg.vector_size = optarg->vector_size; } arg.scorer = optarg->scorer; arg.scorer_args_expr = optarg->scorer_args_expr; arg.scorer_args_expr_offset = optarg->scorer_args_expr_offset; arg.match_info = &(optarg->match_info); } /* todo : support subrec grn_rset_init(ctx, s, grn_rec_document, 0, grn_rec_none, 0, 0); */ if (grn_ii_select(ctx, ii, string, string_len, s, op, &arg)) { GRN_LOG(ctx, GRN_LOG_ERROR, "grn_ii_select on grn_ii_sel(1) failed !"); return ctx->rc; } GRN_LOG(ctx, GRN_LOG_INFO, "exact: %d", GRN_HASH_SIZE(s)); if (op == GRN_OP_OR) { grn_id min = GRN_ID_NIL; if ((int64_t)GRN_HASH_SIZE(s) <= ctx->impl->match_escalation_threshold) { arg.mode = GRN_OP_UNSPLIT; if (arg.match_info) { if (arg.match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { min = arg.match_info->min; arg.match_info->min = GRN_ID_NIL; } } if (grn_ii_select(ctx, ii, string, string_len, s, op, &arg)) { GRN_LOG(ctx, GRN_LOG_ERROR, "grn_ii_select on grn_ii_sel(2) failed !"); return ctx->rc; } GRN_LOG(ctx, GRN_LOG_INFO, "unsplit: %d", GRN_HASH_SIZE(s)); if (arg.match_info) { if (arg.match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { if (min > GRN_ID_NIL && min < arg.match_info->min) { arg.match_info->min = min; } } } } if ((int64_t)GRN_HASH_SIZE(s) <= ctx->impl->match_escalation_threshold) { arg.mode = GRN_OP_PARTIAL; if (arg.match_info) { if (arg.match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { min = arg.match_info->min; arg.match_info->min = GRN_ID_NIL; } } if (grn_ii_select(ctx, ii, string, string_len, s, op, &arg)) { GRN_LOG(ctx, GRN_LOG_ERROR, "grn_ii_select on grn_ii_sel(3) failed !"); return ctx->rc; } GRN_LOG(ctx, GRN_LOG_INFO, "partial: %d", GRN_HASH_SIZE(s)); if (arg.match_info) { if (arg.match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { if (min > GRN_ID_NIL && min < arg.match_info->min) { arg.match_info->min = min; } } } } } GRN_LOG(ctx, GRN_LOG_INFO, "hits=%d", GRN_HASH_SIZE(s)); return GRN_SUCCESS; } } grn_rc grn_ii_at(grn_ctx *ctx, grn_ii *ii, grn_id id, grn_hash *s, grn_operator op) { int rep = 0; grn_ii_cursor *c; grn_posting *pos; if ((c = grn_ii_cursor_open(ctx, ii, id, GRN_ID_NIL, GRN_ID_MAX, rep ? ii->n_elements : ii->n_elements - 1, 0))) { while ((pos = grn_ii_cursor_next(ctx, c))) { res_add(ctx, s, (grn_rset_posinfo *) pos, (1 + pos->weight), op); } grn_ii_cursor_close(ctx, c); } return ctx->rc; } void grn_ii_resolve_sel_and(grn_ctx *ctx, grn_hash *s, grn_operator op) { if (op == GRN_OP_AND && !(ctx->flags & GRN_CTX_TEMPORARY_DISABLE_II_RESOLVE_SEL_AND)) { grn_id eid; grn_rset_recinfo *ri; grn_hash_cursor *c = grn_hash_cursor_open(ctx, s, NULL, 0, NULL, 0, 0, -1, 0); if (c) { while ((eid = grn_hash_cursor_next(ctx, c))) { grn_hash_cursor_get_value(ctx, c, (void **) &ri); if ((ri->n_subrecs & GRN_RSET_UTIL_BIT)) { ri->n_subrecs &= ~GRN_RSET_UTIL_BIT; } else { grn_hash_delete_by_id(ctx, s, eid, NULL); } } grn_hash_cursor_close(ctx, c); } } } void grn_ii_cursor_inspect(grn_ctx *ctx, grn_ii_cursor *c, grn_obj *buf) { grn_obj key_buf; char key[GRN_TABLE_MAX_KEY_SIZE]; int key_size; int i = 0; grn_ii_cursor_next_options options = { .include_garbage = GRN_TRUE }; GRN_TEXT_PUTS(ctx, buf, " #<"); key_size = grn_table_get_key(ctx, c->ii->lexicon, c->id, key, GRN_TABLE_MAX_KEY_SIZE); GRN_OBJ_INIT(&key_buf, GRN_BULK, 0, c->ii->lexicon->header.domain); GRN_TEXT_SET(ctx, &key_buf, key, key_size); grn_inspect(ctx, buf, &key_buf); GRN_OBJ_FIN(ctx, &key_buf); GRN_TEXT_PUTS(ctx, buf, "\n elements:[\n "); while (grn_ii_cursor_next_internal(ctx, c, &options)) { grn_posting *pos = c->post; if (i > 0) { GRN_TEXT_PUTS(ctx, buf, ",\n "); } i++; GRN_TEXT_PUTS(ctx, buf, "{status:"); if (pos->tf && pos->sid) { GRN_TEXT_PUTS(ctx, buf, "available"); } else { GRN_TEXT_PUTS(ctx, buf, "garbage"); } GRN_TEXT_PUTS(ctx, buf, ", rid:"); grn_text_lltoa(ctx, buf, pos->rid); GRN_TEXT_PUTS(ctx, buf, ", sid:"); grn_text_lltoa(ctx, buf, pos->sid); GRN_TEXT_PUTS(ctx, buf, ", pos:"); grn_text_lltoa(ctx, buf, pos->pos); GRN_TEXT_PUTS(ctx, buf, ", tf:"); grn_text_lltoa(ctx, buf, pos->tf); GRN_TEXT_PUTS(ctx, buf, ", weight:"); grn_text_lltoa(ctx, buf, pos->weight); GRN_TEXT_PUTS(ctx, buf, ", rest:"); grn_text_lltoa(ctx, buf, pos->rest); GRN_TEXT_PUTS(ctx, buf, "}"); } GRN_TEXT_PUTS(ctx, buf, "\n ]\n >"); } void grn_ii_inspect_values(grn_ctx *ctx, grn_ii *ii, grn_obj *buf) { grn_table_cursor *tc; GRN_TEXT_PUTS(ctx, buf, "["); if ((tc = grn_table_cursor_open(ctx, ii->lexicon, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_ASCENDING))) { int i = 0; grn_id tid; grn_ii_cursor *c; while ((tid = grn_table_cursor_next(ctx, tc))) { if (i > 0) { GRN_TEXT_PUTS(ctx, buf, ","); } i++; GRN_TEXT_PUTS(ctx, buf, "\n"); if ((c = grn_ii_cursor_open(ctx, ii, tid, GRN_ID_NIL, GRN_ID_MAX, ii->n_elements, GRN_OBJ_WITH_POSITION|GRN_OBJ_WITH_SECTION))) { grn_ii_cursor_inspect(ctx, c, buf); grn_ii_cursor_close(ctx, c); } } grn_table_cursor_close(ctx, tc); } GRN_TEXT_PUTS(ctx, buf, "]"); } /********************** buffered index builder ***********************/ const grn_id II_BUFFER_TYPE_MASK = 0xc0000000; #define II_BUFFER_TYPE_RID 0x80000000 #define II_BUFFER_TYPE_WEIGHT 0x40000000 #define II_BUFFER_TYPE(id) (((id) & II_BUFFER_TYPE_MASK)) #define II_BUFFER_PACK(value, type) ((value) | (type)) #define II_BUFFER_UNPACK(id, type) ((id) & ~(type)) #define II_BUFFER_ORDER GRN_CURSOR_BY_KEY const uint16_t II_BUFFER_NTERMS_PER_BUFFER = 16380; const uint32_t II_BUFFER_PACKED_BUF_SIZE = 0x4000000; const char *TMPFILE_PATH = "grn_ii_buffer_tmp"; const uint32_t II_BUFFER_NCOUNTERS_MARGIN = 0x100000; const size_t II_BUFFER_BLOCK_SIZE = 0x1000000; const uint32_t II_BUFFER_BLOCK_READ_UNIT_SIZE = 0x200000; typedef struct { unsigned int sid; /* Section ID */ unsigned int weight; /* Weight */ const char *p; /* Value address */ uint32_t len; /* Value length */ char *buf; /* Buffer address */ uint32_t cap; /* Buffer size */ } ii_buffer_value; /* ii_buffer_counter is associated with a combination of a block an a term. */ typedef struct { uint32_t nrecs; /* Number of records or sections */ uint32_t nposts; /* Number of occurrences */ /* Information of the last value */ grn_id last_rid; /* Record ID */ uint32_t last_sid; /* Section ID */ uint32_t last_tf; /* Term frequency */ uint32_t last_weight; /* Total weight */ uint32_t last_pos; /* Token position */ /* Meaning of offset_* is different before/after encoding. */ /* Before encoding: size in encoded sequence */ /* After encoding: Offset in encoded sequence */ uint32_t offset_rid; /* Record ID */ uint32_t offset_sid; /* Section ID */ uint32_t offset_tf; /* Term frequency */ uint32_t offset_weight; /* Weight */ uint32_t offset_pos; /* Token position */ } ii_buffer_counter; typedef struct { off64_t head; off64_t tail; uint32_t nextsize; uint8_t *buffer; uint32_t buffersize; uint8_t *bufcur; uint32_t rest; grn_id tid; uint32_t nrecs; uint32_t nposts; grn_id *recs; uint32_t *tfs; uint32_t *posts; } ii_buffer_block; struct _grn_ii_buffer { grn_obj *lexicon; /* Global lexicon */ grn_obj *tmp_lexicon; /* Temporary lexicon for each block */ ii_buffer_block *blocks; /* Blocks */ uint32_t nblocks; /* Number of blocks */ int tmpfd; /* Descriptor of temporary file */ char tmpfpath[PATH_MAX]; /* Path of temporary file */ uint64_t update_buffer_size; // stuff for parsing off64_t filepos; /* Write position of temporary file */ grn_id *block_buf; /* Buffer for the current block */ size_t block_buf_size; /* Size of block_buf */ size_t block_pos; /* Write position of block_buf */ ii_buffer_counter *counters; /* Status of terms */ uint32_t ncounters; /* Number of counters */ size_t total_size; size_t curr_size; ii_buffer_value *values; /* Values in block */ unsigned int nvalues; /* Number of values in block */ unsigned int max_nvalues; /* Size of values */ grn_id last_rid; // stuff for merging grn_ii *ii; uint32_t lseg; uint32_t dseg; buffer *term_buffer; datavec data_vectors[MAX_N_ELEMENTS + 1]; uint8_t *packed_buf; size_t packed_buf_size; size_t packed_len; size_t total_chunk_size; }; /* block_new returns a new ii_buffer_block to store block information. */ static ii_buffer_block * block_new(grn_ctx *ctx, grn_ii_buffer *ii_buffer) { ii_buffer_block *block; if (!(ii_buffer->nblocks & 0x3ff)) { ii_buffer_block *blocks; if (!(blocks = GRN_REALLOC(ii_buffer->blocks, (ii_buffer->nblocks + 0x400) * sizeof(ii_buffer_block)))) { return NULL; } ii_buffer->blocks = blocks; } block = &ii_buffer->blocks[ii_buffer->nblocks]; block->head = ii_buffer->filepos; block->rest = 0; block->buffer = NULL; block->buffersize = 0; return block; } /* allocate_outbuf allocates memory to flush a block. */ static uint8_t * allocate_outbuf(grn_ctx *ctx, grn_ii_buffer *ii_buffer) { size_t bufsize = 0, bufsize_ = 0; uint32_t flags = ii_buffer->ii->header->flags; ii_buffer_counter *counter = ii_buffer->counters; grn_id tid, tid_max = grn_table_size(ctx, ii_buffer->tmp_lexicon); for (tid = 1; tid <= tid_max; counter++, tid++) { counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1); counter->last_rid = 0; counter->last_tf = 0; bufsize += 5; bufsize += GRN_B_ENC_SIZE(counter->nrecs); bufsize += GRN_B_ENC_SIZE(counter->nposts); bufsize += counter->offset_rid; if ((flags & GRN_OBJ_WITH_SECTION)) { bufsize += counter->offset_sid; } bufsize += counter->offset_tf; if ((flags & GRN_OBJ_WITH_WEIGHT)) { bufsize += counter->offset_weight; } if ((flags & GRN_OBJ_WITH_POSITION)) { bufsize += counter->offset_pos; } if (bufsize_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < bufsize) { bufsize += sizeof(uint32_t); bufsize_ = bufsize; } } GRN_LOG(ctx, GRN_LOG_INFO, "flushing:%d bufsize:%" GRN_FMT_SIZE, ii_buffer->nblocks, bufsize); return (uint8_t *)GRN_MALLOC(bufsize); } /* * The temporary file format is roughly as follows: * * File = Block... * Block = Unit... * Unit = TermChunk (key order) * NextUnitSize (The first unit size is kept on memory) * Chunk = Term... * Term = ID (gtid) * NumRecordsOrSections (nrecs), NumOccurrences (nposts) * RecordID... (rid, diff) * [SectionID... (sid, diff)] * TermFrequency... (tf, diff) * [Weight... (weight, diff)] * [Position... (pos, diff)] */ /* * encode_terms encodes terms in ii_buffer->tmp_lexicon and returns the * expected temporary file size. */ static size_t encode_terms(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf, ii_buffer_block *block) { grn_id tid; uint8_t *outbufp = outbuf; uint8_t *outbufp_ = outbuf; grn_table_cursor *tc; /* The first size is written into block->nextsize. */ uint8_t *pnext = (uint8_t *)&block->nextsize; uint32_t flags = ii_buffer->ii->header->flags; tc = grn_table_cursor_open(ctx, ii_buffer->tmp_lexicon, NULL, 0, NULL, 0, 0, -1, II_BUFFER_ORDER); while ((tid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { char key[GRN_TABLE_MAX_KEY_SIZE]; int key_size = grn_table_get_key(ctx, ii_buffer->tmp_lexicon, tid, key, GRN_TABLE_MAX_KEY_SIZE); /* gtid is a global term ID, not in a temporary lexicon. */ grn_id gtid = grn_table_add(ctx, ii_buffer->lexicon, key, key_size, NULL); ii_buffer_counter *counter = &ii_buffer->counters[tid - 1]; if (counter->nrecs) { uint32_t offset_rid = counter->offset_rid; uint32_t offset_sid = counter->offset_sid; uint32_t offset_tf = counter->offset_tf; uint32_t offset_weight = counter->offset_weight; uint32_t offset_pos = counter->offset_pos; GRN_B_ENC(gtid, outbufp); GRN_B_ENC(counter->nrecs, outbufp); GRN_B_ENC(counter->nposts, outbufp); ii_buffer->total_size += counter->nrecs + counter->nposts; counter->offset_rid = outbufp - outbuf; outbufp += offset_rid; if ((flags & GRN_OBJ_WITH_SECTION)) { counter->offset_sid = outbufp - outbuf; outbufp += offset_sid; } counter->offset_tf = outbufp - outbuf; outbufp += offset_tf; if ((flags & GRN_OBJ_WITH_WEIGHT)) { counter->offset_weight = outbufp - outbuf; outbufp += offset_weight; } if ((flags & GRN_OBJ_WITH_POSITION)) { counter->offset_pos = outbufp - outbuf; outbufp += offset_pos; } } if (outbufp_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < outbufp) { uint32_t size = outbufp - outbufp_ + sizeof(uint32_t); grn_memcpy(pnext, &size, sizeof(uint32_t)); pnext = outbufp; outbufp += sizeof(uint32_t); outbufp_ = outbufp; } } grn_table_cursor_close(ctx, tc); if (outbufp_ < outbufp) { uint32_t size = outbufp - outbufp_; grn_memcpy(pnext, &size, sizeof(uint32_t)); } return outbufp - outbuf; } /* encode_postings encodes data in ii_buffer->block_buf. */ static void encode_postings(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf) { grn_id rid = 0; unsigned int sid = 1; unsigned int weight = 0; uint32_t pos = 0; uint32_t rest; grn_id *bp = ii_buffer->block_buf; uint32_t flags = ii_buffer->ii->header->flags; for (rest = ii_buffer->block_pos; rest; bp++, rest--) { grn_id id = *bp; switch (II_BUFFER_TYPE(id)) { case II_BUFFER_TYPE_RID : rid = II_BUFFER_UNPACK(id, II_BUFFER_TYPE_RID); if ((flags & GRN_OBJ_WITH_SECTION) && rest) { sid = *++bp; rest--; } weight = 0; pos = 0; break; case II_BUFFER_TYPE_WEIGHT : weight = II_BUFFER_UNPACK(id, II_BUFFER_TYPE_WEIGHT); break; default : { ii_buffer_counter *counter = &ii_buffer->counters[id - 1]; if (counter->last_rid == rid && counter->last_sid == sid) { counter->last_tf++; counter->last_weight += weight; } else { if (counter->last_tf) { uint8_t *p = outbuf + counter->offset_tf; GRN_B_ENC(counter->last_tf - 1, p); counter->offset_tf = p - outbuf; if (flags & GRN_OBJ_WITH_WEIGHT) { p = outbuf + counter->offset_weight; GRN_B_ENC(counter->last_weight, p); counter->offset_weight = p - outbuf; } } { uint8_t *p = outbuf + counter->offset_rid; GRN_B_ENC(rid - counter->last_rid, p); counter->offset_rid = p - outbuf; } if (flags & GRN_OBJ_WITH_SECTION) { uint8_t *p = outbuf + counter->offset_sid; if (counter->last_rid != rid) { GRN_B_ENC(sid - 1, p); } else { GRN_B_ENC(sid - counter->last_sid - 1, p); } counter->offset_sid = p - outbuf; } counter->last_rid = rid; counter->last_sid = sid; counter->last_tf = 1; counter->last_weight = weight; counter->last_pos = 0; } if ((flags & GRN_OBJ_WITH_POSITION) && rest) { uint8_t *p = outbuf + counter->offset_pos; pos = *++bp; rest--; GRN_B_ENC(pos - counter->last_pos, p); counter->offset_pos = p - outbuf; counter->last_pos = pos; } } break; } } } /* encode_last_tf encodes last_tf and last_weight in counters. */ static void encode_last_tf(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf) { ii_buffer_counter *counter = ii_buffer->counters; grn_id tid, tid_max = grn_table_size(ctx, ii_buffer->tmp_lexicon); for (tid = 1; tid <= tid_max; counter++, tid++) { uint8_t *p = outbuf + counter->offset_tf; GRN_B_ENC(counter->last_tf - 1, p); } if ((ii_buffer->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { for (tid = 1; tid <= tid_max; counter++, tid++) { uint8_t *p = outbuf + counter->offset_weight; GRN_B_ENC(counter->last_weight, p); } } } /* * grn_ii_buffer_flush flushes the current block (ii_buffer->block_buf, * counters and tmp_lexicon) to a temporary file (ii_buffer->tmpfd). * Also, block information is stored into ii_buffer->blocks. */ static void grn_ii_buffer_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) { size_t encsize; uint8_t *outbuf; ii_buffer_block *block; GRN_LOG(ctx, GRN_LOG_DEBUG, "flushing:%d npostings:%" GRN_FMT_SIZE, ii_buffer->nblocks, ii_buffer->block_pos); if (!(block = block_new(ctx, ii_buffer))) { return; } if (!(outbuf = allocate_outbuf(ctx, ii_buffer))) { return; } encsize = encode_terms(ctx, ii_buffer, outbuf, block); encode_postings(ctx, ii_buffer, outbuf); encode_last_tf(ctx, ii_buffer, outbuf); { ssize_t r = grn_write(ii_buffer->tmpfd, outbuf, encsize); if (r != (ssize_t) encsize) { ERR(GRN_INPUT_OUTPUT_ERROR, "write returned %" GRN_FMT_LLD " != %" GRN_FMT_LLU, (long long int)r, (unsigned long long int)encsize); GRN_FREE(outbuf); return; } ii_buffer->filepos += r; block->tail = ii_buffer->filepos; } GRN_FREE(outbuf); memset(ii_buffer->counters, 0, grn_table_size(ctx, ii_buffer->tmp_lexicon) * sizeof(ii_buffer_counter)); grn_obj_close(ctx, ii_buffer->tmp_lexicon); GRN_LOG(ctx, GRN_LOG_DEBUG, "flushed: %d encsize:%" GRN_FMT_SIZE, ii_buffer->nblocks, encsize); ii_buffer->tmp_lexicon = NULL; ii_buffer->nblocks++; ii_buffer->block_pos = 0; } const uint32_t PAT_CACHE_SIZE = 1<<20; /* * get_tmp_lexicon returns a temporary lexicon. * * Note that a lexicon is created for each block and ii_buffer->tmp_lexicon is * closed in grn_ii_buffer_flush. */ static grn_obj * get_tmp_lexicon(grn_ctx *ctx, grn_ii_buffer *ii_buffer) { grn_obj *tmp_lexicon = ii_buffer->tmp_lexicon; if (!tmp_lexicon) { grn_obj *domain = grn_ctx_at(ctx, ii_buffer->lexicon->header.domain); grn_obj *range = grn_ctx_at(ctx, DB_OBJ(ii_buffer->lexicon)->range); grn_obj *tokenizer; grn_obj *normalizer; grn_obj *token_filters; grn_table_flags flags; grn_table_get_info(ctx, ii_buffer->lexicon, &flags, NULL, &tokenizer, &normalizer, &token_filters); flags &= ~GRN_OBJ_PERSISTENT; tmp_lexicon = grn_table_create(ctx, NULL, 0, NULL, flags, domain, range); if (tmp_lexicon) { ii_buffer->tmp_lexicon = tmp_lexicon; grn_obj_set_info(ctx, tmp_lexicon, GRN_INFO_DEFAULT_TOKENIZER, tokenizer); grn_obj_set_info(ctx, tmp_lexicon, GRN_INFO_NORMALIZER, normalizer); grn_obj_set_info(ctx, tmp_lexicon, GRN_INFO_TOKEN_FILTERS, token_filters); if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { grn_pat_cache_enable(ctx, (grn_pat *)tmp_lexicon, PAT_CACHE_SIZE); } } } return tmp_lexicon; } /* get_buffer_counter returns a counter associated with tid. */ static ii_buffer_counter * get_buffer_counter(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_obj *tmp_lexicon, grn_id tid) { if (tid > ii_buffer->ncounters) { ii_buffer_counter *counters; uint32_t ncounters = grn_table_size(ctx, tmp_lexicon) + II_BUFFER_NCOUNTERS_MARGIN; counters = GRN_REALLOC(ii_buffer->counters, ncounters * sizeof(ii_buffer_counter)); if (!counters) { return NULL; } memset(&counters[ii_buffer->ncounters], 0, (ncounters - ii_buffer->ncounters) * sizeof(ii_buffer_counter)); ii_buffer->ncounters = ncounters; ii_buffer->counters = counters; } return &ii_buffer->counters[tid - 1]; } /* * grn_ii_buffer_tokenize_value tokenizes a value. * * The result is written into the current block (ii_buffer->tmp_lexicon, * ii_buffer->block_buf, ii_buffer->counters, etc.). */ static void grn_ii_buffer_tokenize_value(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid, const ii_buffer_value *value) { grn_obj *tmp_lexicon; if ((tmp_lexicon = get_tmp_lexicon(ctx, ii_buffer))) { unsigned int token_flags = 0; grn_token_cursor *token_cursor; grn_id *buffer = ii_buffer->block_buf; uint32_t block_pos = ii_buffer->block_pos; uint32_t ii_flags = ii_buffer->ii->header->flags; buffer[block_pos++] = II_BUFFER_PACK(rid, II_BUFFER_TYPE_RID); if (ii_flags & GRN_OBJ_WITH_SECTION) { buffer[block_pos++] = value->sid; } if (value->weight) { buffer[block_pos++] = II_BUFFER_PACK(value->weight, II_BUFFER_TYPE_WEIGHT); } if ((token_cursor = grn_token_cursor_open(ctx, tmp_lexicon, value->p, value->len, GRN_TOKEN_ADD, token_flags))) { while (!token_cursor->status) { grn_id tid; if ((tid = grn_token_cursor_next(ctx, token_cursor))) { ii_buffer_counter *counter; counter = get_buffer_counter(ctx, ii_buffer, tmp_lexicon, tid); if (!counter) { return; } buffer[block_pos++] = tid; if (ii_flags & GRN_OBJ_WITH_POSITION) { buffer[block_pos++] = token_cursor->pos; } if (counter->last_rid != rid) { counter->offset_rid += GRN_B_ENC_SIZE(rid - counter->last_rid); counter->last_rid = rid; counter->offset_sid += GRN_B_ENC_SIZE(value->sid - 1); counter->last_sid = value->sid; if (counter->last_tf) { counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1); counter->last_tf = 0; counter->offset_weight += GRN_B_ENC_SIZE(counter->last_weight); counter->last_weight = 0; } counter->last_pos = 0; counter->nrecs++; } else if (counter->last_sid != value->sid) { counter->offset_rid += GRN_B_ENC_SIZE(0); counter->offset_sid += GRN_B_ENC_SIZE(value->sid - counter->last_sid - 1); counter->last_sid = value->sid; if (counter->last_tf) { counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1); counter->last_tf = 0; counter->offset_weight += GRN_B_ENC_SIZE(counter->last_weight); counter->last_weight = 0; } counter->last_pos = 0; counter->nrecs++; } counter->offset_pos += GRN_B_ENC_SIZE(token_cursor->pos - counter->last_pos); counter->last_pos = token_cursor->pos; counter->last_tf++; counter->last_weight += value->weight; counter->nposts++; } } grn_token_cursor_close(ctx, token_cursor); } ii_buffer->block_pos = block_pos; } } /* * grn_ii_buffer_tokenize tokenizes ii_buffer->values. * * grn_ii_buffer_tokenize estimates the size of tokenized values. * If the remaining space of the current block is not enough to store the new * tokenized values, the current block is flushed. * Then, grn_ii_buffer_tokenize tokenizes values. */ static void grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid) { unsigned int i; uint32_t est_len = 0; for (i = 0; i < ii_buffer->nvalues; i++) { est_len += ii_buffer->values[i].len * 2 + 2; } if (ii_buffer->block_buf_size < ii_buffer->block_pos + est_len) { grn_ii_buffer_flush(ctx, ii_buffer); } if (ii_buffer->block_buf_size < est_len) { grn_id *block_buf = (grn_id *)GRN_REALLOC(ii_buffer->block_buf, est_len * sizeof(grn_id)); if (block_buf) { ii_buffer->block_buf = block_buf; ii_buffer->block_buf_size = est_len; } } for (i = 0; i < ii_buffer->nvalues; i++) { const ii_buffer_value *value = &ii_buffer->values[i]; if (value->len) { uint32_t est_len = value->len * 2 + 2; if (ii_buffer->block_buf_size >= ii_buffer->block_pos + est_len) { grn_ii_buffer_tokenize_value(ctx, ii_buffer, rid, value); } } } ii_buffer->nvalues = 0; } /* grn_ii_buffer_fetch fetches the next term. */ static void grn_ii_buffer_fetch(grn_ctx *ctx, grn_ii_buffer *ii_buffer, ii_buffer_block *block) { if (!block->rest) { /* Read the next unit. */ if (block->head < block->tail) { size_t bytesize = block->nextsize; if (block->buffersize < block->nextsize) { void *r = GRN_REALLOC(block->buffer, bytesize); if (r) { block->buffer = (uint8_t *)r; block->buffersize = block->nextsize; } else { GRN_LOG(ctx, GRN_LOG_WARNING, "realloc: %" GRN_FMT_LLU, (unsigned long long int)bytesize); return; } } { off64_t seeked_position; seeked_position = grn_lseek(ii_buffer->tmpfd, block->head, SEEK_SET); if (seeked_position != block->head) { ERRNO_ERR("failed to " "grn_lseek(%" GRN_FMT_OFF64_T ") -> %" GRN_FMT_OFF64_T, block->head, seeked_position); return; } } { size_t read_bytesize; read_bytesize = grn_read(ii_buffer->tmpfd, block->buffer, bytesize); if (read_bytesize != bytesize) { SERR("failed to grn_read(%" GRN_FMT_SIZE ") -> %" GRN_FMT_SIZE, bytesize, read_bytesize); return; } } block->head += bytesize; block->bufcur = block->buffer; if (block->head >= block->tail) { if (block->head > block->tail) { GRN_LOG(ctx, GRN_LOG_WARNING, "fetch error: %" GRN_FMT_INT64D " > %" GRN_FMT_INT64D, block->head, block->tail); } block->rest = block->nextsize; block->nextsize = 0; } else { block->rest = block->nextsize - sizeof(uint32_t); grn_memcpy(&block->nextsize, &block->buffer[block->rest], sizeof(uint32_t)); } } } if (block->rest) { uint8_t *p = block->bufcur; GRN_B_DEC(block->tid, p); GRN_B_DEC(block->nrecs, p); GRN_B_DEC(block->nposts, p); block->rest -= (p - block->bufcur); block->bufcur = p; } else { block->tid = 0; } } /* grn_ii_buffer_chunk_flush flushes the current buffer for packed postings. */ static void grn_ii_buffer_chunk_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) { grn_io_win io_win; uint32_t chunk_number; chunk_new(ctx, ii_buffer->ii, &chunk_number, ii_buffer->packed_len); GRN_LOG(ctx, GRN_LOG_INFO, "chunk:%d, packed_len:%" GRN_FMT_SIZE, chunk_number, ii_buffer->packed_len); fake_map(ctx, ii_buffer->ii->chunk, &io_win, ii_buffer->packed_buf, chunk_number, ii_buffer->packed_len); grn_io_win_unmap(&io_win); ii_buffer->term_buffer->header.chunk = chunk_number; ii_buffer->term_buffer->header.chunk_size = ii_buffer->packed_len; ii_buffer->term_buffer->header.buffer_free = S_SEGMENT - sizeof(buffer_header) - ii_buffer->term_buffer->header.nterms * sizeof(buffer_term); ii_buffer->term_buffer->header.nterms_void = 0; buffer_segment_update(ii_buffer->ii, ii_buffer->lseg, ii_buffer->dseg); ii_buffer->ii->header->total_chunk_size += ii_buffer->packed_len; ii_buffer->total_chunk_size += ii_buffer->packed_len; GRN_LOG(ctx, GRN_LOG_DEBUG, "nterms=%d chunk=%d total=%" GRN_FMT_INT64U "KB", ii_buffer->term_buffer->header.nterms, ii_buffer->term_buffer->header.chunk_size, ii_buffer->ii->header->total_chunk_size >> 10); ii_buffer->term_buffer = NULL; ii_buffer->packed_buf = NULL; ii_buffer->packed_len = 0; ii_buffer->packed_buf_size = 0; ii_buffer->curr_size = 0; } /* * merge_hit_blocks merges hit blocks into ii_buffer->data_vectors. * merge_hit_blocks returns the estimated maximum size in bytes. */ static size_t merge_hit_blocks(grn_ctx *ctx, grn_ii_buffer *ii_buffer, ii_buffer_block *hits[], int nhits) { uint64_t nrecs = 0; uint64_t nposts = 0; size_t max_size; uint64_t flags = ii_buffer->ii->header->flags; int i; for (i = 0; i < nhits; i++) { ii_buffer_block *block = hits[i]; nrecs += block->nrecs; nposts += block->nposts; } ii_buffer->curr_size += nrecs + nposts; max_size = nrecs * (ii_buffer->ii->n_elements); if (flags & GRN_OBJ_WITH_POSITION) { max_size += nposts - nrecs; } datavec_reset(ctx, ii_buffer->data_vectors, ii_buffer->ii->n_elements, nrecs, max_size); { int i; uint32_t lr = 0; /* Last rid */ uint64_t spos = 0; uint32_t *ridp, *sidp = NULL, *tfp, *weightp = NULL, *posp = NULL; { /* Get write positions in datavec. */ int j = 0; ridp = ii_buffer->data_vectors[j++].data; if (flags & GRN_OBJ_WITH_SECTION) { sidp = ii_buffer->data_vectors[j++].data; } tfp = ii_buffer->data_vectors[j++].data; if (flags & GRN_OBJ_WITH_WEIGHT) { weightp = ii_buffer->data_vectors[j++].data; } if (flags & GRN_OBJ_WITH_POSITION) { posp = ii_buffer->data_vectors[j++].data; } } for (i = 0; i < nhits; i++) { /* Read postings from hit blocks and join the postings into datavec. */ ii_buffer_block *block = hits[i]; uint8_t *p = block->bufcur; uint32_t n = block->nrecs; if (n) { GRN_B_DEC(*ridp, p); *ridp -= lr; lr += *ridp++; while (--n) { GRN_B_DEC(*ridp, p); lr += *ridp++; } } if ((flags & GRN_OBJ_WITH_SECTION)) { for (n = block->nrecs; n; n--) { GRN_B_DEC(*sidp++, p); } } for (n = block->nrecs; n; n--) { GRN_B_DEC(*tfp++, p); } if ((flags & GRN_OBJ_WITH_WEIGHT)) { for (n = block->nrecs; n; n--) { GRN_B_DEC(*weightp++, p); } } if ((flags & GRN_OBJ_WITH_POSITION)) { for (n = block->nposts; n; n--) { GRN_B_DEC(*posp, p); spos += *posp++; } } block->rest -= (p - block->bufcur); block->bufcur = p; grn_ii_buffer_fetch(ctx, ii_buffer, block); } { /* Set size and flags of datavec. */ int j = 0; uint32_t f_s = (nrecs < 3) ? 0 : USE_P_ENC; uint32_t f_d = ((nrecs < 16) || (nrecs <= (lr >> 8))) ? 0 : USE_P_ENC; ii_buffer->data_vectors[j].data_size = nrecs; ii_buffer->data_vectors[j++].flags = f_d; if ((flags & GRN_OBJ_WITH_SECTION)) { ii_buffer->data_vectors[j].data_size = nrecs; ii_buffer->data_vectors[j++].flags = f_s; } ii_buffer->data_vectors[j].data_size = nrecs; ii_buffer->data_vectors[j++].flags = f_s; if ((flags & GRN_OBJ_WITH_WEIGHT)) { ii_buffer->data_vectors[j].data_size = nrecs; ii_buffer->data_vectors[j++].flags = f_s; } if ((flags & GRN_OBJ_WITH_POSITION)) { uint32_t f_p = (((nposts < 32) || (nposts <= (spos >> 13))) ? 0 : USE_P_ENC); ii_buffer->data_vectors[j].data_size = nposts; ii_buffer->data_vectors[j++].flags = f_p|ODD; } } } return (max_size + ii_buffer->ii->n_elements) * 4; } static buffer * get_term_buffer(grn_ctx *ctx, grn_ii_buffer *ii_buffer) { if (!ii_buffer->term_buffer) { uint32_t lseg; void *term_buffer; for (lseg = 0; lseg < GRN_II_MAX_LSEG; lseg++) { if (ii_buffer->ii->header->binfo[lseg] == GRN_II_PSEG_NOT_ASSIGNED) { break; } } if (lseg == GRN_II_MAX_LSEG) { DEFINE_NAME(ii_buffer->ii); MERR("[ii][buffer][term-buffer] couldn't find a free buffer: " "<%.*s>", name_size, name); return NULL; } ii_buffer->lseg = lseg; ii_buffer->dseg = segment_get(ctx, ii_buffer->ii); GRN_IO_SEG_REF(ii_buffer->ii->seg, ii_buffer->dseg, term_buffer); ii_buffer->term_buffer = (buffer *)term_buffer; } return ii_buffer->term_buffer; } /* * try_in_place_packing tries to pack a posting in an array element. * * The requirements are as follows: * - nposts == 1 * - nhits == 1 && nrecs == 1 && tf == 0 * - weight == 0 * - !(flags & GRN_OBJ_WITH_SECTION) || (rid < 0x100000 && sid < 0x800) */ static grn_bool try_in_place_packing(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id tid, ii_buffer_block *hits[], int nhits) { if (nhits == 1 && hits[0]->nrecs == 1 && hits[0]->nposts == 1) { grn_id rid; uint32_t sid = 1, tf, pos = 0, weight = 0; ii_buffer_block *block = hits[0]; uint8_t *p = block->bufcur; uint32_t flags = ii_buffer->ii->header->flags; GRN_B_DEC(rid, p); if (flags & GRN_OBJ_WITH_SECTION) { GRN_B_DEC(sid, p); sid++; } GRN_B_DEC(tf, p); if (tf != 0) { GRN_LOG(ctx, GRN_LOG_WARNING, "tf=%d", tf); } if (flags & GRN_OBJ_WITH_WEIGHT) { GRN_B_DEC(weight, p); } if (flags & GRN_OBJ_WITH_POSITION) { GRN_B_DEC(pos, p); } if (!weight) { if (flags & GRN_OBJ_WITH_SECTION) { if (rid < 0x100000 && sid < 0x800) { uint32_t *a = array_get(ctx, ii_buffer->ii, tid); a[0] = (rid << 12) + (sid << 1) + 1; a[1] = pos; array_unref(ii_buffer->ii, tid); } else { return GRN_FALSE; } } else { uint32_t *a = array_get(ctx, ii_buffer->ii, tid); a[0] = (rid << 1) + 1; a[1] = pos; array_unref(ii_buffer->ii, tid); } block->rest -= (p - block->bufcur); block->bufcur = p; grn_ii_buffer_fetch(ctx, ii_buffer, block); return GRN_TRUE; } } return GRN_FALSE; } /* grn_ii_buffer_merge merges hit blocks and pack it. */ static void grn_ii_buffer_merge(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id tid, ii_buffer_block *hits[], int nhits) { if (!try_in_place_packing(ctx, ii_buffer, tid, hits, nhits)) { /* Merge hit blocks and reserve a buffer for packed data. */ size_t max_size = merge_hit_blocks(ctx, ii_buffer, hits, nhits); if (ii_buffer->packed_buf && ii_buffer->packed_buf_size < ii_buffer->packed_len + max_size) { grn_ii_buffer_chunk_flush(ctx, ii_buffer); } if (!ii_buffer->packed_buf) { size_t buf_size = (max_size > II_BUFFER_PACKED_BUF_SIZE) ? max_size : II_BUFFER_PACKED_BUF_SIZE; if ((ii_buffer->packed_buf = GRN_MALLOC(buf_size))) { ii_buffer->packed_buf_size = buf_size; } } { /* Pack postings into the current buffer. */ uint16_t nterm; size_t packed_len; buffer_term *bt; uint32_t *a; buffer *term_buffer; a = array_get(ctx, ii_buffer->ii, tid); if (!a) { DEFINE_NAME(ii_buffer->ii); MERR("[ii][buffer][merge] failed to allocate an array: " "<%.*s>: " "<%u>", name_size, name, tid); return; } term_buffer = get_term_buffer(ctx, ii_buffer); if (!term_buffer) { DEFINE_NAME(ii_buffer->ii); MERR("[ii][buffer][merge] failed to allocate a term buffer: " "<%.*s>: " "<%u>", name_size, name, tid); return; } nterm = term_buffer->header.nterms++; bt = &term_buffer->terms[nterm]; a[0] = SEG2POS(ii_buffer->lseg, (sizeof(buffer_header) + sizeof(buffer_term) * nterm)); packed_len = grn_p_encv(ctx, ii_buffer->data_vectors, ii_buffer->ii->n_elements, ii_buffer->packed_buf + ii_buffer->packed_len); a[1] = ii_buffer->data_vectors[0].data_size; bt->tid = tid; bt->size_in_buffer = 0; bt->pos_in_buffer = 0; bt->size_in_chunk = packed_len; bt->pos_in_chunk = ii_buffer->packed_len; ii_buffer->packed_len += packed_len; if (((ii_buffer->curr_size * ii_buffer->update_buffer_size) + (ii_buffer->total_size * term_buffer->header.nterms * 16)) >= (ii_buffer->total_size * II_BUFFER_NTERMS_PER_BUFFER * 16)) { grn_ii_buffer_chunk_flush(ctx, ii_buffer); } array_unref(ii_buffer->ii, tid); } } } grn_ii_buffer * grn_ii_buffer_open(grn_ctx *ctx, grn_ii *ii, long long unsigned int update_buffer_size) { if (ii && ii->lexicon) { grn_ii_buffer *ii_buffer = GRN_MALLOCN(grn_ii_buffer, 1); if (ii_buffer) { ii_buffer->ii = ii; ii_buffer->lexicon = ii->lexicon; ii_buffer->tmp_lexicon = NULL; ii_buffer->nblocks = 0; ii_buffer->blocks = NULL; ii_buffer->ncounters = II_BUFFER_NCOUNTERS_MARGIN; ii_buffer->block_pos = 0; ii_buffer->filepos = 0; ii_buffer->curr_size = 0; ii_buffer->total_size = 0; ii_buffer->update_buffer_size = update_buffer_size; ii_buffer->counters = GRN_CALLOC(ii_buffer->ncounters * sizeof(ii_buffer_counter)); ii_buffer->term_buffer = NULL; ii_buffer->packed_buf = NULL; ii_buffer->packed_len = 0; ii_buffer->packed_buf_size = 0; ii_buffer->total_chunk_size = 0; ii_buffer->values = NULL; ii_buffer->nvalues = 0; ii_buffer->max_nvalues = 0; ii_buffer->last_rid = 0; if (ii_buffer->counters) { ii_buffer->block_buf = GRN_MALLOCN(grn_id, II_BUFFER_BLOCK_SIZE); if (ii_buffer->block_buf) { grn_snprintf(ii_buffer->tmpfpath, PATH_MAX, PATH_MAX, "%-.256sXXXXXX", grn_io_path(ii->seg)); ii_buffer->block_buf_size = II_BUFFER_BLOCK_SIZE; ii_buffer->tmpfd = grn_mkstemp(ii_buffer->tmpfpath); if (ii_buffer->tmpfd != -1) { grn_table_flags flags; grn_table_get_info(ctx, ii->lexicon, &flags, NULL, NULL, NULL, NULL); if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { grn_pat_cache_enable(ctx, (grn_pat *)ii->lexicon, PAT_CACHE_SIZE); } return ii_buffer; } else { SERR("failed grn_mkstemp(%-.256s)", ii_buffer->tmpfpath); } GRN_FREE(ii_buffer->block_buf); } GRN_FREE(ii_buffer->counters); } GRN_FREE(ii_buffer); } } else { ERR(GRN_INVALID_ARGUMENT, "ii or ii->lexicon is NULL"); } return NULL; } static void ii_buffer_value_init(grn_ctx *ctx, ii_buffer_value *value) { value->sid = 0; value->weight = 0; value->p = NULL; value->len = 0; value->buf = NULL; value->cap = 0; } static void ii_buffer_value_fin(grn_ctx *ctx, ii_buffer_value *value) { if (value->buf) { GRN_FREE(value->buf); } } /* * ii_buffer_values_append appends a value to ii_buffer. * This function deep-copies the value if need_copy == GRN_TRUE. */ static void ii_buffer_values_append(grn_ctx *ctx, grn_ii_buffer *ii_buffer, unsigned int sid, unsigned weight, const char *p, uint32_t len, grn_bool need_copy) { if (ii_buffer->nvalues == ii_buffer->max_nvalues) { unsigned int i; unsigned int new_max_nvalues = ii_buffer->max_nvalues * 2; unsigned int new_size; ii_buffer_value *new_values; if (new_max_nvalues == 0) { new_max_nvalues = 1; } new_size = new_max_nvalues * sizeof(ii_buffer_value); new_values = (ii_buffer_value *)GRN_REALLOC(ii_buffer->values, new_size); if (!new_values) { return; } for (i = ii_buffer->max_nvalues; i < new_max_nvalues; i++) { ii_buffer_value_init(ctx, &new_values[i]); } ii_buffer->values = new_values; ii_buffer->max_nvalues = new_max_nvalues; } { ii_buffer_value *value = &ii_buffer->values[ii_buffer->nvalues]; if (need_copy) { if (len > value->cap) { char *new_buf = (char *)GRN_REALLOC(value->buf, len); if (!new_buf) { return; } value->buf = new_buf; value->cap = len; } grn_memcpy(value->buf, p, len); p = value->buf; } value->sid = sid; value->weight = weight; value->p = p; value->len = len; ii_buffer->nvalues++; } } grn_rc grn_ii_buffer_append(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid, unsigned int sid, grn_obj *value) { if (rid != ii_buffer->last_rid) { if (ii_buffer->last_rid) { grn_ii_buffer_tokenize(ctx, ii_buffer, ii_buffer->last_rid); } ii_buffer->last_rid = rid; } ii_buffer_values_append(ctx, ii_buffer, sid, 0, GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value), GRN_TRUE); return ctx->rc; } /* * grn_ii_buffer_commit completes tokenization and builds an inverted index * from data in a temporary file. */ grn_rc grn_ii_buffer_commit(grn_ctx *ctx, grn_ii_buffer *ii_buffer) { /* Tokenize the remaining values and free resources. */ if (ii_buffer->last_rid && ii_buffer->nvalues) { grn_ii_buffer_tokenize(ctx, ii_buffer, ii_buffer->last_rid); } if (ii_buffer->block_pos) { grn_ii_buffer_flush(ctx, ii_buffer); } if (ii_buffer->tmpfd != -1) { grn_close(ii_buffer->tmpfd); } if (ii_buffer->block_buf) { GRN_FREE(ii_buffer->block_buf); ii_buffer->block_buf = NULL; } if (ii_buffer->counters) { GRN_FREE(ii_buffer->counters); ii_buffer->counters = NULL; } if (ii_buffer->update_buffer_size && ii_buffer->update_buffer_size < 20) { if (ii_buffer->update_buffer_size < 10) { ii_buffer->update_buffer_size = ii_buffer->total_size >> (10 - ii_buffer->update_buffer_size); } else { ii_buffer->update_buffer_size = ii_buffer->total_size << (ii_buffer->update_buffer_size - 10); } } GRN_LOG(ctx, GRN_LOG_DEBUG, "nblocks=%d, update_buffer_size=%" GRN_FMT_INT64U, ii_buffer->nblocks, ii_buffer->update_buffer_size); datavec_init(ctx, ii_buffer->data_vectors, ii_buffer->ii->n_elements, 0, 0); grn_open(ii_buffer->tmpfd, ii_buffer->tmpfpath, O_RDONLY | GRN_OPEN_FLAG_BINARY); if (ii_buffer->tmpfd == -1) { ERRNO_ERR("failed to open path: <%-.256s>", ii_buffer->tmpfpath); return ctx->rc; } { /* Fetch the first term of each block. */ uint32_t i; for (i = 0; i < ii_buffer->nblocks; i++) { grn_ii_buffer_fetch(ctx, ii_buffer, &ii_buffer->blocks[i]); } } { ii_buffer_block **hits; if ((hits = GRN_MALLOCN(ii_buffer_block *, ii_buffer->nblocks))) { grn_id tid; grn_table_cursor *tc; tc = grn_table_cursor_open(ctx, ii_buffer->lexicon, NULL, 0, NULL, 0, 0, -1, II_BUFFER_ORDER); if (tc) { while ((tid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { /* * Find blocks which contain the current term. * Then, merge the postings. */ int nrests = 0; int nhits = 0; uint32_t i; for (i = 0; i < ii_buffer->nblocks; i++) { if (ii_buffer->blocks[i].tid == tid) { hits[nhits++] = &ii_buffer->blocks[i]; } if (ii_buffer->blocks[i].tid) { nrests++; } } if (nhits) { grn_ii_buffer_merge(ctx, ii_buffer, tid, hits, nhits); } if (!nrests) { break; } } if (ii_buffer->packed_len) { grn_ii_buffer_chunk_flush(ctx, ii_buffer); } grn_table_cursor_close(ctx, tc); } GRN_FREE(hits); } } datavec_fin(ctx, ii_buffer->data_vectors); GRN_LOG(ctx, GRN_LOG_DEBUG, "tmpfile_size:%" GRN_FMT_INT64D " > total_chunk_size:%" GRN_FMT_SIZE, ii_buffer->filepos, ii_buffer->total_chunk_size); grn_close(ii_buffer->tmpfd); if (grn_unlink(ii_buffer->tmpfpath) == 0) { GRN_LOG(ctx, GRN_LOG_INFO, "[ii][buffer][commit] removed temporary path: <%-.256s>", ii_buffer->tmpfpath); } else { ERRNO_ERR("[ii][buffer][commit] failed to remove temporary path: <%-.256s>", ii_buffer->tmpfpath); } ii_buffer->tmpfd = -1; return ctx->rc; } grn_rc grn_ii_buffer_close(grn_ctx *ctx, grn_ii_buffer *ii_buffer) { uint32_t i; grn_table_flags flags; grn_table_get_info(ctx, ii_buffer->ii->lexicon, &flags, NULL, NULL, NULL, NULL); if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { grn_pat_cache_disable(ctx, (grn_pat *)ii_buffer->ii->lexicon); } if (ii_buffer->tmp_lexicon) { grn_obj_close(ctx, ii_buffer->tmp_lexicon); } if (ii_buffer->tmpfd != -1) { grn_close(ii_buffer->tmpfd); if (grn_unlink(ii_buffer->tmpfpath) == 0) { GRN_LOG(ctx, GRN_LOG_INFO, "[ii][buffer][close] removed temporary path: <%-.256s>", ii_buffer->tmpfpath); } else { ERRNO_ERR("[ii][buffer][close] failed to remove temporary path: <%-.256s>", ii_buffer->tmpfpath); } } if (ii_buffer->block_buf) { GRN_FREE(ii_buffer->block_buf); } if (ii_buffer->counters) { GRN_FREE(ii_buffer->counters); } if (ii_buffer->blocks) { for (i = 0; i < ii_buffer->nblocks; i++) { if (ii_buffer->blocks[i].buffer) { GRN_FREE(ii_buffer->blocks[i].buffer); } } GRN_FREE(ii_buffer->blocks); } if (ii_buffer->values) { for (i = 0; i < ii_buffer->max_nvalues; i++) { ii_buffer_value_fin(ctx, &ii_buffer->values[i]); } GRN_FREE(ii_buffer->values); } GRN_FREE(ii_buffer); return ctx->rc; } /* * grn_ii_buffer_parse tokenizes values to be indexed. * * For each record of the target table, grn_ii_buffer_parse makes a list of * target values and calls grn_ii_buffer_tokenize. To make a list of target * values, ii_buffer_values_append is called for each value. Note that * ii_buffer_values_append is called for each element for a vector. */ static void grn_ii_buffer_parse(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_obj *target, int ncols, grn_obj **cols) { grn_table_cursor *tc; grn_obj *vobjs; if ((vobjs = GRN_MALLOCN(grn_obj, ncols))) { int i; for (i = 0; i < ncols; i++) { GRN_TEXT_INIT(&vobjs[i], 0); } if ((tc = grn_table_cursor_open(ctx, target, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID))) { grn_id rid; while ((rid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { unsigned int j; int sid; grn_obj **col; for (sid = 1, col = cols; sid <= ncols; sid++, col++) { grn_obj *rv = &vobjs[sid - 1]; grn_obj_reinit_for(ctx, rv, *col); if (GRN_OBJ_TABLEP(*col)) { grn_table_get_key2(ctx, *col, rid, rv); } else { grn_obj_get_value(ctx, *col, rid, rv); } switch (rv->header.type) { case GRN_BULK : ii_buffer_values_append(ctx, ii_buffer, sid, 0, GRN_TEXT_VALUE(rv), GRN_TEXT_LEN(rv), GRN_FALSE); break; case GRN_UVECTOR : { unsigned int size; unsigned int elem_size; size = grn_uvector_size(ctx, rv); elem_size = grn_uvector_element_size(ctx, rv); for (j = 0; j < size; j++) { ii_buffer_values_append(ctx, ii_buffer, sid, 0, GRN_BULK_HEAD(rv) + (elem_size * j), elem_size, GRN_FALSE); } } break; case GRN_VECTOR : if (rv->u.v.body) { int j; int n_sections = rv->u.v.n_sections; grn_section *sections = rv->u.v.sections; const char *head = GRN_BULK_HEAD(rv->u.v.body); for (j = 0; j < n_sections; j++) { grn_section *section = sections + j; if (section->length == 0) { continue; } ii_buffer_values_append(ctx, ii_buffer, sid, section->weight, head + section->offset, section->length, GRN_FALSE); } } break; default : ERR(GRN_INVALID_ARGUMENT, "[index] invalid object assigned as value"); break; } } grn_ii_buffer_tokenize(ctx, ii_buffer, rid); } grn_table_cursor_close(ctx, tc); } for (i = 0; i < ncols; i++) { GRN_OBJ_FIN(ctx, &vobjs[i]); } GRN_FREE(vobjs); } } grn_rc grn_ii_build(grn_ctx *ctx, grn_ii *ii, uint64_t sparsity) { grn_ii_buffer *ii_buffer; { /* Do nothing if there are no targets. */ grn_obj *data_table = grn_ctx_at(ctx, DB_OBJ(ii)->range); if (!data_table) { return ctx->rc; } if (grn_table_size(ctx, data_table) == 0) { return ctx->rc; } } ii_buffer = grn_ii_buffer_open(ctx, ii, sparsity); if (ii_buffer) { grn_id *source = (grn_id *)ii->obj.source; if (ii->obj.source_size && ii->obj.source) { int ncols = ii->obj.source_size / sizeof(grn_id); grn_obj **cols = GRN_MALLOCN(grn_obj *, ncols); if (cols) { int i; for (i = 0; i < ncols; i++) { if (!(cols[i] = grn_ctx_at(ctx, source[i]))) { break; } } if (i == ncols) { /* All the source columns are available. */ grn_obj *target = cols[0]; if (!GRN_OBJ_TABLEP(target)) { target = grn_ctx_at(ctx, target->header.domain); } if (target) { grn_ii_buffer_parse(ctx, ii_buffer, target, ncols, cols); grn_ii_buffer_commit(ctx, ii_buffer); } else { ERR(GRN_INVALID_ARGUMENT, "failed to resolve the target"); } } else { ERR(GRN_INVALID_ARGUMENT, "failed to resolve a column (%d)", i); } GRN_FREE(cols); } } else { ERR(GRN_INVALID_ARGUMENT, "ii->obj.source is void"); } grn_ii_buffer_close(ctx, ii_buffer); } return ctx->rc; } /* * ========================================================================== * The following part provides constants, structures and functions for static * indexing. * ========================================================================== */ #define GRN_II_BUILDER_BUFFER_CHUNK_SIZE (S_CHUNK >> 2) #define GRN_II_BUILDER_MAX_LEXICON_CACHE_SIZE (1 << 24) #define GRN_II_BUILDER_MIN_BLOCK_THRESHOLD 1 #define GRN_II_BUILDER_MAX_BLOCK_THRESHOLD (1 << 28) #define GRN_II_BUILDER_MIN_FILE_BUF_SIZE (1 << 12) #define GRN_II_BUILDER_MAX_FILE_BUF_SIZE (1 << 30) #define GRN_II_BUILDER_MIN_BLOCK_BUF_SIZE (1 << 12) #define GRN_II_BUILDER_MAX_BLOCK_BUF_SIZE (1 << 30) #define GRN_II_BUILDER_MIN_CHUNK_THRESHOLD 1 #define GRN_II_BUILDER_MAX_CHUNK_THRESHOLD (1 << 28) #define GRN_II_BUILDER_MIN_BUFFER_MAX_N_TERMS 1 #define GRN_II_BUILDER_MAX_BUFFER_MAX_N_TERMS \ ((S_SEGMENT - sizeof(buffer_header)) / sizeof(buffer_term)) struct grn_ii_builder_options { uint32_t lexicon_cache_size; /* Cache size of temporary lexicon */ /* A block is flushed if builder->n reaches this value. */ uint32_t block_threshold; uint32_t file_buf_size; /* Buffer size for buffered output */ uint32_t block_buf_size; /* Buffer size for buffered input */ /* A chunk is flushed if chunk->n reaches this value. */ uint32_t chunk_threshold; uint32_t buffer_max_n_terms; /* Maximum number of terms in each buffer */ }; static const grn_ii_builder_options grn_ii_builder_default_options = { 0x80000, /* lexicon_cache_size */ 0x4000000, /* block_threshold */ 0x10000, /* file_buf_size */ 0x10000, /* block_buf_size */ 0x1000, /* chunk_threshold */ 0x3000, /* buffer_max_n_terms */ }; /* grn_ii_builder_options_init fills options with the default options. */ void grn_ii_builder_options_init(grn_ii_builder_options *options) { *options = grn_ii_builder_default_options; } /* grn_ii_builder_options_fix fixes out-of-range options. */ static void grn_ii_builder_options_fix(grn_ii_builder_options *options) { if (options->lexicon_cache_size > GRN_II_BUILDER_MAX_LEXICON_CACHE_SIZE) { options->lexicon_cache_size = GRN_II_BUILDER_MAX_LEXICON_CACHE_SIZE; } if (options->block_threshold < GRN_II_BUILDER_MIN_BLOCK_THRESHOLD) { options->block_threshold = GRN_II_BUILDER_MIN_BLOCK_THRESHOLD; } if (options->block_threshold > GRN_II_BUILDER_MAX_BLOCK_THRESHOLD) { options->block_threshold = GRN_II_BUILDER_MAX_BLOCK_THRESHOLD; } if (options->file_buf_size < GRN_II_BUILDER_MIN_FILE_BUF_SIZE) { options->file_buf_size = GRN_II_BUILDER_MIN_FILE_BUF_SIZE; } if (options->file_buf_size > GRN_II_BUILDER_MAX_FILE_BUF_SIZE) { options->file_buf_size = GRN_II_BUILDER_MAX_FILE_BUF_SIZE; } if (options->block_buf_size < GRN_II_BUILDER_MIN_BLOCK_BUF_SIZE) { options->block_buf_size = GRN_II_BUILDER_MIN_BLOCK_BUF_SIZE; } if (options->block_buf_size > GRN_II_BUILDER_MAX_BLOCK_BUF_SIZE) { options->block_buf_size = GRN_II_BUILDER_MAX_BLOCK_BUF_SIZE; } if (options->chunk_threshold < GRN_II_BUILDER_MIN_CHUNK_THRESHOLD) { options->chunk_threshold = GRN_II_BUILDER_MIN_CHUNK_THRESHOLD; } if (options->chunk_threshold > GRN_II_BUILDER_MAX_CHUNK_THRESHOLD) { options->chunk_threshold = GRN_II_BUILDER_MAX_CHUNK_THRESHOLD; } if (options->buffer_max_n_terms < GRN_II_BUILDER_MIN_BUFFER_MAX_N_TERMS) { options->buffer_max_n_terms = GRN_II_BUILDER_MIN_BUFFER_MAX_N_TERMS; } if (options->buffer_max_n_terms > GRN_II_BUILDER_MAX_BUFFER_MAX_N_TERMS) { options->buffer_max_n_terms = GRN_II_BUILDER_MAX_BUFFER_MAX_N_TERMS; } } #define GRN_II_BUILDER_TERM_INPLACE_SIZE\ (sizeof(grn_ii_builder_term) - (uintptr_t)&((grn_ii_builder_term *)0)->dummy) typedef struct { grn_id rid; /* Last record ID */ uint32_t sid; /* Last section ID */ /* Last position (GRN_OBJ_WITH_POSITION) or frequency. */ uint32_t pos_or_freq; uint32_t offset; /* Buffer write offset */ uint32_t size; /* Buffer size */ uint32_t dummy; /* Padding */ uint8_t *buf; /* Buffer (to be freed) */ } grn_ii_builder_term; /* grn_ii_builder_term_is_inplace returns whether a term buffer is inplace. */ inline static grn_bool grn_ii_builder_term_is_inplace(grn_ii_builder_term *term) { return term->size == GRN_II_BUILDER_TERM_INPLACE_SIZE; } /* grn_ii_builder_term_get_buf returns a term buffer. */ inline static uint8_t * grn_ii_builder_term_get_buf(grn_ii_builder_term *term) { if (grn_ii_builder_term_is_inplace(term)) { return (uint8_t *)&term->dummy; } else { return term->buf; } } /* * grn_ii_builder_term_init initializes a term. Note that an initialized term * must be finalized by grn_ii_builder_term_fin. */ static void grn_ii_builder_term_init(grn_ctx *ctx, grn_ii_builder_term *term) { term->rid = GRN_ID_NIL; term->sid = 0; term->pos_or_freq = 0; term->offset = 0; term->size = GRN_II_BUILDER_TERM_INPLACE_SIZE; } /* grn_ii_builder_term_fin finalizes a term. */ static void grn_ii_builder_term_fin(grn_ctx *ctx, grn_ii_builder_term *term) { if (!grn_ii_builder_term_is_inplace(term)) { GRN_FREE(term->buf); } } /* grn_ii_builder_term_reinit reinitializes a term. */ static void grn_ii_builder_term_reinit(grn_ctx *ctx, grn_ii_builder_term *term) { grn_ii_builder_term_fin(ctx, term); grn_ii_builder_term_init(ctx, term); } /* grn_ii_builder_term_extend extends a term buffer. */ static grn_rc grn_ii_builder_term_extend(grn_ctx *ctx, grn_ii_builder_term *term) { uint8_t *buf; uint32_t size = term->size * 2; if (grn_ii_builder_term_is_inplace(term)) { buf = (uint8_t *)GRN_MALLOC(size); if (!buf) { ERR(GRN_NO_MEMORY_AVAILABLE, "failed to allocate memory for term buffer: size = %u", size); return ctx->rc; } grn_memcpy(buf, &term->dummy, term->offset); } else { buf = (uint8_t *)GRN_REALLOC(term->buf, size); if (!buf) { ERR(GRN_NO_MEMORY_AVAILABLE, "failed to reallocate memory for term buffer: size = %u", size); return ctx->rc; } } term->buf = buf; term->size = size; return GRN_SUCCESS; } /* grn_ii_builder_term_append appends an integer to a term buffer. */ inline static grn_rc grn_ii_builder_term_append(grn_ctx *ctx, grn_ii_builder_term *term, uint64_t value) { uint8_t *p; if (value < (uint64_t)1 << 5) { if (term->offset + 1 > term->size) { grn_rc rc = grn_ii_builder_term_extend(ctx, term); if (rc != GRN_SUCCESS) { return rc; } } p = grn_ii_builder_term_get_buf(term) + term->offset; p[0] = (uint8_t)value; term->offset++; return GRN_SUCCESS; } else if (value < (uint64_t)1 << 13) { if (term->offset + 2 > term->size) { grn_rc rc = grn_ii_builder_term_extend(ctx, term); if (rc != GRN_SUCCESS) { return rc; } } p = grn_ii_builder_term_get_buf(term) + term->offset; p[0] = (uint8_t)((value & 0x1f) | (1 << 5)); p[1] = (uint8_t)(value >> 5); term->offset += 2; return GRN_SUCCESS; } else { uint8_t i, n; if (value < (uint64_t)1 << 21) { n = 3; } else if (value < (uint64_t)1 << 29) { n = 4; } else if (value < (uint64_t)1 << 37) { n = 5; } else if (value < (uint64_t)1 << 45) { n = 6; } else if (value < (uint64_t)1 << 53) { n = 7; } else { n = 8; } if (term->offset + n > term->size) { grn_rc rc = grn_ii_builder_term_extend(ctx, term); if (rc != GRN_SUCCESS) { return rc; } } p = grn_ii_builder_term_get_buf(term) + term->offset; p[0] = (uint8_t)(value & 0x1f) | ((n - 1) << 5); value >>= 5; for (i = 1; i < n; i++) { p[i] = (uint8_t)value; value >>= 8; } term->offset += n; return GRN_SUCCESS; } } typedef struct { uint64_t offset; /* File offset */ uint32_t rest; /* Remaining size */ uint8_t *buf; /* Buffer (to be freed) */ uint8_t *cur; /* Current pointer */ uint8_t *end; /* End pointer */ uint32_t tid; /* Term ID */ } grn_ii_builder_block; /* * grn_ii_builder_block_init initializes a block. Note that an initialized * block must be finalized by grn_ii_builder_block_fin. */ static void grn_ii_builder_block_init(grn_ctx *ctx, grn_ii_builder_block *block) { block->offset = 0; block->rest = 0; block->buf = NULL; block->cur = NULL; block->end = NULL; block->tid = GRN_ID_NIL; } /* grn_ii_builder_block_fin finalizes a block. */ static void grn_ii_builder_block_fin(grn_ctx *ctx, grn_ii_builder_block *block) { if (block->buf) { GRN_FREE(block->buf); } } /* * grn_ii_builder_block_next reads the next integer. Note that this function * returns GRN_END_OF_DATA if it reaches the end of a block. */ inline static grn_rc grn_ii_builder_block_next(grn_ctx *ctx, grn_ii_builder_block *block, uint64_t *value) { uint8_t n; if (block->cur == block->end) { return GRN_END_OF_DATA; } n = (*block->cur >> 5) + 1; if (n > block->end - block->cur) { return GRN_END_OF_DATA; } *value = 0; switch (n) { case 8 : *value |= (uint64_t)block->cur[7] << 53; case 7 : *value |= (uint64_t)block->cur[6] << 45; case 6 : *value |= (uint64_t)block->cur[5] << 37; case 5 : *value |= (uint64_t)block->cur[4] << 29; case 4 : *value |= (uint64_t)block->cur[3] << 21; case 3 : *value |= (uint64_t)block->cur[2] << 13; case 2 : *value |= (uint64_t)block->cur[1] << 5; case 1 : *value |= block->cur[0] & 0x1f; break; } block->cur += n; return GRN_SUCCESS; } typedef struct { grn_ii *ii; /* Inverted index */ uint32_t buf_id; /* Buffer ID */ uint32_t buf_seg_id; /* Buffer segment ID */ buffer *buf; /* Buffer (to be unreferenced) */ uint32_t chunk_id; /* Chunk ID */ uint32_t chunk_seg_id; /* Chunk segment ID */ uint8_t *chunk; /* Chunk (to be unreferenced) */ uint32_t chunk_offset; /* Chunk write position */ uint32_t chunk_size; /* Chunk size */ } grn_ii_builder_buffer; /* * grn_ii_builder_buffer_init initializes a buffer. Note that a buffer must be * finalized by grn_ii_builder_buffer_fin. */ static void grn_ii_builder_buffer_init(grn_ctx *ctx, grn_ii_builder_buffer *buf, grn_ii *ii) { buf->ii = ii; buf->buf_id = 0; buf->buf_seg_id = 0; buf->buf = NULL; buf->chunk_id = 0; buf->chunk_seg_id = 0; buf->chunk = NULL; buf->chunk_offset = 0; buf->chunk_size = 0; } /* grn_ii_builder_buffer_fin finalizes a buffer. */ static void grn_ii_builder_buffer_fin(grn_ctx *ctx, grn_ii_builder_buffer *buf) { if (buf->buf) { GRN_IO_SEG_UNREF(buf->ii->seg, buf->buf_seg_id); } if (buf->chunk) { GRN_IO_SEG_UNREF(buf->ii->chunk, buf->chunk_seg_id); } } /* grn_ii_builder_buffer_is_assigned returns whether a buffer is assigned. */ static grn_bool grn_ii_builder_buffer_is_assigned(grn_ctx *ctx, grn_ii_builder_buffer *buf) { return buf->buf != NULL; } /* grn_ii_builder_buffer_assign assigns a buffer. */ static grn_rc grn_ii_builder_buffer_assign(grn_ctx *ctx, grn_ii_builder_buffer *buf, size_t min_chunk_size) { void *seg; size_t chunk_size; grn_rc rc; /* Create a buffer. */ buf->buf_id = GRN_II_PSEG_NOT_ASSIGNED; rc = buffer_segment_new(ctx, buf->ii, &buf->buf_id); if (rc != GRN_SUCCESS) { if (ctx->rc != GRN_SUCCESS) { ERR(rc, "failed to allocate segment for buffer"); } return rc; } buf->buf_seg_id = buf->ii->header->binfo[buf->buf_id]; GRN_IO_SEG_REF(buf->ii->seg, buf->buf_seg_id, seg); if (!seg) { if (ctx->rc == GRN_SUCCESS) { ERR(GRN_UNKNOWN_ERROR, "failed access buffer segment: buf_id = %u, seg_id = %u", buf->buf_id, buf->buf_seg_id); } return ctx->rc; } buf->buf = (buffer *)seg; /* Create a chunk. */ chunk_size = GRN_II_BUILDER_BUFFER_CHUNK_SIZE; while (chunk_size < min_chunk_size) { chunk_size *= 2; } rc = chunk_new(ctx, buf->ii, &buf->chunk_id, chunk_size); if (rc != GRN_SUCCESS) { return rc; } buf->chunk_seg_id = buf->chunk_id >> GRN_II_N_CHUNK_VARIATION; GRN_IO_SEG_REF(buf->ii->chunk, buf->chunk_seg_id, seg); if (!seg) { if (ctx->rc == GRN_SUCCESS) { ERR(GRN_UNKNOWN_ERROR, "failed access chunk segment: chunk_id = %u, seg_id = %u", buf->chunk_id, buf->chunk_seg_id); } return ctx->rc; } buf->chunk = (uint8_t *)seg; buf->chunk += (buf->chunk_id & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << GRN_II_W_LEAST_CHUNK; buf->chunk_offset = 0; buf->chunk_size = chunk_size; buf->buf->header.chunk = buf->chunk_id; buf->buf->header.chunk_size = chunk_size; buf->buf->header.buffer_free = S_SEGMENT - sizeof(buffer_header); buf->buf->header.nterms = 0; buf->buf->header.nterms_void = 0; buf->ii->header->total_chunk_size += chunk_size; return GRN_SUCCESS; } /* grn_ii_builder_buffer_flush flushes a buffer. */ static grn_rc grn_ii_builder_buffer_flush(grn_ctx *ctx, grn_ii_builder_buffer *buf) { grn_ii *ii; buf->buf->header.buffer_free = S_SEGMENT - sizeof(buffer_header) - buf->buf->header.nterms * sizeof(buffer_term); GRN_LOG(ctx, GRN_LOG_DEBUG, "n_terms = %u, chunk_offset = %u, chunk_size = %u, total = %" GRN_FMT_INT64U "KB", buf->buf->header.nterms, buf->chunk_offset, buf->buf->header.chunk_size, buf->ii->header->total_chunk_size >> 10); ii = buf->ii; grn_ii_builder_buffer_fin(ctx, buf); grn_ii_builder_buffer_init(ctx, buf, ii); return GRN_SUCCESS; } typedef struct { grn_id tid; /* Term ID */ uint32_t n; /* Number of integers in buffers */ grn_id rid; /* Record ID */ uint32_t rid_gap; /* Record ID gap */ uint64_t pos_sum; /* Sum of position gaps */ uint32_t offset; /* Write offset */ uint32_t size; /* Buffer size */ grn_id *rid_buf; /* Buffer for record IDs (to be freed) */ uint32_t *sid_buf; /* Buffer for section IDs (to be freed) */ uint32_t *freq_buf; /* Buffer for frequencies (to be freed) */ uint32_t *weight_buf; /* Buffer for weights (to be freed) */ uint32_t pos_offset; /* Write offset of pos_buf */ uint32_t pos_size; /* Buffer size of pos_buf */ uint32_t *pos_buf; /* Buffer for positions (to be freed) */ size_t enc_offset; /* Write offset of enc_buf */ size_t enc_size; /* Buffer size of enc_buf */ uint8_t *enc_buf; /* Buffer for encoded data (to be freed) */ } grn_ii_builder_chunk; /* * grn_ii_builder_chunk_init initializes a chunk. Note that an initialized * chunk must be finalized by grn_ii_builder_chunk_fin. */ static void grn_ii_builder_chunk_init(grn_ctx *ctx, grn_ii_builder_chunk *chunk) { chunk->tid = GRN_ID_NIL; chunk->n = 0; chunk->rid = GRN_ID_NIL; chunk->rid_gap = 0; chunk->pos_sum = 0; chunk->offset = 0; chunk->size = 0; chunk->rid_buf = NULL; chunk->sid_buf = NULL; chunk->freq_buf = NULL; chunk->weight_buf = NULL; chunk->pos_offset = 0; chunk->pos_size = 0; chunk->pos_buf = NULL; chunk->enc_offset = 0; chunk->enc_size = 0; chunk->enc_buf = NULL; } /* grn_ii_builder_chunk_fin finalizes a chunk. */ static void grn_ii_builder_chunk_fin(grn_ctx *ctx, grn_ii_builder_chunk *chunk) { if (chunk->enc_buf) { GRN_FREE(chunk->enc_buf); } if (chunk->pos_buf) { GRN_FREE(chunk->pos_buf); } if (chunk->weight_buf) { GRN_FREE(chunk->weight_buf); } if (chunk->freq_buf) { GRN_FREE(chunk->freq_buf); } if (chunk->sid_buf) { GRN_FREE(chunk->sid_buf); } if (chunk->rid_buf) { GRN_FREE(chunk->rid_buf); } } /* * grn_ii_builder_chunk_clear clears stats except rid and buffers except * enc_buf. */ static void grn_ii_builder_chunk_clear(grn_ctx *ctx, grn_ii_builder_chunk *chunk) { chunk->n = 0; chunk->rid_gap = 0; chunk->pos_sum = 0; chunk->offset = 0; chunk->pos_offset = 0; } /* * grn_ii_builder_chunk_extend_bufs extends buffers except pos_buf and enc_buf. */ static grn_rc grn_ii_builder_chunk_extend_bufs(grn_ctx *ctx, grn_ii_builder_chunk *chunk, uint32_t ii_flags) { uint32_t *buf, size = chunk->size ? chunk->size * 2 : 1; size_t n_bytes = size * sizeof(uint32_t); buf = (uint32_t *)GRN_REALLOC(chunk->rid_buf, n_bytes); if (!buf) { ERR(GRN_NO_MEMORY_AVAILABLE, "failed to allocate memory for record IDs: n_bytes = %" GRN_FMT_SIZE, n_bytes); return ctx->rc; } chunk->rid_buf = buf; if (ii_flags & GRN_OBJ_WITH_SECTION) { buf = (uint32_t *)GRN_REALLOC(chunk->sid_buf, n_bytes); if (!buf) { ERR(GRN_NO_MEMORY_AVAILABLE, "failed to allocate memory for section IDs:" " n_bytes = %" GRN_FMT_SIZE, n_bytes); return ctx->rc; } chunk->sid_buf = buf; } buf = (uint32_t *)GRN_REALLOC(chunk->freq_buf, n_bytes); if (!buf) { ERR(GRN_NO_MEMORY_AVAILABLE, "failed to allocate memory for frequencies: n_bytes = %" GRN_FMT_SIZE, n_bytes); return ctx->rc; } chunk->freq_buf = buf; if (ii_flags & GRN_OBJ_WITH_WEIGHT) { buf = (uint32_t *)GRN_REALLOC(chunk->weight_buf, n_bytes); if (!buf) { ERR(GRN_NO_MEMORY_AVAILABLE, "failed to allocate memory for weights: n_bytes = %" GRN_FMT_SIZE, n_bytes); return ctx->rc; } chunk->weight_buf = buf; } chunk->size = size; return GRN_SUCCESS; } /* grn_ii_builder_chunk_extend_pos_buf extends pos_buf. */ static grn_rc grn_ii_builder_chunk_extend_pos_buf(grn_ctx *ctx, grn_ii_builder_chunk *chunk) { uint32_t *buf, size = chunk->pos_size ? chunk->pos_size * 2 : 1; size_t n_bytes = size * sizeof(uint32_t); buf = (uint32_t *)GRN_REALLOC(chunk->pos_buf, n_bytes); if (!buf) { ERR(GRN_NO_MEMORY_AVAILABLE, "failed to allocate memory for positions: n_bytes = %" GRN_FMT_SIZE, n_bytes); return ctx->rc; } chunk->pos_buf = buf; chunk->pos_size = size; return GRN_SUCCESS; } /* * grn_ii_builder_chunk_reserve_enc_buf estimates a size that is enough to * store encoded data and allocates memory to enc_buf. */ static grn_rc grn_ii_builder_chunk_reserve_enc_buf(grn_ctx *ctx, grn_ii_builder_chunk *chunk, uint32_t n_cinfos) { size_t rich_size = (chunk->n + 4) * sizeof(uint32_t) + n_cinfos * sizeof(chunk_info); if (chunk->enc_size < rich_size) { size_t size = chunk->enc_size ? chunk->enc_size * 2 : 1; uint8_t *buf; while (size < rich_size) { size *= 2; } buf = GRN_REALLOC(chunk->enc_buf, size); if (!buf) { ERR(GRN_NO_MEMORY_AVAILABLE, "failed to allocate memory for encoding: size = %" GRN_FMT_SIZE, size); return ctx->rc; } chunk->enc_buf = buf; chunk->enc_size = size; } chunk->enc_offset = 0; return GRN_SUCCESS; } /* grn_ii_builder_chunk_encode encodes a chunk buffer. */ static void grn_ii_builder_chunk_encode_buf(grn_ctx *ctx, grn_ii_builder_chunk *chunk, uint32_t *values, uint32_t n_values, grn_bool use_p_enc) { uint8_t *p = chunk->enc_buf + chunk->enc_offset; uint32_t i; if (use_p_enc) { uint8_t freq[33]; uint32_t buf[UNIT_SIZE]; while (n_values >= UNIT_SIZE) { memset(freq, 0, 33); for (i = 0; i < UNIT_SIZE; i++) { buf[i] = values[i]; if (buf[i]) { uint32_t w; GRN_BIT_SCAN_REV(buf[i], w); freq[w + 1]++; } else { freq[0]++; } } p = pack(buf, UNIT_SIZE, freq, p); values += UNIT_SIZE; n_values -= UNIT_SIZE; } if (n_values) { memset(freq, 0, 33); for (i = 0; i < n_values; i++) { buf[i] = values[i]; if (buf[i]) { uint32_t w; GRN_BIT_SCAN_REV(buf[i], w); freq[w + 1]++; } else { freq[0]++; } } p = pack(buf, n_values, freq, p); } } else { for (i = 0; i < n_values; i++) { GRN_B_ENC(values[i], p); } } chunk->enc_offset = p - chunk->enc_buf; } /* grn_ii_builder_chunk_encode encodes a chunk. */ static grn_rc grn_ii_builder_chunk_encode(grn_ctx *ctx, grn_ii_builder_chunk *chunk, chunk_info *cinfos, uint32_t n_cinfos) { grn_rc rc; uint8_t *p; uint8_t shift = 0, use_p_enc_flags = 0; uint8_t rid_use_p_enc, rest_use_p_enc, pos_use_p_enc = 0; /* Choose an encoding. */ rid_use_p_enc = chunk->offset >= 16 && chunk->offset > (chunk->rid >> 8); use_p_enc_flags |= rid_use_p_enc << shift++; rest_use_p_enc = chunk->offset >= 3; if (chunk->sid_buf) { use_p_enc_flags |= rest_use_p_enc << shift++; } use_p_enc_flags |= rest_use_p_enc << shift++; if (chunk->weight_buf) { use_p_enc_flags |= rest_use_p_enc << shift++; } if (chunk->pos_buf) { pos_use_p_enc = chunk->pos_offset >= 32 && chunk->pos_offset > (chunk->pos_sum >> 13); use_p_enc_flags |= pos_use_p_enc << shift++; } rc = grn_ii_builder_chunk_reserve_enc_buf(ctx, chunk, n_cinfos); if (rc != GRN_SUCCESS) { return rc; } /* Encode a header. */ p = chunk->enc_buf; if (n_cinfos) { uint32_t i; GRN_B_ENC(n_cinfos, p); for (i = 0; i < n_cinfos; i++) { GRN_B_ENC(cinfos[i].segno, p); GRN_B_ENC(cinfos[i].size, p); GRN_B_ENC(cinfos[i].dgap, p); } } if (use_p_enc_flags) { GRN_B_ENC(use_p_enc_flags << 1, p); GRN_B_ENC(chunk->offset, p); if (chunk->pos_buf) { GRN_B_ENC(chunk->pos_offset - chunk->offset, p); } } else { GRN_B_ENC((chunk->offset << 1) | 1, p); } chunk->enc_offset = p - chunk->enc_buf; /* Encode a body. */ grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->rid_buf, chunk->offset, rid_use_p_enc); if (chunk->sid_buf) { grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->sid_buf, chunk->offset, rest_use_p_enc); } grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->freq_buf, chunk->offset, rest_use_p_enc); if (chunk->weight_buf) { grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->weight_buf, chunk->offset, rest_use_p_enc); } if (chunk->pos_buf) { grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->pos_buf, chunk->pos_offset, pos_use_p_enc); } return GRN_SUCCESS; } typedef struct { grn_ii *ii; /* Building inverted index */ grn_ii_builder_options options; /* Options */ grn_obj *src_table; /* Source table */ grn_obj **srcs; /* Source columns (to be freed) */ uint32_t n_srcs; /* Number of source columns */ uint8_t sid_bits; /* Number of bits for section ID */ uint64_t sid_mask; /* Mask bits for section ID */ grn_obj *lexicon; /* Block lexicon (to be closed) */ grn_obj *tokenizer; /* Lexicon's tokenizer */ grn_obj *normalizer; /* Lexicon's normalzier */ uint32_t n; /* Number of integers appended to the current block */ grn_id rid; /* Record ID */ uint32_t sid; /* Section ID */ uint32_t pos; /* Position */ grn_ii_builder_term *terms; /* Terms (to be freed) */ uint32_t n_terms; /* Number of distinct terms */ uint32_t max_n_terms; /* Maximum number of distinct terms */ uint32_t terms_size; /* Buffer size of terms */ /* A temporary file to save blocks. */ char path[PATH_MAX]; /* File path */ int fd; /* File descriptor (to be closed) */ uint8_t *file_buf; /* File buffer for buffered output (to be freed) */ uint32_t file_buf_offset; /* File buffer write offset */ grn_ii_builder_block *blocks; /* Blocks (to be freed) */ uint32_t n_blocks; /* Number of blocks */ uint32_t blocks_size; /* Buffer size of blocks */ grn_ii_builder_buffer buf; /* Buffer (to be finalized) */ grn_ii_builder_chunk chunk; /* Chunk (to be finalized) */ uint32_t df; /* Document frequency (number of sections) */ chunk_info *cinfos; /* Chunk headers (to be freed) */ uint32_t n_cinfos; /* Number of chunks */ uint32_t cinfos_size; /* Size of cinfos */ } grn_ii_builder; /* * grn_ii_builder_init initializes a builder. Note that an initialized builder * must be finalized by grn_ii_builder_fin. */ static grn_rc grn_ii_builder_init(grn_ctx *ctx, grn_ii_builder *builder, grn_ii *ii, const grn_ii_builder_options *options) { builder->ii = ii; builder->options = *options; if (grn_ii_builder_block_threshold_force > 0) { builder->options.block_threshold = grn_ii_builder_block_threshold_force; } grn_ii_builder_options_fix(&builder->options); builder->src_table = NULL; builder->srcs = NULL; builder->n_srcs = 0; builder->sid_bits = 0; builder->sid_mask = 0; builder->lexicon = NULL; builder->tokenizer = NULL; builder->normalizer = NULL; builder->n = 0; builder->rid = GRN_ID_NIL; builder->sid = 0; builder->pos = 0; builder->terms = NULL; builder->n_terms = 0; builder->max_n_terms = 0; builder->terms_size = 0; builder->path[0] = '\0'; builder->fd = -1; builder->file_buf = NULL; builder->file_buf_offset = 0; builder->blocks = NULL; builder->n_blocks = 0; builder->blocks_size = 0; grn_ii_builder_buffer_init(ctx, &builder->buf, ii); grn_ii_builder_chunk_init(ctx, &builder->chunk); builder->df = 0; builder->cinfos = NULL; builder->n_cinfos = 0; builder->cinfos_size = 0; return GRN_SUCCESS; } /* grn_ii_builder_fin_terms finalizes terms. */ static void grn_ii_builder_fin_terms(grn_ctx *ctx, grn_ii_builder *builder) { if (builder->terms) { uint32_t i; for (i = 0; i < builder->max_n_terms; i++) { grn_ii_builder_term_fin(ctx, &builder->terms[i]); } GRN_FREE(builder->terms); /* To avoid double finalization. */ builder->terms = NULL; } } /* grn_ii_builder_fin finalizes a builder. */ static grn_rc grn_ii_builder_fin(grn_ctx *ctx, grn_ii_builder *builder) { if (builder->cinfos) { GRN_FREE(builder->cinfos); } grn_ii_builder_chunk_fin(ctx, &builder->chunk); grn_ii_builder_buffer_fin(ctx, &builder->buf); if (builder->blocks) { uint32_t i; for (i = 0; i < builder->n_blocks; i++) { grn_ii_builder_block_fin(ctx, &builder->blocks[i]); } GRN_FREE(builder->blocks); } if (builder->file_buf) { GRN_FREE(builder->file_buf); } if (builder->fd != -1) { grn_close(builder->fd); if (grn_unlink(builder->path) == 0) { GRN_LOG(ctx, GRN_LOG_INFO, "[ii][builder][fin] removed path: <%-.256s>", builder->path); } else { ERRNO_ERR("[ii][builder][fin] failed to remove path: <%-.256s>", builder->path); } } grn_ii_builder_fin_terms(ctx, builder); if (builder->lexicon) { grn_obj_close(ctx, builder->lexicon); } if (builder->srcs) { GRN_FREE(builder->srcs); } return GRN_SUCCESS; } /* * grn_ii_builder_open creates a builder. Note that a builder must be closed by * grn_ii_builder_close. */ static grn_rc grn_ii_builder_open(grn_ctx *ctx, grn_ii *ii, const grn_ii_builder_options *options, grn_ii_builder **builder) { grn_rc rc; grn_ii_builder *new_builder = GRN_MALLOCN(grn_ii_builder, 1); if (!new_builder) { return GRN_NO_MEMORY_AVAILABLE; } if (!options) { options = &grn_ii_builder_default_options; } rc = grn_ii_builder_init(ctx, new_builder, ii, options); if (rc != GRN_SUCCESS) { GRN_FREE(new_builder); return rc; } *builder = new_builder; return GRN_SUCCESS; } /* grn_ii_builder_close closes a builder. */ static grn_rc grn_ii_builder_close(grn_ctx *ctx, grn_ii_builder *builder) { grn_rc rc; if (!builder) { ERR(GRN_INVALID_ARGUMENT, "builder is null"); return ctx->rc; } rc = grn_ii_builder_fin(ctx, builder); GRN_FREE(builder); return rc; } /* grn_ii_builder_create_lexicon creates a block lexicon. */ static grn_rc grn_ii_builder_create_lexicon(grn_ctx *ctx, grn_ii_builder *builder) { grn_table_flags flags; grn_obj *domain = grn_ctx_at(ctx, builder->ii->lexicon->header.domain); grn_obj *range = grn_ctx_at(ctx, DB_OBJ(builder->ii->lexicon)->range); grn_obj *tokenizer, *normalizer, *token_filters; grn_rc rc = grn_table_get_info(ctx, builder->ii->lexicon, &flags, NULL, &tokenizer, &normalizer, &token_filters); if (rc != GRN_SUCCESS) { return rc; } flags &= ~GRN_OBJ_PERSISTENT; builder->lexicon = grn_table_create(ctx, NULL, 0, NULL, flags, domain, range); if (!builder->lexicon) { if (ctx->rc == GRN_SUCCESS) { ERR(GRN_UNKNOWN_ERROR, "[index] failed to create a block lexicon"); } return ctx->rc; } builder->tokenizer = tokenizer; builder->normalizer = normalizer; rc = grn_obj_set_info(ctx, builder->lexicon, GRN_INFO_DEFAULT_TOKENIZER, tokenizer); if (rc == GRN_SUCCESS) { rc = grn_obj_set_info(ctx, builder->lexicon, GRN_INFO_NORMALIZER, normalizer); if (rc == GRN_SUCCESS) { rc = grn_obj_set_info(ctx, builder->lexicon, GRN_INFO_TOKEN_FILTERS, token_filters); } } if (rc != GRN_SUCCESS) { return rc; } if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { if (builder->options.lexicon_cache_size) { rc = grn_pat_cache_enable(ctx, (grn_pat *)builder->lexicon, builder->options.lexicon_cache_size); if (rc != GRN_SUCCESS) { return rc; } } } return GRN_SUCCESS; } /* * grn_ii_builder_extend_terms extends a buffer for terms in order to make * terms[n_terms - 1] available. */ static grn_rc grn_ii_builder_extend_terms(grn_ctx *ctx, grn_ii_builder *builder, uint32_t n_terms) { if (n_terms <= builder->n_terms) { return GRN_SUCCESS; } if (n_terms > builder->max_n_terms) { uint32_t i; if (n_terms > builder->terms_size) { /* Resize builder->terms for new terms. */ size_t n_bytes; uint32_t terms_size = builder->terms_size ? builder->terms_size * 2 : 1; grn_ii_builder_term *terms; while (terms_size < n_terms) { terms_size *= 2; } n_bytes = terms_size * sizeof(grn_ii_builder_term); terms = (grn_ii_builder_term *)GRN_REALLOC(builder->terms, n_bytes); if (!terms) { ERR(GRN_NO_MEMORY_AVAILABLE, "failed to allocate memory for terms: n_bytes = %" GRN_FMT_SIZE, n_bytes); return ctx->rc; } builder->terms = terms; builder->terms_size = terms_size; } /* Initialize new terms. */ for (i = builder->max_n_terms; i < n_terms; i++) { grn_ii_builder_term_init(ctx, &builder->terms[i]); } builder->max_n_terms = n_terms; } builder->n += n_terms - builder->n_terms; builder->n_terms = n_terms; return GRN_SUCCESS; } /* grn_ii_builder_get_term gets a term associated with tid. */ inline static grn_rc grn_ii_builder_get_term(grn_ctx *ctx, grn_ii_builder *builder, grn_id tid, grn_ii_builder_term **term) { uint32_t n_terms = tid; if (n_terms > builder->n_terms) { grn_rc rc = grn_ii_builder_extend_terms(ctx, builder, n_terms); if (rc != GRN_SUCCESS) { return rc; } } *term = &builder->terms[tid - 1]; return GRN_SUCCESS; } /* grn_ii_builder_flush_file_buf flushes buffered data as a block. */ static grn_rc grn_ii_builder_flush_file_buf(grn_ctx *ctx, grn_ii_builder *builder) { if (builder->file_buf_offset) { ssize_t size = grn_write(builder->fd, builder->file_buf, builder->file_buf_offset); if ((uint64_t)size != builder->file_buf_offset) { SERR("failed to write data: expected = %u, actual = %" GRN_FMT_INT64D, builder->file_buf_offset, (int64_t)size); } builder->file_buf_offset = 0; } return GRN_SUCCESS; } /* grn_ii_builder_flush_term flushes a term and clears it */ static grn_rc grn_ii_builder_flush_term(grn_ctx *ctx, grn_ii_builder *builder, grn_ii_builder_term *term) { grn_rc rc; uint8_t *term_buf; /* Append sentinels. */ if (term->rid != GRN_ID_NIL) { if (builder->ii->header->flags & GRN_OBJ_WITH_POSITION) { rc = grn_ii_builder_term_append(ctx, term, 0); } else { rc = grn_ii_builder_term_append(ctx, term, term->pos_or_freq); } if (rc != GRN_SUCCESS) { return rc; } } rc = grn_ii_builder_term_append(ctx, term, 0); if (rc != GRN_SUCCESS) { return rc; } { /* Put the global term ID. */ int key_size; char key[GRN_TABLE_MAX_KEY_SIZE]; uint8_t *p; uint32_t rest, value; grn_rc rc; grn_id local_tid = term - builder->terms + 1, global_tid; key_size = grn_table_get_key(ctx, builder->lexicon, local_tid, key, GRN_TABLE_MAX_KEY_SIZE); if (!key_size) { if (ctx->rc == GRN_SUCCESS) { ERR(GRN_UNKNOWN_ERROR, "failed to get key: tid = %u", local_tid); } return ctx->rc; } global_tid = grn_table_add(ctx, builder->ii->lexicon, key, key_size, NULL); if (global_tid == GRN_ID_NIL) { if (ctx->rc == GRN_SUCCESS) { ERR(GRN_UNKNOWN_ERROR, "failed to get global term ID: tid = %u, key = \"%.*s\"", local_tid, key_size, key); } return ctx->rc; } rest = builder->options.file_buf_size - builder->file_buf_offset; if (rest < 10) { rc = grn_ii_builder_flush_file_buf(ctx, builder); if (rc != GRN_SUCCESS) { return rc; } } value = global_tid; p = builder->file_buf + builder->file_buf_offset; if (value < 1U << 5) { p[0] = (uint8_t)value; builder->file_buf_offset++; } else if (value < 1U << 13) { p[0] = (uint8_t)((value & 0x1f) | (1 << 5)); p[1] = (uint8_t)(value >> 5); builder->file_buf_offset += 2; } else { uint8_t i, n; if (value < 1U << 21) { n = 3; } else if (value < 1U << 29) { n = 4; } else { n = 5; } p[0] = (uint8_t)(value & 0x1f) | ((n - 1) << 5); value >>= 5; for (i = 1; i < n; i++) { p[i] = (uint8_t)value; value >>= 8; } builder->file_buf_offset += n; } } /* Flush a term buffer. */ term_buf = grn_ii_builder_term_get_buf(term); if (term->offset > builder->options.file_buf_size) { ssize_t size; rc = grn_ii_builder_flush_file_buf(ctx, builder); if (rc != GRN_SUCCESS) { return rc; } size = grn_write(builder->fd, term_buf, term->offset); if ((uint64_t)size != term->offset) { SERR("failed to write data: expected = %u, actual = %" GRN_FMT_INT64D, term->offset, (int64_t)size); } } else { uint32_t rest = builder->options.file_buf_size - builder->file_buf_offset; if (term->offset <= rest) { grn_memcpy(builder->file_buf + builder->file_buf_offset, term_buf, term->offset); builder->file_buf_offset += term->offset; } else { grn_memcpy(builder->file_buf + builder->file_buf_offset, term_buf, rest); builder->file_buf_offset += rest; rc = grn_ii_builder_flush_file_buf(ctx, builder); if (rc != GRN_SUCCESS) { return rc; } builder->file_buf_offset = term->offset - rest; grn_memcpy(builder->file_buf, term_buf + rest, builder->file_buf_offset); } } grn_ii_builder_term_reinit(ctx, term); return GRN_SUCCESS; } /* * grn_ii_builder_create_file creates a temporary file and allocates memory for * buffered output. */ static grn_rc grn_ii_builder_create_file(grn_ctx *ctx, grn_ii_builder *builder) { grn_snprintf(builder->path, PATH_MAX, PATH_MAX, "%-.256sXXXXXX", grn_io_path(builder->ii->seg)); builder->fd = grn_mkstemp(builder->path); if (builder->fd == -1) { SERR("failed to create a temporary file: path = \"%-.256s\"", builder->path); return ctx->rc; } builder->file_buf = (uint8_t *)GRN_MALLOC(builder->options.file_buf_size); if (!builder->file_buf) { ERR(GRN_NO_MEMORY_AVAILABLE, "failed to allocate memory for buffered output: size = %u", builder->options.file_buf_size); return ctx->rc; } return GRN_SUCCESS; } /* grn_ii_builder_register_block registers a block. */ static grn_rc grn_ii_builder_register_block(grn_ctx *ctx, grn_ii_builder *builder) { grn_ii_builder_block *block; uint64_t file_offset = grn_lseek(builder->fd, 0, SEEK_CUR); if (file_offset == (uint64_t)-1) { SERR("failed to get file offset"); return ctx->rc; } if (builder->n_blocks >= builder->blocks_size) { size_t n_bytes; uint32_t blocks_size = 1; grn_ii_builder_block *blocks; while (blocks_size <= builder->n_blocks) { blocks_size *= 2; } n_bytes = blocks_size * sizeof(grn_ii_builder_block); blocks = (grn_ii_builder_block *)GRN_REALLOC(builder->blocks, n_bytes); if (!blocks) { ERR(GRN_NO_MEMORY_AVAILABLE, "failed to allocate memory for block: n_bytes = %" GRN_FMT_SIZE, n_bytes); return ctx->rc; } builder->blocks = blocks; builder->blocks_size = blocks_size; } block = &builder->blocks[builder->n_blocks]; grn_ii_builder_block_init(ctx, block); if (!builder->n_blocks) { block->offset = 0; } else { grn_ii_builder_block *prev_block = &builder->blocks[builder->n_blocks - 1]; block->offset = prev_block->offset + prev_block->rest; } block->rest = (uint32_t)(file_offset - block->offset); builder->n_blocks++; return GRN_SUCCESS; } /* grn_ii_builder_flush_block flushes a block to a temporary file. */ static grn_rc grn_ii_builder_flush_block(grn_ctx *ctx, grn_ii_builder *builder) { grn_rc rc; grn_table_cursor *cursor; if (!builder->n) { /* Do nothing if there are no output data. */ return GRN_SUCCESS; } if (builder->fd == -1) { rc = grn_ii_builder_create_file(ctx, builder); if (rc != GRN_SUCCESS) { return rc; } } /* Flush terms into a temporary file. */ cursor = grn_table_cursor_open(ctx, builder->lexicon, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_KEY); for (;;) { grn_id tid = grn_table_cursor_next(ctx, cursor); if (tid == GRN_ID_NIL) { break; } rc = grn_ii_builder_flush_term(ctx, builder, &builder->terms[tid - 1]); if (rc != GRN_SUCCESS) { return rc; } } grn_table_cursor_close(ctx, cursor); rc = grn_ii_builder_flush_file_buf(ctx, builder); if (rc != GRN_SUCCESS) { return rc; } /* Register a block and clear the current data. */ rc = grn_ii_builder_register_block(ctx, builder); if (rc != GRN_SUCCESS) { return rc; } rc = grn_table_truncate(ctx, builder->lexicon); if (rc != GRN_SUCCESS) { return rc; } builder->rid = GRN_ID_NIL; builder->n_terms = 0; builder->n = 0; return GRN_SUCCESS; } /* grn_ii_builder_append_token appends a token. */ static grn_rc grn_ii_builder_append_token(grn_ctx *ctx, grn_ii_builder *builder, grn_id rid, uint32_t sid, uint32_t weight, grn_id tid, uint32_t pos) { grn_rc rc; uint32_t ii_flags = builder->ii->header->flags; grn_ii_builder_term *term; rc = grn_ii_builder_get_term(ctx, builder, tid, &term); if (rc != GRN_SUCCESS) { return rc; } if (rid != term->rid || sid != term->sid) { uint64_t rsid; if (term->rid != GRN_ID_NIL) { if (ii_flags & GRN_OBJ_WITH_POSITION) { /* Append the end of positions. */ rc = grn_ii_builder_term_append(ctx, term, 0); if (rc != GRN_SUCCESS) { return rc; } builder->n++; } else { /* Append a frequency if positions are not available. */ rc = grn_ii_builder_term_append(ctx, term, term->pos_or_freq); if (rc != GRN_SUCCESS) { return rc; } builder->n++; } } rsid = ((uint64_t)(rid - term->rid) << builder->sid_bits) | (sid - 1); rc = grn_ii_builder_term_append(ctx, term, rsid); if (rc != GRN_SUCCESS) { return rc; } builder->n++; if (ii_flags & GRN_OBJ_WITH_WEIGHT) { rc = grn_ii_builder_term_append(ctx, term, weight); if (rc != GRN_SUCCESS) { return rc; } builder->n++; } term->rid = rid; term->sid = sid; term->pos_or_freq = 0; } if (ii_flags & GRN_OBJ_WITH_POSITION) { rc = grn_ii_builder_term_append(ctx, term, pos - term->pos_or_freq); if (rc != GRN_SUCCESS) { return rc; } builder->n++; term->pos_or_freq = pos; } else { term->pos_or_freq++; } return GRN_SUCCESS; } /* * grn_ii_builder_append_value appends a value. Note that values must be * appended in ascending rid and sid order. */ static grn_rc grn_ii_builder_append_value(grn_ctx *ctx, grn_ii_builder *builder, grn_id rid, uint32_t sid, uint32_t weight, const char *value, uint32_t value_size) { uint32_t pos = 0; grn_token_cursor *cursor; if (rid != builder->rid) { builder->rid = rid; builder->sid = sid; builder->pos = 1; } else if (sid != builder->sid) { builder->sid = sid; builder->pos = 1; } else { /* Insert a space between values. */ builder->pos++; } if (value_size) { if (!builder->tokenizer && !builder->normalizer) { grn_id tid; switch (builder->lexicon->header.type) { case GRN_TABLE_PAT_KEY : tid = grn_pat_add(ctx, (grn_pat *)builder->lexicon, value, value_size, NULL, NULL); break; case GRN_TABLE_DAT_KEY : tid = grn_dat_add(ctx, (grn_dat *)builder->lexicon, value, value_size, NULL, NULL); break; case GRN_TABLE_HASH_KEY : tid = grn_hash_add(ctx, (grn_hash *)builder->lexicon, value, value_size, NULL, NULL); break; case GRN_TABLE_NO_KEY : tid = *(grn_id *)value; break; default : tid = GRN_ID_NIL; break; } if (tid != GRN_ID_NIL) { grn_rc rc; pos = builder->pos; rc = grn_ii_builder_append_token(ctx, builder, rid, sid, weight, tid, pos); if (rc != GRN_SUCCESS) { return rc; } } } else { cursor = grn_token_cursor_open(ctx, builder->lexicon, value, value_size, GRN_TOKEN_ADD, 0); if (!cursor) { if (ctx->rc == GRN_SUCCESS) { ERR(GRN_UNKNOWN_ERROR, "grn_token_cursor_open failed: value = <%.*s>", value_size, value); } return ctx->rc; } while (cursor->status == GRN_TOKEN_CURSOR_DOING) { grn_id tid = grn_token_cursor_next(ctx, cursor); if (tid != GRN_ID_NIL) { grn_rc rc; pos = builder->pos + cursor->pos; rc = grn_ii_builder_append_token(ctx, builder, rid, sid, weight, tid, pos); if (rc != GRN_SUCCESS) { break; } } } grn_token_cursor_close(ctx, cursor); } } builder->pos = pos + 1; return ctx->rc; } /* grn_ii_builder_append_obj appends a BULK, UVECTOR or VECTOR object. */ static grn_rc grn_ii_builder_append_obj(grn_ctx *ctx, grn_ii_builder *builder, grn_id rid, uint32_t sid, grn_obj *obj) { switch (obj->header.type) { case GRN_BULK : return grn_ii_builder_append_value(ctx, builder, rid, sid, 0, GRN_TEXT_VALUE(obj), GRN_TEXT_LEN(obj)); case GRN_UVECTOR : { const char *p = GRN_BULK_HEAD(obj); uint32_t i, n_values = grn_uvector_size(ctx, obj); uint32_t value_size = grn_uvector_element_size(ctx, obj); for (i = 0; i < n_values; i++) { grn_rc rc = grn_ii_builder_append_value(ctx, builder, rid, sid, 0, p, value_size); if (rc != GRN_SUCCESS) { return rc; } p += value_size; } } return GRN_SUCCESS; case GRN_VECTOR : if (obj->u.v.body) { /* * Note that the following sections and n_sections don't correspond to * source columns. */ int i, n_secs = obj->u.v.n_sections; grn_section *secs = obj->u.v.sections; const char *head = GRN_BULK_HEAD(obj->u.v.body); for (i = 0; i < n_secs; i++) { grn_rc rc; grn_section *sec = &secs[i]; if (sec->length == 0) { continue; } if (builder->tokenizer) { sid = i + 1; } rc = grn_ii_builder_append_value(ctx, builder, rid, sid, sec->weight, head + sec->offset, sec->length); if (rc != GRN_SUCCESS) { return rc; } } } return GRN_SUCCESS; default : ERR(GRN_INVALID_ARGUMENT, "[index] invalid object assigned as value"); return ctx->rc; } } /* * grn_ii_builder_append_srcs reads values from source columns and appends the * values. */ static grn_rc grn_ii_builder_append_srcs(grn_ctx *ctx, grn_ii_builder *builder) { size_t i; grn_rc rc = GRN_SUCCESS; grn_obj *objs; grn_table_cursor *cursor; /* Allocate memory for objects to store source values. */ objs = GRN_MALLOCN(grn_obj, builder->n_srcs); if (!objs) { ERR(GRN_NO_MEMORY_AVAILABLE, "failed to allocate memory for objs: n_srcs = %u", builder->n_srcs); return ctx->rc; } /* Create a cursor to get records in the ID order. */ cursor = grn_table_cursor_open(ctx, builder->src_table, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID); if (!cursor) { if (ctx->rc == GRN_SUCCESS) { ERR(GRN_OBJECT_CORRUPT, "[index] failed to open table cursor"); } GRN_FREE(objs); return ctx->rc; } /* Read source values and append it. */ for (i = 0; i < builder->n_srcs; i++) { GRN_TEXT_INIT(&objs[i], 0); } while (rc == GRN_SUCCESS) { grn_id rid = grn_table_cursor_next(ctx, cursor); if (rid == GRN_ID_NIL) { break; } for (i = 0; i < builder->n_srcs; i++) { grn_obj *obj = &objs[i]; grn_obj *src = builder->srcs[i]; rc = grn_obj_reinit_for(ctx, obj, src); if (rc == GRN_SUCCESS) { if (GRN_OBJ_TABLEP(src)) { int len = grn_table_get_key2(ctx, src, rid, obj); if (len <= 0) { if (ctx->rc == GRN_SUCCESS) { ERR(GRN_UNKNOWN_ERROR, "failed to get key: rid = %u, len = %d", rid, len); } rc = ctx->rc; } } else { if (!grn_obj_get_value(ctx, src, rid, obj)) { if (ctx->rc == GRN_SUCCESS) { ERR(GRN_UNKNOWN_ERROR, "failed to get value: rid = %u", rid); } rc = ctx->rc; } } if (rc == GRN_SUCCESS) { uint32_t sid = (uint32_t)(i + 1); rc = grn_ii_builder_append_obj(ctx, builder, rid, sid, obj); } } } if (rc == GRN_SUCCESS && builder->n >= builder->options.block_threshold) { rc = grn_ii_builder_flush_block(ctx, builder); } } if (rc == GRN_SUCCESS) { rc = grn_ii_builder_flush_block(ctx, builder); } for (i = 0; i < builder->n_srcs; i++) { GRN_OBJ_FIN(ctx, &objs[i]); } grn_table_cursor_close(ctx, cursor); GRN_FREE(objs); return rc; } /* grn_ii_builder_set_src_table sets a source table. */ static grn_rc grn_ii_builder_set_src_table(grn_ctx *ctx, grn_ii_builder *builder) { builder->src_table = grn_ctx_at(ctx, DB_OBJ(builder->ii)->range); if (!builder->src_table) { if (ctx->rc == GRN_SUCCESS) { ERR(GRN_INVALID_ARGUMENT, "source table is null: range = %d", DB_OBJ(builder->ii)->range); } return ctx->rc; } return GRN_SUCCESS; } /* grn_ii_builder_set_sid_bits calculates sid_bits and sid_mask. */ static grn_rc grn_ii_builder_set_sid_bits(grn_ctx *ctx, grn_ii_builder *builder) { /* Calculate the number of bits required to represent a section ID. */ if (builder->n_srcs == 1 && builder->tokenizer && (builder->srcs[0]->header.flags & GRN_OBJ_COLUMN_VECTOR) != 0) { /* If the source column is a vector column and the index has a tokenizer, */ /* the maximum sid equals to the maximum number of elements. */ size_t max_elems = 0; grn_table_cursor *cursor; grn_obj obj; cursor = grn_table_cursor_open(ctx, builder->src_table, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID); if (!cursor) { if (ctx->rc == GRN_SUCCESS) { ERR(GRN_OBJECT_CORRUPT, "[index] failed to open table cursor"); } return ctx->rc; } GRN_TEXT_INIT(&obj, 0); for (;;) { grn_id rid = grn_table_cursor_next(ctx, cursor); if (rid == GRN_ID_NIL) { break; } if (!grn_obj_get_value(ctx, builder->srcs[0], rid, &obj)) { continue; } if (obj.u.v.n_sections > (int) max_elems) { max_elems = obj.u.v.n_sections; } } GRN_OBJ_FIN(ctx, &obj); grn_table_cursor_close(ctx, cursor); while (((uint32_t)1 << builder->sid_bits) < max_elems) { builder->sid_bits++; } } if (builder->sid_bits == 0) { while (((uint32_t)1 << builder->sid_bits) < builder->n_srcs) { builder->sid_bits++; } } builder->sid_mask = ((uint64_t)1 << builder->sid_bits) - 1; return GRN_SUCCESS; } /* grn_ii_builder_set_srcs sets source columns. */ static grn_rc grn_ii_builder_set_srcs(grn_ctx *ctx, grn_ii_builder *builder) { size_t i; grn_id *source; builder->n_srcs = builder->ii->obj.source_size / sizeof(grn_id); source = (grn_id *)builder->ii->obj.source; if (!source || !builder->n_srcs) { ERR(GRN_INVALID_ARGUMENT, "source is not available: source = %p, source_size = %u", builder->ii->obj.source, builder->ii->obj.source_size); return ctx->rc; } builder->srcs = GRN_MALLOCN(grn_obj *, builder->n_srcs); if (!builder->srcs) { return GRN_NO_MEMORY_AVAILABLE; } for (i = 0; i < builder->n_srcs; i++) { builder->srcs[i] = grn_ctx_at(ctx, source[i]); if (!builder->srcs[i]) { if (ctx->rc == GRN_SUCCESS) { ERR(GRN_OBJECT_CORRUPT, "source not found: id = %d", source[i]); } return ctx->rc; } } return grn_ii_builder_set_sid_bits(ctx, builder); } /* grn_ii_builder_append_source appends values in source columns. */ static grn_rc grn_ii_builder_append_source(grn_ctx *ctx, grn_ii_builder *builder) { grn_rc rc = grn_ii_builder_set_src_table(ctx, builder); if (rc != GRN_SUCCESS) { return rc; } if (grn_table_size(ctx, builder->src_table) == 0) { /* Nothing to do because there are no values. */ return ctx->rc; } /* Create a block lexicon. */ rc = grn_ii_builder_create_lexicon(ctx, builder); if (rc != GRN_SUCCESS) { return rc; } rc = grn_ii_builder_set_srcs(ctx, builder); if (rc != GRN_SUCCESS) { return rc; } rc = grn_ii_builder_append_srcs(ctx, builder); if (rc != GRN_SUCCESS) { return rc; } grn_ii_builder_fin_terms(ctx, builder); return GRN_SUCCESS; } /* * grn_ii_builder_fill_block reads the next data from a temporary file and fill * a block buffer. */ static grn_rc grn_ii_builder_fill_block(grn_ctx *ctx, grn_ii_builder *builder, uint32_t block_id) { ssize_t size; uint32_t buf_rest; uint64_t file_offset; grn_ii_builder_block *block = &builder->blocks[block_id]; if (!block->rest) { return GRN_END_OF_DATA; } if (!block->buf) { block->buf = (uint8_t *)GRN_MALLOC(builder->options.block_buf_size); if (!block->buf) { ERR(GRN_NO_MEMORY_AVAILABLE, "failed to allocate memory for buffered input: size = %u", builder->options.block_buf_size); return ctx->rc; } } /* Move the remaining data to the head. */ buf_rest = block->end - block->cur; if (buf_rest) { grn_memmove(block->buf, block->cur, buf_rest); } block->cur = block->buf; block->end = block->buf + buf_rest; /* Read the next data. */ file_offset = grn_lseek(builder->fd, block->offset, SEEK_SET); if (file_offset != block->offset) { SERR("failed to seek file: expected = %" GRN_FMT_INT64U ", actual = %" GRN_FMT_INT64D, block->offset, file_offset); return ctx->rc; } buf_rest = builder->options.block_buf_size - buf_rest; if (block->rest < buf_rest) { buf_rest = block->rest; } size = grn_read(builder->fd, block->end, buf_rest); if (size <= 0) { SERR("failed to read data: expected = %u, actual = %" GRN_FMT_INT64D, buf_rest, (int64_t)size); return ctx->rc; } block->offset += size; block->rest -= size; block->end += size; return GRN_SUCCESS; } /* grn_ii_builder_read_from_block reads the next value from a block. */ static grn_rc grn_ii_builder_read_from_block(grn_ctx *ctx, grn_ii_builder *builder, uint32_t block_id, uint64_t *value) { grn_ii_builder_block *block = &builder->blocks[block_id]; grn_rc rc = grn_ii_builder_block_next(ctx, block, value); if (rc == GRN_SUCCESS) { return GRN_SUCCESS; } else if (rc == GRN_END_OF_DATA) { rc = grn_ii_builder_fill_block(ctx, builder, block_id); if (rc != GRN_SUCCESS) { return rc; } return grn_ii_builder_block_next(ctx, block, value); } return rc; } /* grn_ii_builder_pack_chunk tries to pack a chunk. */ static grn_rc grn_ii_builder_pack_chunk(grn_ctx *ctx, grn_ii_builder *builder, grn_bool *packed) { grn_id rid; uint32_t sid, pos, *a; grn_ii_builder_chunk *chunk = &builder->chunk; *packed = GRN_FALSE; if (chunk->offset != 1) { /* df != 1 */ return GRN_SUCCESS; } if (chunk->weight_buf && chunk->weight_buf[0]) { /* weight != 0 */ return GRN_SUCCESS; } if (chunk->freq_buf[0] != 0) { /* freq != 1 */ return GRN_SUCCESS; } rid = chunk->rid_buf[0]; if (chunk->sid_buf) { if (rid >= 0x100000) { return GRN_SUCCESS; } sid = chunk->sid_buf[0] + 1; if (sid >= 0x800) { return GRN_SUCCESS; } a = array_get(ctx, builder->ii, chunk->tid); if (!a) { DEFINE_NAME(builder->ii); MERR("[ii][builder][chunk][pack] failed to allocate an array: " "<%.*s>: " "<%u>:<%u>:<%u>", name_size, name, rid, sid, chunk->tid); return ctx->rc; } a[0] = ((rid << 12) + (sid << 1)) | 1; } else { a = array_get(ctx, builder->ii, chunk->tid); if (!a) { DEFINE_NAME(builder->ii); MERR("[ii][builder][chunk][pack] failed to allocate an array: " "<%.*s>: " "<%u>:<%u>", name_size, name, rid, chunk->tid); return ctx->rc; } a[0] = (rid << 1) | 1; } pos = 0; if (chunk->pos_buf) { pos = chunk->pos_buf[0]; } a[1] = pos; array_unref(builder->ii, chunk->tid); *packed = GRN_TRUE; grn_ii_builder_chunk_clear(ctx, chunk); return GRN_SUCCESS; } /* grn_ii_builder_get_cinfo returns a new cinfo. */ static grn_rc grn_ii_builder_get_cinfo(grn_ctx *ctx, grn_ii_builder *builder, chunk_info **cinfo) { if (builder->n_cinfos == builder->cinfos_size) { uint32_t size = builder->cinfos_size ? (builder->cinfos_size * 2) : 1; size_t n_bytes = size * sizeof(chunk_info); chunk_info *cinfos = (chunk_info *)GRN_REALLOC(builder->cinfos, n_bytes); if (!cinfos) { ERR(GRN_NO_MEMORY_AVAILABLE, "failed to allocate memory for cinfos: n_bytes = %" GRN_FMT_SIZE, n_bytes); return ctx->rc; } builder->cinfos = cinfos; builder->cinfos_size = size; } *cinfo = &builder->cinfos[builder->n_cinfos++]; return GRN_SUCCESS; } /* grn_ii_builder_flush_chunk flushes a chunk. */ static grn_rc grn_ii_builder_flush_chunk(grn_ctx *ctx, grn_ii_builder *builder) { grn_rc rc; chunk_info *cinfo = NULL; grn_ii_builder_chunk *chunk = &builder->chunk; void *seg; uint8_t *in; uint32_t in_size, chunk_id, seg_id, seg_offset, seg_rest; rc = grn_ii_builder_chunk_encode(ctx, chunk, NULL, 0); if (rc != GRN_SUCCESS) { return rc; } in = chunk->enc_buf; in_size = chunk->enc_offset; rc = chunk_new(ctx, builder->ii, &chunk_id, chunk->enc_offset); if (rc != GRN_SUCCESS) { return rc; } /* Copy to the first segment. */ seg_id = chunk_id >> GRN_II_N_CHUNK_VARIATION; seg_offset = (chunk_id & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << GRN_II_W_LEAST_CHUNK; GRN_IO_SEG_REF(builder->ii->chunk, seg_id, seg); if (!seg) { if (ctx->rc == GRN_SUCCESS) { ERR(GRN_UNKNOWN_ERROR, "failed access chunk segment: chunk_id = %u, seg_id = %u", chunk_id, seg_id); } return ctx->rc; } seg_rest = S_CHUNK - seg_offset; if (in_size <= seg_rest) { grn_memcpy((uint8_t *)seg + seg_offset, in, in_size); in_size = 0; } else { grn_memcpy((uint8_t *)seg + seg_offset, in, seg_rest); in += seg_rest; in_size -= seg_rest; } GRN_IO_SEG_UNREF(builder->ii->chunk, seg_id); /* Copy to the next segments. */ while (in_size) { seg_id++; GRN_IO_SEG_REF(builder->ii->chunk, seg_id, seg); if (!seg) { if (ctx->rc == GRN_SUCCESS) { ERR(GRN_UNKNOWN_ERROR, "failed access chunk segment: chunk_id = %u, seg_id = %u", chunk_id, seg_id); } return ctx->rc; } if (in_size <= S_CHUNK) { grn_memcpy(seg, in, in_size); in_size = 0; } else { grn_memcpy(seg, in, S_CHUNK); in += S_CHUNK; in_size -= S_CHUNK; } GRN_IO_SEG_UNREF(builder->ii->chunk, seg_id); } /* Append a cinfo. */ rc = grn_ii_builder_get_cinfo(ctx, builder, &cinfo); if (rc != GRN_SUCCESS) { return rc; } cinfo->segno = chunk_id; cinfo->size = chunk->enc_offset; cinfo->dgap = chunk->rid_gap; builder->buf.ii->header->total_chunk_size += chunk->enc_offset; grn_ii_builder_chunk_clear(ctx, chunk); return GRN_SUCCESS; } /* grn_ii_builder_read_to_chunk read values from a block to a chunk. */ static grn_rc grn_ii_builder_read_to_chunk(grn_ctx *ctx, grn_ii_builder *builder, uint32_t block_id) { grn_rc rc; uint64_t value; uint32_t rid = GRN_ID_NIL, last_sid = 0; uint32_t ii_flags = builder->ii->header->flags; grn_ii_builder_chunk *chunk = &builder->chunk; for (;;) { uint32_t gap, freq; uint64_t value; rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); if (rc != GRN_SUCCESS) { return rc; } if (!value) { break; } if (builder->chunk.offset == builder->chunk.size) { rc = grn_ii_builder_chunk_extend_bufs(ctx, chunk, ii_flags); if (rc != GRN_SUCCESS) { return rc; } } /* Read record ID. */ gap = value >> builder->sid_bits; /* In-block gap */ if (gap) { if (chunk->n >= builder->options.chunk_threshold) { rc = grn_ii_builder_flush_chunk(ctx, builder); if (rc != GRN_SUCCESS) { return rc; } } last_sid = 0; } rid += gap; gap = rid - chunk->rid; /* Global gap */ chunk->rid_buf[chunk->offset] = chunk->offset ? gap : rid; chunk->n++; chunk->rid = rid; chunk->rid_gap += gap; builder->df++; /* Read section ID. */ if (ii_flags & GRN_OBJ_WITH_SECTION) { uint32_t sid = (value & builder->sid_mask) + 1; chunk->sid_buf[chunk->offset] = sid - last_sid - 1; chunk->n++; last_sid = sid; } /* Read weight. */ if (ii_flags & GRN_OBJ_WITH_WEIGHT) { uint32_t weight; rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); if (rc != GRN_SUCCESS) { return rc; } weight = value; chunk->weight_buf[chunk->offset] = weight; chunk->n++; } /* Read positions or a frequency. */ if (ii_flags & GRN_OBJ_WITH_POSITION) { uint32_t pos = (uint32_t) -1; freq = 0; for (;;) { rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); if (rc != GRN_SUCCESS) { return rc; } if (!value) { break; } if (builder->chunk.pos_offset == builder->chunk.pos_size) { rc = grn_ii_builder_chunk_extend_pos_buf(ctx, chunk); if (rc != GRN_SUCCESS) { return rc; } } if (pos == (uint32_t) -1) { chunk->pos_buf[chunk->pos_offset] = value - 1; chunk->pos_sum += value - 1; } else { chunk->pos_buf[chunk->pos_offset] = value; chunk->pos_sum += value; } chunk->n++; pos += value; chunk->pos_offset++; freq++; } } else { rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); if (rc != GRN_SUCCESS) { return rc; } freq = value; } chunk->freq_buf[chunk->offset] = freq - 1; chunk->n++; chunk->offset++; } rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); if (rc == GRN_SUCCESS) { builder->blocks[block_id].tid = value; } else if (rc == GRN_END_OF_DATA) { builder->blocks[block_id].tid = GRN_ID_NIL; } else { return rc; } return GRN_SUCCESS; } /* grn_ii_builder_register_chunks registers chunks. */ static grn_rc grn_ii_builder_register_chunks(grn_ctx *ctx, grn_ii_builder *builder) { grn_rc rc; uint32_t buf_tid, *a; buffer_term *buf_term; rc = grn_ii_builder_chunk_encode(ctx, &builder->chunk, builder->cinfos, builder->n_cinfos); if (rc != GRN_SUCCESS) { return rc; } if (!grn_ii_builder_buffer_is_assigned(ctx, &builder->buf)) { rc = grn_ii_builder_buffer_assign(ctx, &builder->buf, builder->chunk.enc_offset); if (rc != GRN_SUCCESS) { return rc; } } buf_tid = builder->buf.buf->header.nterms; if (buf_tid >= builder->options.buffer_max_n_terms || builder->buf.chunk_size - builder->buf.chunk_offset < builder->chunk.enc_offset) { rc = grn_ii_builder_buffer_flush(ctx, &builder->buf); if (rc != GRN_SUCCESS) { return rc; } rc = grn_ii_builder_buffer_assign(ctx, &builder->buf, builder->chunk.enc_offset); if (rc != GRN_SUCCESS) { return rc; } buf_tid = 0; } buf_term = &builder->buf.buf->terms[buf_tid]; buf_term->tid = builder->chunk.tid; if (builder->n_cinfos) { buf_term->tid |= CHUNK_SPLIT; } buf_term->size_in_buffer = 0; buf_term->pos_in_buffer = 0; buf_term->size_in_chunk = builder->chunk.enc_offset; buf_term->pos_in_chunk = builder->buf.chunk_offset; grn_memcpy(builder->buf.chunk + builder->buf.chunk_offset, builder->chunk.enc_buf, builder->chunk.enc_offset); builder->buf.chunk_offset += builder->chunk.enc_offset; a = array_get(ctx, builder->ii, builder->chunk.tid); if (!a) { DEFINE_NAME(builder->ii); MERR("[ii][builder][chunk][register] " "failed to allocate an array in segment: " "<%.*s>: " "tid=<%u>: max_n_segments=<%u>", name_size, name, builder->chunk.tid, builder->ii->seg->header->max_segment); return ctx->rc; } a[0] = SEG2POS(builder->buf.buf_id, sizeof(buffer_header) + buf_tid * sizeof(buffer_term)); a[1] = builder->df; array_unref(builder->ii, builder->chunk.tid); builder->buf.buf->header.nterms++; builder->n_cinfos = 0; grn_ii_builder_chunk_clear(ctx, &builder->chunk); return GRN_SUCCESS; } static grn_rc grn_ii_builder_commit(grn_ctx *ctx, grn_ii_builder *builder) { uint32_t i; grn_rc rc; grn_table_cursor *cursor; for (i = 0; i < builder->n_blocks; i++) { uint64_t value; rc = grn_ii_builder_read_from_block(ctx, builder, i, &value); if (rc != GRN_SUCCESS) { return rc; } builder->blocks[i].tid = value; } cursor = grn_table_cursor_open(ctx, builder->ii->lexicon, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_KEY); for (;;) { grn_id tid = grn_table_cursor_next(ctx, cursor); if (tid == GRN_ID_NIL) { break; } builder->chunk.tid = tid; builder->chunk.rid = GRN_ID_NIL; builder->df = 0; for (i = 0; i < builder->n_blocks; i++) { if (tid == builder->blocks[i].tid) { rc = grn_ii_builder_read_to_chunk(ctx, builder, i); if (rc != GRN_SUCCESS) { return rc; } } } if (!builder->chunk.n) { /* This term does not appear. */ continue; } if (!builder->n_cinfos) { grn_bool packed; rc = grn_ii_builder_pack_chunk(ctx, builder, &packed); if (rc != GRN_SUCCESS) { return rc; } if (packed) { continue; } } rc = grn_ii_builder_register_chunks(ctx, builder); if (rc != GRN_SUCCESS) { return rc; } } grn_table_cursor_close(ctx, cursor); if (grn_ii_builder_buffer_is_assigned(ctx, &builder->buf)) { rc = grn_ii_builder_buffer_flush(ctx, &builder->buf); if (rc != GRN_SUCCESS) { return rc; } } return GRN_SUCCESS; } grn_rc grn_ii_build2(grn_ctx *ctx, grn_ii *ii, const grn_ii_builder_options *options) { grn_rc rc, rc_close; grn_ii_builder *builder; rc = grn_ii_builder_open(ctx, ii, options, &builder); if (rc == GRN_SUCCESS) { rc = grn_ii_builder_append_source(ctx, builder); if (rc == GRN_SUCCESS) { rc = grn_ii_builder_commit(ctx, builder); } rc_close = grn_ii_builder_close(ctx, builder); if (rc == GRN_SUCCESS) { rc = rc_close; } } return rc; }