diff options
Diffstat (limited to 'contrib/intarray/_int_gist.c')
-rw-r--r-- | contrib/intarray/_int_gist.c | 634 |
1 files changed, 634 insertions, 0 deletions
diff --git a/contrib/intarray/_int_gist.c b/contrib/intarray/_int_gist.c new file mode 100644 index 0000000..ea79c4b --- /dev/null +++ b/contrib/intarray/_int_gist.c @@ -0,0 +1,634 @@ +/* + * contrib/intarray/_int_gist.c + */ +#include "postgres.h" + +#include <limits.h> + +#include "_int.h" +#include "access/gist.h" +#include "access/reloptions.h" +#include "access/stratnum.h" + +#define GETENTRY(vec,pos) ((ArrayType *) DatumGetPointer((vec)->vector[(pos)].key)) + +/* + * Control the maximum sparseness of compressed keys. + * + * The upper safe bound for this limit is half the maximum allocatable array + * size. A lower bound would give more guarantees that pathological data + * wouldn't eat excessive CPU and memory, but at the expense of breaking + * possibly working (after a fashion) indexes. + */ +#define MAXNUMELTS (Min((MaxAllocSize / sizeof(Datum)),((MaxAllocSize - ARR_OVERHEAD_NONULLS(1)) / sizeof(int)))/2) +/* or: #define MAXNUMELTS 1000000 */ + +/* +** GiST support methods +*/ +PG_FUNCTION_INFO_V1(g_int_consistent); +PG_FUNCTION_INFO_V1(g_int_compress); +PG_FUNCTION_INFO_V1(g_int_decompress); +PG_FUNCTION_INFO_V1(g_int_penalty); +PG_FUNCTION_INFO_V1(g_int_picksplit); +PG_FUNCTION_INFO_V1(g_int_union); +PG_FUNCTION_INFO_V1(g_int_same); +PG_FUNCTION_INFO_V1(g_int_options); + + +/* +** The GiST Consistent method for _intments +** Should return false if for all data items x below entry, +** the predicate x op query == false, where op is the oper +** corresponding to strategy in the pg_amop table. +*/ +Datum +g_int_consistent(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + ArrayType *query = PG_GETARG_ARRAYTYPE_P_COPY(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + + /* Oid subtype = PG_GETARG_OID(3); */ + bool *recheck = (bool *) PG_GETARG_POINTER(4); + bool retval; + + /* this is exact except for RTSameStrategyNumber */ + *recheck = (strategy == RTSameStrategyNumber); + + if (strategy == BooleanSearchStrategy) + { + retval = execconsistent((QUERYTYPE *) query, + (ArrayType *) DatumGetPointer(entry->key), + GIST_LEAF(entry)); + + pfree(query); + PG_RETURN_BOOL(retval); + } + + /* sort query for fast search, key is already sorted */ + CHECKARRVALID(query); + PREPAREARR(query); + + switch (strategy) + { + case RTOverlapStrategyNumber: + retval = inner_int_overlap((ArrayType *) DatumGetPointer(entry->key), + query); + break; + case RTSameStrategyNumber: + if (GIST_LEAF(entry)) + DirectFunctionCall3(g_int_same, + entry->key, + PointerGetDatum(query), + PointerGetDatum(&retval)); + else + retval = inner_int_contains((ArrayType *) DatumGetPointer(entry->key), + query); + break; + case RTContainsStrategyNumber: + case RTOldContainsStrategyNumber: + retval = inner_int_contains((ArrayType *) DatumGetPointer(entry->key), + query); + break; + case RTContainedByStrategyNumber: + case RTOldContainedByStrategyNumber: + + /* + * This code is unreachable as of intarray 1.4, because the <@ + * operator has been removed from the opclass. We keep it for now + * to support older versions of the SQL definitions. + */ + if (GIST_LEAF(entry)) + retval = inner_int_contains(query, + (ArrayType *) DatumGetPointer(entry->key)); + else + { + /* + * Unfortunately, because empty arrays could be anywhere in + * the index, we must search the whole tree. + */ + retval = true; + } + break; + default: + retval = false; + } + pfree(query); + PG_RETURN_BOOL(retval); +} + +Datum +g_int_union(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + int *size = (int *) PG_GETARG_POINTER(1); + int32 i, + *ptr; + ArrayType *res; + int totlen = 0; + + for (i = 0; i < entryvec->n; i++) + { + ArrayType *ent = GETENTRY(entryvec, i); + + CHECKARRVALID(ent); + totlen += ARRNELEMS(ent); + } + + res = new_intArrayType(totlen); + ptr = ARRPTR(res); + + for (i = 0; i < entryvec->n; i++) + { + ArrayType *ent = GETENTRY(entryvec, i); + int nel; + + nel = ARRNELEMS(ent); + memcpy(ptr, ARRPTR(ent), nel * sizeof(int32)); + ptr += nel; + } + + QSORT(res, 1); + res = _int_unique(res); + *size = VARSIZE(res); + PG_RETURN_POINTER(res); +} + +/* +** GiST Compress and Decompress methods +*/ +Datum +g_int_compress(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + GISTENTRY *retval; + ArrayType *r; + int num_ranges = G_INT_GET_NUMRANGES(); + int len, + lenr; + int *dr; + int i, + j, + cand; + int64 min; + + if (entry->leafkey) + { + r = DatumGetArrayTypePCopy(entry->key); + CHECKARRVALID(r); + PREPAREARR(r); + + if (ARRNELEMS(r) >= 2 * num_ranges) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("input array is too big (%d maximum allowed, %d current), use gist__intbig_ops opclass instead", + 2 * num_ranges - 1, ARRNELEMS(r)))); + + retval = palloc(sizeof(GISTENTRY)); + gistentryinit(*retval, PointerGetDatum(r), + entry->rel, entry->page, entry->offset, false); + + PG_RETURN_POINTER(retval); + } + + /* + * leaf entries never compress one more time, only when entry->leafkey + * ==true, so now we work only with internal keys + */ + + r = DatumGetArrayTypeP(entry->key); + CHECKARRVALID(r); + if (ARRISEMPTY(r)) + { + if (r != (ArrayType *) DatumGetPointer(entry->key)) + pfree(r); + PG_RETURN_POINTER(entry); + } + + if ((len = ARRNELEMS(r)) >= 2 * num_ranges) + { /* compress */ + if (r == (ArrayType *) DatumGetPointer(entry->key)) + r = DatumGetArrayTypePCopy(entry->key); + r = resize_intArrayType(r, 2 * (len)); + + dr = ARRPTR(r); + + /* + * "len" at this point is the number of ranges we will construct. + * "lenr" is the number of ranges we must eventually remove by + * merging, we must be careful to remove no more than this number. + */ + lenr = len - num_ranges; + + /* + * Initially assume we can merge consecutive ints into a range. but we + * must count every value removed and stop when lenr runs out + */ + for (j = i = len - 1; i > 0 && lenr > 0; i--, j--) + { + int r_end = dr[i]; + int r_start = r_end; + + while (i > 0 && lenr > 0 && dr[i - 1] == r_start - 1) + --r_start, --i, --lenr; + dr[2 * j] = r_start; + dr[2 * j + 1] = r_end; + } + /* just copy the rest, if any, as trivial ranges */ + for (; i >= 0; i--, j--) + dr[2 * j] = dr[2 * j + 1] = dr[i]; + + if (++j) + { + /* + * shunt everything down to start at the right place + */ + memmove((void *) &dr[0], (void *) &dr[2 * j], 2 * (len - j) * sizeof(int32)); + } + + /* + * make "len" be number of array elements, not ranges + */ + len = 2 * (len - j); + cand = 1; + while (len > num_ranges * 2) + { + min = PG_INT64_MAX; + for (i = 2; i < len; i += 2) + if (min > ((int64) dr[i] - (int64) dr[i - 1])) + { + min = ((int64) dr[i] - (int64) dr[i - 1]); + cand = i; + } + memmove((void *) &dr[cand - 1], (void *) &dr[cand + 1], (len - cand - 1) * sizeof(int32)); + len -= 2; + } + + /* + * check sparseness of result + */ + lenr = internal_size(dr, len); + if (lenr < 0 || lenr > MAXNUMELTS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("data is too sparse, recreate index using gist__intbig_ops opclass instead"))); + + r = resize_intArrayType(r, len); + retval = palloc(sizeof(GISTENTRY)); + gistentryinit(*retval, PointerGetDatum(r), + entry->rel, entry->page, entry->offset, false); + PG_RETURN_POINTER(retval); + } + else + PG_RETURN_POINTER(entry); +} + +Datum +g_int_decompress(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + GISTENTRY *retval; + ArrayType *r; + int num_ranges = G_INT_GET_NUMRANGES(); + int *dr, + lenr; + ArrayType *in; + int lenin; + int *din; + int i, + j; + + in = DatumGetArrayTypeP(entry->key); + + CHECKARRVALID(in); + if (ARRISEMPTY(in)) + { + if (in != (ArrayType *) DatumGetPointer(entry->key)) + { + retval = palloc(sizeof(GISTENTRY)); + gistentryinit(*retval, PointerGetDatum(in), + entry->rel, entry->page, entry->offset, false); + PG_RETURN_POINTER(retval); + } + + PG_RETURN_POINTER(entry); + } + + lenin = ARRNELEMS(in); + + if (lenin < 2 * num_ranges) + { /* not compressed value */ + if (in != (ArrayType *) DatumGetPointer(entry->key)) + { + retval = palloc(sizeof(GISTENTRY)); + gistentryinit(*retval, PointerGetDatum(in), + entry->rel, entry->page, entry->offset, false); + + PG_RETURN_POINTER(retval); + } + PG_RETURN_POINTER(entry); + } + + din = ARRPTR(in); + lenr = internal_size(din, lenin); + if (lenr < 0 || lenr > MAXNUMELTS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("compressed array is too big, recreate index using gist__intbig_ops opclass instead"))); + + r = new_intArrayType(lenr); + dr = ARRPTR(r); + + for (i = 0; i < lenin; i += 2) + for (j = din[i]; j <= din[i + 1]; j++) + if ((!i) || *(dr - 1) != j) + *dr++ = j; + + if (in != (ArrayType *) DatumGetPointer(entry->key)) + pfree(in); + retval = palloc(sizeof(GISTENTRY)); + gistentryinit(*retval, PointerGetDatum(r), + entry->rel, entry->page, entry->offset, false); + + PG_RETURN_POINTER(retval); +} + +/* +** The GiST Penalty method for _intments +*/ +Datum +g_int_penalty(PG_FUNCTION_ARGS) +{ + GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0); + GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1); + float *result = (float *) PG_GETARG_POINTER(2); + ArrayType *ud; + float tmp1, + tmp2; + + ud = inner_int_union((ArrayType *) DatumGetPointer(origentry->key), + (ArrayType *) DatumGetPointer(newentry->key)); + rt__int_size(ud, &tmp1); + rt__int_size((ArrayType *) DatumGetPointer(origentry->key), &tmp2); + *result = tmp1 - tmp2; + pfree(ud); + + PG_RETURN_POINTER(result); +} + + + +Datum +g_int_same(PG_FUNCTION_ARGS) +{ + ArrayType *a = PG_GETARG_ARRAYTYPE_P(0); + ArrayType *b = PG_GETARG_ARRAYTYPE_P(1); + bool *result = (bool *) PG_GETARG_POINTER(2); + int32 n = ARRNELEMS(a); + int32 *da, + *db; + + CHECKARRVALID(a); + CHECKARRVALID(b); + + if (n != ARRNELEMS(b)) + { + *result = false; + PG_RETURN_POINTER(result); + } + *result = true; + da = ARRPTR(a); + db = ARRPTR(b); + while (n--) + { + if (*da++ != *db++) + { + *result = false; + break; + } + } + + PG_RETURN_POINTER(result); +} + +/***************************************************************** +** Common GiST Method +*****************************************************************/ + +typedef struct +{ + OffsetNumber pos; + float cost; +} SPLITCOST; + +static int +comparecost(const void *a, const void *b) +{ + if (((const SPLITCOST *) a)->cost == ((const SPLITCOST *) b)->cost) + return 0; + else + return (((const SPLITCOST *) a)->cost > ((const SPLITCOST *) b)->cost) ? 1 : -1; +} + +/* +** The GiST PickSplit method for _intments +** We use Guttman's poly time split algorithm +*/ +Datum +g_int_picksplit(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1); + OffsetNumber i, + j; + ArrayType *datum_alpha, + *datum_beta; + ArrayType *datum_l, + *datum_r; + ArrayType *union_d, + *union_dl, + *union_dr; + ArrayType *inter_d; + bool firsttime; + float size_alpha, + size_beta, + size_union, + size_inter; + float size_waste, + waste; + float size_l, + size_r; + int nbytes; + OffsetNumber seed_1 = 0, + seed_2 = 0; + OffsetNumber *left, + *right; + OffsetNumber maxoff; + SPLITCOST *costvector; + +#ifdef GIST_DEBUG + elog(DEBUG3, "--------picksplit %d", entryvec->n); +#endif + + maxoff = entryvec->n - 2; + nbytes = (maxoff + 2) * sizeof(OffsetNumber); + v->spl_left = (OffsetNumber *) palloc(nbytes); + v->spl_right = (OffsetNumber *) palloc(nbytes); + + firsttime = true; + waste = 0.0; + for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i)) + { + datum_alpha = GETENTRY(entryvec, i); + for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j)) + { + datum_beta = GETENTRY(entryvec, j); + + /* compute the wasted space by unioning these guys */ + /* size_waste = size_union - size_inter; */ + union_d = inner_int_union(datum_alpha, datum_beta); + rt__int_size(union_d, &size_union); + inter_d = inner_int_inter(datum_alpha, datum_beta); + rt__int_size(inter_d, &size_inter); + size_waste = size_union - size_inter; + + pfree(union_d); + pfree(inter_d); + + /* + * are these a more promising split that what we've already seen? + */ + + if (size_waste > waste || firsttime) + { + waste = size_waste; + seed_1 = i; + seed_2 = j; + firsttime = false; + } + } + } + + left = v->spl_left; + v->spl_nleft = 0; + right = v->spl_right; + v->spl_nright = 0; + if (seed_1 == 0 || seed_2 == 0) + { + seed_1 = 1; + seed_2 = 2; + } + + datum_alpha = GETENTRY(entryvec, seed_1); + datum_l = copy_intArrayType(datum_alpha); + rt__int_size(datum_l, &size_l); + datum_beta = GETENTRY(entryvec, seed_2); + datum_r = copy_intArrayType(datum_beta); + rt__int_size(datum_r, &size_r); + + maxoff = OffsetNumberNext(maxoff); + + /* + * sort entries + */ + costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + costvector[i - 1].pos = i; + datum_alpha = GETENTRY(entryvec, i); + union_d = inner_int_union(datum_l, datum_alpha); + rt__int_size(union_d, &size_alpha); + pfree(union_d); + union_d = inner_int_union(datum_r, datum_alpha); + rt__int_size(union_d, &size_beta); + pfree(union_d); + costvector[i - 1].cost = Abs((size_alpha - size_l) - (size_beta - size_r)); + } + qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost); + + /* + * Now split up the regions between the two seeds. An important property + * of this split algorithm is that the split vector v has the indices of + * items to be split in order in its left and right vectors. We exploit + * this property by doing a merge in the code that actually splits the + * page. + * + * For efficiency, we also place the new index tuple in this loop. This is + * handled at the very end, when we have placed all the existing tuples + * and i == maxoff + 1. + */ + + + for (j = 0; j < maxoff; j++) + { + i = costvector[j].pos; + + /* + * If we've already decided where to place this item, just put it on + * the right list. Otherwise, we need to figure out which page needs + * the least enlargement in order to store the item. + */ + + if (i == seed_1) + { + *left++ = i; + v->spl_nleft++; + continue; + } + else if (i == seed_2) + { + *right++ = i; + v->spl_nright++; + continue; + } + + /* okay, which page needs least enlargement? */ + datum_alpha = GETENTRY(entryvec, i); + union_dl = inner_int_union(datum_l, datum_alpha); + union_dr = inner_int_union(datum_r, datum_alpha); + rt__int_size(union_dl, &size_alpha); + rt__int_size(union_dr, &size_beta); + + /* pick which page to add it to */ + if (size_alpha - size_l < size_beta - size_r + WISH_F(v->spl_nleft, v->spl_nright, 0.01)) + { + pfree(datum_l); + pfree(union_dr); + datum_l = union_dl; + size_l = size_alpha; + *left++ = i; + v->spl_nleft++; + } + else + { + pfree(datum_r); + pfree(union_dl); + datum_r = union_dr; + size_r = size_beta; + *right++ = i; + v->spl_nright++; + } + } + pfree(costvector); + *right = *left = FirstOffsetNumber; + + v->spl_ldatum = PointerGetDatum(datum_l); + v->spl_rdatum = PointerGetDatum(datum_r); + + PG_RETURN_POINTER(v); +} + +Datum +g_int_options(PG_FUNCTION_ARGS) +{ + local_relopts *relopts = (local_relopts *) PG_GETARG_POINTER(0); + + init_local_reloptions(relopts, sizeof(GISTIntArrayOptions)); + add_local_int_reloption(relopts, "numranges", + "number of ranges for compression", + G_INT_NUMRANGES_DEFAULT, 1, G_INT_NUMRANGES_MAX, + offsetof(GISTIntArrayOptions, num_ranges)); + + PG_RETURN_VOID(); +} |