summaryrefslogtreecommitdiffstats
path: root/storage/maria/ma_rt_split.c
diff options
context:
space:
mode:
Diffstat (limited to 'storage/maria/ma_rt_split.c')
-rw-r--r--storage/maria/ma_rt_split.c550
1 files changed, 550 insertions, 0 deletions
diff --git a/storage/maria/ma_rt_split.c b/storage/maria/ma_rt_split.c
new file mode 100644
index 00000000..a0acb9ce
--- /dev/null
+++ b/storage/maria/ma_rt_split.c
@@ -0,0 +1,550 @@
+/* Copyright (C) 2006 MySQL AB & Alexey Botchkov & MySQL Finland AB
+ & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
+
+#include "maria_def.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+
+#ifdef HAVE_RTREE_KEYS
+
+#include "ma_rt_index.h"
+#include "ma_rt_key.h"
+#include "ma_rt_mbr.h"
+
+typedef struct
+{
+ double square;
+ int n_node;
+ const uchar *key;
+ double *coords;
+} SplitStruct;
+
+inline static double *reserve_coords(double **d_buffer, int n_dim)
+{
+ double *coords= *d_buffer;
+ (*d_buffer)+= n_dim * 2;
+ return coords;
+}
+
+static void mbr_join(double *a, const double *b, int n_dim)
+{
+ double *end= a + n_dim * 2;
+ do
+ {
+ if (a[0] > b[0])
+ a[0]= b[0];
+
+ if (a[1] < b[1])
+ a[1]= b[1];
+
+ a+= 2;
+ b+= 2;
+ } while (a != end);
+}
+
+/*
+Counts the square of mbr which is a join of a and b
+*/
+static double mbr_join_square(const double *a, const double *b, int n_dim)
+{
+ const double *end= a + n_dim * 2;
+ double square= 1.0;
+ do
+ {
+ square *=
+ ((a[1] < b[1]) ? b[1] : a[1]) - ((a[0] > b[0]) ? b[0] : a[0]);
+
+ a+= 2;
+ b+= 2;
+ } while (a != end);
+
+ return square;
+}
+
+static double count_square(const double *a, int n_dim)
+{
+ const double *end= a + n_dim * 2;
+ double square= 1.0;
+ do
+ {
+ square *= a[1] - a[0];
+ a+= 2;
+ } while (a != end);
+ return square;
+}
+
+inline static void copy_coords(double *dst, const double *src, int n_dim)
+{
+ memcpy(dst, src, sizeof(double) * (n_dim * 2));
+}
+
+/**
+ Select two nodes to collect group upon.
+
+ Note that such function uses 'double' arithmetic so may behave differently
+ on different platforms/builds. There are others in this file.
+*/
+static void pick_seeds(SplitStruct *node, int n_entries,
+ SplitStruct **seed_a, SplitStruct **seed_b, int n_dim)
+{
+ SplitStruct *cur1;
+ SplitStruct *lim1= node + (n_entries - 1);
+ SplitStruct *cur2;
+ SplitStruct *lim2= node + n_entries;
+
+ double max_d= -DBL_MAX;
+ double d;
+
+ for (cur1= node; cur1 < lim1; cur1++)
+ {
+ for (cur2=cur1 + 1; cur2 < lim2; cur2++)
+ {
+
+ d= mbr_join_square(cur1->coords, cur2->coords, n_dim) - cur1->square -
+ cur2->square;
+ if (d > max_d)
+ {
+ max_d= d;
+ *seed_a= cur1;
+ *seed_b= cur2;
+ }
+ }
+ }
+}
+
+/*
+Select next node and group where to add
+*/
+static void pick_next(SplitStruct *node, int n_entries, double *g1, double *g2,
+ SplitStruct **choice, int *n_group, int n_dim)
+{
+ SplitStruct *cur= node;
+ SplitStruct *end= node + n_entries;
+
+ double max_diff= -DBL_MAX;
+
+ for (; cur < end; cur++)
+ {
+ double diff;
+ double abs_diff;
+
+ if (cur->n_node)
+ {
+ continue;
+ }
+
+ diff= mbr_join_square(g1, cur->coords, n_dim) -
+ mbr_join_square(g2, cur->coords, n_dim);
+
+ abs_diff= fabs(diff);
+ if (abs_diff > max_diff)
+ {
+ max_diff= abs_diff;
+ *n_group= 1 + (diff > 0);
+ *choice= cur;
+ }
+ }
+}
+
+/*
+Mark not-in-group entries as n_group
+*/
+static void mark_all_entries(SplitStruct *node, int n_entries, int n_group)
+{
+ SplitStruct *cur= node;
+ SplitStruct *end= node + n_entries;
+
+ for (; cur < end; cur++)
+ {
+ if (cur->n_node)
+ {
+ continue;
+ }
+ cur->n_node= n_group;
+ }
+}
+
+static int split_maria_rtree_node(SplitStruct *node, int n_entries,
+ int all_size, /* Total key's size */
+ int key_size,
+ int min_size, /* Minimal group size */
+ int size1, int size2 /* initial group sizes */,
+ double **d_buffer, int n_dim)
+{
+ SplitStruct *cur;
+ SplitStruct *UNINIT_VAR(a);
+ SplitStruct *UNINIT_VAR(b);
+ double *g1= reserve_coords(d_buffer, n_dim);
+ double *g2= reserve_coords(d_buffer, n_dim);
+ SplitStruct *UNINIT_VAR(next);
+ int UNINIT_VAR(next_node);
+ int i;
+ SplitStruct *end= node + n_entries;
+
+ if (all_size < min_size * 2)
+ {
+ return 1;
+ }
+
+ cur= node;
+ for (; cur < end; cur++)
+ {
+ cur->square= count_square(cur->coords, n_dim);
+ cur->n_node= 0;
+ }
+
+ pick_seeds(node, n_entries, &a, &b, n_dim);
+ a->n_node= 1;
+ b->n_node= 2;
+
+
+ copy_coords(g1, a->coords, n_dim);
+ size1+= key_size;
+ copy_coords(g2, b->coords, n_dim);
+ size2+= key_size;
+
+
+ for (i=n_entries - 2; i>0; --i)
+ {
+ if (all_size - (size2 + key_size) < min_size) /* Can't write into group 2 */
+ {
+ mark_all_entries(node, n_entries, 1);
+ break;
+ }
+
+ if (all_size - (size1 + key_size) < min_size) /* Can't write into group 1 */
+ {
+ mark_all_entries(node, n_entries, 2);
+ break;
+ }
+
+ pick_next(node, n_entries, g1, g2, &next, &next_node, n_dim);
+ if (next_node == 1)
+ {
+ size1+= key_size;
+ mbr_join(g1, next->coords, n_dim);
+ }
+ else
+ {
+ size2+= key_size;
+ mbr_join(g2, next->coords, n_dim);
+ }
+ next->n_node= next_node;
+ }
+
+ return 0;
+}
+
+
+/**
+ Logs key reorganization done in a split page (new page is logged elsewhere).
+
+ The effect of a split on the split page is three changes:
+ - some piece of the page move to different places inside this page (we are
+ not interested here in the pieces which move to the new page)
+ - the key is inserted into the page or not (could be in the new page)
+ - page is shrunk
+ All this is uniquely determined by a few parameters:
+ - the key (starting at 'key-nod_flag', for 'full_length' bytes
+ (maria_rtree_split_page() seems to depend on its parameters key&key_length
+ but in fact it reads more (to the left: nod_flag, and to the right:
+ full_length)
+ - the binary content of the page
+ - some variables in the share
+ - double arithmetic, which is unpredictable from machine to machine and
+ from build to build (see pick_seeds() above: it has a comparison between
+ double-s 'if (d > max_d)' so the comparison can go differently from machine
+ to machine or build to build, it has happened in real life).
+ If one day we use precision-math instead of double-math, in GIS, then the
+ last parameter would become constant accross machines and builds and we
+ could some cheap logging: just log the few parameters above.
+ Until then, we log the list of memcpy() operations (fortunately, we often do
+ not have to log the source bytes, as they can be found in the page before
+ applying the REDO; the only source bytes to log are the key), the key if it
+ was inserted into this page, and the shrinking.
+
+ @param info table
+ @param page page's offset in the file
+ @param buff content of the page (post-split)
+ @param key_with_nod_flag pointer to key-nod_flag
+ @param full_length length of (key + (nod_flag (if node) or rowid (if
+ leaf)))
+ @param log_internal_copy encoded list of mempcy() operations done on
+ split page, having their source in the page
+ @param log_internal_copy_length length of above list, in bytes
+ @param log_key_copy operation describing the key's copy, or NULL if the
+ inserted key was not put into the page (was put in
+ new page, so does not have to be logged here)
+ @param length_diff by how much the page has shrunk during split
+*/
+
+static my_bool _ma_log_rt_split(MARIA_PAGE *page,
+ const uchar *key_with_nod_flag,
+ uint full_length,
+ const uchar *log_internal_copy,
+ uint log_internal_copy_length,
+ const uchar *log_key_copy,
+ uint length_diff)
+{
+ MARIA_HA *info= page->info;
+ MARIA_SHARE *share= info->s;
+ LSN lsn;
+ uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 1 + 2 + 1 + 2 + 2 + 7],
+ *log_pos;
+ LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6];
+ uint translog_parts, extra_length= 0;
+ my_off_t page_pos;
+ DBUG_ENTER("_ma_log_rt_split");
+ DBUG_PRINT("enter", ("page: %p", page));
+
+ DBUG_ASSERT(share->now_transactional);
+ page_pos= page->pos / share->block_size;
+ page_store(log_data + FILEID_STORE_SIZE, page_pos);
+ log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE;
+ log_pos[0]= KEY_OP_DEL_SUFFIX;
+ log_pos++;
+ DBUG_ASSERT((int)length_diff > 0);
+ int2store(log_pos, length_diff);
+ log_pos+= 2;
+ log_pos[0]= KEY_OP_MULTI_COPY;
+ log_pos++;
+ int2store(log_pos, full_length);
+ log_pos+= 2;
+ int2store(log_pos, log_internal_copy_length);
+ log_pos+= 2;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
+ log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data) - 7;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= log_internal_copy;
+ log_array[TRANSLOG_INTERNAL_PARTS + 1].length= log_internal_copy_length;
+ translog_parts= 2;
+ if (log_key_copy != NULL) /* need to store key into record */
+ {
+ log_array[TRANSLOG_INTERNAL_PARTS + 2].str= log_key_copy;
+ log_array[TRANSLOG_INTERNAL_PARTS + 2].length= 1 + 2 + 1 + 2;
+ log_array[TRANSLOG_INTERNAL_PARTS + 3].str= key_with_nod_flag;
+ log_array[TRANSLOG_INTERNAL_PARTS + 3].length= full_length;
+ extra_length= 1 + 2 + 1 + 2 + full_length;
+ translog_parts+= 2;
+ }
+
+ _ma_log_key_changes(page,
+ log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_pos, &extra_length, &translog_parts);
+ /* Remember new page length for future log entires for same page */
+ page->org_size= page->size;
+
+ if (translog_write_record(&lsn, LOGREC_REDO_INDEX,
+ info->trn, info,
+ (translog_size_t) ((log_pos - log_data) +
+ log_internal_copy_length +
+ extra_length),
+ TRANSLOG_INTERNAL_PARTS + translog_parts,
+ log_array, log_data, NULL))
+ DBUG_RETURN(1);
+ DBUG_RETURN(0);
+}
+
+/**
+ 0 ok; the created page is put into page cache; the shortened one is not (up
+ to the caller to do it)
+ 1 or -1: error.
+ If new_page_offs==NULL, won't create new page (for redo phase).
+*/
+
+int maria_rtree_split_page(const MARIA_KEY *key, MARIA_PAGE *page,
+ my_off_t *new_page_offs)
+{
+ MARIA_HA *info= page->info;
+ MARIA_SHARE *share= info->s;
+ const my_bool transactional= share->now_transactional;
+ int n1, n2; /* Number of items in groups */
+ SplitStruct *task;
+ SplitStruct *cur;
+ SplitStruct *stop;
+ double *coord_buf;
+ double *next_coord;
+ int n_dim;
+ uchar *source_cur, *cur1, *cur2;
+ uchar *new_page_buff= 0, *log_internal_copy, *log_internal_copy_ptr,
+ *log_key_copy= NULL;
+ int err_code= 0;
+ uint new_page_length;
+ uint nod_flag= page->node;
+ uint org_length= page->size;
+ uint full_length= key->data_length + (nod_flag ? nod_flag :
+ key->ref_length);
+ uint key_data_length= key->data_length;
+ int max_keys= ((org_length - share->keypage_header) / (full_length));
+ MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link;
+ MARIA_KEYDEF *keyinfo= key->keyinfo;
+ my_bool new_page_buff_alloced= 0, coord_buf_alloced= 0;
+ DBUG_ENTER("maria_rtree_split_page");
+ DBUG_PRINT("rtree", ("splitting block"));
+
+ n_dim= keyinfo->keysegs / 2;
+
+ alloc_on_stack(*info->stack_end_ptr, coord_buf, coord_buf_alloced,
+ (n_dim * 2 * sizeof(double) * (max_keys + 1 + 4) +
+ sizeof(SplitStruct) * (max_keys + 1)));
+ if (!coord_buf)
+ DBUG_RETURN(-1);
+
+ task= (SplitStruct *)(coord_buf + n_dim * 2 * (max_keys + 1 + 4));
+
+ next_coord= coord_buf;
+
+ stop= task + max_keys;
+ source_cur= rt_PAGE_FIRST_KEY(share, page->buff, nod_flag);
+
+ for (cur= task;
+ cur < stop;
+ cur++, source_cur= rt_PAGE_NEXT_KEY(share, source_cur, key_data_length,
+ nod_flag))
+ {
+ cur->coords= reserve_coords(&next_coord, n_dim);
+ cur->key= source_cur;
+ maria_rtree_d_mbr(keyinfo->seg, source_cur, key_data_length, cur->coords);
+ }
+
+ cur->coords= reserve_coords(&next_coord, n_dim);
+ maria_rtree_d_mbr(keyinfo->seg, key->data, key_data_length, cur->coords);
+ cur->key= key->data;
+
+
+ if (split_maria_rtree_node(task, max_keys + 1,
+ page->size + full_length + 2,
+ full_length,
+ rt_PAGE_MIN_SIZE(keyinfo->block_length),
+ 2, 2, &next_coord, n_dim))
+ {
+ err_code= 1;
+ goto split_err;
+ }
+
+ /* Allocate buffer for new page and piece of log record */
+ alloc_on_stack(*info->stack_end_ptr, new_page_buff, new_page_buff_alloced,
+ (keyinfo->block_length +
+ (transactional ? max_keys * (2 + 2) + 1 + 2 + 1 + 2 : 0)));
+ if (!new_page_buff)
+ {
+ err_code= -1;
+ goto split_err;
+ }
+
+ log_internal_copy= log_internal_copy_ptr= new_page_buff +
+ keyinfo->block_length;
+ bzero(new_page_buff, share->block_size);
+
+ stop= task + (max_keys + 1);
+ cur1= rt_PAGE_FIRST_KEY(share, page->buff, nod_flag);
+ cur2= rt_PAGE_FIRST_KEY(share, new_page_buff, nod_flag);
+
+ n1= n2= 0;
+ for (cur= task; cur < stop; cur++)
+ {
+ uchar *to;
+ const uchar *cur_key= cur->key;
+ my_bool log_this_change;
+ DBUG_ASSERT(log_key_copy == NULL);
+ if (cur->n_node == 1)
+ {
+ to= cur1;
+ cur1= rt_PAGE_NEXT_KEY(share, cur1, key_data_length, nod_flag);
+ n1++;
+ log_this_change= transactional;
+ }
+ else
+ {
+ to= cur2;
+ cur2= rt_PAGE_NEXT_KEY(share, cur2, key_data_length, nod_flag);
+ n2++;
+ log_this_change= FALSE;
+ }
+ if (to != cur_key)
+ {
+ uchar *to_with_nod_flag= to - nod_flag;
+ const uchar *cur_key_with_nod_flag= cur_key - nod_flag;
+ memcpy(to_with_nod_flag, cur_key_with_nod_flag, full_length);
+ if (log_this_change)
+ {
+ size_t to_with_nod_flag_offs= to_with_nod_flag - page->buff;
+ if (likely(cur_key != key->data))
+ {
+ /* this memcpy() is internal to the page (source in the page) */
+ size_t cur_key_with_nod_flag_offs= cur_key_with_nod_flag - page->buff;
+ int2store(log_internal_copy_ptr, to_with_nod_flag_offs);
+ log_internal_copy_ptr+= 2;
+ int2store(log_internal_copy_ptr, cur_key_with_nod_flag_offs);
+ log_internal_copy_ptr+= 2;
+ }
+ else
+ {
+ /* last iteration, and this involves *key: source is external */
+ log_key_copy= log_internal_copy_ptr;
+ log_key_copy[0]= KEY_OP_OFFSET;
+ int2store(log_key_copy + 1, to_with_nod_flag_offs);
+ log_key_copy[3]= KEY_OP_CHANGE;
+ int2store(log_key_copy + 4, full_length);
+ /* _ma_log_rt_split() will store *key, right after */
+ }
+ }
+ }
+ }
+ { /* verify that above loop didn't touch header bytes */
+ uint i;
+ for (i= 0; i < share->keypage_header; i++)
+ DBUG_ASSERT(new_page_buff[i]==0);
+ }
+
+ if (nod_flag)
+ _ma_store_keypage_flag(share, new_page_buff, KEYPAGE_FLAG_ISNOD);
+ _ma_store_keynr(share, new_page_buff, keyinfo->key_nr);
+ new_page_length= share->keypage_header + n2 * full_length;
+ _ma_store_page_used(share, new_page_buff, new_page_length);
+ page->size= share->keypage_header + n1 * full_length;
+ page_store_size(share, page);
+
+ if ((*new_page_offs= _ma_new(info, DFLT_INIT_HITS, &page_link)) ==
+ HA_OFFSET_ERROR)
+ err_code= -1;
+ else
+ {
+ MARIA_PAGE new_page;
+ _ma_page_setup(&new_page, info, keyinfo, *new_page_offs, new_page_buff);
+
+ if (transactional &&
+ ( /* log change to split page */
+ _ma_log_rt_split(page, key->data - nod_flag,
+ full_length, log_internal_copy,
+ (uint)(log_internal_copy_ptr - log_internal_copy),
+ log_key_copy, (uint)(org_length - page->size)) ||
+ /* and to new page */
+ _ma_log_new(&new_page, 0)))
+ err_code= -1;
+
+ if (_ma_write_keypage(&new_page, page_link->write_lock,
+ DFLT_INIT_HITS))
+ err_code= -1;
+ }
+ DBUG_PRINT("rtree", ("split new block: %lu", (ulong) *new_page_offs));
+
+split_err:
+ stack_alloc_free(new_page_buff, new_page_buff_alloced);
+ stack_alloc_free(coord_buf, coord_buf_alloced);
+ DBUG_RETURN(err_code);
+}
+
+#endif /*HAVE_RTREE_KEYS*/