diff options
Diffstat (limited to '')
-rw-r--r-- | sql/multi_range_read.cc | 2126 |
1 files changed, 2126 insertions, 0 deletions
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc new file mode 100644 index 00000000..c7c3079f --- /dev/null +++ b/sql/multi_range_read.cc @@ -0,0 +1,2126 @@ +/* Copyright (C) 2010, 2011 Monty Program Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "mariadb.h" +#include "sql_parse.h" +#include <my_bit.h> +#include "sql_select.h" +#include "key.h" +#include "sql_statistics.h" +#include "rowid_filter.h" + +/**************************************************************************** + * Default MRR implementation (MRR to non-MRR converter) + ***************************************************************************/ + +/** + Get cost and other information about MRR scan over a known list of ranges + + Calculate estimated cost and other information about an MRR scan for given + sequence of ranges. + + @param keyno Index number + @param seq Range sequence to be traversed + @param seq_init_param First parameter for seq->init() + @param n_ranges_arg Number of ranges in the sequence, or 0 if the caller + can't efficiently determine it + @param bufsz INOUT IN: Size of the buffer available for use + OUT: Size of the buffer that is expected to be actually + used, or 0 if buffer is not needed. + @param flags INOUT A combination of HA_MRR_* flags + @param cost OUT Estimated cost of MRR access + + @note + This method (or an overriding one in a derived class) must check for + thd->killed and return HA_POS_ERROR if it is not zero. This is required + for a user to be able to interrupt the calculation by killing the + connection/query. + + @retval + HA_POS_ERROR Error or the engine is unable to perform the requested + scan. Values of OUT parameters are undefined. + @retval + other OK, *cost contains cost of the scan, *bufsz and *flags + contain scan parameters. +*/ + +ha_rows +handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, + void *seq_init_param, uint n_ranges_arg, + uint *bufsz, uint *flags, + Cost_estimate *cost) +{ + KEY_MULTI_RANGE range; + range_seq_t seq_it; + ha_rows total_rows= 0; + uint n_ranges=0; + ha_rows max_rows= stats.records; + THD *thd= table->in_use; + ulonglong io_blocks; + + /* + Counter of blocks that contain range edges for those ranges + for which records_in_range() is called + */ + ulonglong edge_blocks_cnt= 0; + /* + Counter of blocks that contain index tuples for those ranges + for which records_in_range() is called + */ + ulonglong range_blocks_cnt= 0; + /* + The position of the block containing the last record of the previous range + for which the info about range position is provided + */ + ulonglong prev_range_last_block= UNUSED_PAGE_NO; + /* The counter of records the staring from prev_range_last_block */ + ulonglong prev_range_last_block_records= 0; + /* + The counter of single point ranges. + (For single point ranges we do not call records_in_range()) + */ + ulonglong single_point_ranges= 0; + /* + The counter of of single point ranges that we succeded to assign + to some blocks + */ + ulonglong assigned_single_point_ranges= 0; + /* + Counter of single point ranges for which records_in_range in not + called and that are encountered between two ranges without such property + For example, let's have a subsequence of ranges + R1,r1,....rk,R2 + where r1,...,rk are single point ranges for which records_in_range is + called while R1 and R2 are not such ranges. + Then single_point_ranges_delta will count ranges r1,...,rk. + */ + ulonglong unassigned_single_point_ranges= 0; + + uint len= table->key_info[keyno].key_length + table->file->ref_length; + if (table->file->is_clustering_key(keyno)) + len= table->s->stored_rec_length; + /* Assume block is 75 % full */ + uint avg_block_records= ((uint) (stats.block_size*3/4))/len + 1; + uint limit= thd->variables.eq_range_index_dive_limit; + bool use_statistics_for_eq_range= eq_ranges_exceeds_limit(seq, + seq_init_param, + limit); + DBUG_ENTER("multi_range_read_info_const"); + + /* Default MRR implementation doesn't need buffer */ + *bufsz= 0; + + seq_it= seq->init(seq_init_param, n_ranges, *flags); + while (!seq->next(seq_it, &range)) + { + ha_rows rows; + + if (unlikely(thd->killed != 0)) + DBUG_RETURN(HA_POS_ERROR); + + n_ranges++; + key_range *min_endp, *max_endp; + if (range.range_flag & GEOM_FLAG) + { + /* In this case tmp_min_flag contains the handler-read-function */ + range.start_key.flag= (ha_rkey_function) (range.range_flag ^ GEOM_FLAG); + min_endp= &range.start_key; + max_endp= NULL; + } + else + { + min_endp= range.start_key.length? &range.start_key : NULL; + max_endp= range.end_key.length? &range.end_key : NULL; + } + int keyparts_used= my_count_bits(range.start_key.keypart_map); + + if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE)) + { + rows= 1; + /* + In this case we do not call records_in_range() and as a result + do not get any info on the edge blocks for this range. However if it + happens that the range for which we have such info uses the same block + for its first record as the last range for which such info is + provided uses for its last record then this range can be assigned + later to one of the blocks used by other ranges. + + Note that we don't have to increment edge_blocks_cnt or + range_blocks_cnt here. + */ + single_point_ranges++; + } + else if (use_statistics_for_eq_range && + !(range.range_flag & NULL_RANGE) && + (range.range_flag & EQ_RANGE) && + table->key_info[keyno].actual_rec_per_key(keyparts_used - 1) > 0.5) + { + rows= ((ha_rows) table->key_info[keyno]. + actual_rec_per_key(keyparts_used-1)); + range_blocks_cnt+= ((MY_MAX(rows, 1) - 1) / avg_block_records + 1); + } + else + { + page_range pages= unused_page_range; + if ((rows= this->records_in_range(keyno, min_endp, max_endp, &pages)) == + HA_POS_ERROR) + { + /* Can't scan one range => can't do MRR scan at all */ + total_rows= HA_POS_ERROR; + break; + } + if (pages.first_page == UNUSED_PAGE_NO) + { + /* + The engine does not provide info on the range position. + Place the range in a new block. Note that in this case + any new range will be placed in a new block. + */ + ulonglong additional_blocks= ((MY_MAX(rows,1) - 1) / avg_block_records + + 1); + edge_blocks_cnt+= additional_blocks == 1 ? 1 : 2; + range_blocks_cnt+= additional_blocks; + } + else + { + /* The info on the range position is provided */ + if (pages.first_page == prev_range_last_block) + { + /* + The new range starts in the same block that the last range + for which the position of the range was provided. + */ + /* + First add records of single point ranges that can be placed + between these two ranges. + */ + prev_range_last_block_records+= (single_point_ranges - + assigned_single_point_ranges); + assigned_single_point_ranges= single_point_ranges; + if (pages.first_page == pages.last_page) + { + /* + All records of the current range are in the same block + Note that the prev_range_last_block_records can be much larger + than max_records_in_block as the rows can be compressed! + */ + prev_range_last_block_records+= rows; + DBUG_ASSERT(prev_range_last_block_records < + stats.block_size); + } + else + { + /* + The current range spans more than one block + + Place part of the range records in 'prev_range_last_block' + and the remaining records in additional blocks. + + We don't know where the first key was positioned in the + block, so we assume the range started in the middle of the + block. + + Note that prev_range_last_block_records > avg_block_records + can be true in case of compressed rows. + */ + ha_rows rem_rows= rows; + + if (avg_block_records > prev_range_last_block_records) + { + ha_rows space_left_in_prev_block= + (avg_block_records - prev_range_last_block_records)/2; + rem_rows= 0; + if (rows > space_left_in_prev_block) + rem_rows= rows - space_left_in_prev_block; + } + /* Calculate how many additional blocks we need for rem_rows */ + ulonglong additional_blocks= ((MY_MAX(rem_rows, 1) - 1) / + avg_block_records + 1); + edge_blocks_cnt++; + range_blocks_cnt+= additional_blocks; + prev_range_last_block= pages.last_page; + /* There is at least one row on last page */ + prev_range_last_block_records= 1; + } + } + else + { + /* + The new range does not start in the same block that the last range + for which the position of the range was provided. + Note that rows may be 0! + */ + ulonglong additional_blocks= ((MY_MAX(rows, 1) - 1) / + avg_block_records + 1); + edge_blocks_cnt+= additional_blocks == 1 ? 1 : 2; + range_blocks_cnt+= additional_blocks; + unassigned_single_point_ranges+= (single_point_ranges - + assigned_single_point_ranges); + assigned_single_point_ranges= single_point_ranges; + prev_range_last_block= pages.last_page; + /* There is at least one row on last page */ + prev_range_last_block_records= 1; + } + } + } + total_rows+= rows; + } + /* + Count the number of io_blocks that where not yet read and thus not cached. + The number of equal read blocks that where not read are: + + (single_point_ranges - assigned_single_point_ranges). + + We don't add these to io_blocks as we don't want to penalize equal + readss (if we did, a range that would read 5 rows would be + regarded as better than one equal read). + + Better to assume we have done a records_in_range() for the equal + range and it's also cached. + */ + io_blocks= (range_blocks_cnt - edge_blocks_cnt); + unassigned_single_point_ranges+= (single_point_ranges - + assigned_single_point_ranges); + + if (total_rows != HA_POS_ERROR) + { + set_if_smaller(total_rows, max_rows); + + /* The following calculation is the same as in multi_range_read_info(): */ + *flags |= HA_MRR_USE_DEFAULT_IMPL; + cost->reset(); + cost->avg_io_cost= cost->idx_avg_io_cost= avg_io_cost(); + + if (!is_clustering_key(keyno)) + { + cost->idx_io_count= (double) io_blocks; + cost->idx_cpu_cost= (keyread_time(keyno, 0, total_rows) + + n_ranges * IDX_LOOKUP_COST); + if (!(*flags & HA_MRR_INDEX_ONLY)) + cost->cpu_cost= read_time(keyno, 0, total_rows); + } + else + { + /* + Clustered index + If all index dives are to a few blocks, then limit the + ranges used by read_time to the number of dives. + */ + io_blocks+= unassigned_single_point_ranges; + cost->idx_cpu_cost= n_ranges * IDX_LOOKUP_COST; + uint limited_ranges= (uint) MY_MIN((ulonglong) n_ranges, io_blocks); + cost->cpu_cost= read_time(keyno, limited_ranges, total_rows); + } + cost->cpu_cost+= (rows2double(total_rows) / TIME_FOR_COMPARE + + MULTI_RANGE_READ_SETUP_COST); + } + DBUG_PRINT("statistics", + ("key: %s rows: %llu total_cost: %.3f io_blocks: %llu " + "idx_io_count: %.3f cpu_cost: %.3f io_count: %.3f", + table->s->keynames.type_names[keyno], + (ulonglong) total_rows, cost->total_cost(), (ulonglong) io_blocks, + cost->idx_io_count, cost->cpu_cost, cost->io_count)); + DBUG_RETURN(total_rows); +} + + +/** + Get cost and other information about MRR scan over some sequence of ranges + + Calculate estimated cost and other information about an MRR scan for some + sequence of ranges. + + The ranges themselves will be known only at execution phase. When this + function is called we only know number of ranges and a (rough) E(#records) + within those ranges. + + Currently this function is only called for "n-keypart singlepoint" ranges, + i.e. each range is "keypart1=someconst1 AND ... AND keypartN=someconstN" + + The flags parameter is a combination of those flags: HA_MRR_SORTED, + HA_MRR_INDEX_ONLY, HA_MRR_NO_ASSOCIATION, HA_MRR_LIMITS. + + @param keyno Index number + @param n_ranges Estimated number of ranges (i.e. intervals) in the + range sequence. + @param n_rows Estimated total number of records contained within all + of the ranges + @param bufsz INOUT IN: Size of the buffer available for use + OUT: Size of the buffer that will be actually used, or + 0 if buffer is not needed. + @param flags INOUT A combination of HA_MRR_* flags + @param cost OUT Estimated cost of MRR access + + @retval + 0 OK, *cost contains cost of the scan, *bufsz and *flags contain scan + parameters. + @retval + other Error or can't perform the requested scan +*/ + +ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows, + uint key_parts, uint *bufsz, + uint *flags, Cost_estimate *cost) +{ + /* + Currently we expect this function to be called only in preparation of scan + with HA_MRR_SINGLE_POINT property. + */ + DBUG_ASSERT(*flags | HA_MRR_SINGLE_POINT); + + *bufsz= 0; /* Default implementation doesn't need a buffer */ + *flags |= HA_MRR_USE_DEFAULT_IMPL; + + cost->reset(); + /* Produce the same cost as non-MRR code does */ + if (!is_clustering_key(keyno)) + { + /* + idx_io_count could potentially be increased with the number of + index leaf blocks we have to read for finding n_rows. + */ + cost->idx_io_count= n_ranges; + cost->idx_cpu_cost= (keyread_time(keyno, 0, n_rows) + + n_ranges * IDX_LOOKUP_COST); + if (!(*flags & HA_MRR_INDEX_ONLY)) + { + cost->cpu_cost= read_time(keyno, 0, n_rows); + } + } + else + { + cost->cpu_cost= read_time(keyno, n_ranges, (uint)n_rows); + } + cost->cpu_cost+= rows2double(n_rows) / TIME_FOR_COMPARE; + return 0; +} + + +/** + Initialize the MRR scan + + Initialize the MRR scan. This function may do heavyweight scan + initialization like row prefetching/sorting/etc (NOTE: but better not do + it here as we may not need it, e.g. if we never satisfy WHERE clause on + previous tables. For many implementations it would be natural to do such + initializations in the first multi_read_range_next() call) + + mode is a combination of the following flags: HA_MRR_SORTED, + HA_MRR_INDEX_ONLY, HA_MRR_NO_ASSOCIATION + + @param seq Range sequence to be traversed + @param seq_init_param First parameter for seq->init() + @param n_ranges Number of ranges in the sequence + @param mode Flags, see the description section for the details + @param buf INOUT: memory buffer to be used + + @note + One must have called index_init() before calling this function. Several + multi_range_read_init() calls may be made in course of one query. + + Buffer memory management is done according to the following scenario: + The caller allocates the buffer and provides it to the callee by filling + the members of HANDLER_BUFFER structure. + The callee consumes all or some fraction of the provided buffer space, and + sets the HANDLER_BUFFER members accordingly. + The callee may use the buffer memory until the next multi_range_read_init() + call is made, all records have been read, or until index_end() call is + made, whichever comes first. + + @retval 0 OK + @retval 1 Error +*/ + +int +handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param, + uint n_ranges, uint mode, HANDLER_BUFFER *buf) +{ + DBUG_ENTER("handler::multi_range_read_init"); + mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode); + mrr_funcs= *seq_funcs; + mrr_is_output_sorted= MY_TEST(mode & HA_MRR_SORTED); + mrr_have_range= FALSE; + DBUG_RETURN(0); +} + +/** + Get next record in MRR scan + + Default MRR implementation: read the next record + + @param range_info OUT Undefined if HA_MRR_NO_ASSOCIATION flag is in effect + Otherwise, the opaque value associated with the range + that contains the returned record. + + @retval 0 OK + @retval other Error code +*/ + +int handler::multi_range_read_next(range_id_t *range_info) +{ + int result= HA_ERR_END_OF_FILE; + bool range_res; + DBUG_ENTER("handler::multi_range_read_next"); + + if (!mrr_have_range) + { + mrr_have_range= TRUE; + goto start; + } + + do + { + /* Save a call if there can be only one row in range. */ + if (mrr_cur_range.range_flag != (UNIQUE_RANGE | EQ_RANGE)) + { + result= read_range_next(); + /* On success or non-EOF errors jump to the end. */ + if (result != HA_ERR_END_OF_FILE) + break; + } + else + { + if (ha_was_semi_consistent_read()) + { + /* + The following assignment is redundant, but for extra safety and to + remove the compiler warning: + */ + range_res= FALSE; + goto scan_it_again; + } + /* + We need to set this for the last range only, but checking this + condition is more expensive than just setting the result code. + */ + result= HA_ERR_END_OF_FILE; + } + +start: + /* Try the next range(s) until one matches a record. */ + while (!(range_res= mrr_funcs.next(mrr_iter, &mrr_cur_range))) + { +scan_it_again: + result= read_range_first(mrr_cur_range.start_key.keypart_map ? + &mrr_cur_range.start_key : 0, + mrr_cur_range.end_key.keypart_map ? + &mrr_cur_range.end_key : 0, + MY_TEST(mrr_cur_range.range_flag & EQ_RANGE), + mrr_is_output_sorted); + if (result != HA_ERR_END_OF_FILE) + break; + } + } + while ((result == HA_ERR_END_OF_FILE) && !range_res); + + *range_info= mrr_cur_range.ptr; + DBUG_PRINT("exit",("handler::multi_range_read_next result %d", result)); + DBUG_RETURN(result); +} + +/**************************************************************************** + * Mrr_*_reader classes (building blocks for DS-MRR) + ***************************************************************************/ + +int Mrr_simple_index_reader::init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, + uint mode, Key_parameters *key_par_arg, + Lifo_buffer *key_buffer_arg, + Buffer_manager *buf_manager_arg) +{ + HANDLER_BUFFER no_buffer = {NULL, NULL, NULL}; + file= h_arg; + return file->handler::multi_range_read_init(seq_funcs, seq_init_param, + n_ranges, mode, &no_buffer); +} + + +int Mrr_simple_index_reader::get_next(range_id_t *range_info) +{ + int res; + while (!(res= file->handler::multi_range_read_next(range_info))) + { + KEY_MULTI_RANGE *curr_range= &file->handler::mrr_cur_range; + if (!file->mrr_funcs.skip_index_tuple || + !file->mrr_funcs.skip_index_tuple(file->mrr_iter, curr_range->ptr)) + break; + } + if (res && res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND) + file->print_error(res, MYF(0)); // Fatal error + return res; +} + + +/** + @brief Get next index record + + @param range_info OUT identifier of range that the returned record belongs to + + @note + We actually iterate over nested sequences: + - an ordered sequence of groups of identical keys + - each key group has key value, which has multiple matching records + - thus, each record matches all members of the key group + + @retval 0 OK, next record was successfully read + @retval HA_ERR_END_OF_FILE End of records + @retval Other Some other error; Error is printed +*/ + +int Mrr_ordered_index_reader::get_next(range_id_t *range_info) +{ + int res; + DBUG_ENTER("Mrr_ordered_index_reader::get_next"); + + for(;;) + { + if (!scanning_key_val_iter) + { + while ((res= kv_it.init(this))) + { + if ((res != HA_ERR_KEY_NOT_FOUND && res != HA_ERR_END_OF_FILE)) + DBUG_RETURN(res); /* Some fatal error */ + + if (key_buffer->is_empty()) + { + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + } + scanning_key_val_iter= TRUE; + } + + if ((res= kv_it.get_next(range_info))) + { + scanning_key_val_iter= FALSE; + if ((res != HA_ERR_KEY_NOT_FOUND && res != HA_ERR_END_OF_FILE)) + DBUG_RETURN(res); + kv_it.move_to_next_key_value(); + continue; + } + if (!skip_index_tuple(*range_info) && + !skip_record(*range_info, NULL)) + { + break; + } + /* Go get another (record, range_id) combination */ + } /* while */ + + DBUG_RETURN(0); +} + + +/* + Supply index reader with the O(1)space it needs for scan interrupt/restore + operation +*/ + +bool Mrr_ordered_index_reader::set_interruption_temp_buffer(uint rowid_length, + uint key_len, + uint saved_pk_len, + uchar **space_start, + uchar *space_end) +{ + if (space_end - *space_start <= (ptrdiff_t)(rowid_length + key_len + saved_pk_len)) + return TRUE; + support_scan_interruptions= TRUE; + + saved_rowid= *space_start; + *space_start += rowid_length; + + if (saved_pk_len) + { + saved_primary_key= *space_start; + *space_start += saved_pk_len; + } + else + saved_primary_key= NULL; + + saved_key_tuple= *space_start; + *space_start += key_len; + + have_saved_rowid= FALSE; + read_was_interrupted= FALSE; + return FALSE; +} + +void Mrr_ordered_index_reader::set_no_interruption_temp_buffer() +{ + support_scan_interruptions= FALSE; + saved_key_tuple= saved_rowid= saved_primary_key= NULL; /* safety */ + have_saved_rowid= FALSE; + read_was_interrupted= FALSE; +} + +void Mrr_ordered_index_reader::interrupt_read() +{ + DBUG_ASSERT(support_scan_interruptions); + TABLE *table= file->get_table(); + KEY *used_index= &table->key_info[file->active_index]; + /* Save the current key value */ + key_copy(saved_key_tuple, table->record[0], + used_index, used_index->key_length); + + if (saved_primary_key) + { + key_copy(saved_primary_key, table->record[0], + &table->key_info[table->s->primary_key], + table->key_info[table->s->primary_key].key_length); + } + read_was_interrupted= TRUE; + + /* Save the last rowid */ + memcpy(saved_rowid, file->ref, file->ref_length); + have_saved_rowid= TRUE; +} + +void Mrr_ordered_index_reader::position() +{ + if (have_saved_rowid) + memcpy(file->ref, saved_rowid, file->ref_length); + else + Mrr_index_reader::position(); +} + +void Mrr_ordered_index_reader::resume_read() +{ + TABLE *table= file->get_table(); + + if (!read_was_interrupted) + return; + + KEY *used_index= &table->key_info[file->active_index]; + key_restore(table->record[0], saved_key_tuple, + used_index, used_index->key_length); + if (saved_primary_key) + { + key_restore(table->record[0], saved_primary_key, + &table->key_info[table->s->primary_key], + table->key_info[table->s->primary_key].key_length); + } +} + + +/** + Fill the buffer with (lookup_tuple, range_id) pairs and sort + + @return + 0 OK, the buffer is non-empty and sorted + HA_ERR_END_OF_FILE Source exhausted, the buffer is empty. +*/ + +int Mrr_ordered_index_reader::refill_buffer(bool initial) +{ + KEY_MULTI_RANGE cur_range; + DBUG_ENTER("Mrr_ordered_index_reader::refill_buffer"); + + DBUG_ASSERT(key_buffer->is_empty()); + + if (source_exhausted) + DBUG_RETURN(HA_ERR_END_OF_FILE); + + buf_manager->reset_buffer_sizes(buf_manager->arg); + key_buffer->reset(); + key_buffer->setup_writing(keypar.key_size_in_keybuf, + is_mrr_assoc? sizeof(range_id_t) : 0); + + while (key_buffer->can_write() && + !(source_exhausted= mrr_funcs.next(mrr_iter, &cur_range))) + { + DBUG_ASSERT(cur_range.range_flag & EQ_RANGE); + + /* Put key, or {key, range_id} pair into the buffer */ + key_buffer->write_ptr1= keypar.use_key_pointers ? + (uchar*)&cur_range.start_key.key : + (uchar*)cur_range.start_key.key; + key_buffer->write_ptr2= (uchar*)&cur_range.ptr; + key_buffer->write(); + } + + /* Force get_next() to start with kv_it.init() call: */ + scanning_key_val_iter= FALSE; + + if (source_exhausted && key_buffer->is_empty()) + DBUG_RETURN(HA_ERR_END_OF_FILE); + + if (!initial) + { + /* This is a non-initial buffer fill and we've got a non-empty buffer */ + THD *thd= current_thd; + status_var_increment(thd->status_var.ha_mrr_key_refills_count); + } + + key_buffer->sort((key_buffer->type() == Lifo_buffer::FORWARD)? + (qsort2_cmp)Mrr_ordered_index_reader::compare_keys_reverse : + (qsort2_cmp)Mrr_ordered_index_reader::compare_keys, + this); + DBUG_RETURN(0); +} + + +int Mrr_ordered_index_reader::init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, + uint mode, Key_parameters *key_par_arg, + Lifo_buffer *key_buffer_arg, + Buffer_manager *buf_manager_arg) +{ + file= h_arg; + key_buffer= key_buffer_arg; + buf_manager= buf_manager_arg; + keypar= *key_par_arg; + + KEY *key_info= &file->get_table()->key_info[file->active_index]; + keypar.index_ranges_unique= MY_TEST(key_info->flags & HA_NOSAME && + key_info->user_defined_key_parts == + my_count_bits(keypar.key_tuple_map)); + + mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode); + is_mrr_assoc= !MY_TEST(mode & HA_MRR_NO_ASSOCIATION); + mrr_funcs= *seq_funcs; + source_exhausted= FALSE; + read_was_interrupted= false; + have_saved_rowid= FALSE; + return 0; +} + + +static int rowid_cmp_reverse(void *file, uchar *a, uchar *b) +{ + return - ((handler*)file)->cmp_ref(a, b); +} + + +int Mrr_ordered_rndpos_reader::init(handler *h_arg, + Mrr_index_reader *index_reader_arg, + uint mode, + Lifo_buffer *buf, + Rowid_filter *filter) +{ + file= h_arg; + index_reader= index_reader_arg; + rowid_buffer= buf; + is_mrr_assoc= !MY_TEST(mode & HA_MRR_NO_ASSOCIATION); + index_reader_exhausted= FALSE; + index_reader_needs_refill= TRUE; + rowid_filter= filter; + + return 0; +} + + +/** + DS-MRR: Fill and sort the rowid buffer + + Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into + buffer. When the buffer is full or scan is completed, sort the buffer by + rowid and return. + + When this function returns, either rowid buffer is not empty, or the source + of lookup keys (i.e. ranges) is exhaused. + + @retval 0 OK, the next portion of rowids is in the buffer, + properly ordered + @retval other Error +*/ + +int Mrr_ordered_rndpos_reader::refill_buffer(bool initial) +{ + int res; + bool first_call= initial; + DBUG_ENTER("Mrr_ordered_rndpos_reader::refill_buffer"); + + if (index_reader_exhausted) + DBUG_RETURN(HA_ERR_END_OF_FILE); + + while (initial || index_reader_needs_refill || + (res= refill_from_index_reader()) == HA_ERR_END_OF_FILE) + { + if ((res= index_reader->refill_buffer(initial))) + { + if (res == HA_ERR_END_OF_FILE) + index_reader_exhausted= TRUE; + break; + } + initial= FALSE; + index_reader_needs_refill= FALSE; + } + + if (!first_call && !index_reader_exhausted) + { + /* Ok, this was a successful buffer refill operation */ + THD *thd= current_thd; + status_var_increment(thd->status_var.ha_mrr_rowid_refills_count); + } + + DBUG_RETURN(res); +} + + +void Mrr_index_reader::position() +{ + file->position(file->get_table()->record[0]); +} + + +/* + @brief Try to refill the rowid buffer without calling + index_reader->refill_buffer(). +*/ + +int Mrr_ordered_rndpos_reader::refill_from_index_reader() +{ + range_id_t range_info; + int res; + DBUG_ENTER("Mrr_ordered_rndpos_reader::refill_from_index_reader"); + + DBUG_ASSERT(rowid_buffer->is_empty()); + index_rowid= index_reader->get_rowid_ptr(); + rowid_buffer->reset(); + rowid_buffer->setup_writing(file->ref_length, + is_mrr_assoc? sizeof(range_id_t) : 0); + + last_identical_rowid= NULL; + + index_reader->resume_read(); + while (rowid_buffer->can_write()) + { + res= index_reader->get_next(&range_info); + + if (res) + { + if (res != HA_ERR_END_OF_FILE) + DBUG_RETURN(res); + index_reader_needs_refill=TRUE; + break; + } + + index_reader->position(); + + /* + If the built rowid filter cannot be used at the engine level, use it here. + */ + if (rowid_filter && !file->pushed_rowid_filter && + !rowid_filter->check((char *)index_rowid)) + continue; + + /* Put rowid, or {rowid, range_id} pair into the buffer */ + rowid_buffer->write_ptr1= index_rowid; + rowid_buffer->write_ptr2= (uchar*)&range_info; + rowid_buffer->write(); + } + + /* + When index_reader_needs_refill=TRUE, this means we've got all of index + tuples for lookups keys that index_reader had. We are not in the middle + of an index read, so there is no need to call interrupt_read. + + Actually, we must not call interrupt_read(), because it could be that we + haven't read a single row (because all index lookups returned + HA_ERR_KEY_NOT_FOUND). In this case, interrupt_read() will cause [harmless] + valgrind warnings when trying to save garbage from table->record[0]. + */ + if (!index_reader_needs_refill) + index_reader->interrupt_read(); + /* Sort the buffer contents by rowid */ + rowid_buffer->sort((qsort2_cmp)rowid_cmp_reverse, (void*)file); + + rowid_buffer->setup_reading(file->ref_length, + is_mrr_assoc ? sizeof(range_id_t) : 0); + DBUG_RETURN(rowid_buffer->is_empty()? HA_ERR_END_OF_FILE : 0); +} + + +/* + Get the next {record, range_id} using ordered array of rowid+range_id pairs + + @note + Since we have sorted rowids, we try not to make multiple rnd_pos() calls + with the same rowid value. +*/ + +int Mrr_ordered_rndpos_reader::get_next(range_id_t *range_info) +{ + int res; + + /* + First, check if rowid buffer has elements with the same rowid value as + the previous. + */ + while (last_identical_rowid) + { + /* + Current record (the one we've returned in previous call) was obtained + from a rowid that matched multiple range_ids. Return this record again, + with next matching range_id. + */ + (void)rowid_buffer->read(); + + if (rowid_buffer->read_ptr1 == last_identical_rowid) + last_identical_rowid= NULL; /* reached the last of identical rowids */ + + if (!is_mrr_assoc) + return 0; + + memcpy(range_info, rowid_buffer->read_ptr2, sizeof(range_id_t)); + if (!index_reader->skip_record(*range_info, rowid_buffer->read_ptr1)) + return 0; + } + + /* + Ok, last_identical_rowid==NULL, it's time to read next different rowid + value and get record for it. + */ + for(;;) + { + /* Return eof if there are no rowids in the buffer after re-fill attempt */ + if (rowid_buffer->read()) + return HA_ERR_END_OF_FILE; + + if (is_mrr_assoc) + { + memcpy(range_info, rowid_buffer->read_ptr2, sizeof(range_id_t)); + if (index_reader->skip_record(*range_info, rowid_buffer->read_ptr1)) + continue; + } + + res= file->ha_rnd_pos(file->get_table()->record[0], + rowid_buffer->read_ptr1); + + if (res) + return res; /* Some fatal error */ + + break; /* Got another record */ + } + + /* + Check if subsequent buffer elements have the same rowid value as this + one. If yes, remember this fact so that we don't make any more rnd_pos() + calls with this value. + + Note: this implies that SQL layer doesn't touch table->record[0] + between calls. + */ + Lifo_buffer_iterator it; + it.init(rowid_buffer); + while (!it.read()) + { + if (file->cmp_ref(it.read_ptr1, rowid_buffer->read_ptr1)) + break; + last_identical_rowid= it.read_ptr1; + } + return 0; +} + + +/**************************************************************************** + * Top-level DS-MRR implementation functions (the ones called by storage engine) + ***************************************************************************/ + +/** + DS-MRR: Initialize and start MRR scan + + Initialize and start the MRR scan. Depending on the mode parameter, this + may use default or DS-MRR implementation. + + @param h_arg Table handler to be used + @param key Index to be used + @param seq_funcs Interval sequence enumeration functions + @param seq_init_param Interval sequence enumeration parameter + @param n_ranges Number of ranges in the sequence. + @param mode HA_MRR_* modes to use + @param buf INOUT Buffer to use + + @retval 0 Ok, Scan started. + @retval other Error +*/ + +int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, uint mode, + HANDLER_BUFFER *buf) +{ + TABLE *table= h_arg->get_table(); + THD *thd= table->in_use; + int res; + Key_parameters keypar; + uint UNINIT_VAR(key_buff_elem_size); /* set/used when do_sort_keys==TRUE */ + handler *h_idx; + Mrr_ordered_rndpos_reader *disk_strategy= NULL; + bool do_sort_keys= FALSE; + DBUG_ENTER("DsMrr_impl::dsmrr_init"); + /* + index_merge may invoke a scan on an object for which dsmrr_info[_const] + has not been called, so set the owner handler here as well. + */ + primary_file= h_arg; + is_mrr_assoc= !MY_TEST(mode & HA_MRR_NO_ASSOCIATION); + + strategy_exhausted= FALSE; + + /* By default, have do-nothing buffer manager */ + buf_manager.arg= this; + buf_manager.reset_buffer_sizes= do_nothing; + buf_manager.redistribute_buffer_space= do_nothing; + + if (mode & (HA_MRR_USE_DEFAULT_IMPL | HA_MRR_SORTED)) + goto use_default_impl; + + /* + Determine whether we'll need to do key sorting and/or rnd_pos() scan + */ + index_strategy= NULL; + if ((mode & HA_MRR_SINGLE_POINT) && + optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS)) + { + do_sort_keys= TRUE; + index_strategy= &reader_factory.ordered_index_reader; + } + else + index_strategy= &reader_factory.simple_index_reader; + + strategy= index_strategy; + /* + We don't need a rowid-to-rndpos step if + - We're doing a scan on clustered primary key + - [In the future] We're doing an index_only read + */ + DBUG_ASSERT(primary_file->inited == handler::INDEX || + (primary_file->inited == handler::RND && + secondary_file && + secondary_file->inited == handler::INDEX)); + + h_idx= (primary_file->inited == handler::INDEX)? primary_file: secondary_file; + keyno= h_idx->active_index; + + if (! h_idx->is_clustering_key(keyno)) + { + strategy= disk_strategy= &reader_factory.ordered_rndpos_reader; + if (h_arg->pushed_rowid_filter) + { + /* + Currently usage of a rowid filter within InnoDB engine is not supported + if the table is accessed by the primary key. + With optimizer switches ''mrr' and 'mrr_sort_keys' are both enabled + any access by a secondary index is converted to the rndpos access. In + InnoDB the rndpos access is always uses the primary key. + Do not use pushed rowid filter if the table is accessed actually by the + primary key. Use the rowid filter outside the engine code (see + Mrr_ordered_rndpos_reader::refill_from_index_reader). + */ + rowid_filter= h_arg->pushed_rowid_filter; + h_arg->cancel_pushed_rowid_filter(); + } + } + + full_buf= buf->buffer; + full_buf_end= buf->buffer_end; + + if (do_sort_keys) + { + /* Pre-calculate some parameters of key sorting */ + keypar.use_key_pointers= MY_TEST(mode & HA_MRR_MATERIALIZED_KEYS); + seq_funcs->get_key_info(seq_init_param, &keypar.key_tuple_length, + &keypar.key_tuple_map); + keypar.key_size_in_keybuf= keypar.use_key_pointers? + sizeof(char*) : keypar.key_tuple_length; + key_buff_elem_size= keypar.key_size_in_keybuf + (int)is_mrr_assoc * sizeof(void*); + + /* Ordered index reader needs some space to store an index tuple */ + if (strategy != index_strategy) + { + uint saved_pk_length=0; + uint pk= h_idx->get_table()->s->primary_key; + if (h_idx->pk_is_clustering_key(pk)) + { + saved_pk_length= h_idx->get_table()->key_info[pk].key_length; + } + + KEY *used_index= &h_idx->get_table()->key_info[h_idx->active_index]; + if (reader_factory.ordered_index_reader. + set_interruption_temp_buffer(primary_file->ref_length, + used_index->key_length, + saved_pk_length, + &full_buf, full_buf_end)) + goto use_default_impl; + } + else + reader_factory.ordered_index_reader.set_no_interruption_temp_buffer(); + } + + if (strategy == index_strategy) + { + /* + Index strategy alone handles the record retrieval. Give all buffer space + to it. Key buffer should have forward orientation so we can return the + end of it. + */ + key_buffer= &forward_key_buf; + key_buffer->set_buffer_space(full_buf, full_buf_end); + + /* Safety: specify that rowid buffer has zero size: */ + rowid_buffer.set_buffer_space(full_buf_end, full_buf_end); + + if (do_sort_keys && !key_buffer->have_space_for(key_buff_elem_size)) + goto use_default_impl; + + if ((res= index_strategy->init(primary_file, seq_funcs, seq_init_param, n_ranges, + mode, &keypar, key_buffer, &buf_manager))) + goto error; + } + else + { + /* We'll have both index and rndpos strategies working together */ + if (do_sort_keys) + { + /* Both strategies will need buffer space, share the buffer */ + if (setup_buffer_sharing(keypar.key_size_in_keybuf, keypar.key_tuple_map)) + goto use_default_impl; + + buf_manager.reset_buffer_sizes= reset_buffer_sizes; + buf_manager.redistribute_buffer_space= redistribute_buffer_space; + } + else + { + /* index strategy doesn't need buffer, give all space to rowids*/ + rowid_buffer.set_buffer_space(full_buf, full_buf_end); + if (!rowid_buffer.have_space_for(primary_file->ref_length + + (int)is_mrr_assoc * sizeof(range_id_t))) + goto use_default_impl; + } + + // setup_two_handlers() will call dsmrr_close() will clears the filter. + // Save its value and restore afterwards. + Rowid_filter *tmp = rowid_filter; + if ((res= setup_two_handlers())) + goto error; + rowid_filter= tmp; + + if ((res= index_strategy->init(secondary_file, seq_funcs, seq_init_param, + n_ranges, mode, &keypar, key_buffer, + &buf_manager)) || + (res= disk_strategy->init(primary_file, index_strategy, mode, + &rowid_buffer, rowid_filter))) + { + goto error; + } + } + + /* + At this point, we're sure that we're running a native MRR scan (i.e. we + didnt fall back to default implementation for some reason). + */ + status_var_increment(thd->status_var.ha_mrr_init_count); + + res= strategy->refill_buffer(TRUE); + if (res) + { + if (res != HA_ERR_END_OF_FILE) + goto error; + strategy_exhausted= TRUE; + } + + /* + If we have scanned through all intervals in *seq, then adjust *buf to + indicate that the remaining buffer space will not be used. + */ +// if (dsmrr_eof) +// buf->end_of_used_area= rowid_buffer.end_of_space(); + + + DBUG_RETURN(0); +error: + close_second_handler(); + /* Safety, not really needed but: */ + strategy= NULL; + DBUG_RETURN(res); + +use_default_impl: + if (primary_file->inited != handler::INDEX) + { + /* We can get here when + - we've previously successfully done a DS-MRR scan (and so have + secondary_file!= NULL, secondary_file->inited= INDEX, + primary_file->inited=RND) + - for this invocation, we haven't got enough buffer space, and so we + have to use the default MRR implementation. + + note: primary_file->ha_index_end() will call dsmrr_close() which will + close/destroy the secondary_file, this is intentional. + (Yes this is slow, but one can't expect performance with join buffer + so small that it can accomodate one rowid and one index tuple) + */ + if ((res= primary_file->ha_rnd_end()) || + (res= primary_file->ha_index_init(keyno, MY_TEST(mode & HA_MRR_SORTED)))) + { + DBUG_RETURN(res); + } + } + /* Call correct init function and assign to top level object */ + Mrr_simple_index_reader *s= &reader_factory.simple_index_reader; + res= s->init(primary_file, seq_funcs, seq_init_param, n_ranges, mode, NULL, + NULL, NULL); + strategy= s; + DBUG_RETURN(res); +} + + +/* + Whatever the current state is, make it so that we have two handler objects: + - primary_file - initialized for rnd_pos() scan + - secondary_file - initialized for scanning the index specified in + this->keyno + RETURN + 0 OK + HA_XXX Error code +*/ + +int DsMrr_impl::setup_two_handlers() +{ + int res; + THD *thd= primary_file->get_table()->in_use; + DBUG_ENTER("DsMrr_impl::setup_two_handlers"); + if (!secondary_file) + { + handler *new_h2; + Item *pushed_cond= NULL; + DBUG_ASSERT(primary_file->inited == handler::INDEX); + /* Create a separate handler object to do rnd_pos() calls. */ + /* + ::clone() takes up a lot of stack, especially on 64 bit platforms. + The constant 5 is an empiric result. + */ + if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2)) + DBUG_RETURN(1); + + /* Create a separate handler object to do rnd_pos() calls. */ + if (!(new_h2= primary_file->clone(primary_file->get_table()->s-> + normalized_path.str, + thd->mem_root)) || + new_h2->ha_external_lock(thd, F_RDLCK)) + { + delete new_h2; + DBUG_RETURN(1); + } + + if (keyno == primary_file->pushed_idx_cond_keyno) + pushed_cond= primary_file->pushed_idx_cond; + + Mrr_reader *save_strategy= strategy; + strategy= NULL; + /* + Caution: this call will invoke this->dsmrr_close(). Do not put the + created secondary table handler new_h2 into this->secondary_file or it + will delete it. Also, save the picked strategy + */ + res= primary_file->ha_index_end(); + + strategy= save_strategy; + secondary_file= new_h2; + + if (res || (res= (primary_file->ha_rnd_init(FALSE)))) + goto error; + + table->prepare_for_position(); + secondary_file->extra(HA_EXTRA_KEYREAD); + secondary_file->mrr_iter= primary_file->mrr_iter; + + if ((res= secondary_file->ha_index_init(keyno, FALSE))) + goto error; + + if (pushed_cond) + secondary_file->idx_cond_push(keyno, pushed_cond); + } + else + { + DBUG_ASSERT(secondary_file && secondary_file->inited==handler::INDEX); + /* + We get here when the access alternates betwen MRR scan(s) and non-MRR + scans. + + Calling primary_file->index_end() will invoke dsmrr_close() for this + object, which will delete secondary_file. We need to keep it, so put it + away and don't let it be deleted: + */ + if (primary_file->inited == handler::INDEX) + { + handler *save_h2= secondary_file; + Mrr_reader *save_strategy= strategy; + secondary_file= NULL; + strategy= NULL; + res= primary_file->ha_index_end(); + secondary_file= save_h2; + strategy= save_strategy; + if (res) + goto error; + } + if ((primary_file->inited != handler::RND) && + (res= primary_file->ha_rnd_init(FALSE))) + goto error; + } + DBUG_RETURN(0); + +error: + DBUG_RETURN(res); +} + + +void DsMrr_impl::close_second_handler() +{ + if (secondary_file) + { + secondary_file->extra(HA_EXTRA_NO_KEYREAD); + secondary_file->ha_index_or_rnd_end(); + secondary_file->ha_external_unlock(current_thd); + secondary_file->ha_close(); + delete secondary_file; + secondary_file= NULL; + } +} + + +void DsMrr_impl::dsmrr_close() +{ + DBUG_ENTER("DsMrr_impl::dsmrr_close"); + rowid_filter= NULL; + close_second_handler(); + strategy= NULL; + DBUG_VOID_RETURN; +} + + +/* + my_qsort2-compatible static member function to compare key tuples +*/ + +int Mrr_ordered_index_reader::compare_keys(void* arg, uchar* key1_arg, + uchar* key2_arg) +{ + Mrr_ordered_index_reader *reader= (Mrr_ordered_index_reader*)arg; + TABLE *table= reader->file->get_table(); + KEY_PART_INFO *part= table->key_info[reader->file->active_index].key_part; + uchar *key1, *key2; + + if (reader->keypar.use_key_pointers) + { + /* the buffer stores pointers to keys, get to the keys */ + memcpy(&key1, key1_arg, sizeof(char*)); + memcpy(&key2, key2_arg, sizeof(char*)); + } + else + { + key1= key1_arg; + key2= key2_arg; + } + + return key_tuple_cmp(part, key1, key2, reader->keypar.key_tuple_length); +} + + +int Mrr_ordered_index_reader::compare_keys_reverse(void* arg, uchar* key1, + uchar* key2) +{ + return -compare_keys(arg, key1, key2); +} + + +/** + Set the buffer space to be shared between rowid and key buffer + + @return FALSE ok + @return TRUE There is so little buffer space that we won't be able to use + the strategy. + This happens when we don't have enough space for one rowid + element and one key element so this is mainly targeted at + testing. +*/ + +bool DsMrr_impl::setup_buffer_sharing(uint key_size_in_keybuf, + key_part_map key_tuple_map) +{ + long key_buff_elem_size= key_size_in_keybuf + + (int)is_mrr_assoc * sizeof(range_id_t); + + KEY *key_info= &primary_file->get_table()->key_info[keyno]; + /* + Ok if we got here we need to allocate one part of the buffer + for keys and another part for rowids. + */ + ulonglong rowid_buf_elem_size= primary_file->ref_length + + (int)is_mrr_assoc * sizeof(range_id_t); + + /* + Use rec_per_key statistics as a basis to find out how many rowids + we'll get for each key value. + TODO: what should be the default value to use when there is no + statistics? + */ + uint parts= my_count_bits(key_tuple_map); + ha_rows rpc; + ulonglong rowids_size= rowid_buf_elem_size; + if ((rpc= (ha_rows) key_info->actual_rec_per_key(parts - 1))) + rowids_size= rowid_buf_elem_size * rpc; + + double fraction_for_rowids= + (ulonglong2double(rowids_size) / + (ulonglong2double(rowids_size) + key_buff_elem_size)); + + ptrdiff_t bytes_for_rowids= + (ptrdiff_t)floor(0.5 + fraction_for_rowids * (full_buf_end - full_buf)); + + ptrdiff_t bytes_for_keys= (full_buf_end - full_buf) - bytes_for_rowids; + + if (bytes_for_keys < key_buff_elem_size + 1 || + bytes_for_rowids < (ptrdiff_t)rowid_buf_elem_size + 1) + return TRUE; /* Failed to provide minimum space for one of the buffers */ + + rowid_buffer_end= full_buf + bytes_for_rowids; + rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end); + key_buffer= &backward_key_buf; + key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end); + + /* The above code guarantees that the buffers are big enough */ + DBUG_ASSERT(key_buffer->have_space_for(key_buff_elem_size) && + rowid_buffer.have_space_for((size_t)rowid_buf_elem_size)); + + return FALSE; +} + + +void DsMrr_impl::do_nothing(void *dsmrr_arg) +{ + /* Do nothing */ +} + + +void DsMrr_impl::reset_buffer_sizes(void *dsmrr_arg) +{ + DsMrr_impl *dsmrr= (DsMrr_impl*)dsmrr_arg; + dsmrr->rowid_buffer.set_buffer_space(dsmrr->full_buf, + dsmrr->rowid_buffer_end); + dsmrr->key_buffer->set_buffer_space(dsmrr->rowid_buffer_end, + dsmrr->full_buf_end); +} + + +/* + Take unused space from the key buffer and give it to the rowid buffer +*/ + +void DsMrr_impl::redistribute_buffer_space(void *dsmrr_arg) +{ + DsMrr_impl *dsmrr= (DsMrr_impl*)dsmrr_arg; + uchar *unused_start, *unused_end; + dsmrr->key_buffer->remove_unused_space(&unused_start, &unused_end); + dsmrr->rowid_buffer.grow(unused_start, unused_end); +} + + +/* + @brief Initialize the iterator + + @note + Initialize the iterator to produce matches for the key of the first element + in owner_arg->key_buffer + + @retval 0 OK + @retval HA_ERR_END_OF_FILE Either the owner->key_buffer is empty or + no matches for the key we've tried (check + key_buffer->is_empty() to tell these apart) + @retval other code Fatal error +*/ + +int Key_value_records_iterator::init(Mrr_ordered_index_reader *owner_arg) +{ + int res; + owner= owner_arg; + + identical_key_it.init(owner->key_buffer); + owner->key_buffer->setup_reading(owner->keypar.key_size_in_keybuf, + owner->is_mrr_assoc ? sizeof(void*) : 0); + + if (identical_key_it.read()) + return HA_ERR_END_OF_FILE; + + uchar *key_in_buf= last_identical_key_ptr= identical_key_it.read_ptr1; + + uchar *index_tuple= key_in_buf; + if (owner->keypar.use_key_pointers) + memcpy(&index_tuple, key_in_buf, sizeof(char*)); + + /* Check out how many more identical keys are following */ + while (!identical_key_it.read()) + { + if (Mrr_ordered_index_reader::compare_keys(owner, key_in_buf, + identical_key_it.read_ptr1)) + break; + last_identical_key_ptr= identical_key_it.read_ptr1; + } + identical_key_it.init(owner->key_buffer); + res= owner->file->ha_index_read_map(owner->file->get_table()->record[0], + index_tuple, + owner->keypar.key_tuple_map, + HA_READ_KEY_EXACT); + + if (res) + { + /* Failed to find any matching records */ + move_to_next_key_value(); + return res; + } + owner->have_saved_rowid= FALSE; + get_next_row= FALSE; + return 0; +} + + +int Key_value_records_iterator::get_next(range_id_t *range_info) +{ + int res; + + if (get_next_row) + { + if (owner->keypar.index_ranges_unique) + { + /* We're using a full unique key, no point to call index_next_same */ + return HA_ERR_END_OF_FILE; + } + + handler *h= owner->file; + uchar *lookup_key; + if (owner->keypar.use_key_pointers) + memcpy(&lookup_key, identical_key_it.read_ptr1, sizeof(void*)); + else + lookup_key= identical_key_it.read_ptr1; + + if ((res= h->ha_index_next_same(h->get_table()->record[0], + lookup_key, + owner->keypar.key_tuple_length))) + { + /* It's either HA_ERR_END_OF_FILE or some other error */ + return res; + } + identical_key_it.init(owner->key_buffer); + owner->have_saved_rowid= FALSE; + get_next_row= FALSE; + } + + identical_key_it.read(); /* This gets us next range_id */ + memcpy(range_info, identical_key_it.read_ptr2, sizeof(range_id_t)); + + if (!last_identical_key_ptr || + (identical_key_it.read_ptr1 == last_identical_key_ptr)) + { + /* + We've reached the last of the identical keys that current record is a + match for. Set get_next_row=TRUE so that we read the next index record + on the next call to this function. + */ + get_next_row= TRUE; + } + return 0; +} + + +void Key_value_records_iterator::move_to_next_key_value() +{ + while (!owner->key_buffer->read() && + (owner->key_buffer->read_ptr1 != last_identical_key_ptr)) {} +} + + +/** + DS-MRR implementation: multi_range_read_next() function. + + Calling convention is like multi_range_read_next() has. +*/ + +int DsMrr_impl::dsmrr_next(range_id_t *range_info) +{ + int res; + if (strategy_exhausted) + return HA_ERR_END_OF_FILE; + + while ((res= strategy->get_next(range_info)) == HA_ERR_END_OF_FILE) + { + if ((res= strategy->refill_buffer(FALSE))) + break; /* EOF or error */ + } + return res; +} + + +/** + DS-MRR implementation: multi_range_read_info() function +*/ +ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, + uint key_parts, + uint *bufsz, uint *flags, Cost_estimate *cost) +{ + ha_rows res __attribute__((unused)); + uint def_flags= *flags; + uint def_bufsz= *bufsz; + + /* Get cost/flags/mem_usage of default MRR implementation */ + res= primary_file->handler::multi_range_read_info(keyno, n_ranges, rows, + key_parts, &def_bufsz, + &def_flags, cost); + DBUG_ASSERT(!res); + + if ((*flags & HA_MRR_USE_DEFAULT_IMPL) || + choose_mrr_impl(keyno, rows, flags, bufsz, cost)) + { + /* Default implementation is chosen */ + DBUG_PRINT("info", ("Default MRR implementation chosen")); + *flags= def_flags; + *bufsz= def_bufsz; + } + else + { + /* *flags and *bufsz were set by choose_mrr_impl */ + DBUG_PRINT("info", ("DS-MRR implementation chosen")); + } + return 0; +} + + +/** + DS-MRR Implementation: multi_range_read_info_const() function +*/ + +ha_rows DsMrr_impl::dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, + void *seq_init_param, uint n_ranges, + uint *bufsz, uint *flags, Cost_estimate *cost) +{ + ha_rows rows; + uint def_flags= *flags; + uint def_bufsz= *bufsz; + /* Get cost/flags/mem_usage of default MRR implementation */ + rows= primary_file->handler::multi_range_read_info_const(keyno, seq, + seq_init_param, + n_ranges, + &def_bufsz, + &def_flags, cost); + if (rows == HA_POS_ERROR) + { + /* Default implementation can't perform MRR scan => we can't either */ + return rows; + } + + /* + If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to + use the default MRR implementation (we need it for UPDATE/DELETE). + Otherwise, make a choice based on cost and @@optimizer_switch settings + */ + if ((*flags & HA_MRR_USE_DEFAULT_IMPL) || + choose_mrr_impl(keyno, rows, flags, bufsz, cost)) + { + DBUG_PRINT("info", ("Default MRR implementation chosen")); + *flags= def_flags; + *bufsz= def_bufsz; + } + else + { + /* *flags and *bufsz were set by choose_mrr_impl */ + DBUG_PRINT("info", ("DS-MRR implementation chosen")); + } + return rows; +} + + +/** + Check if key has partially-covered columns + + We can't use DS-MRR to perform range scans when the ranges are over + partially-covered keys, because we'll not have full key part values + (we'll have their prefixes from the index) and will not be able to check + if we've reached the end the range. + + @param keyno Key to check + + @todo + Allow use of DS-MRR in cases where the index has partially-covered + components but they are not used for scanning. + + @retval TRUE Yes + @retval FALSE No +*/ + +bool key_uses_partial_cols(TABLE_SHARE *share, uint keyno) +{ + KEY_PART_INFO *kp= share->key_info[keyno].key_part; + KEY_PART_INFO *kp_end= kp + share->key_info[keyno].user_defined_key_parts; + for (; kp != kp_end; kp++) + { + if (!kp->field->part_of_key.is_set(keyno)) + return TRUE; + } + return FALSE; +} + + +/* + Check if key/flags allow DS-MRR/CPK strategy to be used + + @param thd + @param keyno Index that will be used + @param mrr_flags + + @retval TRUE DS-MRR/CPK should be used + @retval FALSE Otherwise +*/ + +bool DsMrr_impl::check_cpk_scan(THD *thd, TABLE_SHARE *share, uint keyno, + uint mrr_flags) +{ + return MY_TEST((mrr_flags & HA_MRR_SINGLE_POINT) && + primary_file->is_clustering_key(keyno) && + optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS)); +} + + +/* + DS-MRR Internals: Choose between Default MRR implementation and DS-MRR + + Make the choice between using Default MRR implementation and DS-MRR. + This function contains common functionality factored out of dsmrr_info() + and dsmrr_info_const(). The function assumes that the default MRR + implementation's applicability requirements are satisfied. + + @param keyno Index number + @param rows E(full rows to be retrieved) + @param flags IN MRR flags provided by the MRR user + OUT If DS-MRR is chosen, flags of DS-MRR implementation + else the value is not modified + @param bufsz IN If DS-MRR is chosen, buffer use of DS-MRR implementation + else the value is not modified + @param cost IN Cost of default MRR implementation + OUT If DS-MRR is chosen, cost of DS-MRR scan + else the value is not modified + + @retval TRUE Default MRR implementation should be used + @retval FALSE DS-MRR implementation should be used +*/ + + +bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, + uint *bufsz, Cost_estimate *cost) +{ + Cost_estimate dsmrr_cost; + bool res; + THD *thd= primary_file->get_table()->in_use; + TABLE_SHARE *share= primary_file->get_table_share(); + + bool doing_cpk_scan= check_cpk_scan(thd, share, keyno, *flags); + bool using_cpk= primary_file->is_clustering_key(keyno); + *flags &= ~HA_MRR_IMPLEMENTATION_FLAGS; + if (!optimizer_flag(thd, OPTIMIZER_SWITCH_MRR) || + *flags & HA_MRR_INDEX_ONLY || + (using_cpk && !doing_cpk_scan) || key_uses_partial_cols(share, keyno)) + { + /* Use the default implementation */ + *flags |= HA_MRR_USE_DEFAULT_IMPL; + *flags &= ~HA_MRR_IMPLEMENTATION_FLAGS; + return TRUE; + } + + uint add_len= share->key_info[keyno].key_length + primary_file->ref_length; + if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, add_len, + &dsmrr_cost)) + return TRUE; + + bool force_dsmrr; + /* + If mrr_cost_based flag is not set, then set cost of DS-MRR to be minimum of + DS-MRR and Default implementations cost. This allows one to force use of + DS-MRR whenever it is applicable without affecting other cost-based + choices. + */ + if ((force_dsmrr= !optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_COST_BASED)) && + dsmrr_cost.total_cost() > cost->total_cost()) + dsmrr_cost= *cost; + + if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost()) + { + *flags &= ~HA_MRR_USE_DEFAULT_IMPL; /* Use the DS-MRR implementation */ + *flags &= ~HA_MRR_SORTED; /* We will return unordered output */ + *cost= dsmrr_cost; + res= FALSE; + + + if ((using_cpk && doing_cpk_scan) || + (optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS) && + *flags & HA_MRR_SINGLE_POINT)) + { + *flags |= DSMRR_IMPL_SORT_KEYS; + } + + if (!(using_cpk && doing_cpk_scan) && + !(*flags & HA_MRR_INDEX_ONLY)) + { + *flags |= DSMRR_IMPL_SORT_ROWIDS; + } + /* + if ((*flags & HA_MRR_SINGLE_POINT) && + optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS)) + *flags |= HA_MRR_MATERIALIZED_KEYS; + */ + } + else + { + /* Use the default MRR implementation */ + res= TRUE; + } + return res; +} + +/* + Take the flags we've returned previously and print one of + - Key-ordered scan + - Rowid-ordered scan + - Key-ordered Rowid-ordered scan +*/ + +int DsMrr_impl::dsmrr_explain_info(uint mrr_mode, char *str, size_t size) +{ + const char *key_ordered= "Key-ordered scan"; + const char *rowid_ordered= "Rowid-ordered scan"; + const char *both_ordered= "Key-ordered Rowid-ordered scan"; + const char *used_str=""; + const uint BOTH_FLAGS= (DSMRR_IMPL_SORT_KEYS | DSMRR_IMPL_SORT_ROWIDS); + + if (!(mrr_mode & HA_MRR_USE_DEFAULT_IMPL)) + { + if ((mrr_mode & BOTH_FLAGS) == BOTH_FLAGS) + used_str= both_ordered; + else if (mrr_mode & DSMRR_IMPL_SORT_KEYS) + used_str= key_ordered; + else if (mrr_mode & DSMRR_IMPL_SORT_ROWIDS) + used_str= rowid_ordered; + + size_t used_str_len= strlen(used_str); + size_t copy_len= MY_MIN(used_str_len, size); + memcpy(str, used_str, copy_len); + return (int)copy_len; + } + return 0; +} + + +static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, Cost_estimate *cost); + + +/** + Get cost of DS-MRR scan + + @param keynr Index to be used + @param rows E(Number of rows to be scanned) + @param flags Scan parameters (HA_MRR_* flags) + @param buffer_size INOUT Buffer size + IN: Buffer of size 0 means the function + will determine the best size and return it. + @param extra_mem_overhead Extra memory overhead of the MRR implementation + (the function assumes this many bytes of buffer + space will not be usable by DS-MRR) + @param cost OUT The cost + + @retval FALSE OK + @retval TRUE Error, DS-MRR cannot be used (the buffer is too small + for even 1 rowid) +*/ + +bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, + uint *buffer_size, + uint extra_mem_overhead, + Cost_estimate *cost) +{ + ulong max_buff_entries, elem_size; + ha_rows rows_in_full_step; + ha_rows rows_in_last_step; + uint n_full_steps; + double index_read_cost; + + elem_size= primary_file->ref_length + + sizeof(void*) * (!MY_TEST(flags & HA_MRR_NO_ASSOCIATION)); + + if (!*buffer_size) + { + /* + We are requested to determine how much memory we need. + Request memory to finish the scan in one pass but do not request + more than @@mrr_buff_size. + */ + *buffer_size= (uint) MY_MIN(extra_mem_overhead + elem_size*(ulong)rows, + MY_MAX(table->in_use->variables.mrr_buff_size, + extra_mem_overhead)); + } + + if (elem_size + extra_mem_overhead > *buffer_size) + return TRUE; /* Buffer has not enough space for even 1 rowid */ + + max_buff_entries = (*buffer_size - extra_mem_overhead) / elem_size; + + /* Number of iterations we'll make with full buffer */ + n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries); + + /* + Get numbers of rows we'll be processing in + - non-last sweep, with full buffer + - last iteration, with non-full buffer + */ + rows_in_full_step= max_buff_entries; + rows_in_last_step= rows % max_buff_entries; + + /* Adjust buffer size if we expect to use only part of the buffer */ + if (n_full_steps) + { + get_sort_and_sweep_cost(table, rows_in_full_step, cost); + cost->multiply(n_full_steps); + } + else + { + cost->reset(); + *buffer_size= (uint)MY_MAX(*buffer_size, + (size_t)(1.2*rows_in_last_step) * elem_size + + primary_file->ref_length + table->key_info[keynr].key_length); + } + + Cost_estimate last_step_cost; + get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost); + cost->add(&last_step_cost); + + if (n_full_steps != 0) + cost->mem_cost= *buffer_size; + else + cost->mem_cost= (double)rows_in_last_step * elem_size; + + /* Total cost of all index accesses */ + index_read_cost= primary_file->keyread_time(keynr, 1, rows); + cost->add_io(index_read_cost, 1 /* Random seeks */); + return FALSE; +} + + +/* + Get cost of one sort-and-sweep step + + It consists of two parts: + - sort an array of #nrows ROWIDs using qsort + - read #nrows records from table in a sweep. + + @param table Table being accessed + @param nrows Number of rows to be sorted and retrieved + @param cost OUT The cost of scan +*/ + +static +void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, Cost_estimate *cost) +{ + if (nrows) + { + get_sweep_read_cost(table, nrows, FALSE, cost); + /* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */ + double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID); + if (cmp_op < 3) + cmp_op= 3; + cost->cpu_cost += cmp_op * log2(cmp_op); + } + else + cost->reset(); +} + + +/** + Get cost of reading nrows table records in a "disk sweep" + + A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made + for an ordered sequence of rowids. + + We assume hard disk IO. The read is performed as follows: + + 1. The disk head is moved to the needed cylinder + 2. The controller waits for the plate to rotate + 3. The data is transferred + + Time to do #3 is insignificant compared to #2+#1. + + Time to move the disk head is proportional to head travel distance. + + Time to wait for the plate to rotate depends on whether the disk head + was moved or not. + + If disk head wasn't moved, the wait time is proportional to distance + between the previous block and the block we're reading. + + If the head was moved, we don't know how much we'll need to wait for the + plate to rotate. We assume the wait time to be a variate with a mean of + 0.5 of full rotation time. + + Our cost units are "random disk seeks". The cost of random disk seek is + actually not a constant, it depends one range of cylinders we're going + to access. We make it constant by introducing a fuzzy concept of "typical + datafile length" (it's fuzzy as it's hard to tell whether it should + include index file, temp.tables etc). Then random seek cost is: + + 1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length + + We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9. + + If handler::avg_io_cost() < 1.0, then we will trust the handler + when it comes to the average cost (this is for example true for HEAP). + + @param table Table to be accessed + @param nrows Number of rows to retrieve + @param interrupted TRUE <=> Assume that the disk sweep will be + interrupted by other disk IO. FALSE - otherwise. + @param cost OUT The cost. +*/ + +void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, + Cost_estimate *cost) +{ + DBUG_ENTER("get_sweep_read_cost"); + + cost->reset(); + if (table->file->pk_is_clustering_key(table->s->primary_key)) + { + cost->cpu_cost= table->file->read_time(table->s->primary_key, + (uint) nrows, nrows); + } + else if ((cost->avg_io_cost= table->file->avg_io_cost()) >= 0.999) + { + double n_blocks= + ceil(ulonglong2double(table->file->stats.data_file_length) / IO_SIZE); + double busy_blocks= + n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows))); + if (busy_blocks < 1.0) + busy_blocks= 1.0; + + DBUG_PRINT("info",("sweep: nblocks=%g, busy_blocks=%g", n_blocks, + busy_blocks)); + cost->io_count= busy_blocks; + + if (!interrupted) + { + /* Assume reading is done in one 'sweep' */ + cost->avg_io_cost= (DISK_SEEK_BASE_COST + + DISK_SEEK_PROP_COST*n_blocks/busy_blocks); + } + } + DBUG_PRINT("info",("returning cost=%g", cost->total_cost())); + DBUG_VOID_RETURN; +} + + +/* ************************************************************************** + * DS-MRR implementation ends + ***************************************************************************/ |