diff options
Diffstat (limited to 'sql/sql_digest.cc')
-rw-r--r-- | sql/sql_digest.cc | 688 |
1 files changed, 688 insertions, 0 deletions
diff --git a/sql/sql_digest.cc b/sql/sql_digest.cc new file mode 100644 index 00000000..36a6b398 --- /dev/null +++ b/sql/sql_digest.cc @@ -0,0 +1,688 @@ +/* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved. + Copyright (c) 2017, MariaDB Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + This code needs extra visibility in the lexer structures +*/ + +#include "mariadb.h" +#include "my_md5.h" +#include "unireg.h" + +#include "sql_string.h" +#include "sql_class.h" +#include "sql_lex.h" +#include "sp_pcontext.h" +#include "sql_digest.h" +#include "sql_digest_stream.h" + +#include "sql_get_diagnostics.h" + +/* Generated code */ +#include "yy_mariadb.hh" +#define LEX_TOKEN_WITH_DEFINITION +#include "lex_token.h" + +/* Name pollution from sql/sql_lex.h */ +#ifdef LEX_YYSTYPE +#undef LEX_YYSTYPE +#endif + +#define LEX_YYSTYPE YYSTYPE* + +#define SIZE_OF_A_TOKEN 2 + +/** + Read a single token from token array. +*/ +inline uint read_token(const sql_digest_storage *digest_storage, + uint index, uint *tok) +{ + uint safe_byte_count= digest_storage->m_byte_count; + + if (index + SIZE_OF_A_TOKEN <= safe_byte_count && + safe_byte_count <= digest_storage->m_token_array_length) + { + const unsigned char *src= & digest_storage->m_token_array[index]; + *tok= src[0] | (src[1] << 8); + return index + SIZE_OF_A_TOKEN; + } + + /* The input byte stream is exhausted. */ + *tok= 0; + return MAX_DIGEST_STORAGE_SIZE + 1; +} + +/** + Store a single token in token array. +*/ +inline void store_token(sql_digest_storage* digest_storage, uint token) +{ + DBUG_ASSERT(digest_storage->m_byte_count <= digest_storage->m_token_array_length); + + if (digest_storage->m_byte_count + SIZE_OF_A_TOKEN <= digest_storage->m_token_array_length) + { + unsigned char* dest= & digest_storage->m_token_array[digest_storage->m_byte_count]; + dest[0]= token & 0xff; + dest[1]= (token >> 8) & 0xff; + digest_storage->m_byte_count+= SIZE_OF_A_TOKEN; + } + else + { + digest_storage->m_full= true; + } +} + +/** + Read an identifier from token array. +*/ +inline uint read_identifier(const sql_digest_storage* digest_storage, + uint index, char ** id_string, int *id_length) +{ + uint new_index; + uint safe_byte_count= digest_storage->m_byte_count; + + DBUG_ASSERT(index <= safe_byte_count); + DBUG_ASSERT(safe_byte_count <= digest_storage->m_token_array_length); + + /* + token + length + string are written in an atomic way, + so we do always expect a length + string here + */ + + uint bytes_needed= SIZE_OF_A_TOKEN; + /* If we can read token and identifier length */ + if ((index + bytes_needed) <= safe_byte_count) + { + const unsigned char *src= & digest_storage->m_token_array[index]; + /* Read the length of identifier */ + uint length= src[0] | (src[1] << 8); + bytes_needed+= length; + /* If we can read entire identifier from token array */ + if ((index + bytes_needed) <= safe_byte_count) + { + *id_string= (char *) (src + 2); + *id_length= length; + + new_index= index + bytes_needed; + DBUG_ASSERT(new_index <= safe_byte_count); + return new_index; + } + } + + /* The input byte stream is exhausted. */ + return MAX_DIGEST_STORAGE_SIZE + 1; +} + +/** + Store an identifier in token array. +*/ +inline void store_token_identifier(sql_digest_storage* digest_storage, + uint token, + size_t id_length, const char *id_name) +{ + DBUG_ASSERT(digest_storage->m_byte_count <= digest_storage->m_token_array_length); + + size_t bytes_needed= 2 * SIZE_OF_A_TOKEN + id_length; + if (digest_storage->m_byte_count + bytes_needed <= (unsigned int)digest_storage->m_token_array_length) + { + unsigned char* dest= & digest_storage->m_token_array[digest_storage->m_byte_count]; + /* Write the token */ + dest[0]= token & 0xff; + dest[1]= (token >> 8) & 0xff; + /* Write the string length */ + dest[2]= id_length & 0xff; + dest[3]= (id_length >> 8) & 0xff; + /* Write the string data */ + if (id_length > 0) + memcpy((char *)(dest + 4), id_name, id_length); + digest_storage->m_byte_count+= (uint)bytes_needed; + } + else + { + digest_storage->m_full= true; + } +} + +void compute_digest_md5(const sql_digest_storage *digest_storage, unsigned char *md5) +{ + compute_md5_hash(md5, + (const char *) digest_storage->m_token_array, + digest_storage->m_byte_count); +} + +/* + Iterate token array and updates digest_text. +*/ +void compute_digest_text(const sql_digest_storage* digest_storage, + String *digest_text) +{ + DBUG_ASSERT(digest_storage != NULL); + uint byte_count= digest_storage->m_byte_count; + String *digest_output= digest_text; + uint tok= 0; + uint current_byte= 0; + lex_token_string *tok_data; + + /* Reset existing data */ + digest_output->length(0); + + if (byte_count > digest_storage->m_token_array_length) + { + digest_output->append('\0'); + return; + } + + /* Convert text to utf8 */ + const CHARSET_INFO *from_cs= get_charset(digest_storage->m_charset_number, MYF(0)); + const CHARSET_INFO *to_cs= &my_charset_utf8mb3_bin; + + if (from_cs == NULL) + { + /* + Can happen, as we do dirty reads on digest_storage, + which can be written to in another thread. + */ + digest_output->append('\0'); + return; + } + + char id_buffer[NAME_LEN + 1]= {'\0'}; + char *id_string; + size_t id_length; + bool convert_text= !my_charset_same(from_cs, to_cs); + + while (current_byte < byte_count) + { + current_byte= read_token(digest_storage, current_byte, &tok); + + if (tok <= 0 || tok >= array_elements(lex_token_array) + || current_byte > max_digest_length) + return; + + tok_data= &lex_token_array[tok]; + + switch (tok) + { + /* All identifiers are printed with their name. */ + case IDENT: + case IDENT_QUOTED: + case TOK_IDENT: + { + char *id_ptr= NULL; + int id_len= 0; + uint err_cs= 0; + + /* Get the next identifier from the storage buffer. */ + current_byte= read_identifier(digest_storage, current_byte, + &id_ptr, &id_len); + if (current_byte > max_digest_length) + return; + + if (convert_text) + { + /* Verify that the converted text will fit. */ + if (to_cs->mbmaxlen*id_len > NAME_LEN) + { + digest_output->append("...", 3); + break; + } + /* Convert identifier string into the storage character set. */ + id_length= my_convert(id_buffer, NAME_LEN, to_cs, + id_ptr, id_len, from_cs, &err_cs); + id_string= id_buffer; + } + else + { + id_string= id_ptr; + id_length= id_len; + } + + if (id_length == 0 || err_cs != 0) + { + break; + } + /* Copy the converted identifier into the digest string. */ + digest_output->append('`'); + if (id_length > 0) + digest_output->append(id_string, id_length); + digest_output->append("` ", 2); + } + break; + + /* Everything else is printed as is. */ + default: + /* + Make sure not to overflow digest_text buffer. + +1 is to make sure extra space for ' '. + */ + int tok_length= tok_data->m_token_length; + + digest_output->append(tok_data->m_token_string, tok_length); + if (tok_data->m_append_space) + digest_output->append(' '); + break; + } + } +} + +static inline uint peek_token(const sql_digest_storage *digest, uint index) +{ + uint token; + DBUG_ASSERT(index + SIZE_OF_A_TOKEN <= digest->m_byte_count); + DBUG_ASSERT(digest->m_byte_count <= digest->m_token_array_length); + + token= ((digest->m_token_array[index + 1])<<8) | digest->m_token_array[index]; + return token; +} + +/** + Function to read last two tokens from token array. If an identifier + is found, do not look for token before that. +*/ +static inline void peek_last_two_tokens(const sql_digest_storage* digest_storage, + uint last_id_index, uint *t1, uint *t2) +{ + uint byte_count= digest_storage->m_byte_count; + uint peek_index= byte_count; + + if (last_id_index + SIZE_OF_A_TOKEN <= peek_index) + { + /* Take last token. */ + peek_index-= SIZE_OF_A_TOKEN; + *t1= peek_token(digest_storage, peek_index); + + if (last_id_index + SIZE_OF_A_TOKEN <= peek_index) + { + /* Take 2nd token from last. */ + peek_index-= SIZE_OF_A_TOKEN; + *t2= peek_token(digest_storage, peek_index); + } + else + { + *t2= TOK_UNUSED; + } + } + else + { + *t1= TOK_UNUSED; + *t2= TOK_UNUSED; + } +} + +/** + Function to read last three tokens from token array. If an identifier + is found, do not look for token before that. +*/ +static inline void peek_last_three_tokens(const sql_digest_storage* digest_storage, + uint last_id_index, uint *t1, uint *t2, uint *t3) +{ + uint byte_count= digest_storage->m_byte_count; + uint peek_index= byte_count; + + if (last_id_index + SIZE_OF_A_TOKEN <= peek_index) + { + /* Take last token. */ + peek_index-= SIZE_OF_A_TOKEN; + *t1= peek_token(digest_storage, peek_index); + + if (last_id_index + SIZE_OF_A_TOKEN <= peek_index) + { + /* Take 2nd token from last. */ + peek_index-= SIZE_OF_A_TOKEN; + *t2= peek_token(digest_storage, peek_index); + + if (last_id_index + SIZE_OF_A_TOKEN <= peek_index) + { + /* Take 3rd token from last. */ + peek_index-= SIZE_OF_A_TOKEN; + *t3= peek_token(digest_storage, peek_index); + } + else + { + *t3= TOK_UNUSED; + } + } + else + { + *t2= TOK_UNUSED; + *t3= TOK_UNUSED; + } + } + else + { + *t1= TOK_UNUSED; + *t2= TOK_UNUSED; + *t3= TOK_UNUSED; + } +} + +sql_digest_state* digest_add_token(sql_digest_state *state, + uint token, + LEX_YYSTYPE yylval) +{ + sql_digest_storage *digest_storage= NULL; + + digest_storage= &state->m_digest_storage; + + /* + Stop collecting further tokens if digest storage is full or + if END token is received. + */ + if (digest_storage->m_full || token == END_OF_INPUT) + return NULL; + + /* + Take last_token 2 tokens collected till now. These tokens will be used + in reduce for normalisation. Make sure not to consider ID tokens in reduce. + */ + uint last_token; + uint last_token2; + + switch (token) + { + case NUM: + case LONG_NUM: + case ULONGLONG_NUM: + case DECIMAL_NUM: + case FLOAT_NUM: + case BIN_NUM: + case HEX_NUM: + { + bool found_unary; + do + { + found_unary= false; + peek_last_two_tokens(digest_storage, state->m_last_id_index, + &last_token, &last_token2); + + if ((last_token == '-') || (last_token == '+')) + { + /* + We need to differentiate: + - a <unary minus> operator + - a <unary plus> operator + from + - a <binary minus> operator + - a <binary plus> operator + to only reduce "a = -1" to "a = ?", and not change "b - 1" to "b ?" + + Binary operators are found inside an expression, + while unary operators are found at the beginning of an expression, or after operators. + + To achieve this, every token that is followed by an <expr> expression + in the SQL grammar is flagged. + See sql/sql_yacc.yy + See sql/gen_lex_token.cc + + For example, + "(-1)" is parsed as "(", "-", NUM, ")", and lex_token_array["("].m_start_expr is true, + so reduction of the "-" NUM is done, the result is "(?)". + "(a-1)" is parsed as "(", ID, "-", NUM, ")", and lex_token_array[ID].m_start_expr is false, + so the operator is binary, no reduction is done, and the result is "(a-?)". + */ + if (lex_token_array[last_token2].m_start_expr) + { + /* + REDUCE: + TOK_GENERIC_VALUE := (UNARY_PLUS | UNARY_MINUS) (NUM | LOG_NUM | ... | FLOAT_NUM) + + REDUCE: + TOK_GENERIC_VALUE := (UNARY_PLUS | UNARY_MINUS) TOK_GENERIC_VALUE + */ + token= TOK_GENERIC_VALUE; + digest_storage->m_byte_count-= SIZE_OF_A_TOKEN; + found_unary= true; + } + } + } while (found_unary); + } + /* for case NULL_SYM below */ + /* fall through */ + case LEX_HOSTNAME: + case TEXT_STRING: + case NCHAR_STRING: + case PARAM_MARKER: + { + /* + REDUCE: + TOK_GENERIC_VALUE := BIN_NUM | DECIMAL_NUM | ... | ULONGLONG_NUM + */ + token= TOK_GENERIC_VALUE; + + peek_last_two_tokens(digest_storage, state->m_last_id_index, + &last_token, &last_token2); + + if ((last_token2 == TOK_GENERIC_VALUE || + last_token2 == TOK_GENERIC_VALUE_LIST) && + (last_token == ',')) + { + /* + REDUCE: + TOK_GENERIC_VALUE_LIST := + TOK_GENERIC_VALUE ',' TOK_GENERIC_VALUE + + REDUCE: + TOK_GENERIC_VALUE_LIST := + TOK_GENERIC_VALUE_LIST ',' TOK_GENERIC_VALUE + */ + digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN; + token= TOK_GENERIC_VALUE_LIST; + } + /* + Add this token or the resulting reduce to digest storage. + */ + store_token(digest_storage, token); + break; + } + case ')': + { + peek_last_two_tokens(digest_storage, state->m_last_id_index, + &last_token, &last_token2); + + if (last_token == TOK_GENERIC_VALUE && + last_token2 == '(') + { + /* + REDUCE: + TOK_ROW_SINGLE_VALUE := + '(' TOK_GENERIC_VALUE ')' + */ + digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN; + token= TOK_ROW_SINGLE_VALUE; + + /* Read last two tokens again */ + peek_last_two_tokens(digest_storage, state->m_last_id_index, + &last_token, &last_token2); + + if ((last_token2 == TOK_ROW_SINGLE_VALUE || + last_token2 == TOK_ROW_SINGLE_VALUE_LIST) && + (last_token == ',')) + { + /* + REDUCE: + TOK_ROW_SINGLE_VALUE_LIST := + TOK_ROW_SINGLE_VALUE ',' TOK_ROW_SINGLE_VALUE + + REDUCE: + TOK_ROW_SINGLE_VALUE_LIST := + TOK_ROW_SINGLE_VALUE_LIST ',' TOK_ROW_SINGLE_VALUE + */ + digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN; + token= TOK_ROW_SINGLE_VALUE_LIST; + } + } + else if (last_token == TOK_GENERIC_VALUE_LIST && + last_token2 == '(') + { + /* + REDUCE: + TOK_ROW_MULTIPLE_VALUE := + '(' TOK_GENERIC_VALUE_LIST ')' + */ + digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN; + token= TOK_ROW_MULTIPLE_VALUE; + + /* Read last two tokens again */ + peek_last_two_tokens(digest_storage, state->m_last_id_index, + &last_token, &last_token2); + + if ((last_token2 == TOK_ROW_MULTIPLE_VALUE || + last_token2 == TOK_ROW_MULTIPLE_VALUE_LIST) && + (last_token == ',')) + { + /* + REDUCE: + TOK_ROW_MULTIPLE_VALUE_LIST := + TOK_ROW_MULTIPLE_VALUE ',' TOK_ROW_MULTIPLE_VALUE + + REDUCE: + TOK_ROW_MULTIPLE_VALUE_LIST := + TOK_ROW_MULTIPLE_VALUE_LIST ',' TOK_ROW_MULTIPLE_VALUE + */ + digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN; + token= TOK_ROW_MULTIPLE_VALUE_LIST; + } + } + /* + Add this token or the resulting reduce to digest storage. + */ + store_token(digest_storage, token); + break; + } + case IDENT: + case IDENT_QUOTED: + { + YYSTYPE *lex_token= yylval; + const char *yytext= lex_token->lex_str.str; + size_t yylen= lex_token->lex_str.length; + + /* + REDUCE: + TOK_IDENT := IDENT | IDENT_QUOTED + The parser gives IDENT or IDENT_TOKEN for the same text, + depending on the character set used. + We unify both to always print the same digest text, + and always have the same digest hash. + */ + token= TOK_IDENT; + /* Add this token and identifier string to digest storage. */ + store_token_identifier(digest_storage, token, yylen, yytext); + + /* Update the index of last identifier found. */ + state->m_last_id_index= digest_storage->m_byte_count; + break; + } + default: + { + /* Add this token to digest storage. */ + store_token(digest_storage, token); + break; + } + } + + return state; +} + +sql_digest_state* digest_reduce_token(sql_digest_state *state, + uint token_left, uint token_right) +{ + sql_digest_storage *digest_storage= NULL; + + digest_storage= &state->m_digest_storage; + + /* + Stop collecting further tokens if digest storage is full. + */ + if (digest_storage->m_full) + return NULL; + + uint last_token; + uint last_token2; + uint last_token3; + uint token_to_push= TOK_UNUSED; + + peek_last_two_tokens(digest_storage, state->m_last_id_index, + &last_token, &last_token2); + + /* + There is only one caller of digest_reduce_token(), + see sql/sql_yacc.yy, rule literal := NULL_SYM. + REDUCE: + token_left := token_right + Used for: + TOK_GENERIC_VALUE := NULL_SYM + */ + + if (last_token == token_right) + { + /* + Current stream is like: + TOKEN_X TOKEN_RIGHT . + REDUCE to + TOKEN_X TOKEN_LEFT . + */ + digest_storage->m_byte_count-= SIZE_OF_A_TOKEN; + store_token(digest_storage, token_left); + } + else + { + /* + Current stream is like: + TOKEN_X TOKEN_RIGHT TOKEN_Y . + Pop TOKEN_Y + TOKEN_X TOKEN_RIGHT . TOKEN_Y + REDUCE to + TOKEN_X TOKEN_LEFT . TOKEN_Y + */ + DBUG_ASSERT(last_token2 == token_right); + digest_storage->m_byte_count-= 2 * SIZE_OF_A_TOKEN; + store_token(digest_storage, token_left); + token_to_push= last_token; + } + + peek_last_three_tokens(digest_storage, state->m_last_id_index, + &last_token, &last_token2, &last_token3); + + if ((last_token3 == TOK_GENERIC_VALUE || + last_token3 == TOK_GENERIC_VALUE_LIST) && + (last_token2 == ',') && + (last_token == TOK_GENERIC_VALUE)) + { + /* + REDUCE: + TOK_GENERIC_VALUE_LIST := + TOK_GENERIC_VALUE ',' TOK_GENERIC_VALUE + + REDUCE: + TOK_GENERIC_VALUE_LIST := + TOK_GENERIC_VALUE_LIST ',' TOK_GENERIC_VALUE + */ + digest_storage->m_byte_count-= 3*SIZE_OF_A_TOKEN; + store_token(digest_storage, TOK_GENERIC_VALUE_LIST); + } + + if (token_to_push != TOK_UNUSED) + { + /* + Push TOKEN_Y + */ + store_token(digest_storage, token_to_push); + } + + return state; +} + |