diff options
Diffstat (limited to '')
-rw-r--r-- | storage/maria/ma_ftdefs.h | 156 |
1 files changed, 156 insertions, 0 deletions
diff --git a/storage/maria/ma_ftdefs.h b/storage/maria/ma_ftdefs.h new file mode 100644 index 00000000..90ca6feb --- /dev/null +++ b/storage/maria/ma_ftdefs.h @@ -0,0 +1,156 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* some definitions for full-text indices */ + +#include "ma_fulltext.h" +#include <m_ctype.h> +#include <my_tree.h> +#include <queues.h> +#include <mysql/plugin.h> + +#define true_word_char(ctype, character) \ + ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \ + (character) == '_') +#define misc_word_char(X) 0 + +#define FT_MAX_WORD_LEN_FOR_SORT 31 + +#define FTPARSER_MEMROOT_ALLOC_SIZE 65536 + +#define COMPILE_STOPWORDS_IN + +/* Interested readers may consult SMART + (ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z) + for an excellent implementation of vector space model we use. + It also demonstrate the usage of different weghting techniques. + This code, though, is completely original and is not based on the + SMART code but was in some cases inspired by it. + + NORM_PIVOT was taken from the article + A.Singhal, C.Buckley, M.Mitra, "Pivoted Document Length Normalization", + ACM SIGIR'96, 21-29, 1996 + */ + +#define LWS_FOR_QUERY LWS_TF +#define LWS_IN_USE LWS_LOG +#define PRENORM_IN_USE PRENORM_AVG +#define NORM_IN_USE NORM_PIVOT +#define GWS_IN_USE GWS_PROB +/*==============================================================*/ +#define LWS_TF (count) +#define LWS_BINARY (count>0) +#define LWS_SQUARE (count*count) +#define LWS_LOG (count?(log( (double) count)+1):0) +/*--------------------------------------------------------------*/ +#define PRENORM_NONE (p->weight) +#define PRENORM_MAX (p->weight/docstat.max) +#define PRENORM_AUG (0.4+0.6*p->weight/docstat.max) +#define PRENORM_AVG (p->weight/docstat.sum*docstat.uniq) +#define PRENORM_AVGLOG ((1+log(p->weight))/(1+log(docstat.sum/docstat.uniq))) +/*--------------------------------------------------------------*/ +#define NORM_NONE (1) +#define NORM_SUM (docstat.nsum) +#define NORM_COS (sqrt(docstat.nsum2)) + +#define PIVOT_VAL (0.0115) +#define NORM_PIVOT (1+PIVOT_VAL*docstat.uniq) +/*---------------------------------------------------------------*/ +#define GWS_NORM (1/sqrt(sum2)) +#define GWS_GFIDF (sum/doc_cnt) +/* Mysterious, but w/o (double) GWS_IDF performs better :-o */ +#define GWS_IDF log(aio->info->state->records/doc_cnt) +#define GWS_IDF1 log((double)aio->info->state->records/doc_cnt) +#define GWS_PROB ((aio->info->state->records > doc_cnt) ? log(((double)(aio->info->state->records-doc_cnt))/doc_cnt) : 0 ) +#define GWS_FREQ (1.0/doc_cnt) +#define GWS_SQUARED pow(log((double)aio->info->state->records/doc_cnt),2) +#define GWS_CUBIC pow(log((double)aio->info->state->records/doc_cnt),3) +#define GWS_ENTROPY (1-(suml/sum-log(sum))/log(aio->info->state->records)) +/*=================================================================*/ + +/* Boolean search operators */ +#define FTB_YES (ft_boolean_syntax[0]) +#define FTB_EGAL (ft_boolean_syntax[1]) +#define FTB_NO (ft_boolean_syntax[2]) +#define FTB_INC (ft_boolean_syntax[3]) +#define FTB_DEC (ft_boolean_syntax[4]) +#define FTB_LBR (ft_boolean_syntax[5]) +#define FTB_RBR (ft_boolean_syntax[6]) +#define FTB_NEG (ft_boolean_syntax[7]) +#define FTB_TRUNC (ft_boolean_syntax[8]) +#define FTB_LQUOT (ft_boolean_syntax[10]) +#define FTB_RQUOT (ft_boolean_syntax[11]) + +typedef struct st_maria_ft_word { + const uchar * pos; + uint len; + double weight; +} FT_WORD; + +int is_stopword(const char *word, size_t len); + +MARIA_KEY *_ma_ft_make_key(MARIA_HA *, MARIA_KEY *, uint , uchar *, FT_WORD *, + my_off_t); + +uchar maria_ft_get_word(CHARSET_INFO *, const uchar **, const uchar *, + FT_WORD *, MYSQL_FTPARSER_BOOLEAN_INFO *); +uchar maria_ft_simple_get_word(CHARSET_INFO *, uchar **, const uchar *, + FT_WORD *, my_bool); + +typedef struct _st_maria_ft_seg_iterator { + uint num, len; + HA_KEYSEG *seg; + const uchar *rec, *pos; +} FT_SEG_ITERATOR; + +void _ma_ft_segiterator_init(MARIA_HA *, uint, const uchar *, FT_SEG_ITERATOR *); +void _ma_ft_segiterator_dummy_init(const uchar *, uint, FT_SEG_ITERATOR *); +uint _ma_ft_segiterator(FT_SEG_ITERATOR *); + +void maria_ft_parse_init(TREE *, CHARSET_INFO *); +int maria_ft_parse(TREE *, uchar *, int, struct st_mysql_ftparser *parser, + MYSQL_FTPARSER_PARAM *, MEM_ROOT *); +FT_WORD * maria_ft_linearize(TREE *, MEM_ROOT *); +FT_WORD * _ma_ft_parserecord(MARIA_HA *, uint, const uchar *, MEM_ROOT *); +uint _ma_ft_parse(TREE *, MARIA_HA *, uint, const uchar *, + MYSQL_FTPARSER_PARAM *, MEM_ROOT *); + +FT_INFO *maria_ft_init_nlq_search(MARIA_HA *, uint, uchar *, uint, uint, + uchar *); +FT_INFO *maria_ft_init_boolean_search(MARIA_HA *, uint, uchar *, uint, + CHARSET_INFO *); + +extern const struct _ft_vft _ma_ft_vft_nlq; +int maria_ft_nlq_read_next(FT_INFO *, char *); +float maria_ft_nlq_find_relevance(FT_INFO *, uchar *, uint); +void maria_ft_nlq_close_search(FT_INFO *); +float maria_ft_nlq_get_relevance(FT_INFO *); +my_off_t maria_ft_nlq_get_docid(FT_INFO *); +void maria_ft_nlq_reinit_search(FT_INFO *); + +extern const struct _ft_vft _ma_ft_vft_boolean; +int maria_ft_boolean_read_next(FT_INFO *, char *); +float maria_ft_boolean_find_relevance(FT_INFO *, uchar *, uint); +void maria_ft_boolean_close_search(FT_INFO *); +float maria_ft_boolean_get_relevance(FT_INFO *); +my_off_t maria_ft_boolean_get_docid(FT_INFO *); +void maria_ft_boolean_reinit_search(FT_INFO *); +MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info); +extern MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info, + uint keynr, + uint paramnr); +extern void maria_ftparser_call_deinitializer(MARIA_HA *info); |