summaryrefslogtreecommitdiffstats
path: root/storage/maria/ma_ftdefs.h
blob: 90ca6feb086ebf402cce5a4bfcc84d8793428091 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; version 2 of the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */

/* Written by Sergei A. Golubchik, who has a shared copyright to this code */

/* some definitions for full-text indices */

#include "ma_fulltext.h"
#include <m_ctype.h>
#include <my_tree.h>
#include <queues.h>
#include <mysql/plugin.h>

#define true_word_char(ctype, character) \
                      ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \
                       (character) == '_')
#define misc_word_char(X)	0

#define FT_MAX_WORD_LEN_FOR_SORT 31

#define FTPARSER_MEMROOT_ALLOC_SIZE 65536

#define COMPILE_STOPWORDS_IN

/* Interested readers may consult SMART
   (ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z)
   for an excellent implementation of vector space model we use.
   It also demonstrate the usage of different weghting techniques.
   This code, though, is completely original and is not based on the
   SMART code but was in some cases inspired by it.

   NORM_PIVOT was taken from the article
   A.Singhal, C.Buckley, M.Mitra, "Pivoted Document Length Normalization",
   ACM SIGIR'96, 21-29, 1996
 */

#define LWS_FOR_QUERY					  LWS_TF
#define LWS_IN_USE					 LWS_LOG
#define PRENORM_IN_USE				     PRENORM_AVG
#define NORM_IN_USE				      NORM_PIVOT
#define GWS_IN_USE					GWS_PROB
/*==============================================================*/
#define LWS_TF						  (count)
#define LWS_BINARY					(count>0)
#define LWS_SQUARE				    (count*count)
#define LWS_LOG				 (count?(log( (double) count)+1):0)
/*--------------------------------------------------------------*/
#define PRENORM_NONE				      (p->weight)
#define PRENORM_MAX			  (p->weight/docstat.max)
#define PRENORM_AUG		  (0.4+0.6*p->weight/docstat.max)
#define PRENORM_AVG	     (p->weight/docstat.sum*docstat.uniq)
#define PRENORM_AVGLOG ((1+log(p->weight))/(1+log(docstat.sum/docstat.uniq)))
/*--------------------------------------------------------------*/
#define NORM_NONE					      (1)
#define NORM_SUM				   (docstat.nsum)
#define NORM_COS			    (sqrt(docstat.nsum2))

#define PIVOT_VAL (0.0115)
#define NORM_PIVOT  (1+PIVOT_VAL*docstat.uniq)
/*---------------------------------------------------------------*/
#define GWS_NORM				     (1/sqrt(sum2))
#define GWS_GFIDF				      (sum/doc_cnt)
/* Mysterious, but w/o (double) GWS_IDF performs better :-o */
#define GWS_IDF		   log(aio->info->state->records/doc_cnt)
#define GWS_IDF1	   log((double)aio->info->state->records/doc_cnt)
#define GWS_PROB ((aio->info->state->records > doc_cnt) ? log(((double)(aio->info->state->records-doc_cnt))/doc_cnt) : 0 )
#define GWS_FREQ					(1.0/doc_cnt)
#define GWS_SQUARED pow(log((double)aio->info->state->records/doc_cnt),2)
#define GWS_CUBIC   pow(log((double)aio->info->state->records/doc_cnt),3)
#define GWS_ENTROPY (1-(suml/sum-log(sum))/log(aio->info->state->records))
/*=================================================================*/

/* Boolean search operators */
#define FTB_YES   (ft_boolean_syntax[0])
#define FTB_EGAL  (ft_boolean_syntax[1])
#define FTB_NO    (ft_boolean_syntax[2])
#define FTB_INC   (ft_boolean_syntax[3])
#define FTB_DEC   (ft_boolean_syntax[4])
#define FTB_LBR   (ft_boolean_syntax[5])
#define FTB_RBR   (ft_boolean_syntax[6])
#define FTB_NEG   (ft_boolean_syntax[7])
#define FTB_TRUNC (ft_boolean_syntax[8])
#define FTB_LQUOT (ft_boolean_syntax[10])
#define FTB_RQUOT (ft_boolean_syntax[11])

typedef struct st_maria_ft_word {
  const uchar * pos;
  uint	 len;
  double weight;
} FT_WORD;

int is_stopword(const char *word, size_t len);

MARIA_KEY *_ma_ft_make_key(MARIA_HA *, MARIA_KEY *, uint , uchar *, FT_WORD *,
                           my_off_t);

uchar maria_ft_get_word(CHARSET_INFO *, const uchar **, const uchar *,
                        FT_WORD *, MYSQL_FTPARSER_BOOLEAN_INFO *);
uchar maria_ft_simple_get_word(CHARSET_INFO *, uchar **, const uchar *,
                               FT_WORD *, my_bool);

typedef struct _st_maria_ft_seg_iterator {
  uint        num, len;
  HA_KEYSEG  *seg;
  const uchar *rec, *pos;
} FT_SEG_ITERATOR;

void _ma_ft_segiterator_init(MARIA_HA *, uint, const uchar *, FT_SEG_ITERATOR *);
void _ma_ft_segiterator_dummy_init(const uchar *, uint, FT_SEG_ITERATOR *);
uint _ma_ft_segiterator(FT_SEG_ITERATOR *);

void maria_ft_parse_init(TREE *, CHARSET_INFO *);
int maria_ft_parse(TREE *, uchar *, int, struct st_mysql_ftparser *parser,
             MYSQL_FTPARSER_PARAM *, MEM_ROOT *);
FT_WORD * maria_ft_linearize(TREE *, MEM_ROOT *);
FT_WORD * _ma_ft_parserecord(MARIA_HA *, uint, const uchar *, MEM_ROOT *);
uint _ma_ft_parse(TREE *, MARIA_HA *, uint, const uchar *,
                  MYSQL_FTPARSER_PARAM *, MEM_ROOT *);

FT_INFO *maria_ft_init_nlq_search(MARIA_HA *, uint, uchar *, uint, uint,
                                  uchar *);
FT_INFO *maria_ft_init_boolean_search(MARIA_HA *, uint, uchar *, uint,
                                      CHARSET_INFO *);

extern const struct _ft_vft _ma_ft_vft_nlq;
int maria_ft_nlq_read_next(FT_INFO *, char *);
float maria_ft_nlq_find_relevance(FT_INFO *, uchar *, uint);
void maria_ft_nlq_close_search(FT_INFO *);
float maria_ft_nlq_get_relevance(FT_INFO *);
my_off_t maria_ft_nlq_get_docid(FT_INFO *);
void maria_ft_nlq_reinit_search(FT_INFO *);

extern const struct _ft_vft _ma_ft_vft_boolean;
int maria_ft_boolean_read_next(FT_INFO *, char *);
float maria_ft_boolean_find_relevance(FT_INFO *, uchar *, uint);
void maria_ft_boolean_close_search(FT_INFO *);
float maria_ft_boolean_get_relevance(FT_INFO *);
my_off_t maria_ft_boolean_get_docid(FT_INFO *);
void maria_ft_boolean_reinit_search(FT_INFO *);
MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info);
extern MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info,
                                                             uint keynr,
                                                             uint paramnr);
extern void maria_ftparser_call_deinitializer(MARIA_HA *info);