1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
|
/* Copyright (c) 2005 MySQL AB, 2009 Sun Microsystems, Inc.
Use is subject to license terms.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */
#ifndef _my_plugin_ftparser_h
#define _my_plugin_ftparser_h
#include "plugin.h"
#ifdef __cplusplus
extern "C" {
#endif
/*************************************************************************
API for Full-text parser plugin. (MYSQL_FTPARSER_PLUGIN)
*/
#define MYSQL_FTPARSER_INTERFACE_VERSION 0x0100
/* Parsing modes. Set in MYSQL_FTPARSER_PARAM::mode */
enum enum_ftparser_mode
{
/*
Fast and simple mode. This mode is used for indexing, and natural
language queries.
The parser is expected to return only those words that go into the
index. Stopwords or too short/long words should not be returned. The
'boolean_info' argument of mysql_add_word() does not have to be set.
*/
MYSQL_FTPARSER_SIMPLE_MODE= 0,
/*
Parse with stopwords mode. This mode is used in boolean searches for
"phrase matching."
The parser is not allowed to ignore words in this mode. Every word
should be returned, including stopwords and words that are too short
or long. The 'boolean_info' argument of mysql_add_word() does not
have to be set.
*/
MYSQL_FTPARSER_WITH_STOPWORDS= 1,
/*
Parse in boolean mode. This mode is used to parse a boolean query string.
The parser should provide a valid MYSQL_FTPARSER_BOOLEAN_INFO
structure in the 'boolean_info' argument to mysql_add_word().
Usually that means that the parser should recognize boolean operators
in the parsing stream and set appropriate fields in
MYSQL_FTPARSER_BOOLEAN_INFO structure accordingly. As for
MYSQL_FTPARSER_WITH_STOPWORDS mode, no word should be ignored.
Instead, use FT_TOKEN_STOPWORD for the token type of such a word.
*/
MYSQL_FTPARSER_FULL_BOOLEAN_INFO= 2
};
/*
Token types for boolean mode searching (used for the type member of
MYSQL_FTPARSER_BOOLEAN_INFO struct)
FT_TOKEN_EOF: End of data.
FT_TOKEN_WORD: Regular word.
FT_TOKEN_LEFT_PAREN: Left parenthesis (start of group/sub-expression).
FT_TOKEN_RIGHT_PAREN: Right parenthesis (end of group/sub-expression).
FT_TOKEN_STOPWORD: Stopword.
*/
enum enum_ft_token_type
{
FT_TOKEN_EOF= 0,
FT_TOKEN_WORD= 1,
FT_TOKEN_LEFT_PAREN= 2,
FT_TOKEN_RIGHT_PAREN= 3,
FT_TOKEN_STOPWORD= 4
};
/*
This structure is used in boolean search mode only. It conveys
boolean-mode metadata to the MySQL search engine for every word in
the search query. A valid instance of this structure must be filled
in by the plugin parser and passed as an argument in the call to
mysql_add_word (the callback function in the MYSQL_FTPARSER_PARAM
structure) when a query is parsed in boolean mode.
type: The token type. Should be one of the enum_ft_token_type values.
yesno: Whether the word must be present for a match to occur:
>0 Must be present
<0 Must not be present
0 Neither; the word is optional but its presence increases the relevance
With the default settings of the ft_boolean_syntax system variable,
>0 corresponds to the '+' operator, <0 corresponds to the '-' operator,
and 0 means neither operator was used.
weight_adjust: A weighting factor that determines how much a match
for the word counts. Positive values increase, negative - decrease the
relative word's importance in the query.
wasign: The sign of the word's weight in the query. If it's non-negative
the match for the word will increase document relevance, if it's
negative - decrease (the word becomes a "noise word", the less of it the
better).
trunc: Corresponds to the '*' operator in the default setting of the
ft_boolean_syntax system variable.
*/
typedef struct st_mysql_ftparser_boolean_info
{
enum enum_ft_token_type type;
int yesno;
int weight_adjust;
char wasign;
char trunc;
/* These are parser state and must be removed. */
char prev;
char *quot;
} MYSQL_FTPARSER_BOOLEAN_INFO;
/*
The following flag means that buffer with a string (document, word)
may be overwritten by the caller before the end of the parsing (that is
before st_mysql_ftparser::deinit() call). If one needs the string
to survive between two successive calls of the parsing function, she
needs to save a copy of it. The flag may be set by MySQL before calling
st_mysql_ftparser::parse(), or it may be set by a plugin before calling
st_mysql_ftparser_param::mysql_parse() or
st_mysql_ftparser_param::mysql_add_word().
*/
#define MYSQL_FTFLAGS_NEED_COPY 1
/*
An argument of the full-text parser plugin. This structure is
filled in by MySQL server and passed to the parsing function of the
plugin as an in/out parameter.
mysql_parse: A pointer to the built-in parser implementation of the
server. It's set by the server and can be used by the parser plugin
to invoke the MySQL default parser. If plugin's role is to extract
textual data from .doc, .pdf or .xml content, it might extract
plaintext from the content, and then pass the text to the default
MySQL parser to be parsed.
mysql_add_word: A server callback to add a new word. When parsing
a document, the server sets this to point at a function that adds
the word to MySQL full-text index. When parsing a search query,
this function will add the new word to the list of words to search
for. The boolean_info argument can be NULL for all cases except
when mode is MYSQL_FTPARSER_FULL_BOOLEAN_INFO. A plugin can replace this
callback to post-process every parsed word before passing it to the original
mysql_add_word function.
ftparser_state: A generic pointer. The plugin can set it to point
to information to be used internally for its own purposes.
mysql_ftparam: This is set by the server. It is used by MySQL functions
called via mysql_parse() and mysql_add_word() callback. The plugin
should not modify it.
cs: Information about the character set of the document or query string.
doc: A pointer to the document or query string to be parsed.
length: Length of the document or query string, in bytes.
flags: See MYSQL_FTFLAGS_* constants above.
mode: The parsing mode. With boolean operators, with stopwords, or
nothing. See enum_ftparser_mode above.
*/
typedef struct st_mysql_ftparser_param
{
int (*mysql_parse)(struct st_mysql_ftparser_param *,
const char *doc, int doc_len);
int (*mysql_add_word)(struct st_mysql_ftparser_param *,
const char *word, int word_len,
MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info);
void *ftparser_state;
void *mysql_ftparam;
const struct charset_info_st *cs;
const char *doc;
int length;
unsigned int flags;
enum enum_ftparser_mode mode;
} MYSQL_FTPARSER_PARAM;
/*
Full-text parser descriptor.
interface_version is, e.g., MYSQL_FTPARSER_INTERFACE_VERSION.
The parsing, initialization, and deinitialization functions are
invoked per SQL statement for which the parser is used.
*/
struct st_mysql_ftparser
{
int interface_version;
int (*parse)(MYSQL_FTPARSER_PARAM *param);
int (*init)(MYSQL_FTPARSER_PARAM *param);
int (*deinit)(MYSQL_FTPARSER_PARAM *param);
};
#ifdef __cplusplus
}
#endif
#endif
|