src/backend/tsearch/ts_selfuncs.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453

/*-------------------------------------------------------------------------
 *
 * ts_selfuncs.c
 *	  Selectivity estimation functions for text search operators.
 *
 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *	  src/backend/tsearch/ts_selfuncs.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include "access/htup_details.h"
#include "catalog/pg_statistic.h"
#include "catalog/pg_type.h"
#include "miscadmin.h"
#include "nodes/nodes.h"
#include "tsearch/ts_type.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/selfuncs.h"
#include "utils/syscache.h"


/*
 * The default text search selectivity is chosen to be small enough to
 * encourage indexscans for typical table densities.  See selfuncs.h and
 * DEFAULT_EQ_SEL for details.
 */
#define DEFAULT_TS_MATCH_SEL 0.005

/* lookup table type for binary searching through MCELEMs */
typedef struct
{
	text	   *element;
	float4		frequency;
} TextFreq;

/* type of keys for bsearch'ing through an array of TextFreqs */
typedef struct
{
	char	   *lexeme;
	int			length;
} LexemeKey;

static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
static Selectivity mcelem_tsquery_selec(TSQuery query,
										Datum *mcelem, int nmcelem,
										float4 *numbers, int nnumbers);
static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
									 TextFreq *lookup, int length, float4 minfreq);
static int	compare_lexeme_textfreq(const void *e1, const void *e2);

#define tsquery_opr_selec_no_stats(query) \
	tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)


/*
 *	tsmatchsel -- Selectivity of "@@"
 *
 * restriction selectivity function for tsvector @@ tsquery and
 * tsquery @@ tsvector
 */
Datum
tsmatchsel(PG_FUNCTION_ARGS)
{
	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);

#ifdef NOT_USED
	Oid			operator = PG_GETARG_OID(1);
#endif
	List	   *args = (List *) PG_GETARG_POINTER(2);
	int			varRelid = PG_GETARG_INT32(3);
	VariableStatData vardata;
	Node	   *other;
	bool		varonleft;
	Selectivity selec;

	/*
	 * If expression is not variable = something or something = variable, then
	 * punt and return a default estimate.
	 */
	if (!get_restriction_variable(root, args, varRelid,
								  &vardata, &other, &varonleft))
		PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);

	/*
	 * Can't do anything useful if the something is not a constant, either.
	 */
	if (!IsA(other, Const))
	{
		ReleaseVariableStats(vardata);
		PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
	}

	/*
	 * The "@@" operator is strict, so we can cope with NULL right away
	 */
	if (((Const *) other)->constisnull)
	{
		ReleaseVariableStats(vardata);
		PG_RETURN_FLOAT8(0.0);
	}

	/*
	 * OK, there's a Var and a Const we're dealing with here.  We need the
	 * Const to be a TSQuery, else we can't do anything useful.  We have to
	 * check this because the Var might be the TSQuery not the TSVector.
	 */
	if (((Const *) other)->consttype == TSQUERYOID)
	{
		/* tsvector @@ tsquery or the other way around */
		Assert(vardata.vartype == TSVECTOROID);

		selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
	}
	else
	{
		/* If we can't see the query structure, must punt */
		selec = DEFAULT_TS_MATCH_SEL;
	}

	ReleaseVariableStats(vardata);

	CLAMP_PROBABILITY(selec);

	PG_RETURN_FLOAT8((float8) selec);
}


/*
 *	tsmatchjoinsel -- join selectivity of "@@"
 *
 * join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
 */
Datum
tsmatchjoinsel(PG_FUNCTION_ARGS)
{
	/* for the moment we just punt */
	PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
}


/*
 * @@ selectivity for tsvector var vs tsquery constant
 */
static Selectivity
tsquerysel(VariableStatData *vardata, Datum constval)
{
	Selectivity selec;
	TSQuery		query;

	/* The caller made sure the const is a TSQuery, so get it now */
	query = DatumGetTSQuery(constval);

	/* Empty query matches nothing */
	if (query->size == 0)
		return (Selectivity) 0.0;

	if (HeapTupleIsValid(vardata->statsTuple))
	{
		Form_pg_statistic stats;
		AttStatsSlot sslot;

		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);

		/* MCELEM will be an array of TEXT elements for a tsvector column */
		if (get_attstatsslot(&sslot, vardata->statsTuple,
							 STATISTIC_KIND_MCELEM, InvalidOid,
							 ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
		{
			/*
			 * There is a most-common-elements slot for the tsvector Var, so
			 * use that.
			 */
			selec = mcelem_tsquery_selec(query, sslot.values, sslot.nvalues,
										 sslot.numbers, sslot.nnumbers);
			free_attstatsslot(&sslot);
		}
		else
		{
			/* No most-common-elements info, so do without */
			selec = tsquery_opr_selec_no_stats(query);
		}

		/*
		 * MCE stats count only non-null rows, so adjust for null rows.
		 */
		selec *= (1.0 - stats->stanullfrac);
	}
	else
	{
		/* No stats at all, so do without */
		selec = tsquery_opr_selec_no_stats(query);
		/* we assume no nulls here, so no stanullfrac correction */
	}

	return selec;
}

/*
 * Extract data from the pg_statistic arrays into useful format.
 */
static Selectivity
mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
					 float4 *numbers, int nnumbers)
{
	float4		minfreq;
	TextFreq   *lookup;
	Selectivity selec;
	int			i;

	/*
	 * There should be two more Numbers than Values, because the last two
	 * cells are taken for minimal and maximal frequency.  Punt if not.
	 *
	 * (Note: the MCELEM statistics slot definition allows for a third extra
	 * number containing the frequency of nulls, but we're not expecting that
	 * to appear for a tsvector column.)
	 */
	if (nnumbers != nmcelem + 2)
		return tsquery_opr_selec_no_stats(query);

	/*
	 * Transpose the data into a single array so we can use bsearch().
	 */
	lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
	for (i = 0; i < nmcelem; i++)
	{
		/*
		 * The text Datums came from an array, so it cannot be compressed or
		 * stored out-of-line -- it's safe to use VARSIZE_ANY*.
		 */
		Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
		lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
		lookup[i].frequency = numbers[i];
	}

	/*
	 * Grab the lowest frequency. compute_tsvector_stats() stored it for us in
	 * the one before the last cell of the Numbers array. See ts_typanalyze.c
	 */
	minfreq = numbers[nnumbers - 2];

	selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
							  nmcelem, minfreq);

	pfree(lookup);

	return selec;
}

/*
 * Traverse the tsquery in preorder, calculating selectivity as:
 *
 *	 selec(left_oper) * selec(right_oper) in AND & PHRASE nodes,
 *
 *	 selec(left_oper) + selec(right_oper) -
 *		selec(left_oper) * selec(right_oper) in OR nodes,
 *
 *	 1 - select(oper) in NOT nodes
 *
 *	 histogram-based estimation in prefix VAL nodes
 *
 *	 freq[val] in exact VAL nodes, if the value is in MCELEM
 *	 min(freq[MCELEM]) / 2 in VAL nodes, if it is not
 *
 * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
 * binary search for determining freq[MCELEM].
 *
 * If we don't have stats for the tsvector, we still use this logic,
 * except we use default estimates for VAL nodes.  This case is signaled
 * by lookup == NULL.
 */
static Selectivity
tsquery_opr_selec(QueryItem *item, char *operand,
				  TextFreq *lookup, int length, float4 minfreq)
{
	Selectivity selec;

	/* since this function recurses, it could be driven to stack overflow */
	check_stack_depth();

	if (item->type == QI_VAL)
	{
		QueryOperand *oper = (QueryOperand *) item;
		LexemeKey	key;

		/*
		 * Prepare the key for bsearch().
		 */
		key.lexeme = operand + oper->distance;
		key.length = oper->length;

		if (oper->prefix)
		{
			/* Prefix match, ie the query item is lexeme:* */
			Selectivity matched,
						allmces;
			int			i,
						n_matched;

			/*
			 * Our strategy is to scan through the MCELEM list and combine the
			 * frequencies of the ones that match the prefix.  We then
			 * extrapolate the fraction of matching MCELEMs to the remaining
			 * rows, assuming that the MCELEMs are representative of the whole
			 * lexeme population in this respect.  (Compare
			 * histogram_selectivity().)  Note that these are most common
			 * elements not most common values, so they're not mutually
			 * exclusive.  We treat occurrences as independent events.
			 *
			 * This is only a good plan if we have a pretty fair number of
			 * MCELEMs available; we set the threshold at 100.  If no stats or
			 * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
			 */
			if (lookup == NULL || length < 100)
				return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);

			matched = allmces = 0;
			n_matched = 0;
			for (i = 0; i < length; i++)
			{
				TextFreq   *t = lookup + i;
				int			tlen = VARSIZE_ANY_EXHDR(t->element);

				if (tlen >= key.length &&
					strncmp(key.lexeme, VARDATA_ANY(t->element),
							key.length) == 0)
				{
					matched += t->frequency - matched * t->frequency;
					n_matched++;
				}
				allmces += t->frequency - allmces * t->frequency;
			}

			/* Clamp to ensure sanity in the face of roundoff error */
			CLAMP_PROBABILITY(matched);
			CLAMP_PROBABILITY(allmces);

			selec = matched + (1.0 - allmces) * ((double) n_matched / length);

			/*
			 * In any case, never believe that a prefix match has selectivity
			 * less than we would assign for a non-MCELEM lexeme.  This
			 * preserves the property that "word:*" should be estimated to
			 * match at least as many rows as "word" would be.
			 */
			selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec);
		}
		else
		{
			/* Regular exact lexeme match */
			TextFreq   *searchres;

			/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
			if (lookup == NULL)
				return (Selectivity) DEFAULT_TS_MATCH_SEL;

			searchres = (TextFreq *) bsearch(&key, lookup, length,
											 sizeof(TextFreq),
											 compare_lexeme_textfreq);

			if (searchres)
			{
				/*
				 * The element is in MCELEM.  Return precise selectivity (or
				 * at least as precise as ANALYZE could find out).
				 */
				selec = searchres->frequency;
			}
			else
			{
				/*
				 * The element is not in MCELEM.  Punt, but assume that the
				 * selectivity cannot be more than minfreq / 2.
				 */
				selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
			}
		}
	}
	else
	{
		/* Current TSQuery node is an operator */
		Selectivity s1,
					s2;

		switch (item->qoperator.oper)
		{
			case OP_NOT:
				selec = 1.0 - tsquery_opr_selec(item + 1, operand,
												lookup, length, minfreq);
				break;

			case OP_PHRASE:
			case OP_AND:
				s1 = tsquery_opr_selec(item + 1, operand,
									   lookup, length, minfreq);
				s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
									   lookup, length, minfreq);
				selec = s1 * s2;
				break;

			case OP_OR:
				s1 = tsquery_opr_selec(item + 1, operand,
									   lookup, length, minfreq);
				s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
									   lookup, length, minfreq);
				selec = s1 + s2 - s1 * s2;
				break;

			default:
				elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
				selec = 0;		/* keep compiler quiet */
				break;
		}
	}

	/* Clamp intermediate results to stay sane despite roundoff error */
	CLAMP_PROBABILITY(selec);

	return selec;
}

/*
 * bsearch() comparator for a lexeme (non-NULL terminated string with length)
 * and a TextFreq. Use length, then byte-for-byte comparison, because that's
 * how ANALYZE code sorted data before storing it in a statistic tuple.
 * See ts_typanalyze.c for details.
 */
static int
compare_lexeme_textfreq(const void *e1, const void *e2)
{
	const LexemeKey *key = (const LexemeKey *) e1;
	const TextFreq *t = (const TextFreq *) e2;
	int			len1,
				len2;

	len1 = key->length;
	len2 = VARSIZE_ANY_EXHDR(t->element);

	/* Compare lengths first, possibly avoiding a strncmp call */
	if (len1 > len2)
		return 1;
	else if (len1 < len2)
		return -1;

	/* Fall back on byte-for-byte comparison */
	return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
}