summaryrefslogtreecommitdiffstats
path: root/comm/mailnews/extensions/fts3/fts3_porter.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 17:32:43 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 17:32:43 +0000
commit6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch)
treea68f146d7fa01f0134297619fbe7e33db084e0aa /comm/mailnews/extensions/fts3/fts3_porter.c
parentInitial commit. (diff)
downloadthunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.tar.xz
thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.zip
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'comm/mailnews/extensions/fts3/fts3_porter.c')
-rw-r--r--comm/mailnews/extensions/fts3/fts3_porter.c1140
1 files changed, 1140 insertions, 0 deletions
diff --git a/comm/mailnews/extensions/fts3/fts3_porter.c b/comm/mailnews/extensions/fts3/fts3_porter.c
new file mode 100644
index 0000000000..e0e5d5ddaf
--- /dev/null
+++ b/comm/mailnews/extensions/fts3/fts3_porter.c
@@ -0,0 +1,1140 @@
+/*
+** 2006 September 30
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+** Implementation of the full-text-search tokenizer that implements
+** a Porter stemmer.
+**
+*/
+
+/*
+ * This file is based on the SQLite FTS3 Porter Stemmer implementation.
+ *
+ * This is an attempt to provide some level of full-text search to users of
+ * Thunderbird who use languages that are not space/punctuation delimited.
+ * This is accomplished by performing bi-gram indexing of characters fall
+ * into the unicode space occupied by character sets used in such languages.
+ *
+ * Bi-gram indexing means that given the string "12345" we would index the
+ * pairs "12", "23", "34", and "45" (with position information). We do this
+ * because we are not sure where the word/semantic boundaries are in that
+ * string. Then, when a user searches for "234" the FTS3 engine tokenizes the
+ * search query into "23" and "34". Using special phrase-logic FTS3 requires
+ * the matches to have the tokens "23" and "34" adjacent to each other and in
+ * that order. In theory if the user searched for "2345" we we could just
+ * search for "23 NEAR/2 34". Unfortunately, NEAR does not imply ordering,
+ * so even though that would be more efficient, we would lose correctness
+ * and cannot do it.
+ *
+ * The efficiency and usability of bi-gram search assumes that the character
+ * space is large enough and actually observed bi-grams sufficiently
+ * distributed throughout the potential space so that the search bi-grams
+ * generated when the user issues a query find a 'reasonable' number of
+ * documents for each bi-gram match.
+ *
+ * Mozilla contributors:
+ * Makoto Kato <m_kato@ga2.so-net.ne.jp>
+ * Andrew Sutherland <asutherland@asutherland.org>
+ */
+
+/*
+** The code in this file is only compiled if:
+**
+** * The FTS3 module is being built as an extension
+** (in which case SQLITE_CORE is not defined), or
+**
+** * The FTS3 module is being built into the core of
+** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
+*/
+#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
+
+# include <assert.h>
+# include <stdlib.h>
+# include <stdio.h>
+# include <string.h>
+# include <ctype.h>
+
+# include "fts3_tokenizer.h"
+
+/* need some defined to compile without sqlite3 code */
+
+# define sqlite3_malloc malloc
+# define sqlite3_free free
+# define sqlite3_realloc realloc
+
+static const unsigned char sqlite3Utf8Trans1[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
+ 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
+ 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x00,
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
+ 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+ 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
+};
+
+typedef unsigned char u8;
+
+/**
+ * SQLite helper macro from sqlite3.c (really utf.c) to encode a unicode
+ * character into utf8.
+ *
+ * @param zOut A pointer to the current write position that is updated by
+ * the routine. At entry it should point to one-past the last valid
+ * encoded byte. The same holds true at exit.
+ * @param c The character to encode; this should be an unsigned int.
+ */
+# define WRITE_UTF8(zOut, c) \
+ { \
+ if (c < 0x0080) { \
+ *zOut++ = (u8)(c & 0xff); \
+ } else if (c < 0x0800) { \
+ *zOut++ = 0xC0 + (u8)((c >> 6) & 0x1F); \
+ *zOut++ = 0x80 + (u8)(c & 0x3F); \
+ } else if (c < 0x10000) { \
+ *zOut++ = 0xE0 + (u8)((c >> 12) & 0x0F); \
+ *zOut++ = 0x80 + (u8)((c >> 6) & 0x3F); \
+ *zOut++ = 0x80 + (u8)(c & 0x3F); \
+ } else { \
+ *zOut++ = 0xf0 + (u8)((c >> 18) & 0x07); \
+ *zOut++ = 0x80 + (u8)((c >> 12) & 0x3F); \
+ *zOut++ = 0x80 + (u8)((c >> 6) & 0x3F); \
+ *zOut++ = 0x80 + (u8)(c & 0x3F); \
+ } \
+ }
+
+/**
+ * Fudge factor to avoid buffer overwrites when WRITE_UTF8 is involved.
+ *
+ * Our normalization table includes entries that may result in a larger
+ * utf-8 encoding. Namely, 023a maps to 2c65. This is a growth from 2 bytes
+ * as utf-8 encoded to 3 bytes. This is currently the only transition possible
+ * because 1-byte encodings are known to stay 1-byte and our normalization
+ * table is 16-bit and so can't generate a 4-byte encoded output.
+ *
+ * For simplicity, we just multiple by 2 which covers the current case and
+ * potential growth for 2-byte to 4-byte growth. We can afford to do this
+ * because we're not talking about a lot of memory here as a rule.
+ */
+# define MAX_UTF8_GROWTH_FACTOR 2
+
+/**
+ * Helper from sqlite3.c to read a single UTF8 character.
+ *
+ * The clever bit with multi-byte reading is that you keep going until you find
+ * a byte whose top bits are not '10'. A single-byte UTF8 character will have
+ * '00' or '01', and a multi-byte UTF8 character must start with '11'.
+ *
+ * In the event of illegal UTF-8 this macro may read an arbitrary number of
+ * characters but will never read past zTerm. The resulting character value
+ * of illegal UTF-8 can be anything, although efforts are made to return the
+ * illegal character (0xfffd) for UTF-16 surrogates.
+ *
+ * @param zIn A pointer to the current position that is updated by the routine,
+ * pointing at the start of the next character when the routine returns.
+ * @param zTerm A pointer one past the end of the buffer.
+ * @param c The 'unsigned int' to hold the resulting character value. Do not
+ * use a short or a char.
+ */
+# define READ_UTF8(zIn, zTerm, c) \
+ { \
+ c = *(zIn++); \
+ if (c >= 0xc0) { \
+ c = sqlite3Utf8Trans1[c - 0xc0]; \
+ while (zIn != zTerm && (*zIn & 0xc0) == 0x80) { \
+ c = (c << 6) + (0x3f & *(zIn++)); \
+ } \
+ if (c < 0x80 || (c & 0xFFFFF800) == 0xD800 || \
+ (c & 0xFFFFFFFE) == 0xFFFE) { \
+ c = 0xFFFD; \
+ } \
+ } \
+ }
+
+/* end of compatible block to complie codes */
+
+/*
+** Class derived from sqlite3_tokenizer
+*/
+typedef struct porter_tokenizer {
+ sqlite3_tokenizer base; /* Base class */
+} porter_tokenizer;
+
+/*
+** Class derived from sqlit3_tokenizer_cursor
+*/
+typedef struct porter_tokenizer_cursor {
+ sqlite3_tokenizer_cursor base;
+ const char* zInput; /* input we are tokenizing */
+ int nInput; /* size of the input */
+ int iOffset; /* current position in zInput */
+ int iToken; /* index of next token to be returned */
+ unsigned char* zToken; /* storage for current token */
+ int nAllocated; /* space allocated to zToken buffer */
+ /**
+ * Store the offset of the second character in the bi-gram pair that we just
+ * emitted so that we can consider it being the first character in a bi-gram
+ * pair.
+ * The value 0 indicates that there is no previous such character. This is
+ * an acceptable sentinel value because the 0th offset can never be the
+ * offset of the second in a bi-gram pair.
+ *
+ * For example, let us say we are tokenizing a string of 4 CJK characters
+ * represented by the byte-string "11223344" where each repeated digit
+ * indicates 2-bytes of storage used to encode the character in UTF-8.
+ * (It actually takes 3, btw.) Then on the passes to emit each token,
+ * the iOffset and iPrevGigramOffset values at entry will be:
+ *
+ * 1122: iOffset = 0, iPrevBigramOffset = 0
+ * 2233: iOffset = 4, iPrevBigramOffset = 2
+ * 3344: iOffset = 6, iPrevBigramOffset = 4
+ * (nothing will be emitted): iOffset = 8, iPrevBigramOffset = 6
+ */
+ int iPrevBigramOffset; /* previous result was bi-gram */
+} porter_tokenizer_cursor;
+
+/* Forward declaration */
+static const sqlite3_tokenizer_module porterTokenizerModule;
+
+/* from normalize.c */
+extern unsigned int normalize_character(const unsigned int c);
+
+/*
+** Create a new tokenizer instance.
+*/
+static int porterCreate(int argc, const char* const* argv,
+ sqlite3_tokenizer** ppTokenizer) {
+ porter_tokenizer* t;
+ t = (porter_tokenizer*)sqlite3_malloc(sizeof(*t));
+ if (t == NULL) return SQLITE_NOMEM;
+ memset(t, 0, sizeof(*t));
+ *ppTokenizer = &t->base;
+ return SQLITE_OK;
+}
+
+/*
+** Destroy a tokenizer
+*/
+static int porterDestroy(sqlite3_tokenizer* pTokenizer) {
+ sqlite3_free(pTokenizer);
+ return SQLITE_OK;
+}
+
+/*
+** Prepare to begin tokenizing a particular string. The input
+** string to be tokenized is zInput[0..nInput-1]. A cursor
+** used to incrementally tokenize this string is returned in
+** *ppCursor.
+*/
+static int porterOpen(
+ sqlite3_tokenizer* pTokenizer, /* The tokenizer */
+ const char* zInput, int nInput, /* String to be tokenized */
+ sqlite3_tokenizer_cursor** ppCursor /* OUT: Tokenization cursor */
+) {
+ porter_tokenizer_cursor* c;
+
+ c = (porter_tokenizer_cursor*)sqlite3_malloc(sizeof(*c));
+ if (c == NULL) return SQLITE_NOMEM;
+
+ c->zInput = zInput;
+ if (zInput == 0) {
+ c->nInput = 0;
+ } else if (nInput < 0) {
+ c->nInput = (int)strlen(zInput);
+ } else {
+ c->nInput = nInput;
+ }
+ c->iOffset = 0; /* start tokenizing at the beginning */
+ c->iToken = 0;
+ c->zToken = NULL; /* no space allocated, yet. */
+ c->nAllocated = 0;
+ c->iPrevBigramOffset = 0;
+
+ *ppCursor = &c->base;
+ return SQLITE_OK;
+}
+
+/*
+** Close a tokenization cursor previously opened by a call to
+** porterOpen() above.
+*/
+static int porterClose(sqlite3_tokenizer_cursor* pCursor) {
+ porter_tokenizer_cursor* c = (porter_tokenizer_cursor*)pCursor;
+ sqlite3_free(c->zToken);
+ sqlite3_free(c);
+ return SQLITE_OK;
+}
+/*
+** Vowel or consonant
+*/
+static const char cType[] = {0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 1};
+
+/*
+** isConsonant() and isVowel() determine if their first character in
+** the string they point to is a consonant or a vowel, according
+** to Porter ruls.
+**
+** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
+** 'Y' is a consonant unless it follows another consonant,
+** in which case it is a vowel.
+**
+** In these routine, the letters are in reverse order. So the 'y' rule
+** is that 'y' is a consonant unless it is followed by another
+** consonant.
+*/
+static int isVowel(const char*);
+static int isConsonant(const char* z) {
+ int j;
+ char x = *z;
+ if (x == 0) return 0;
+ assert(x >= 'a' && x <= 'z');
+ j = cType[x - 'a'];
+ if (j < 2) return j;
+ return z[1] == 0 || isVowel(z + 1);
+}
+static int isVowel(const char* z) {
+ int j;
+ char x = *z;
+ if (x == 0) return 0;
+ assert(x >= 'a' && x <= 'z');
+ j = cType[x - 'a'];
+ if (j < 2) return 1 - j;
+ return isConsonant(z + 1);
+}
+
+/*
+** Let any sequence of one or more vowels be represented by V and let
+** C be sequence of one or more consonants. Then every word can be
+** represented as:
+**
+** [C] (VC){m} [V]
+**
+** In prose: A word is an optional consonant followed by zero or
+** vowel-consonant pairs followed by an optional vowel. "m" is the
+** number of vowel consonant pairs. This routine computes the value
+** of m for the first i bytes of a word.
+**
+** Return true if the m-value for z is 1 or more. In other words,
+** return true if z contains at least one vowel that is followed
+** by a consonant.
+**
+** In this routine z[] is in reverse order. So we are really looking
+** for an instance of of a consonant followed by a vowel.
+*/
+static int m_gt_0(const char* z) {
+ while (isVowel(z)) {
+ z++;
+ }
+ if (*z == 0) return 0;
+ while (isConsonant(z)) {
+ z++;
+ }
+ return *z != 0;
+}
+
+/* Like mgt0 above except we are looking for a value of m which is
+** exactly 1
+*/
+static int m_eq_1(const char* z) {
+ while (isVowel(z)) {
+ z++;
+ }
+ if (*z == 0) return 0;
+ while (isConsonant(z)) {
+ z++;
+ }
+ if (*z == 0) return 0;
+ while (isVowel(z)) {
+ z++;
+ }
+ if (*z == 0) return 1;
+ while (isConsonant(z)) {
+ z++;
+ }
+ return *z == 0;
+}
+
+/* Like mgt0 above except we are looking for a value of m>1 instead
+** or m>0
+*/
+static int m_gt_1(const char* z) {
+ while (isVowel(z)) {
+ z++;
+ }
+ if (*z == 0) return 0;
+ while (isConsonant(z)) {
+ z++;
+ }
+ if (*z == 0) return 0;
+ while (isVowel(z)) {
+ z++;
+ }
+ if (*z == 0) return 0;
+ while (isConsonant(z)) {
+ z++;
+ }
+ return *z != 0;
+}
+
+/*
+** Return TRUE if there is a vowel anywhere within z[0..n-1]
+*/
+static int hasVowel(const char* z) {
+ while (isConsonant(z)) {
+ z++;
+ }
+ return *z != 0;
+}
+
+/*
+** Return TRUE if the word ends in a double consonant.
+**
+** The text is reversed here. So we are really looking at
+** the first two characters of z[].
+*/
+static int doubleConsonant(const char* z) {
+ return isConsonant(z) && z[0] == z[1] && isConsonant(z + 1);
+}
+
+/*
+** Return TRUE if the word ends with three letters which
+** are consonant-vowel-consonent and where the final consonant
+** is not 'w', 'x', or 'y'.
+**
+** The word is reversed here. So we are really checking the
+** first three letters and the first one cannot be in [wxy].
+*/
+static int star_oh(const char* z) {
+ return z[0] != 0 && isConsonant(z) && z[0] != 'w' && z[0] != 'x' &&
+ z[0] != 'y' && z[1] != 0 && isVowel(z + 1) && z[2] != 0 &&
+ isConsonant(z + 2);
+}
+
+/*
+** If the word ends with zFrom and xCond() is true for the stem
+** of the word that precedes the zFrom ending, then change the
+** ending to zTo.
+**
+** The input word *pz and zFrom are both in reverse order. zTo
+** is in normal order.
+**
+** Return TRUE if zFrom matches. Return FALSE if zFrom does not
+** match. Not that TRUE is returned even if xCond() fails and
+** no substitution occurs.
+*/
+static int stem(
+ char** pz, /* The word being stemmed (Reversed) */
+ const char* zFrom, /* If the ending matches this... (Reversed) */
+ const char* zTo, /* ... change the ending to this (not reversed) */
+ int (*xCond)(const char*) /* Condition that must be true */
+) {
+ char* z = *pz;
+ while (*zFrom && *zFrom == *z) {
+ z++;
+ zFrom++;
+ }
+ if (*zFrom != 0) return 0;
+ if (xCond && !xCond(z)) return 1;
+ while (*zTo) {
+ *(--z) = *(zTo++);
+ }
+ *pz = z;
+ return 1;
+}
+
+/**
+ * Voiced sound mark is only on Japanese. It is like accent. It combines with
+ * previous character. Example, "サ" (Katakana) with "゛" (voiced sound mark)
+ * is "ザ". Although full-width character mapping has combined character like
+ * "ザ", there is no combined character on half-width Katanaka character
+ * mapping.
+ */
+static int isVoicedSoundMark(const unsigned int c) {
+ if (c == 0xff9e || c == 0xff9f || c == 0x3099 || c == 0x309a) return 1;
+ return 0;
+}
+
+/**
+ * How many unicode characters to take from the front and back of a term in
+ * |copy_stemmer|.
+ */
+# define COPY_STEMMER_COPY_HALF_LEN 10
+
+/**
+ * Normalizing but non-stemming term copying.
+ *
+ * The original function would take 10 bytes from the front and 10 bytes from
+ * the back if there were no digits in the string and it was more than 20
+ * bytes long. If there were digits involved that would decrease to 3 bytes
+ * from the front and 3 from the back. This would potentially corrupt utf-8
+ * encoded characters, which is fine from the perspective of the FTS3 logic.
+ *
+ * In our revised form we now operate on a unicode character basis rather than
+ * a byte basis. Additionally we use the same length limit even if there are
+ * digits involved because it's not clear digit token-space reduction is saving
+ * us from anything and could be hurting. Specifically, if no one is ever
+ * going to search on things with digits, then we should just remove them.
+ * Right now, the space reduction is going to increase false positives when
+ * people do search on them and increase the number of collisions sufficiently
+ * to make it really expensive. The caveat is there will be some increase in
+ * index size which could be meaningful if people are receiving lots of emails
+ * full of distinct numbers.
+ *
+ * In order to do the copy-from-the-front and copy-from-the-back trick, once
+ * we reach N characters in, we set zFrontEnd to the current value of zOut
+ * (which represents the termination of the first part of the result string)
+ * and set zBackStart to the value of zOutStart. We then advanced zBackStart
+ * along a character at a time as we write more characters. Once we have
+ * traversed the entire string, if zBackStart > zFrontEnd, then we know
+ * the string should be shrunk using the characters in the two ranges.
+ *
+ * (It would be faster to scan from the back with specialized logic but that
+ * particular logic seems easy to screw up and we don't have unit tests in here
+ * to the extent required.)
+ *
+ * @param zIn Input string to normalize and potentially shrink.
+ * @param nBytesIn The number of bytes in zIn, distinct from the number of
+ * unicode characters encoded in zIn.
+ * @param zOut The string to write our output into. This must have at least
+ * nBytesIn * MAX_UTF8_GROWTH_FACTOR in order to compensate for
+ * normalization that results in a larger utf-8 encoding.
+ * @param pnBytesOut Integer to write the number of bytes in zOut into.
+ */
+static void copy_stemmer(const unsigned char* zIn, const int nBytesIn,
+ unsigned char* zOut, int* pnBytesOut) {
+ const unsigned char* zInTerm = zIn + nBytesIn;
+ unsigned char* zOutStart = zOut;
+ unsigned int c;
+ unsigned int charCount = 0;
+ unsigned char *zFrontEnd = NULL, *zBackStart = NULL;
+ unsigned int trashC;
+
+ /* copy normalized character */
+ while (zIn < zInTerm) {
+ READ_UTF8(zIn, zInTerm, c);
+ c = normalize_character(c);
+
+ /* ignore voiced/semi-voiced sound mark */
+ if (!isVoicedSoundMark(c)) {
+ /* advance one non-voiced sound mark character. */
+ if (zBackStart) READ_UTF8(zBackStart, zOut, trashC);
+
+ WRITE_UTF8(zOut, c);
+ charCount++;
+ if (charCount == COPY_STEMMER_COPY_HALF_LEN) {
+ zFrontEnd = zOut;
+ zBackStart = zOutStart;
+ }
+ }
+ }
+
+ /* if we need to shrink the string, transplant the back bytes */
+ if (zBackStart > zFrontEnd) { /* this handles when both are null too */
+ size_t backBytes = zOut - zBackStart;
+ memmove(zFrontEnd, zBackStart, backBytes);
+ zOut = zFrontEnd + backBytes;
+ }
+ *zOut = 0;
+ *pnBytesOut = zOut - zOutStart;
+}
+
+/*
+** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
+** zOut is at least big enough to hold nIn bytes. Write the actual
+** size of the output word (exclusive of the '\0' terminator) into *pnOut.
+**
+** Any upper-case characters in the US-ASCII character set ([A-Z])
+** are converted to lower case. Upper-case UTF characters are
+** unchanged.
+**
+** Words that are longer than about 20 bytes are stemmed by retaining
+** a few bytes from the beginning and the end of the word. If the
+** word contains digits, 3 bytes are taken from the beginning and
+** 3 bytes from the end. For long words without digits, 10 bytes
+** are taken from each end. US-ASCII case folding still applies.
+**
+** If the input word contains not digits but does characters not
+** in [a-zA-Z] then no stemming is attempted and this routine just
+** copies the input into the input into the output with US-ASCII
+** case folding.
+**
+** Stemming never increases the length of the word. So there is
+** no chance of overflowing the zOut buffer.
+*/
+static void porter_stemmer(const unsigned char* zIn, unsigned int nIn,
+ unsigned char* zOut, int* pnOut) {
+ unsigned int i, j, c;
+ char zReverse[28];
+ char *z, *z2;
+ const unsigned char* zTerm = zIn + nIn;
+ const unsigned char* zTmp = zIn;
+
+ if (nIn < 3 || nIn >= sizeof(zReverse) - 7) {
+ /* The word is too big or too small for the porter stemmer.
+ ** Fallback to the copy stemmer */
+ copy_stemmer(zIn, nIn, zOut, pnOut);
+ return;
+ }
+ for (j = sizeof(zReverse) - 6; zTmp < zTerm; j--) {
+ READ_UTF8(zTmp, zTerm, c);
+ c = normalize_character(c);
+ if (c >= 'a' && c <= 'z') {
+ zReverse[j] = c;
+ } else {
+ /* The use of a character not in [a-zA-Z] means that we fallback
+ ** to the copy stemmer */
+ copy_stemmer(zIn, nIn, zOut, pnOut);
+ return;
+ }
+ }
+ memset(&zReverse[sizeof(zReverse) - 5], 0, 5);
+ z = &zReverse[j + 1];
+
+ /* Step 1a */
+ if (z[0] == 's') {
+ if (!stem(&z, "sess", "ss", 0) && !stem(&z, "sei", "i", 0) &&
+ !stem(&z, "ss", "ss", 0)) {
+ z++;
+ }
+ }
+
+ /* Step 1b */
+ z2 = z;
+ if (stem(&z, "dee", "ee", m_gt_0)) {
+ /* Do nothing. The work was all in the test */
+ } else if ((stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) &&
+ z != z2) {
+ if (stem(&z, "ta", "ate", 0) || stem(&z, "lb", "ble", 0) ||
+ stem(&z, "zi", "ize", 0)) {
+ /* Do nothing. The work was all in the test */
+ } else if (doubleConsonant(z) && (*z != 'l' && *z != 's' && *z != 'z')) {
+ z++;
+ } else if (m_eq_1(z) && star_oh(z)) {
+ *(--z) = 'e';
+ }
+ }
+
+ /* Step 1c */
+ if (z[0] == 'y' && hasVowel(z + 1)) {
+ z[0] = 'i';
+ }
+
+ /* Step 2 */
+ switch (z[1]) {
+ case 'a':
+ (void)(stem(&z, "lanoita", "ate", m_gt_0) ||
+ stem(&z, "lanoit", "tion", m_gt_0));
+ break;
+ case 'c':
+ (void)(stem(&z, "icne", "ence", m_gt_0) ||
+ stem(&z, "icna", "ance", m_gt_0));
+ break;
+ case 'e':
+ (void)(stem(&z, "rezi", "ize", m_gt_0));
+ break;
+ case 'g':
+ (void)(stem(&z, "igol", "log", m_gt_0));
+ break;
+ case 'l':
+ (void)(stem(&z, "ilb", "ble", m_gt_0) || stem(&z, "illa", "al", m_gt_0) ||
+ stem(&z, "iltne", "ent", m_gt_0) || stem(&z, "ile", "e", m_gt_0) ||
+ stem(&z, "ilsuo", "ous", m_gt_0));
+ break;
+ case 'o':
+ (void)(stem(&z, "noitazi", "ize", m_gt_0) ||
+ stem(&z, "noita", "ate", m_gt_0) ||
+ stem(&z, "rota", "ate", m_gt_0));
+ break;
+ case 's':
+ (void)(stem(&z, "msila", "al", m_gt_0) ||
+ stem(&z, "ssenevi", "ive", m_gt_0) ||
+ stem(&z, "ssenluf", "ful", m_gt_0) ||
+ stem(&z, "ssensuo", "ous", m_gt_0));
+ break;
+ case 't':
+ (void)(stem(&z, "itila", "al", m_gt_0) ||
+ stem(&z, "itivi", "ive", m_gt_0) ||
+ stem(&z, "itilib", "ble", m_gt_0));
+ break;
+ }
+
+ /* Step 3 */
+ switch (z[0]) {
+ case 'e':
+ (void)(stem(&z, "etaci", "ic", m_gt_0) || stem(&z, "evita", "", m_gt_0) ||
+ stem(&z, "ezila", "al", m_gt_0));
+ break;
+ case 'i':
+ (void)(stem(&z, "itici", "ic", m_gt_0));
+ break;
+ case 'l':
+ (void)(stem(&z, "laci", "ic", m_gt_0) || stem(&z, "luf", "", m_gt_0));
+ break;
+ case 's':
+ (void)(stem(&z, "ssen", "", m_gt_0));
+ break;
+ }
+
+ /* Step 4 */
+ switch (z[1]) {
+ case 'a':
+ if (z[0] == 'l' && m_gt_1(z + 2)) {
+ z += 2;
+ }
+ break;
+ case 'c':
+ if (z[0] == 'e' && z[2] == 'n' && (z[3] == 'a' || z[3] == 'e') &&
+ m_gt_1(z + 4)) {
+ z += 4;
+ }
+ break;
+ case 'e':
+ if (z[0] == 'r' && m_gt_1(z + 2)) {
+ z += 2;
+ }
+ break;
+ case 'i':
+ if (z[0] == 'c' && m_gt_1(z + 2)) {
+ z += 2;
+ }
+ break;
+ case 'l':
+ if (z[0] == 'e' && z[2] == 'b' && (z[3] == 'a' || z[3] == 'i') &&
+ m_gt_1(z + 4)) {
+ z += 4;
+ }
+ break;
+ case 'n':
+ if (z[0] == 't') {
+ if (z[2] == 'a') {
+ if (m_gt_1(z + 3)) {
+ z += 3;
+ }
+ } else if (z[2] == 'e') {
+ (void)(stem(&z, "tneme", "", m_gt_1) ||
+ stem(&z, "tnem", "", m_gt_1) || stem(&z, "tne", "", m_gt_1));
+ }
+ }
+ break;
+ case 'o':
+ if (z[0] == 'u') {
+ if (m_gt_1(z + 2)) {
+ z += 2;
+ }
+ } else if (z[3] == 's' || z[3] == 't') {
+ (void)(stem(&z, "noi", "", m_gt_1));
+ }
+ break;
+ case 's':
+ if (z[0] == 'm' && z[2] == 'i' && m_gt_1(z + 3)) {
+ z += 3;
+ }
+ break;
+ case 't':
+ (void)(stem(&z, "eta", "", m_gt_1) || stem(&z, "iti", "", m_gt_1));
+ break;
+ case 'u':
+ if (z[0] == 's' && z[2] == 'o' && m_gt_1(z + 3)) {
+ z += 3;
+ }
+ break;
+ case 'v':
+ case 'z':
+ if (z[0] == 'e' && z[2] == 'i' && m_gt_1(z + 3)) {
+ z += 3;
+ }
+ break;
+ }
+
+ /* Step 5a */
+ if (z[0] == 'e') {
+ if (m_gt_1(z + 1)) {
+ z++;
+ } else if (m_eq_1(z + 1) && !star_oh(z + 1)) {
+ z++;
+ }
+ }
+
+ /* Step 5b */
+ if (m_gt_1(z) && z[0] == 'l' && z[1] == 'l') {
+ z++;
+ }
+
+ /* z[] is now the stemmed word in reverse order. Flip it back
+ ** around into forward order and return.
+ */
+ *pnOut = i = strlen(z);
+ zOut[i] = 0;
+ while (*z) {
+ zOut[--i] = *(z++);
+ }
+}
+
+/**
+ * Indicate whether characters in the 0x30 - 0x7f region can be part of a token.
+ * Letters and numbers can; punctuation (and 'del') can't.
+ */
+static const char porterIdChar[] = {
+ /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
+};
+
+/**
+ * Test whether a character is a (non-ascii) space character or not. isDelim
+ * uses the existing porter stemmer logic for anything in the ASCII (< 0x80)
+ * space which covers 0x20.
+ *
+ * 0x2000-0x206F is the general punctuation table. 0x2000 - 0x200b are spaces.
+ * The spaces 0x2000 - 0x200a are all defined as roughly equivalent to a
+ * standard 0x20 space. 0x200b is a "zero width space" (ZWSP) and not like an
+ * 0x20 space. 0x202f is a narrow no-break space and roughly equivalent to an
+ * 0x20 space. 0x205f is a "medium mathematical space" and defined as roughly
+ * equivalent to an 0x20 space.
+ */
+# define IS_UNI_SPACE(x) \
+ (((x) >= 0x2000 && (x) <= 0x200a) || (x) == 0x202f || (x) == 0x205f)
+/**
+ * What we are checking for:
+ * - 0x3001: Ideographic comma (-> 0x2c ',')
+ * - 0x3002: Ideographic full stop (-> 0x2e '.')
+ * - 0xff0c: fullwidth comma (~ wide 0x2c ',')
+ * - 0xff0e: fullwidth full stop (~ wide 0x2e '.')
+ * - 0xff61: halfwidth ideographic full stop (~ narrow 0x3002)
+ * - 0xff64: halfwidth ideographic comma (~ narrow 0x3001)
+ *
+ * It is possible we should be treating other things as delimiters!
+ */
+# define IS_JA_DELIM(x) \
+ (((x) == 0x3001) || ((x) == 0xFF64) || ((x) == 0xFF0E) || \
+ ((x) == 0x3002) || ((x) == 0xFF61) || ((x) == 0xFF0C))
+
+/**
+ * The previous character was a delimiter (which includes the start of the
+ * string).
+ */
+# define BIGRAM_RESET 0
+/**
+ * The previous character was a CJK character and we have only seen one of them.
+ * If we had seen more than one in a row it would be the BIGRAM_USE state.
+ */
+# define BIGRAM_UNKNOWN 1
+/**
+ * We have seen two or more CJK characters in a row.
+ */
+# define BIGRAM_USE 2
+/**
+ * The previous character was ASCII or something in the unicode general scripts
+ * area that we do not believe is a delimiter. We call it 'alpha' as in
+ * alphabetic/alphanumeric and something that should be tokenized based on
+ * delimiters rather than on a bi-gram basis.
+ */
+# define BIGRAM_ALPHA 3
+
+static int isDelim(
+ const unsigned char* zCur, /* IN: current pointer of token */
+ const unsigned char* zTerm, /* IN: one character beyond end of token */
+ int* len, /* OUT: analyzed bytes in this token */
+ int* state /* IN/OUT: analyze state */
+) {
+ const unsigned char* zIn = zCur;
+ unsigned int c;
+ int delim;
+
+ /* get the unicode character to analyze */
+ READ_UTF8(zIn, zTerm, c);
+ c = normalize_character(c);
+ *len = zIn - zCur;
+
+ /* ASCII character range has rule */
+ if (c < 0x80) {
+ // This is original porter stemmer isDelim logic.
+ // 0x0 - 0x1f are all control characters, 0x20 is space, 0x21-0x2f are
+ // punctuation.
+ delim = (c < 0x30 || !porterIdChar[c - 0x30]);
+ // cases: "&a", "&."
+ if (*state == BIGRAM_USE || *state == BIGRAM_UNKNOWN) {
+ /* previous maybe CJK and current is ascii */
+ *state = BIGRAM_ALPHA; /*ascii*/
+ delim = 1; /* must break */
+ } else if (delim == 1) {
+ // cases: "a.", ".."
+ /* this is delimiter character */
+ *state = BIGRAM_RESET; /*reset*/
+ } else {
+ // cases: "aa", ".a"
+ *state = BIGRAM_ALPHA; /*ascii*/
+ }
+ return delim;
+ }
+
+ // (at this point we must be a non-ASCII character)
+
+ /* voiced/semi-voiced sound mark is ignore */
+ if (isVoicedSoundMark(c) && *state != BIGRAM_ALPHA) {
+ /* ignore this because it is combined with previous char */
+ return 0;
+ }
+
+ /* this isn't CJK range, so return as no delim */
+ // Anything less than 0x2000 (except to U+0E00-U+0EFF and U+1780-U+17FF)
+ // is the general scripts area and should not be bi-gram indexed.
+ // 0xa000 - 0a4cf is the Yi area. It is apparently a phonetic language whose
+ // usage does not appear to have simple delimiter rules, so we're leaving it
+ // as bigram processed. This is a guess, if you know better, let us know.
+ // (We previously bailed on this range too.)
+ // Addition, U+0E00-U+0E7F is Thai, U+0E80-U+0EFF is Laos,
+ // and U+1780-U+17FF is Khmer. It is no easy way to break each word.
+ // So these should use bi-gram too.
+ // cases: "aa", ".a", "&a"
+ if (c < 0xe00 || (c >= 0xf00 && c < 0x1780) || (c >= 0x1800 && c < 0x2000)) {
+ *state = BIGRAM_ALPHA; /* not really ASCII but same idea; tokenize it */
+ return 0;
+ }
+
+ // (at this point we must be a bi-grammable char or delimiter)
+
+ /* this is space character or delim character */
+ // cases: "a.", "..", "&."
+ if (IS_UNI_SPACE(c) || IS_JA_DELIM(c)) {
+ *state = BIGRAM_RESET; /* reset */
+ return 1; /* it actually is a delimiter; report as such */
+ }
+
+ // (at this point we must be a bi-grammable char)
+
+ // cases: "a&"
+ if (*state == BIGRAM_ALPHA) {
+ /* Previous is ascii and current maybe CJK */
+ *state = BIGRAM_UNKNOWN; /* mark as unknown */
+ return 1; /* break to emit the ASCII token*/
+ }
+
+ /* We have no rule for CJK!. use bi-gram */
+ // cases: "&&"
+ if (*state == BIGRAM_UNKNOWN || *state == BIGRAM_USE) {
+ /* previous state is unknown. mark as bi-gram */
+ *state = BIGRAM_USE;
+ return 1; /* break to emit the digram */
+ }
+
+ // cases: ".&" (*state == BIGRAM_RESET)
+ *state = BIGRAM_UNKNOWN; /* mark as unknown */
+ return 0; /* no need to break; nothing to emit */
+}
+
+/**
+ * Generate a new token. There are basically three types of token we can
+ * generate:
+ * - A porter stemmed token. This is a word entirely comprised of ASCII
+ * characters. We run the porter stemmer algorithm against the word.
+ * Because we have no way to know what is and is not an English word
+ * (the only language for which the porter stemmer was designed), this
+ * could theoretically map multiple words that are not variations of the
+ * same word down to the same root, resulting in potentially unexpected
+ * result inclusions in the search results. We accept this result because
+ * there's not a lot we can do about it and false positives are much
+ * better than false negatives.
+ * - A copied token; case/accent-folded but not stemmed. We call the porter
+ * stemmer for all non-CJK cases and it diverts to the copy stemmer if it
+ * sees any non-ASCII characters (after folding) or if the string is too
+ * long. The copy stemmer will shrink the string if it is deemed too long.
+ * - A bi-gram token; two CJK-ish characters. For query reasons we generate a
+ * series of overlapping bi-grams. (We can't require the user to start their
+ * search based on the arbitrary context of the indexed documents.)
+ *
+ * It may be useful to think of this function as operating at the points between
+ * characters. While we are considering the 'current' character (the one after
+ * the 'point'), we are also interested in the 'previous' character (the one
+ * preceding the point).
+ * At any 'point', there are a number of possible situations which I will
+ * illustrate with pairs of characters. 'a' means alphanumeric ASCII or a
+ * non-ASCII character that is not bi-grammable or a delimiter, '.'
+ * means a delimiter (space or punctuation), '&' means a bi-grammable
+ * character.
+ * - aa: We are in the midst of a token. State remains BIGRAM_ALPHA.
+ * - a.: We will generate a porter stemmed or copied token. State was
+ * BIGRAM_ALPHA, gets set to BIGRAM_RESET.
+ * - a&: We will generate a porter stemmed or copied token; we will set our
+ * state to BIGRAM_UNKNOWN to indicate we have seen one bigram character
+ * but that it is not yet time to emit a bigram.
+ * - .a: We are starting a token. State was BIGRAM_RESET, gets set to
+ * BIGRAM_ALPHA.
+ * - ..: We skip/eat the delimiters. State stays BIGRAM_RESET.
+ * - .&: State set to BIGRAM_UNKNOWN to indicate we have seen one bigram char.
+ * - &a: If the state was BIGRAM_USE, we generate a bi-gram token. If the state
+ * was BIGRAM_UNKNOWN we had only seen one CJK character and so don't do
+ * anything. State is set to BIGRAM_ALPHA.
+ * - &.: Same as the "&a" case, but state is set to BIGRAM_RESET.
+ * - &&: We will generate a bi-gram token. State was either BIGRAM_UNKNOWN or
+ * BIGRAM_USE, gets set to BIGRAM_USE.
+ */
+static int porterNext(
+ sqlite3_tokenizer_cursor* pCursor, /* Cursor returned by porterOpen */
+ const char** pzToken, /* OUT: *pzToken is the token text */
+ int* pnBytes, /* OUT: Number of bytes in token */
+ int* piStartOffset, /* OUT: Starting offset of token */
+ int* piEndOffset, /* OUT: Ending offset of token */
+ int* piPosition /* OUT: Position integer of token */
+) {
+ porter_tokenizer_cursor* c = (porter_tokenizer_cursor*)pCursor;
+ const unsigned char* z = (unsigned char*)c->zInput;
+ int len = 0;
+ int state;
+
+ while (c->iOffset < c->nInput) {
+ int iStartOffset, numChars;
+
+ /*
+ * This loop basically has two modes of operation:
+ * - general processing (iPrevBigramOffset == 0 here)
+ * - CJK processing (iPrevBigramOffset != 0 here)
+ *
+ * In an general processing pass we skip over all the delimiters, leaving us
+ * at a character that promises to produce a token. This could be a CJK
+ * token (state == BIGRAM_USE) or an ALPHA token (state == BIGRAM_ALPHA).
+ * If it was a CJK token, we transition into CJK state for the next loop.
+ * If it was an alpha token, our current offset is pointing at a delimiter
+ * (which could be a CJK character), so it is good that our next pass
+ * through the function and loop will skip over any delimiters. If the
+ * delimiter we hit was a CJK character, the next time through we will
+ * not treat it as a delimiter though; the entry state for that scan is
+ * BIGRAM_RESET so the transition is not treated as a delimiter!
+ *
+ * The CJK pass always starts with the second character in a bi-gram emitted
+ * as a token in the previous step. No delimiter skipping is required
+ * because we know that first character might produce a token for us. It
+ * only 'might' produce a token because the previous pass performed no
+ * lookahead and cannot be sure it is followed by another CJK character.
+ * This is why
+ */
+
+ // If we have a previous bigram offset
+ if (c->iPrevBigramOffset == 0) {
+ /* Scan past delimiter characters */
+ state = BIGRAM_RESET; /* reset */
+ while (c->iOffset < c->nInput &&
+ isDelim(z + c->iOffset, z + c->nInput, &len, &state)) {
+ c->iOffset += len;
+ }
+
+ } else {
+ /* for bigram indexing, use previous offset */
+ c->iOffset = c->iPrevBigramOffset;
+ }
+
+ /* Count non-delimiter characters. */
+ iStartOffset = c->iOffset;
+ numChars = 0;
+
+ // Start from a reset state. This means the first character we see
+ // (which will not be a delimiter) determines which of ALPHA or CJK modes
+ // we are operating in. (It won't be a delimiter because in a 'general'
+ // pass as defined above, we will have eaten all the delimiters, and in
+ // a CJK pass we are guaranteed that the first character is CJK.)
+ state = BIGRAM_RESET; /* state is reset */
+ // Advance until it is time to emit a token.
+ // For ALPHA characters, this means advancing until we encounter a delimiter
+ // or a CJK character. iOffset will be pointing at the delimiter or CJK
+ // character, aka one beyond the last ALPHA character.
+ // For CJK characters this means advancing until we encounter an ALPHA
+ // character, a delimiter, or we have seen two consecutive CJK
+ // characters. iOffset points at the ALPHA/delimiter in the first 2 cases
+ // and the second of two CJK characters in the last case.
+ // Because of the way this loop is structured, iOffset is only updated
+ // when we don't terminate. However, if we terminate, len still contains
+ // the number of bytes in the character found at iOffset. (This is useful
+ // in the CJK case.)
+ while (c->iOffset < c->nInput &&
+ !isDelim(z + c->iOffset, z + c->nInput, &len, &state)) {
+ c->iOffset += len;
+ numChars++;
+ }
+
+ if (state == BIGRAM_USE) {
+ /* Split word by bigram */
+ // Right now iOffset is pointing at the second character in a pair.
+ // Save this offset so next-time through we start with that as the
+ // first character.
+ c->iPrevBigramOffset = c->iOffset;
+ // And now advance so that iOffset is pointing at the character after
+ // the second character in the bi-gram pair. Also count the char.
+ c->iOffset += len;
+ numChars++;
+ } else {
+ /* Reset bigram offset */
+ c->iPrevBigramOffset = 0;
+ }
+
+ /* We emit a token if:
+ * - there are two ideograms together,
+ * - there are three chars or more,
+ * - we think this is a query and wildcard magic is desired.
+ * We think is a wildcard query when we have a single character, it starts
+ * at the start of the buffer, it's CJK, our current offset is one shy of
+ * nInput and the character at iOffset is '*'. Because the state gets
+ * clobbered by the incidence of '*' our requirement for CJK is that the
+ * implied character length is at least 3 given that it takes at least 3
+ * bytes to encode to 0x2000.
+ */
+ // It is possible we have no token to emit here if iPrevBigramOffset was not
+ // 0 on entry and there was no second CJK character. iPrevBigramOffset
+ // will now be 0 if that is the case (and c->iOffset == iStartOffset).
+ if ( // allow two-character words only if in bigram
+ (numChars == 2 && state == BIGRAM_USE) ||
+ // otherwise, drop two-letter words (considered stop-words)
+ (numChars >= 3) ||
+ // wildcard case:
+ (numChars == 1 && iStartOffset == 0 && (c->iOffset >= 3) &&
+ (c->iOffset == c->nInput - 1) && (z[c->iOffset] == '*'))) {
+ /* figure out the number of bytes to copy/stem */
+ int n = c->iOffset - iStartOffset;
+ /* make sure there is enough buffer space */
+ if (n * MAX_UTF8_GROWTH_FACTOR > c->nAllocated) {
+ c->nAllocated = n * MAX_UTF8_GROWTH_FACTOR + 20;
+ c->zToken = sqlite3_realloc(c->zToken, c->nAllocated);
+ if (c->zToken == NULL) return SQLITE_NOMEM;
+ }
+
+ if (state == BIGRAM_USE) {
+ /* This is by bigram. So it is unnecessary to convert word */
+ copy_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
+ } else {
+ porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
+ }
+ *pzToken = (const char*)c->zToken;
+ *piStartOffset = iStartOffset;
+ *piEndOffset = c->iOffset;
+ *piPosition = c->iToken++;
+ return SQLITE_OK;
+ }
+ }
+ return SQLITE_DONE;
+}
+
+/*
+** The set of routines that implement the porter-stemmer tokenizer
+*/
+static const sqlite3_tokenizer_module porterTokenizerModule = {
+ 0, porterCreate, porterDestroy, porterOpen, porterClose, porterNext,
+};
+
+/*
+** Allocate a new porter tokenizer. Return a pointer to the new
+** tokenizer in *ppModule
+*/
+void sqlite3Fts3PorterTokenizerModule(
+ sqlite3_tokenizer_module const** ppModule) {
+ *ppModule = &porterTokenizerModule;
+}
+
+#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */