1 files changed, 1415 insertions, 0 deletions
diff --git a/ext/fts5/fts5_tokenize.c b/ext/fts5/fts5_tokenize.c
new file mode 100644
index 0000000..e61d6b1
--- /dev/null
+++ b/ext/fts5/fts5_tokenize.c
@@ -0,0 +1,1415 @@
+/*
+** 2014 May 31
+**
+** The author disclaims copyright to this source code.  In place of
+** a legal notice, here is a blessing:
+**
+**    May you do good and not evil.
+**    May you find forgiveness for yourself and forgive others.
+**    May you share freely, never taking more than you give.
+**
+******************************************************************************
+*/
+
+
+#include "fts5Int.h"
+
+/**************************************************************************
+** Start of ascii tokenizer implementation.
+*/
+
+/*
+** For tokenizers with no "unicode" modifier, the set of token characters
+** is the same as the set of ASCII range alphanumeric characters. 
+*/
+static unsigned char aAsciiTokenChar[128] = {
+  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x00..0x0F */
+  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x10..0x1F */
+  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x20..0x2F */
+  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 0, 0, 0, 0, 0, 0,   /* 0x30..0x3F */
+  0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x40..0x4F */
+  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x50..0x5F */
+  0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x60..0x6F */
+  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x70..0x7F */
+};
+
+typedef struct AsciiTokenizer AsciiTokenizer;
+struct AsciiTokenizer {
+  unsigned char aTokenChar[128];
+};
+
+static void fts5AsciiAddExceptions(
+  AsciiTokenizer *p, 
+  const char *zArg, 
+  int bTokenChars
+){
+  int i;
+  for(i=0; zArg[i]; i++){
+    if( (zArg[i] & 0x80)==0 ){
+      p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
+    }
+  }
+}
+
+/*
+** Delete a "ascii" tokenizer.
+*/
+static void fts5AsciiDelete(Fts5Tokenizer *p){
+  sqlite3_free(p);
+}
+
+/*
+** Create an "ascii" tokenizer.
+*/
+static int fts5AsciiCreate(
+  void *pUnused, 
+  const char **azArg, int nArg,
+  Fts5Tokenizer **ppOut
+){
+  int rc = SQLITE_OK;
+  AsciiTokenizer *p = 0;
+  UNUSED_PARAM(pUnused);
+  if( nArg%2 ){
+    rc = SQLITE_ERROR;
+  }else{
+    p = sqlite3_malloc(sizeof(AsciiTokenizer));
+    if( p==0 ){
+      rc = SQLITE_NOMEM;
+    }else{
+      int i;
+      memset(p, 0, sizeof(AsciiTokenizer));
+      memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
+      for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
+        const char *zArg = azArg[i+1];
+        if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
+          fts5AsciiAddExceptions(p, zArg, 1);
+        }else
+        if( 0==sqlite3_stricmp(azArg[i], "separators") ){
+          fts5AsciiAddExceptions(p, zArg, 0);
+        }else{
+          rc = SQLITE_ERROR;
+        }
+      }
+      if( rc!=SQLITE_OK ){
+        fts5AsciiDelete((Fts5Tokenizer*)p);
+        p = 0;
+      }
+    }
+  }
+
+  *ppOut = (Fts5Tokenizer*)p;
+  return rc;
+}
+
+
+static void asciiFold(char *aOut, const char *aIn, int nByte){
+  int i;
+  for(i=0; i<nByte; i++){
+    char c = aIn[i];
+    if( c>='A' && c<='Z' ) c += 32;
+    aOut[i] = c;
+  }
+}
+
+/*
+** Tokenize some text using the ascii tokenizer.
+*/
+static int fts5AsciiTokenize(
+  Fts5Tokenizer *pTokenizer,
+  void *pCtx,
+  int iUnused,
+  const char *pText, int nText,
+  int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
+){
+  AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
+  int rc = SQLITE_OK;
+  int ie;
+  int is = 0;
+
+  char aFold[64];
+  int nFold = sizeof(aFold);
+  char *pFold = aFold;
+  unsigned char *a = p->aTokenChar;
+
+  UNUSED_PARAM(iUnused);
+
+  while( is<nText && rc==SQLITE_OK ){
+    int nByte;
+
+    /* Skip any leading divider characters. */
+    while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
+      is++;
+    }
+    if( is==nText ) break;
+
+    /* Count the token characters */
+    ie = is+1;
+    while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
+      ie++;
+    }
+
+    /* Fold to lower case */
+    nByte = ie-is;
+    if( nByte>nFold ){
+      if( pFold!=aFold ) sqlite3_free(pFold);
+      pFold = sqlite3_malloc64((sqlite3_int64)nByte*2);
+      if( pFold==0 ){
+        rc = SQLITE_NOMEM;
+        break;
+      }
+      nFold = nByte*2;
+    }
+    asciiFold(pFold, &pText[is], nByte);
+
+    /* Invoke the token callback */
+    rc = xToken(pCtx, 0, pFold, nByte, is, ie);
+    is = ie+1;
+  }
+  
+  if( pFold!=aFold ) sqlite3_free(pFold);
+  if( rc==SQLITE_DONE ) rc = SQLITE_OK;
+  return rc;
+}
+
+/**************************************************************************
+** Start of unicode61 tokenizer implementation.
+*/
+
+
+/*
+** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
+** from the sqlite3 source file utf.c. If this file is compiled as part
+** of the amalgamation, they are not required.
+*/
+#ifndef SQLITE_AMALGAMATION
+
+static const unsigned char sqlite3Utf8Trans1[] = {
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
+};
+
+#define READ_UTF8(zIn, zTerm, c)                           \
+  c = *(zIn++);                                            \
+  if( c>=0xc0 ){                                           \
+    c = sqlite3Utf8Trans1[c-0xc0];                         \
+    while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
+      c = (c<<6) + (0x3f & *(zIn++));                      \
+    }                                                      \
+    if( c<0x80                                             \
+        || (c&0xFFFFF800)==0xD800                          \
+        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
+  }
+
+
+#define WRITE_UTF8(zOut, c) {                          \
+  if( c<0x00080 ){                                     \
+    *zOut++ = (unsigned char)(c&0xFF);                 \
+  }                                                    \
+  else if( c<0x00800 ){                                \
+    *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F);     \
+    *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
+  }                                                    \
+  else if( c<0x10000 ){                                \
+    *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F);    \
+    *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
+    *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
+  }else{                                               \
+    *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07);  \
+    *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F);  \
+    *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
+    *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
+  }                                                    \
+}
+
+#endif /* ifndef SQLITE_AMALGAMATION */
+
+typedef struct Unicode61Tokenizer Unicode61Tokenizer;
+struct Unicode61Tokenizer {
+  unsigned char aTokenChar[128];  /* ASCII range token characters */
+  char *aFold;                    /* Buffer to fold text into */
+  int nFold;                      /* Size of aFold[] in bytes */
+  int eRemoveDiacritic;           /* True if remove_diacritics=1 is set */
+  int nException;
+  int *aiException;
+
+  unsigned char aCategory[32];    /* True for token char categories */
+};
+
+/* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
+#define FTS5_REMOVE_DIACRITICS_NONE    0
+#define FTS5_REMOVE_DIACRITICS_SIMPLE  1
+#define FTS5_REMOVE_DIACRITICS_COMPLEX 2
+
+static int fts5UnicodeAddExceptions(
+  Unicode61Tokenizer *p,          /* Tokenizer object */
+  const char *z,                  /* Characters to treat as exceptions */
+  int bTokenChars                 /* 1 for 'tokenchars', 0 for 'separators' */
+){
+  int rc = SQLITE_OK;
+  int n = (int)strlen(z);
+  int *aNew;
+
+  if( n>0 ){
+    aNew = (int*)sqlite3_realloc64(p->aiException,
+                                   (n+p->nException)*sizeof(int));
+    if( aNew ){
+      int nNew = p->nException;
+      const unsigned char *zCsr = (const unsigned char*)z;
+      const unsigned char *zTerm = (const unsigned char*)&z[n];
+      while( zCsr<zTerm ){
+        u32 iCode;
+        int bToken;
+        READ_UTF8(zCsr, zTerm, iCode);
+        if( iCode<128 ){
+          p->aTokenChar[iCode] = (unsigned char)bTokenChars;
+        }else{
+          bToken = p->aCategory[sqlite3Fts5UnicodeCategory(iCode)];
+          assert( (bToken==0 || bToken==1) ); 
+          assert( (bTokenChars==0 || bTokenChars==1) );
+          if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
+            int i;
+            for(i=0; i<nNew; i++){
+              if( (u32)aNew[i]>iCode ) break;
+            }
+            memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
+            aNew[i] = iCode;
+            nNew++;
+          }
+        }
+      }
+      p->aiException = aNew;
+      p->nException = nNew;
+    }else{
+      rc = SQLITE_NOMEM;
+    }
+  }
+
+  return rc;
+}
+
+/*
+** Return true if the p->aiException[] array contains the value iCode.
+*/
+static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
+  if( p->nException>0 ){
+    int *a = p->aiException;
+    int iLo = 0;
+    int iHi = p->nException-1;
+
+    while( iHi>=iLo ){
+      int iTest = (iHi + iLo) / 2;
+      if( iCode==a[iTest] ){
+        return 1;
+      }else if( iCode>a[iTest] ){
+        iLo = iTest+1;
+      }else{
+        iHi = iTest-1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+/*
+** Delete a "unicode61" tokenizer.
+*/
+static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
+  if( pTok ){
+    Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
+    sqlite3_free(p->aiException);
+    sqlite3_free(p->aFold);
+    sqlite3_free(p);
+  }
+  return;
+}
+
+static int unicodeSetCategories(Unicode61Tokenizer *p, const char *zCat){
+  const char *z = zCat;
+
+  while( *z ){
+    while( *z==' ' || *z=='\t' ) z++;
+    if( *z && sqlite3Fts5UnicodeCatParse(z, p->aCategory) ){
+      return SQLITE_ERROR;
+    }
+    while( *z!=' ' && *z!='\t' && *z!='\0' ) z++;
+  }
+
+  sqlite3Fts5UnicodeAscii(p->aCategory, p->aTokenChar);
+  return SQLITE_OK;
+}
+
+/*
+** Create a "unicode61" tokenizer.
+*/
+static int fts5UnicodeCreate(
+  void *pUnused, 
+  const char **azArg, int nArg,
+  Fts5Tokenizer **ppOut
+){
+  int rc = SQLITE_OK;             /* Return code */
+  Unicode61Tokenizer *p = 0;      /* New tokenizer object */ 
+
+  UNUSED_PARAM(pUnused);
+
+  if( nArg%2 ){
+    rc = SQLITE_ERROR;
+  }else{
+    p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
+    if( p ){
+      const char *zCat = "L* N* Co";
+      int i;
+      memset(p, 0, sizeof(Unicode61Tokenizer));
+
+      p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
+      p->nFold = 64;
+      p->aFold = sqlite3_malloc64(p->nFold * sizeof(char));
+      if( p->aFold==0 ){
+        rc = SQLITE_NOMEM;
+      }
+
+      /* Search for a "categories" argument */
+      for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
+        if( 0==sqlite3_stricmp(azArg[i], "categories") ){
+          zCat = azArg[i+1];
+        }
+      }
+
+      if( rc==SQLITE_OK ){
+        rc = unicodeSetCategories(p, zCat);
+      }
+
+      for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
+        const char *zArg = azArg[i+1];
+        if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
+          if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
+            rc = SQLITE_ERROR;
+          }else{
+            p->eRemoveDiacritic = (zArg[0] - '0');
+            assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
+                 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
+                 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
+            );
+          }
+        }else
+        if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
+          rc = fts5UnicodeAddExceptions(p, zArg, 1);
+        }else
+        if( 0==sqlite3_stricmp(azArg[i], "separators") ){
+          rc = fts5UnicodeAddExceptions(p, zArg, 0);
+        }else
+        if( 0==sqlite3_stricmp(azArg[i], "categories") ){
+          /* no-op */
+        }else{
+          rc = SQLITE_ERROR;
+        }
+      }
+
+    }else{
+      rc = SQLITE_NOMEM;
+    }
+    if( rc!=SQLITE_OK ){
+      fts5UnicodeDelete((Fts5Tokenizer*)p);
+      p = 0;
+    }
+    *ppOut = (Fts5Tokenizer*)p;
+  }
+  return rc;
+}
+
+/*
+** Return true if, for the purposes of tokenizing with the tokenizer
+** passed as the first argument, codepoint iCode is considered a token 
+** character (not a separator).
+*/
+static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
+  return (
+    p->aCategory[sqlite3Fts5UnicodeCategory((u32)iCode)]
+    ^ fts5UnicodeIsException(p, iCode)
+  );
+}
+
+static int fts5UnicodeTokenize(
+  Fts5Tokenizer *pTokenizer,
+  void *pCtx,
+  int iUnused,
+  const char *pText, int nText,
+  int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
+){
+  Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
+  int rc = SQLITE_OK;
+  unsigned char *a = p->aTokenChar;
+
+  unsigned char *zTerm = (unsigned char*)&pText[nText];
+  unsigned char *zCsr = (unsigned char *)pText;
+
+  /* Output buffer */
+  char *aFold = p->aFold;
+  int nFold = p->nFold;
+  const char *pEnd = &aFold[nFold-6];
+
+  UNUSED_PARAM(iUnused);
+
+  /* Each iteration of this loop gobbles up a contiguous run of separators,
+  ** then the next token.  */
+  while( rc==SQLITE_OK ){
+    u32 iCode;                    /* non-ASCII codepoint read from input */
+    char *zOut = aFold;
+    int is;
+    int ie;
+
+    /* Skip any separator characters. */
+    while( 1 ){
+      if( zCsr>=zTerm ) goto tokenize_done;
+      if( *zCsr & 0x80 ) {
+        /* A character outside of the ascii range. Skip past it if it is
+        ** a separator character. Or break out of the loop if it is not. */
+        is = zCsr - (unsigned char*)pText;
+        READ_UTF8(zCsr, zTerm, iCode);
+        if( fts5UnicodeIsAlnum(p, iCode) ){
+          goto non_ascii_tokenchar;
+        }
+      }else{
+        if( a[*zCsr] ){
+          is = zCsr - (unsigned char*)pText;
+          goto ascii_tokenchar;
+        }
+        zCsr++;
+      }
+    }
+
+    /* Run through the tokenchars. Fold them into the output buffer along
+    ** the way.  */
+    while( zCsr<zTerm ){
+
+      /* Grow the output buffer so that there is sufficient space to fit the
+      ** largest possible utf-8 character.  */
+      if( zOut>pEnd ){
+        aFold = sqlite3_malloc64((sqlite3_int64)nFold*2);
+        if( aFold==0 ){
+          rc = SQLITE_NOMEM;
+          goto tokenize_done;
+        }
+        zOut = &aFold[zOut - p->aFold];
+        memcpy(aFold, p->aFold, nFold);
+        sqlite3_free(p->aFold);
+        p->aFold = aFold;
+        p->nFold = nFold = nFold*2;
+        pEnd = &aFold[nFold-6];
+      }
+
+      if( *zCsr & 0x80 ){
+        /* An non-ascii-range character. Fold it into the output buffer if
+        ** it is a token character, or break out of the loop if it is not. */
+        READ_UTF8(zCsr, zTerm, iCode);
+        if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
+ non_ascii_tokenchar:
+          iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
+          if( iCode ) WRITE_UTF8(zOut, iCode);
+        }else{
+          break;
+        }
+      }else if( a[*zCsr]==0 ){
+        /* An ascii-range separator character. End of token. */
+        break; 
+      }else{
+ ascii_tokenchar:
+        if( *zCsr>='A' && *zCsr<='Z' ){
+          *zOut++ = *zCsr + 32;
+        }else{
+          *zOut++ = *zCsr;
+        }
+        zCsr++;
+      }
+      ie = zCsr - (unsigned char*)pText;
+    }
+
+    /* Invoke the token callback */
+    rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie); 
+  }
+  
+ tokenize_done:
+  if( rc==SQLITE_DONE ) rc = SQLITE_OK;
+  return rc;
+}
+
+/**************************************************************************
+** Start of porter stemmer implementation.
+*/
+
+/* Any tokens larger than this (in bytes) are passed through without
+** stemming. */
+#define FTS5_PORTER_MAX_TOKEN 64
+
+typedef struct PorterTokenizer PorterTokenizer;
+struct PorterTokenizer {
+  fts5_tokenizer tokenizer;       /* Parent tokenizer module */
+  Fts5Tokenizer *pTokenizer;      /* Parent tokenizer instance */
+  char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
+};
+
+/*
+** Delete a "porter" tokenizer.
+*/
+static void fts5PorterDelete(Fts5Tokenizer *pTok){
+  if( pTok ){
+    PorterTokenizer *p = (PorterTokenizer*)pTok;
+    if( p->pTokenizer ){
+      p->tokenizer.xDelete(p->pTokenizer);
+    }
+    sqlite3_free(p);
+  }
+}
+
+/*
+** Create a "porter" tokenizer.
+*/
+static int fts5PorterCreate(
+  void *pCtx, 
+  const char **azArg, int nArg,
+  Fts5Tokenizer **ppOut
+){
+  fts5_api *pApi = (fts5_api*)pCtx;
+  int rc = SQLITE_OK;
+  PorterTokenizer *pRet;
+  void *pUserdata = 0;
+  const char *zBase = "unicode61";
+
+  if( nArg>0 ){
+    zBase = azArg[0];
+  }
+
+  pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
+  if( pRet ){
+    memset(pRet, 0, sizeof(PorterTokenizer));
+    rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer);
+  }else{
+    rc = SQLITE_NOMEM;
+  }
+  if( rc==SQLITE_OK ){
+    int nArg2 = (nArg>0 ? nArg-1 : 0);
+    const char **azArg2 = (nArg2 ? &azArg[1] : 0);
+    rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer);
+  }
+
+  if( rc!=SQLITE_OK ){
+    fts5PorterDelete((Fts5Tokenizer*)pRet);
+    pRet = 0;
+  }
+  *ppOut = (Fts5Tokenizer*)pRet;
+  return rc;
+}
+
+typedef struct PorterContext PorterContext;
+struct PorterContext {
+  void *pCtx;
+  int (*xToken)(void*, int, const char*, int, int, int);
+  char *aBuf;
+};
+
+typedef struct PorterRule PorterRule;
+struct PorterRule {
+  const char *zSuffix;
+  int nSuffix;
+  int (*xCond)(char *zStem, int nStem);
+  const char *zOutput;
+  int nOutput;
+};
+
+#if 0
+static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
+  int ret = -1;
+  int nBuf = *pnBuf;
+  PorterRule *p;
+
+  for(p=aRule; p->zSuffix; p++){
+    assert( strlen(p->zSuffix)==p->nSuffix );
+    assert( strlen(p->zOutput)==p->nOutput );
+    if( nBuf<p->nSuffix ) continue;
+    if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
+  }
+
+  if( p->zSuffix ){
+    int nStem = nBuf - p->nSuffix;
+    if( p->xCond==0 || p->xCond(aBuf, nStem) ){
+      memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
+      *pnBuf = nStem + p->nOutput;
+      ret = p - aRule;
+    }
+  }
+
+  return ret;
+}
+#endif
+
+static int fts5PorterIsVowel(char c, int bYIsVowel){
+  return (
+      c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
+  );
+}
+
+static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
+  int i;
+  int bCons = bPrevCons;
+
+  /* Scan for a vowel */
+  for(i=0; i<nStem; i++){
+    if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
+  }
+
+  /* Scan for a consonent */
+  for(i++; i<nStem; i++){
+    if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
+  }
+  return 0;
+}
+
+/* porter rule condition: (m > 0) */
+static int fts5Porter_MGt0(char *zStem, int nStem){
+  return !!fts5PorterGobbleVC(zStem, nStem, 0);
+}
+
+/* porter rule condition: (m > 1) */
+static int fts5Porter_MGt1(char *zStem, int nStem){
+  int n;
+  n = fts5PorterGobbleVC(zStem, nStem, 0);
+  if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
+    return 1;
+  }
+  return 0;
+}
+
+/* porter rule condition: (m = 1) */
+static int fts5Porter_MEq1(char *zStem, int nStem){
+  int n;
+  n = fts5PorterGobbleVC(zStem, nStem, 0);
+  if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
+    return 1;
+  }
+  return 0;
+}
+
+/* porter rule condition: (*o) */
+static int fts5Porter_Ostar(char *zStem, int nStem){
+  if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
+    return 0;
+  }else{
+    int i;
+    int mask = 0;
+    int bCons = 0;
+    for(i=0; i<nStem; i++){
+      bCons = !fts5PorterIsVowel(zStem[i], bCons);
+      assert( bCons==0 || bCons==1 );
+      mask = (mask << 1) + bCons;
+    }
+    return ((mask & 0x0007)==0x0005);
+  }
+}
+
+/* porter rule condition: (m > 1 and (*S or *T)) */
+static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
+  assert( nStem>0 );
+  return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t') 
+      && fts5Porter_MGt1(zStem, nStem);
+}
+
+/* porter rule condition: (*v*) */
+static int fts5Porter_Vowel(char *zStem, int nStem){
+  int i;
+  for(i=0; i<nStem; i++){
+    if( fts5PorterIsVowel(zStem[i], i>0) ){
+      return 1;
+    }
+  }
+  return 0;
+}
+
+
+/**************************************************************************
+***************************************************************************
+** GENERATED CODE STARTS HERE (mkportersteps.tcl)
+*/
+
+static int fts5PorterStep4(char *aBuf, int *pnBuf){
+  int ret = 0;
+  int nBuf = *pnBuf;
+  switch( aBuf[nBuf-2] ){
+    
+    case 'a': 
+      if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-2) ){
+          *pnBuf = nBuf - 2;
+        }
+      }
+      break;
+  
+    case 'c': 
+      if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-4) ){
+          *pnBuf = nBuf - 4;
+        }
+      }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-4) ){
+          *pnBuf = nBuf - 4;
+        }
+      }
+      break;
+  
+    case 'e': 
+      if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-2) ){
+          *pnBuf = nBuf - 2;
+        }
+      }
+      break;
+  
+    case 'i': 
+      if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-2) ){
+          *pnBuf = nBuf - 2;
+        }
+      }
+      break;
+  
+    case 'l': 
+      if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-4) ){
+          *pnBuf = nBuf - 4;
+        }
+      }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-4) ){
+          *pnBuf = nBuf - 4;
+        }
+      }
+      break;
+  
+    case 'n': 
+      if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-3) ){
+          *pnBuf = nBuf - 3;
+        }
+      }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-5) ){
+          *pnBuf = nBuf - 5;
+        }
+      }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-4) ){
+          *pnBuf = nBuf - 4;
+        }
+      }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-3) ){
+          *pnBuf = nBuf - 3;
+        }
+      }
+      break;
+  
+    case 'o': 
+      if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
+        if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
+          *pnBuf = nBuf - 3;
+        }
+      }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-2) ){
+          *pnBuf = nBuf - 2;
+        }
+      }
+      break;
+  
+    case 's': 
+      if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-3) ){
+          *pnBuf = nBuf - 3;
+        }
+      }
+      break;
+  
+    case 't': 
+      if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-3) ){
+          *pnBuf = nBuf - 3;
+        }
+      }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-3) ){
+          *pnBuf = nBuf - 3;
+        }
+      }
+      break;
+  
+    case 'u': 
+      if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-3) ){
+          *pnBuf = nBuf - 3;
+        }
+      }
+      break;
+  
+    case 'v': 
+      if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-3) ){
+          *pnBuf = nBuf - 3;
+        }
+      }
+      break;
+  
+    case 'z': 
+      if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
+        if( fts5Porter_MGt1(aBuf, nBuf-3) ){
+          *pnBuf = nBuf - 3;
+        }
+      }
+      break;
+  
+  }
+  return ret;
+}
+  
+
+static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
+  int ret = 0;
+  int nBuf = *pnBuf;
+  switch( aBuf[nBuf-2] ){
+    
+    case 'a': 
+      if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
+        memcpy(&aBuf[nBuf-2], "ate", 3);
+        *pnBuf = nBuf - 2 + 3;
+        ret = 1;
+      }
+      break;
+  
+    case 'b': 
+      if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
+        memcpy(&aBuf[nBuf-2], "ble", 3);
+        *pnBuf = nBuf - 2 + 3;
+        ret = 1;
+      }
+      break;
+  
+    case 'i': 
+      if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
+        memcpy(&aBuf[nBuf-2], "ize", 3);
+        *pnBuf = nBuf - 2 + 3;
+        ret = 1;
+      }
+      break;
+  
+  }
+  return ret;
+}
+  
+
+static int fts5PorterStep2(char *aBuf, int *pnBuf){
+  int ret = 0;
+  int nBuf = *pnBuf;
+  switch( aBuf[nBuf-2] ){
+    
+    case 'a': 
+      if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-7) ){
+          memcpy(&aBuf[nBuf-7], "ate", 3);
+          *pnBuf = nBuf - 7 + 3;
+        }
+      }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-6) ){
+          memcpy(&aBuf[nBuf-6], "tion", 4);
+          *pnBuf = nBuf - 6 + 4;
+        }
+      }
+      break;
+  
+    case 'c': 
+      if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-4) ){
+          memcpy(&aBuf[nBuf-4], "ence", 4);
+          *pnBuf = nBuf - 4 + 4;
+        }
+      }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-4) ){
+          memcpy(&aBuf[nBuf-4], "ance", 4);
+          *pnBuf = nBuf - 4 + 4;
+        }
+      }
+      break;
+  
+    case 'e': 
+      if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-4) ){
+          memcpy(&aBuf[nBuf-4], "ize", 3);
+          *pnBuf = nBuf - 4 + 3;
+        }
+      }
+      break;
+  
+    case 'g': 
+      if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-4) ){
+          memcpy(&aBuf[nBuf-4], "log", 3);
+          *pnBuf = nBuf - 4 + 3;
+        }
+      }
+      break;
+  
+    case 'l': 
+      if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-3) ){
+          memcpy(&aBuf[nBuf-3], "ble", 3);
+          *pnBuf = nBuf - 3 + 3;
+        }
+      }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-4) ){
+          memcpy(&aBuf[nBuf-4], "al", 2);
+          *pnBuf = nBuf - 4 + 2;
+        }
+      }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-5) ){
+          memcpy(&aBuf[nBuf-5], "ent", 3);
+          *pnBuf = nBuf - 5 + 3;
+        }
+      }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-3) ){
+          memcpy(&aBuf[nBuf-3], "e", 1);
+          *pnBuf = nBuf - 3 + 1;
+        }
+      }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-5) ){
+          memcpy(&aBuf[nBuf-5], "ous", 3);
+          *pnBuf = nBuf - 5 + 3;
+        }
+      }
+      break;
+  
+    case 'o': 
+      if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-7) ){
+          memcpy(&aBuf[nBuf-7], "ize", 3);
+          *pnBuf = nBuf - 7 + 3;
+        }
+      }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-5) ){
+          memcpy(&aBuf[nBuf-5], "ate", 3);
+          *pnBuf = nBuf - 5 + 3;
+        }
+      }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-4) ){
+          memcpy(&aBuf[nBuf-4], "ate", 3);
+          *pnBuf = nBuf - 4 + 3;
+        }
+      }
+      break;
+  
+    case 's': 
+      if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-5) ){
+          memcpy(&aBuf[nBuf-5], "al", 2);
+          *pnBuf = nBuf - 5 + 2;
+        }
+      }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-7) ){
+          memcpy(&aBuf[nBuf-7], "ive", 3);
+          *pnBuf = nBuf - 7 + 3;
+        }
+      }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-7) ){
+          memcpy(&aBuf[nBuf-7], "ful", 3);
+          *pnBuf = nBuf - 7 + 3;
+        }
+      }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-7) ){
+          memcpy(&aBuf[nBuf-7], "ous", 3);
+          *pnBuf = nBuf - 7 + 3;
+        }
+      }
+      break;
+  
+    case 't': 
+      if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-5) ){
+          memcpy(&aBuf[nBuf-5], "al", 2);
+          *pnBuf = nBuf - 5 + 2;
+        }
+      }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-5) ){
+          memcpy(&aBuf[nBuf-5], "ive", 3);
+          *pnBuf = nBuf - 5 + 3;
+        }
+      }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-6) ){
+          memcpy(&aBuf[nBuf-6], "ble", 3);
+          *pnBuf = nBuf - 6 + 3;
+        }
+      }
+      break;
+  
+  }
+  return ret;
+}
+  
+
+static int fts5PorterStep3(char *aBuf, int *pnBuf){
+  int ret = 0;
+  int nBuf = *pnBuf;
+  switch( aBuf[nBuf-2] ){
+    
+    case 'a': 
+      if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-4) ){
+          memcpy(&aBuf[nBuf-4], "ic", 2);
+          *pnBuf = nBuf - 4 + 2;
+        }
+      }
+      break;
+  
+    case 's': 
+      if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-4) ){
+          *pnBuf = nBuf - 4;
+        }
+      }
+      break;
+  
+    case 't': 
+      if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-5) ){
+          memcpy(&aBuf[nBuf-5], "ic", 2);
+          *pnBuf = nBuf - 5 + 2;
+        }
+      }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-5) ){
+          memcpy(&aBuf[nBuf-5], "ic", 2);
+          *pnBuf = nBuf - 5 + 2;
+        }
+      }
+      break;
+  
+    case 'u': 
+      if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-3) ){
+          *pnBuf = nBuf - 3;
+        }
+      }
+      break;
+  
+    case 'v': 
+      if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-5) ){
+          *pnBuf = nBuf - 5;
+        }
+      }
+      break;
+  
+    case 'z': 
+      if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-5) ){
+          memcpy(&aBuf[nBuf-5], "al", 2);
+          *pnBuf = nBuf - 5 + 2;
+        }
+      }
+      break;
+  
+  }
+  return ret;
+}
+  
+
+static int fts5PorterStep1B(char *aBuf, int *pnBuf){
+  int ret = 0;
+  int nBuf = *pnBuf;
+  switch( aBuf[nBuf-2] ){
+    
+    case 'e': 
+      if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
+        if( fts5Porter_MGt0(aBuf, nBuf-3) ){
+          memcpy(&aBuf[nBuf-3], "ee", 2);
+          *pnBuf = nBuf - 3 + 2;
+        }
+      }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
+        if( fts5Porter_Vowel(aBuf, nBuf-2) ){
+          *pnBuf = nBuf - 2;
+          ret = 1;
+        }
+      }
+      break;
+  
+    case 'n': 
+      if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
+        if( fts5Porter_Vowel(aBuf, nBuf-3) ){
+          *pnBuf = nBuf - 3;
+          ret = 1;
+        }
+      }
+      break;
+  
+  }
+  return ret;
+}
+  
+/* 
+** GENERATED CODE ENDS HERE (mkportersteps.tcl)
+***************************************************************************
+**************************************************************************/
+
+static void fts5PorterStep1A(char *aBuf, int *pnBuf){
+  int nBuf = *pnBuf;
+  if( aBuf[nBuf-1]=='s' ){
+    if( aBuf[nBuf-2]=='e' ){
+      if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s') 
+       || (nBuf>3 && aBuf[nBuf-3]=='i' )
+      ){
+        *pnBuf = nBuf-2;
+      }else{
+        *pnBuf = nBuf-1;
+      }
+    }
+    else if( aBuf[nBuf-2]!='s' ){
+      *pnBuf = nBuf-1;
+    }
+  }
+}
+
+static int fts5PorterCb(
+  void *pCtx, 
+  int tflags,
+  const char *pToken, 
+  int nToken, 
+  int iStart, 
+  int iEnd
+){
+  PorterContext *p = (PorterContext*)pCtx;
+
+  char *aBuf;
+  int nBuf;
+
+  if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
+  aBuf = p->aBuf;
+  nBuf = nToken;
+  memcpy(aBuf, pToken, nBuf);
+
+  /* Step 1. */
+  fts5PorterStep1A(aBuf, &nBuf);
+  if( fts5PorterStep1B(aBuf, &nBuf) ){
+    if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
+      char c = aBuf[nBuf-1];
+      if( fts5PorterIsVowel(c, 0)==0 
+       && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2] 
+      ){
+        nBuf--;
+      }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
+        aBuf[nBuf++] = 'e';
+      }
+    }
+  }
+
+  /* Step 1C. */
+  if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
+    aBuf[nBuf-1] = 'i';
+  }
+
+  /* Steps 2 through 4. */
+  fts5PorterStep2(aBuf, &nBuf);
+  fts5PorterStep3(aBuf, &nBuf);
+  fts5PorterStep4(aBuf, &nBuf);
+
+  /* Step 5a. */
+  assert( nBuf>0 );
+  if( aBuf[nBuf-1]=='e' ){
+    if( fts5Porter_MGt1(aBuf, nBuf-1) 
+     || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
+    ){
+      nBuf--;
+    }
+  }
+
+  /* Step 5b. */
+  if( nBuf>1 && aBuf[nBuf-1]=='l' 
+   && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1) 
+  ){
+    nBuf--;
+  }
+
+  return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
+
+ pass_through:
+  return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
+}
+
+/*
+** Tokenize using the porter tokenizer.
+*/
+static int fts5PorterTokenize(
+  Fts5Tokenizer *pTokenizer,
+  void *pCtx,
+  int flags,
+  const char *pText, int nText,
+  int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
+){
+  PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
+  PorterContext sCtx;
+  sCtx.xToken = xToken;
+  sCtx.pCtx = pCtx;
+  sCtx.aBuf = p->aBuf;
+  return p->tokenizer.xTokenize(
+      p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
+  );
+}
+
+/**************************************************************************
+** Start of trigram implementation.
+*/
+typedef struct TrigramTokenizer TrigramTokenizer;
+struct TrigramTokenizer {
+  int bFold;                      /* True to fold to lower-case */
+};
+
+/*
+** Free a trigram tokenizer.
+*/
+static void fts5TriDelete(Fts5Tokenizer *p){
+  sqlite3_free(p);
+}
+
+/*
+** Allocate a trigram tokenizer.
+*/
+static int fts5TriCreate(
+  void *pUnused,
+  const char **azArg,
+  int nArg,
+  Fts5Tokenizer **ppOut
+){
+  int rc = SQLITE_OK;
+  TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew));
+  UNUSED_PARAM(pUnused);
+  if( pNew==0 ){
+    rc = SQLITE_NOMEM;
+  }else{
+    int i;
+    pNew->bFold = 1;
+    for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
+      const char *zArg = azArg[i+1];
+      if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){
+        if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
+          rc = SQLITE_ERROR;
+        }else{
+          pNew->bFold = (zArg[0]=='0');
+        }
+      }else{
+        rc = SQLITE_ERROR;
+      }
+    }
+    if( rc!=SQLITE_OK ){
+      fts5TriDelete((Fts5Tokenizer*)pNew);
+      pNew = 0;
+    }
+  }
+  *ppOut = (Fts5Tokenizer*)pNew;
+  return rc;
+}
+
+/*
+** Trigram tokenizer tokenize routine.
+*/
+static int fts5TriTokenize(
+  Fts5Tokenizer *pTok,
+  void *pCtx,
+  int unusedFlags,
+  const char *pText, int nText,
+  int (*xToken)(void*, int, const char*, int, int, int)
+){
+  TrigramTokenizer *p = (TrigramTokenizer*)pTok;
+  int rc = SQLITE_OK;
+  char aBuf[32];
+  const unsigned char *zIn = (const unsigned char*)pText;
+  const unsigned char *zEof = &zIn[nText];
+  u32 iCode;
+
+  UNUSED_PARAM(unusedFlags);
+  while( 1 ){
+    char *zOut = aBuf;
+    int iStart = zIn - (const unsigned char*)pText;
+    const unsigned char *zNext; 
+
+    READ_UTF8(zIn, zEof, iCode);
+    if( iCode==0 ) break;
+    zNext = zIn;
+    if( zIn<zEof ){
+      if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
+      WRITE_UTF8(zOut, iCode);
+      READ_UTF8(zIn, zEof, iCode);
+      if( iCode==0 ) break;
+    }else{
+      break;
+    }
+    if( zIn<zEof ){
+      if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
+      WRITE_UTF8(zOut, iCode);
+      READ_UTF8(zIn, zEof, iCode);
+      if( iCode==0 ) break;
+      if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
+      WRITE_UTF8(zOut, iCode);
+    }else{
+      break;
+    }
+    rc = xToken(pCtx, 0, aBuf, zOut-aBuf, iStart, iStart + zOut-aBuf);
+    if( rc!=SQLITE_OK ) break;
+    zIn = zNext;
+  }
+
+  return rc;
+}
+
+/*
+** Argument xCreate is a pointer to a constructor function for a tokenizer.
+** pTok is a tokenizer previously created using the same method. This function
+** returns one of FTS5_PATTERN_NONE, FTS5_PATTERN_LIKE or FTS5_PATTERN_GLOB
+** indicating the style of pattern matching that the tokenizer can support.
+** In practice, this is:
+**
+**     "trigram" tokenizer, case_sensitive=1 - FTS5_PATTERN_GLOB
+**     "trigram" tokenizer, case_sensitive=0 (the default) - FTS5_PATTERN_LIKE
+**     all other tokenizers - FTS5_PATTERN_NONE
+*/
+int sqlite3Fts5TokenizerPattern(
+    int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
+    Fts5Tokenizer *pTok
+){
+  if( xCreate==fts5TriCreate ){
+    TrigramTokenizer *p = (TrigramTokenizer*)pTok;
+    return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB;
+  }
+  return FTS5_PATTERN_NONE;
+}
+
+/*
+** Register all built-in tokenizers with FTS5.
+*/
+int sqlite3Fts5TokenizerInit(fts5_api *pApi){
+  struct BuiltinTokenizer {
+    const char *zName;
+    fts5_tokenizer x;
+  } aBuiltin[] = {
+    { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
+    { "ascii",     {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
+    { "porter",    {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
+    { "trigram",   {fts5TriCreate, fts5TriDelete, fts5TriTokenize}},
+  };
+  
+  int rc = SQLITE_OK;             /* Return code */
+  int i;                          /* To iterate through builtin functions */
+
+  for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
+    rc = pApi->xCreateTokenizer(pApi,
+        aBuiltin[i].zName,
+        (void*)pApi,
+        &aBuiltin[i].x,
+        0
+    );
+  }
+
+  return rc;
+}