diff options
Diffstat (limited to 'ext/fts2/fts2_tokenizer.c')
-rw-r--r-- | ext/fts2/fts2_tokenizer.c | 375 |
1 files changed, 375 insertions, 0 deletions
diff --git a/ext/fts2/fts2_tokenizer.c b/ext/fts2/fts2_tokenizer.c new file mode 100644 index 0000000..dda33a7 --- /dev/null +++ b/ext/fts2/fts2_tokenizer.c @@ -0,0 +1,375 @@ +/* +** 2007 June 22 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +** +** This is part of an SQLite module implementing full-text search. +** This particular file implements the generic tokenizer interface. +*/ + +/* +** The code in this file is only compiled if: +** +** * The FTS2 module is being built as an extension +** (in which case SQLITE_CORE is not defined), or +** +** * The FTS2 module is being built into the core of +** SQLite (in which case SQLITE_ENABLE_FTS2 is defined). +*/ +#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) + + +#include "sqlite3.h" +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT3 + +#include "fts2_hash.h" +#include "fts2_tokenizer.h" +#include <assert.h> + +/* +** Implementation of the SQL scalar function for accessing the underlying +** hash table. This function may be called as follows: +** +** SELECT <function-name>(<key-name>); +** SELECT <function-name>(<key-name>, <pointer>); +** +** where <function-name> is the name passed as the second argument +** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer'). +** +** If the <pointer> argument is specified, it must be a blob value +** containing a pointer to be stored as the hash data corresponding +** to the string <key-name>. If <pointer> is not specified, then +** the string <key-name> must already exist in the has table. Otherwise, +** an error is returned. +** +** Whether or not the <pointer> argument is specified, the value returned +** is a blob containing the pointer stored as the hash data corresponding +** to string <key-name> (after the hash-table is updated, if applicable). +*/ +static void scalarFunc( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + fts2Hash *pHash; + void *pPtr = 0; + const unsigned char *zName; + int nName; + + assert( argc==1 || argc==2 ); + + pHash = (fts2Hash *)sqlite3_user_data(context); + + zName = sqlite3_value_text(argv[0]); + nName = sqlite3_value_bytes(argv[0])+1; + + if( argc==2 ){ + void *pOld; + int n = sqlite3_value_bytes(argv[1]); + if( n!=sizeof(pPtr) ){ + sqlite3_result_error(context, "argument type mismatch", -1); + return; + } + pPtr = *(void **)sqlite3_value_blob(argv[1]); + pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr); + if( pOld==pPtr ){ + sqlite3_result_error(context, "out of memory", -1); + return; + } + }else{ + pPtr = sqlite3Fts2HashFind(pHash, zName, nName); + if( !pPtr ){ + char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName); + sqlite3_result_error(context, zErr, -1); + sqlite3_free(zErr); + return; + } + } + + sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT); +} + +#ifdef SQLITE_TEST + +#if defined(INCLUDE_SQLITE_TCL_H) +# include "sqlite_tcl.h" +#else +# include "tcl.h" +#endif +#include <string.h> + +/* +** Implementation of a special SQL scalar function for testing tokenizers +** designed to be used in concert with the Tcl testing framework. This +** function must be called with two arguments: +** +** SELECT <function-name>(<key-name>, <input-string>); +** SELECT <function-name>(<key-name>, <pointer>); +** +** where <function-name> is the name passed as the second argument +** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer') +** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test'). +** +** The return value is a string that may be interpreted as a Tcl +** list. For each token in the <input-string>, three elements are +** added to the returned list. The first is the token position, the +** second is the token text (folded, stemmed, etc.) and the third is the +** substring of <input-string> associated with the token. For example, +** using the built-in "simple" tokenizer: +** +** SELECT fts_tokenizer_test('simple', 'I don't see how'); +** +** will return the string: +** +** "{0 i I 1 dont don't 2 see see 3 how how}" +** +*/ +static void testFunc( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + fts2Hash *pHash; + sqlite3_tokenizer_module *p; + sqlite3_tokenizer *pTokenizer = 0; + sqlite3_tokenizer_cursor *pCsr = 0; + + const char *zErr = 0; + + const char *zName; + int nName; + const char *zInput; + int nInput; + + const char *zArg = 0; + + const char *zToken; + int nToken; + int iStart; + int iEnd; + int iPos; + + Tcl_Obj *pRet; + + assert( argc==2 || argc==3 ); + + nName = sqlite3_value_bytes(argv[0]); + zName = (const char *)sqlite3_value_text(argv[0]); + nInput = sqlite3_value_bytes(argv[argc-1]); + zInput = (const char *)sqlite3_value_text(argv[argc-1]); + + if( argc==3 ){ + zArg = (const char *)sqlite3_value_text(argv[1]); + } + + pHash = (fts2Hash *)sqlite3_user_data(context); + p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1); + + if( !p ){ + char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName); + sqlite3_result_error(context, zErr, -1); + sqlite3_free(zErr); + return; + } + + pRet = Tcl_NewObj(); + Tcl_IncrRefCount(pRet); + + if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){ + zErr = "error in xCreate()"; + goto finish; + } + pTokenizer->pModule = p; + if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){ + zErr = "error in xOpen()"; + goto finish; + } + pCsr->pTokenizer = pTokenizer; + + while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){ + Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos)); + Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken)); + zToken = &zInput[iStart]; + nToken = iEnd-iStart; + Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken)); + } + + if( SQLITE_OK!=p->xClose(pCsr) ){ + zErr = "error in xClose()"; + goto finish; + } + if( SQLITE_OK!=p->xDestroy(pTokenizer) ){ + zErr = "error in xDestroy()"; + goto finish; + } + +finish: + if( zErr ){ + sqlite3_result_error(context, zErr, -1); + }else{ + sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT); + } + Tcl_DecrRefCount(pRet); +} + +static +int registerTokenizer( + sqlite3 *db, + char *zName, + const sqlite3_tokenizer_module *p +){ + int rc; + sqlite3_stmt *pStmt; + const char zSql[] = "SELECT fts2_tokenizer(?, ?)"; + + rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); + if( rc!=SQLITE_OK ){ + return rc; + } + + sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); + sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC); + sqlite3_step(pStmt); + + return sqlite3_finalize(pStmt); +} + +static +int queryFts2Tokenizer( + sqlite3 *db, + char *zName, + const sqlite3_tokenizer_module **pp +){ + int rc; + sqlite3_stmt *pStmt; + const char zSql[] = "SELECT fts2_tokenizer(?)"; + + *pp = 0; + rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); + if( rc!=SQLITE_OK ){ + return rc; + } + + sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); + if( SQLITE_ROW==sqlite3_step(pStmt) ){ + if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){ + memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp)); + } + } + + return sqlite3_finalize(pStmt); +} + +void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule); + +/* +** Implementation of the scalar function fts2_tokenizer_internal_test(). +** This function is used for testing only, it is not included in the +** build unless SQLITE_TEST is defined. +** +** The purpose of this is to test that the fts2_tokenizer() function +** can be used as designed by the C-code in the queryFts2Tokenizer and +** registerTokenizer() functions above. These two functions are repeated +** in the README.tokenizer file as an example, so it is important to +** test them. +** +** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar +** function with no arguments. An assert() will fail if a problem is +** detected. i.e.: +** +** SELECT fts2_tokenizer_internal_test(); +** +*/ +static void intTestFunc( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + int rc; + const sqlite3_tokenizer_module *p1; + const sqlite3_tokenizer_module *p2; + sqlite3 *db = (sqlite3 *)sqlite3_user_data(context); + + /* Test the query function */ + sqlite3Fts2SimpleTokenizerModule(&p1); + rc = queryFts2Tokenizer(db, "simple", &p2); + assert( rc==SQLITE_OK ); + assert( p1==p2 ); + rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2); + assert( rc==SQLITE_ERROR ); + assert( p2==0 ); + assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") ); + + /* Test the storage function */ + rc = registerTokenizer(db, "nosuchtokenizer", p1); + assert( rc==SQLITE_OK ); + rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2); + assert( rc==SQLITE_OK ); + assert( p2==p1 ); + + sqlite3_result_text(context, "ok", -1, SQLITE_STATIC); +} + +#endif + +/* +** Set up SQL objects in database db used to access the contents of +** the hash table pointed to by argument pHash. The hash table must +** been initialized to use string keys, and to take a private copy +** of the key when a value is inserted. i.e. by a call similar to: +** +** sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1); +** +** This function adds a scalar function (see header comment above +** scalarFunc() in this file for details) and, if ENABLE_TABLE is +** defined at compilation time, a temporary virtual table (see header +** comment above struct HashTableVtab) to the database schema. Both +** provide read/write access to the contents of *pHash. +** +** The third argument to this function, zName, is used as the name +** of both the scalar and, if created, the virtual table. +*/ +int sqlite3Fts2InitHashTable( + sqlite3 *db, + fts2Hash *pHash, + const char *zName +){ + int rc = SQLITE_OK; + void *p = (void *)pHash; + const int any = SQLITE_ANY; + char *zTest = 0; + char *zTest2 = 0; + +#ifdef SQLITE_TEST + void *pdb = (void *)db; + zTest = sqlite3_mprintf("%s_test", zName); + zTest2 = sqlite3_mprintf("%s_internal_test", zName); + if( !zTest || !zTest2 ){ + rc = SQLITE_NOMEM; + } +#endif + + if( rc!=SQLITE_OK + || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0)) + || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0)) +#ifdef SQLITE_TEST + || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0)) + || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0)) + || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0)) +#endif + ); + + sqlite3_free(zTest); + sqlite3_free(zTest2); + return rc; +} + +#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */ |