diff options
Diffstat (limited to 'ext/fts3/fts3_test.c')
-rw-r--r-- | ext/fts3/fts3_test.c | 623 |
1 files changed, 623 insertions, 0 deletions
diff --git a/ext/fts3/fts3_test.c b/ext/fts3/fts3_test.c new file mode 100644 index 0000000..49a8476 --- /dev/null +++ b/ext/fts3/fts3_test.c @@ -0,0 +1,623 @@ +/* +** 2011 Jun 13 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +** +** This file is not part of the production FTS code. It is only used for +** testing. It contains a Tcl command that can be used to test if a document +** matches an FTS NEAR expression. +** +** As of March 2012, it also contains a version 1 tokenizer used for testing +** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly. +*/ + +#if defined(INCLUDE_SQLITE_TCL_H) +# include "sqlite_tcl.h" +#else +# include "tcl.h" +# ifndef SQLITE_TCLAPI +# define SQLITE_TCLAPI +# endif +#endif +#include <string.h> +#include <assert.h> + +#if defined(SQLITE_TEST) +#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) + +/* Required so that the "ifdef SQLITE_ENABLE_FTS3" below works */ +#include "fts3Int.h" + +#define NM_MAX_TOKEN 12 + +typedef struct NearPhrase NearPhrase; +typedef struct NearDocument NearDocument; +typedef struct NearToken NearToken; + +struct NearDocument { + int nToken; /* Length of token in bytes */ + NearToken *aToken; /* Token array */ +}; + +struct NearToken { + int n; /* Length of token in bytes */ + const char *z; /* Pointer to token string */ +}; + +struct NearPhrase { + int nNear; /* Preceding NEAR value */ + int nToken; /* Number of tokens in this phrase */ + NearToken aToken[NM_MAX_TOKEN]; /* Array of tokens in this phrase */ +}; + +static int nm_phrase_match( + NearPhrase *p, + NearToken *aToken +){ + int ii; + + for(ii=0; ii<p->nToken; ii++){ + NearToken *pToken = &p->aToken[ii]; + if( pToken->n>0 && pToken->z[pToken->n-1]=='*' ){ + if( aToken[ii].n<(pToken->n-1) ) return 0; + if( memcmp(aToken[ii].z, pToken->z, pToken->n-1) ) return 0; + }else{ + if( aToken[ii].n!=pToken->n ) return 0; + if( memcmp(aToken[ii].z, pToken->z, pToken->n) ) return 0; + } + } + + return 1; +} + +static int nm_near_chain( + int iDir, /* Direction to iterate through aPhrase[] */ + NearDocument *pDoc, /* Document to match against */ + int iPos, /* Position at which iPhrase was found */ + int nPhrase, /* Size of phrase array */ + NearPhrase *aPhrase, /* Phrase array */ + int iPhrase /* Index of phrase found */ +){ + int iStart; + int iStop; + int ii; + int nNear; + int iPhrase2; + NearPhrase *p; + NearPhrase *pPrev; + + assert( iDir==1 || iDir==-1 ); + + if( iDir==1 ){ + if( (iPhrase+1)==nPhrase ) return 1; + nNear = aPhrase[iPhrase+1].nNear; + }else{ + if( iPhrase==0 ) return 1; + nNear = aPhrase[iPhrase].nNear; + } + pPrev = &aPhrase[iPhrase]; + iPhrase2 = iPhrase+iDir; + p = &aPhrase[iPhrase2]; + + iStart = iPos - nNear - p->nToken; + iStop = iPos + nNear + pPrev->nToken; + + if( iStart<0 ) iStart = 0; + if( iStop > pDoc->nToken - p->nToken ) iStop = pDoc->nToken - p->nToken; + + for(ii=iStart; ii<=iStop; ii++){ + if( nm_phrase_match(p, &pDoc->aToken[ii]) ){ + if( nm_near_chain(iDir, pDoc, ii, nPhrase, aPhrase, iPhrase2) ) return 1; + } + } + + return 0; +} + +static int nm_match_count( + NearDocument *pDoc, /* Document to match against */ + int nPhrase, /* Size of phrase array */ + NearPhrase *aPhrase, /* Phrase array */ + int iPhrase /* Index of phrase to count matches for */ +){ + int nOcc = 0; + int ii; + NearPhrase *p = &aPhrase[iPhrase]; + + for(ii=0; ii<(pDoc->nToken + 1 - p->nToken); ii++){ + if( nm_phrase_match(p, &pDoc->aToken[ii]) ){ + /* Test forward NEAR chain (i>iPhrase) */ + if( 0==nm_near_chain(1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue; + + /* Test reverse NEAR chain (i<iPhrase) */ + if( 0==nm_near_chain(-1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue; + + /* This is a real match. Increment the counter. */ + nOcc++; + } + } + + return nOcc; +} + +/* +** Tclcmd: fts3_near_match DOCUMENT EXPR ?OPTIONS? +*/ +static int SQLITE_TCLAPI fts3_near_match_cmd( + ClientData clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + int nTotal = 0; + int rc; + int ii; + int nPhrase; + NearPhrase *aPhrase = 0; + NearDocument doc = {0, 0}; + Tcl_Obj **apDocToken; + Tcl_Obj *pRet; + Tcl_Obj *pPhrasecount = 0; + + Tcl_Obj **apExprToken; + int nExprToken; + + UNUSED_PARAMETER(clientData); + + /* Must have 3 or more arguments. */ + if( objc<3 || (objc%2)==0 ){ + Tcl_WrongNumArgs(interp, 1, objv, "DOCUMENT EXPR ?OPTION VALUE?..."); + rc = TCL_ERROR; + goto near_match_out; + } + + for(ii=3; ii<objc; ii+=2){ + enum NM_enum { NM_PHRASECOUNTS }; + struct TestnmSubcmd { + char *zName; + enum NM_enum eOpt; + } aOpt[] = { + { "-phrasecountvar", NM_PHRASECOUNTS }, + { 0, 0 } + }; + int iOpt; + if( Tcl_GetIndexFromObjStruct( + interp, objv[ii], aOpt, sizeof(aOpt[0]), "option", 0, &iOpt) + ){ + return TCL_ERROR; + } + + switch( aOpt[iOpt].eOpt ){ + case NM_PHRASECOUNTS: + pPhrasecount = objv[ii+1]; + break; + } + } + + rc = Tcl_ListObjGetElements(interp, objv[1], &doc.nToken, &apDocToken); + if( rc!=TCL_OK ) goto near_match_out; + doc.aToken = (NearToken *)ckalloc(doc.nToken*sizeof(NearToken)); + for(ii=0; ii<doc.nToken; ii++){ + doc.aToken[ii].z = Tcl_GetStringFromObj(apDocToken[ii], &doc.aToken[ii].n); + } + + rc = Tcl_ListObjGetElements(interp, objv[2], &nExprToken, &apExprToken); + if( rc!=TCL_OK ) goto near_match_out; + + nPhrase = (nExprToken + 1) / 2; + aPhrase = (NearPhrase *)ckalloc(nPhrase * sizeof(NearPhrase)); + memset(aPhrase, 0, nPhrase * sizeof(NearPhrase)); + for(ii=0; ii<nPhrase; ii++){ + Tcl_Obj *pPhrase = apExprToken[ii*2]; + Tcl_Obj **apToken; + int nToken; + int jj; + + rc = Tcl_ListObjGetElements(interp, pPhrase, &nToken, &apToken); + if( rc!=TCL_OK ) goto near_match_out; + if( nToken>NM_MAX_TOKEN ){ + Tcl_AppendResult(interp, "Too many tokens in phrase", 0); + rc = TCL_ERROR; + goto near_match_out; + } + for(jj=0; jj<nToken; jj++){ + NearToken *pT = &aPhrase[ii].aToken[jj]; + pT->z = Tcl_GetStringFromObj(apToken[jj], &pT->n); + } + aPhrase[ii].nToken = nToken; + } + for(ii=1; ii<nPhrase; ii++){ + Tcl_Obj *pNear = apExprToken[2*ii-1]; + int nNear; + rc = Tcl_GetIntFromObj(interp, pNear, &nNear); + if( rc!=TCL_OK ) goto near_match_out; + aPhrase[ii].nNear = nNear; + } + + pRet = Tcl_NewObj(); + Tcl_IncrRefCount(pRet); + for(ii=0; ii<nPhrase; ii++){ + int nOcc = nm_match_count(&doc, nPhrase, aPhrase, ii); + Tcl_ListObjAppendElement(interp, pRet, Tcl_NewIntObj(nOcc)); + nTotal += nOcc; + } + if( pPhrasecount ){ + Tcl_ObjSetVar2(interp, pPhrasecount, 0, pRet, 0); + } + Tcl_DecrRefCount(pRet); + Tcl_SetObjResult(interp, Tcl_NewBooleanObj(nTotal>0)); + + near_match_out: + ckfree((char *)aPhrase); + ckfree((char *)doc.aToken); + return rc; +} + +/* +** Tclcmd: fts3_configure_incr_load ?CHUNKSIZE THRESHOLD? +** +** Normally, FTS uses hard-coded values to determine the minimum doclist +** size eligible for incremental loading, and the size of the chunks loaded +** when a doclist is incrementally loaded. This command allows the built-in +** values to be overridden for testing purposes. +** +** If present, the first argument is the chunksize in bytes to load doclists +** in. The second argument is the minimum doclist size in bytes to use +** incremental loading with. +** +** Whether or not the arguments are present, this command returns a list of +** two integers - the initial chunksize and threshold when the command is +** invoked. This can be used to restore the default behavior after running +** tests. For example: +** +** # Override incr-load settings for testing: +** set cfg [fts3_configure_incr_load $new_chunksize $new_threshold] +** +** .... run tests .... +** +** # Restore initial incr-load settings: +** eval fts3_configure_incr_load $cfg +*/ +static int SQLITE_TCLAPI fts3_configure_incr_load_cmd( + ClientData clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ +#ifdef SQLITE_ENABLE_FTS3 + extern int test_fts3_node_chunksize; + extern int test_fts3_node_chunk_threshold; + Tcl_Obj *pRet; + + if( objc!=1 && objc!=3 ){ + Tcl_WrongNumArgs(interp, 1, objv, "?CHUNKSIZE THRESHOLD?"); + return TCL_ERROR; + } + + pRet = Tcl_NewObj(); + Tcl_IncrRefCount(pRet); + Tcl_ListObjAppendElement( + interp, pRet, Tcl_NewIntObj(test_fts3_node_chunksize)); + Tcl_ListObjAppendElement( + interp, pRet, Tcl_NewIntObj(test_fts3_node_chunk_threshold)); + + if( objc==3 ){ + int iArg1; + int iArg2; + if( Tcl_GetIntFromObj(interp, objv[1], &iArg1) + || Tcl_GetIntFromObj(interp, objv[2], &iArg2) + ){ + Tcl_DecrRefCount(pRet); + return TCL_ERROR; + } + test_fts3_node_chunksize = iArg1; + test_fts3_node_chunk_threshold = iArg2; + } + + Tcl_SetObjResult(interp, pRet); + Tcl_DecrRefCount(pRet); +#endif + UNUSED_PARAMETER(clientData); + return TCL_OK; +} + +#ifdef SQLITE_ENABLE_FTS3 +/************************************************************************** +** Beginning of test tokenizer code. +** +** For language 0, this tokenizer is similar to the default 'simple' +** tokenizer. For other languages L, the following: +** +** * Odd numbered languages are case-sensitive. Even numbered +** languages are not. +** +** * Language ids 100 or greater are considered an error. +** +** The implementation assumes that the input contains only ASCII characters +** (i.e. those that may be encoded in UTF-8 using a single byte). +*/ +typedef struct test_tokenizer { + sqlite3_tokenizer base; +} test_tokenizer; + +typedef struct test_tokenizer_cursor { + sqlite3_tokenizer_cursor base; + const char *aInput; /* Input being tokenized */ + int nInput; /* Size of the input in bytes */ + int iInput; /* Current offset in aInput */ + int iToken; /* Index of next token to be returned */ + char *aBuffer; /* Buffer containing current token */ + int nBuffer; /* Number of bytes allocated at pToken */ + int iLangid; /* Configured language id */ +} test_tokenizer_cursor; + +static int testTokenizerCreate( + int argc, const char * const *argv, + sqlite3_tokenizer **ppTokenizer +){ + test_tokenizer *pNew; + UNUSED_PARAMETER(argc); + UNUSED_PARAMETER(argv); + + pNew = sqlite3_malloc(sizeof(test_tokenizer)); + if( !pNew ) return SQLITE_NOMEM; + memset(pNew, 0, sizeof(test_tokenizer)); + + *ppTokenizer = (sqlite3_tokenizer *)pNew; + return SQLITE_OK; +} + +static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){ + test_tokenizer *p = (test_tokenizer *)pTokenizer; + sqlite3_free(p); + return SQLITE_OK; +} + +static int testTokenizerOpen( + sqlite3_tokenizer *pTokenizer, /* The tokenizer */ + const char *pInput, int nBytes, /* String to be tokenized */ + sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ +){ + int rc = SQLITE_OK; /* Return code */ + test_tokenizer_cursor *pCsr; /* New cursor object */ + + UNUSED_PARAMETER(pTokenizer); + + pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor)); + if( pCsr==0 ){ + rc = SQLITE_NOMEM; + }else{ + memset(pCsr, 0, sizeof(test_tokenizer_cursor)); + pCsr->aInput = pInput; + if( nBytes<0 ){ + pCsr->nInput = (int)strlen(pInput); + }else{ + pCsr->nInput = nBytes; + } + } + + *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; + return rc; +} + +static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){ + test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; + sqlite3_free(pCsr->aBuffer); + sqlite3_free(pCsr); + return SQLITE_OK; +} + +static int testIsTokenChar(char c){ + return (c>='a' && c<='z') || (c>='A' && c<='Z'); +} +static int testTolower(char c){ + char ret = c; + if( ret>='A' && ret<='Z') ret = ret - ('A'-'a'); + return ret; +} + +static int testTokenizerNext( + sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by testTokenizerOpen */ + const char **ppToken, /* OUT: *ppToken is the token text */ + int *pnBytes, /* OUT: Number of bytes in token */ + int *piStartOffset, /* OUT: Starting offset of token */ + int *piEndOffset, /* OUT: Ending offset of token */ + int *piPosition /* OUT: Position integer of token */ +){ + test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; + int rc = SQLITE_OK; + const char *p; + const char *pEnd; + + p = &pCsr->aInput[pCsr->iInput]; + pEnd = &pCsr->aInput[pCsr->nInput]; + + /* Skip past any white-space */ + assert( p<=pEnd ); + while( p<pEnd && testIsTokenChar(*p)==0 ) p++; + + if( p==pEnd ){ + rc = SQLITE_DONE; + }else{ + /* Advance to the end of the token */ + const char *pToken = p; + sqlite3_int64 nToken; + while( p<pEnd && testIsTokenChar(*p) ) p++; + nToken = (sqlite3_int64)(p-pToken); + + /* Copy the token into the buffer */ + if( nToken>pCsr->nBuffer ){ + sqlite3_free(pCsr->aBuffer); + pCsr->aBuffer = sqlite3_malloc64(nToken); + } + if( pCsr->aBuffer==0 ){ + rc = SQLITE_NOMEM; + }else{ + int i; + + if( pCsr->iLangid & 0x00000001 ){ + for(i=0; i<nToken; i++) pCsr->aBuffer[i] = pToken[i]; + }else{ + for(i=0; i<nToken; i++) pCsr->aBuffer[i] = (char)testTolower(pToken[i]); + } + pCsr->iToken++; + pCsr->iInput = (int)(p - pCsr->aInput); + + *ppToken = pCsr->aBuffer; + *pnBytes = (int)nToken; + *piStartOffset = (int)(pToken - pCsr->aInput); + *piEndOffset = (int)(p - pCsr->aInput); + *piPosition = pCsr->iToken; + } + } + + return rc; +} + +static int testTokenizerLanguage( + sqlite3_tokenizer_cursor *pCursor, + int iLangid +){ + int rc = SQLITE_OK; + test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; + pCsr->iLangid = iLangid; + if( pCsr->iLangid>=100 ){ + rc = SQLITE_ERROR; + } + return rc; +} +#endif + +static int SQLITE_TCLAPI fts3_test_tokenizer_cmd( + ClientData clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ +#ifdef SQLITE_ENABLE_FTS3 + static const sqlite3_tokenizer_module testTokenizerModule = { + 1, + testTokenizerCreate, + testTokenizerDestroy, + testTokenizerOpen, + testTokenizerClose, + testTokenizerNext, + testTokenizerLanguage + }; + const sqlite3_tokenizer_module *pPtr = &testTokenizerModule; + if( objc!=1 ){ + Tcl_WrongNumArgs(interp, 1, objv, ""); + return TCL_ERROR; + } + Tcl_SetObjResult(interp, Tcl_NewByteArrayObj( + (const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *) + )); +#endif + UNUSED_PARAMETER(clientData); + return TCL_OK; +} + +static int SQLITE_TCLAPI fts3_test_varint_cmd( + ClientData clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ +#ifdef SQLITE_ENABLE_FTS3 + char aBuf[24]; + int rc; + Tcl_WideInt w; + sqlite3_int64 w2; + int nByte, nByte2; + + if( objc!=2 ){ + Tcl_WrongNumArgs(interp, 1, objv, "INTEGER"); + return TCL_ERROR; + } + + rc = Tcl_GetWideIntFromObj(interp, objv[1], &w); + if( rc!=TCL_OK ) return rc; + + nByte = sqlite3Fts3PutVarint(aBuf, w); + nByte2 = sqlite3Fts3GetVarint(aBuf, &w2); + if( w!=w2 || nByte!=nByte2 ){ + char *zErr = sqlite3_mprintf("error testing %lld", w); + Tcl_ResetResult(interp); + Tcl_AppendResult(interp, zErr, 0); + return TCL_ERROR; + } + + if( w<=2147483647 && w>=0 ){ + int i; + nByte2 = fts3GetVarint32(aBuf, &i); + if( (int)w!=i || nByte!=nByte2 ){ + char *zErr = sqlite3_mprintf("error testing %lld (32-bit)", w); + Tcl_ResetResult(interp); + Tcl_AppendResult(interp, zErr, 0); + return TCL_ERROR; + } + } + +#endif + UNUSED_PARAMETER(clientData); + return TCL_OK; +} + +/* +** End of tokenizer code. +**************************************************************************/ + +/* +** sqlite3_fts3_may_be_corrupt BOOLEAN +** +** Set or clear the global "may-be-corrupt" flag. Return the old value. +*/ +static int SQLITE_TCLAPI fts3_may_be_corrupt( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ +#ifdef SQLITE_DEBUG + int bOld = sqlite3_fts3_may_be_corrupt; + + if( objc!=2 && objc!=1 ){ + Tcl_WrongNumArgs(interp, 1, objv, "?BOOLEAN?"); + return TCL_ERROR; + } + if( objc==2 ){ + int bNew; + if( Tcl_GetBooleanFromObj(interp, objv[1], &bNew) ) return TCL_ERROR; + sqlite3_fts3_may_be_corrupt = bNew; + } + + Tcl_SetObjResult(interp, Tcl_NewIntObj(bOld)); +#endif + return TCL_OK; +} + +int Sqlitetestfts3_Init(Tcl_Interp *interp){ + Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0); + Tcl_CreateObjCommand(interp, + "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0 + ); + Tcl_CreateObjCommand( + interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0 + ); + Tcl_CreateObjCommand( + interp, "fts3_test_varint", fts3_test_varint_cmd, 0, 0 + ); + Tcl_CreateObjCommand( + interp, "sqlite3_fts3_may_be_corrupt", fts3_may_be_corrupt, 0, 0 + ); + return TCL_OK; +} +#endif /* SQLITE_ENABLE_FTS3 || SQLITE_ENABLE_FTS4 */ +#endif /* ifdef SQLITE_TEST */ |