/* ** 2014 May 31 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ****************************************************************************** */ #include "fts5Int.h" #include /* amalgamator: keep */ /* ** Object used to iterate through all "coalesced phrase instances" in ** a single column of the current row. If the phrase instances in the ** column being considered do not overlap, this object simply iterates ** through them. Or, if they do overlap (share one or more tokens in ** common), each set of overlapping instances is treated as a single ** match. See documentation for the highlight() auxiliary function for ** details. ** ** Usage is: ** ** for(rc = fts5CInstIterNext(pApi, pFts, iCol, &iter); ** (rc==SQLITE_OK && 0==fts5CInstIterEof(&iter); ** rc = fts5CInstIterNext(&iter) ** ){ ** printf("instance starts at %d, ends at %d\n", iter.iStart, iter.iEnd); ** } ** */ typedef struct CInstIter CInstIter; struct CInstIter { const Fts5ExtensionApi *pApi; /* API offered by current FTS version */ Fts5Context *pFts; /* First arg to pass to pApi functions */ int iCol; /* Column to search */ int iInst; /* Next phrase instance index */ int nInst; /* Total number of phrase instances */ /* Output variables */ int iStart; /* First token in coalesced phrase instance */ int iEnd; /* Last token in coalesced phrase instance */ }; /* ** Advance the iterator to the next coalesced phrase instance. Return ** an SQLite error code if an error occurs, or SQLITE_OK otherwise. */ static int fts5CInstIterNext(CInstIter *pIter){ int rc = SQLITE_OK; pIter->iStart = -1; pIter->iEnd = -1; while( rc==SQLITE_OK && pIter->iInstnInst ){ int ip; int ic; int io; rc = pIter->pApi->xInst(pIter->pFts, pIter->iInst, &ip, &ic, &io); if( rc==SQLITE_OK ){ if( ic==pIter->iCol ){ int iEnd = io - 1 + pIter->pApi->xPhraseSize(pIter->pFts, ip); if( pIter->iStart<0 ){ pIter->iStart = io; pIter->iEnd = iEnd; }else if( io<=pIter->iEnd ){ if( iEnd>pIter->iEnd ) pIter->iEnd = iEnd; }else{ break; } } pIter->iInst++; } } return rc; } /* ** Initialize the iterator object indicated by the final parameter to ** iterate through coalesced phrase instances in column iCol. */ static int fts5CInstIterInit( const Fts5ExtensionApi *pApi, Fts5Context *pFts, int iCol, CInstIter *pIter ){ int rc; memset(pIter, 0, sizeof(CInstIter)); pIter->pApi = pApi; pIter->pFts = pFts; pIter->iCol = iCol; rc = pApi->xInstCount(pFts, &pIter->nInst); if( rc==SQLITE_OK ){ rc = fts5CInstIterNext(pIter); } return rc; } /************************************************************************* ** Start of highlight() implementation. */ typedef struct HighlightContext HighlightContext; struct HighlightContext { /* Constant parameters to fts5HighlightCb() */ int iRangeStart; /* First token to include */ int iRangeEnd; /* If non-zero, last token to include */ const char *zOpen; /* Opening highlight */ const char *zClose; /* Closing highlight */ const char *zIn; /* Input text */ int nIn; /* Size of input text in bytes */ /* Variables modified by fts5HighlightCb() */ CInstIter iter; /* Coalesced Instance Iterator */ int iPos; /* Current token offset in zIn[] */ int iOff; /* Have copied up to this offset in zIn[] */ int bOpen; /* True if highlight is open */ char *zOut; /* Output value */ }; /* ** Append text to the HighlightContext output string - p->zOut. Argument ** z points to a buffer containing n bytes of text to append. If n is ** negative, everything up until the first '\0' is appended to the output. ** ** If *pRc is set to any value other than SQLITE_OK when this function is ** called, it is a no-op. If an error (i.e. an OOM condition) is encountered, ** *pRc is set to an error code before returning. */ static void fts5HighlightAppend( int *pRc, HighlightContext *p, const char *z, int n ){ if( *pRc==SQLITE_OK && z ){ if( n<0 ) n = (int)strlen(z); p->zOut = sqlite3_mprintf("%z%.*s", p->zOut, n, z); if( p->zOut==0 ) *pRc = SQLITE_NOMEM; } } /* ** Tokenizer callback used by implementation of highlight() function. */ static int fts5HighlightCb( void *pContext, /* Pointer to HighlightContext object */ int tflags, /* Mask of FTS5_TOKEN_* flags */ const char *pToken, /* Buffer containing token */ int nToken, /* Size of token in bytes */ int iStartOff, /* Start byte offset of token */ int iEndOff /* End byte offset of token */ ){ HighlightContext *p = (HighlightContext*)pContext; int rc = SQLITE_OK; int iPos; UNUSED_PARAM2(pToken, nToken); if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK; iPos = p->iPos++; if( p->iRangeEnd>=0 ){ if( iPosiRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK; if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff; } /* If the parenthesis is open, and this token is not part of the current ** phrase, and the starting byte offset of this token is past the point ** that has currently been copied into the output buffer, close the ** parenthesis. */ if( p->bOpen && (iPos<=p->iter.iStart || p->iter.iStart<0) && iStartOff>p->iOff ){ fts5HighlightAppend(&rc, p, p->zClose, -1); p->bOpen = 0; } /* If this is the start of a new phrase, and the highlight is not open: ** ** * copy text from the input up to the start of the phrase, and ** * open the highlight. */ if( iPos==p->iter.iStart && p->bOpen==0 ){ fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff); fts5HighlightAppend(&rc, p, p->zOpen, -1); p->iOff = iStartOff; p->bOpen = 1; } if( iPos==p->iter.iEnd ){ if( p->bOpen==0 ){ assert( p->iRangeEnd>=0 ); fts5HighlightAppend(&rc, p, p->zOpen, -1); p->bOpen = 1; } fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff); p->iOff = iEndOff; if( rc==SQLITE_OK ){ rc = fts5CInstIterNext(&p->iter); } } if( iPos==p->iRangeEnd ){ if( p->bOpen ){ if( p->iter.iStart>=0 && iPos>=p->iter.iStart ){ fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff); p->iOff = iEndOff; } fts5HighlightAppend(&rc, p, p->zClose, -1); p->bOpen = 0; } fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff); p->iOff = iEndOff; } return rc; } /* ** Implementation of highlight() function. */ static void fts5HighlightFunction( const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ Fts5Context *pFts, /* First arg to pass to pApi functions */ sqlite3_context *pCtx, /* Context for returning result/error */ int nVal, /* Number of values in apVal[] array */ sqlite3_value **apVal /* Array of trailing arguments */ ){ HighlightContext ctx; int rc; int iCol; if( nVal!=3 ){ const char *zErr = "wrong number of arguments to function highlight()"; sqlite3_result_error(pCtx, zErr, -1); return; } iCol = sqlite3_value_int(apVal[0]); memset(&ctx, 0, sizeof(HighlightContext)); ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]); ctx.zClose = (const char*)sqlite3_value_text(apVal[2]); ctx.iRangeEnd = -1; rc = pApi->xColumnText(pFts, iCol, &ctx.zIn, &ctx.nIn); if( rc==SQLITE_RANGE ){ sqlite3_result_text(pCtx, "", -1, SQLITE_STATIC); rc = SQLITE_OK; }else if( ctx.zIn ){ if( rc==SQLITE_OK ){ rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter); } if( rc==SQLITE_OK ){ rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb); } if( ctx.bOpen ){ fts5HighlightAppend(&rc, &ctx, ctx.zClose, -1); } fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff); if( rc==SQLITE_OK ){ sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT); } sqlite3_free(ctx.zOut); } if( rc!=SQLITE_OK ){ sqlite3_result_error_code(pCtx, rc); } } /* ** End of highlight() implementation. **************************************************************************/ /* ** Context object passed to the fts5SentenceFinderCb() function. */ typedef struct Fts5SFinder Fts5SFinder; struct Fts5SFinder { int iPos; /* Current token position */ int nFirstAlloc; /* Allocated size of aFirst[] */ int nFirst; /* Number of entries in aFirst[] */ int *aFirst; /* Array of first token in each sentence */ const char *zDoc; /* Document being tokenized */ }; /* ** Add an entry to the Fts5SFinder.aFirst[] array. Grow the array if ** necessary. Return SQLITE_OK if successful, or SQLITE_NOMEM if an ** error occurs. */ static int fts5SentenceFinderAdd(Fts5SFinder *p, int iAdd){ if( p->nFirstAlloc==p->nFirst ){ int nNew = p->nFirstAlloc ? p->nFirstAlloc*2 : 64; int *aNew; aNew = (int*)sqlite3_realloc64(p->aFirst, nNew*sizeof(int)); if( aNew==0 ) return SQLITE_NOMEM; p->aFirst = aNew; p->nFirstAlloc = nNew; } p->aFirst[p->nFirst++] = iAdd; return SQLITE_OK; } /* ** This function is an xTokenize() callback used by the auxiliary snippet() ** function. Its job is to identify tokens that are the first in a sentence. ** For each such token, an entry is added to the SFinder.aFirst[] array. */ static int fts5SentenceFinderCb( void *pContext, /* Pointer to HighlightContext object */ int tflags, /* Mask of FTS5_TOKEN_* flags */ const char *pToken, /* Buffer containing token */ int nToken, /* Size of token in bytes */ int iStartOff, /* Start offset of token */ int iEndOff /* End offset of token */ ){ int rc = SQLITE_OK; UNUSED_PARAM2(pToken, nToken); UNUSED_PARAM(iEndOff); if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){ Fts5SFinder *p = (Fts5SFinder*)pContext; if( p->iPos>0 ){ int i; char c = 0; for(i=iStartOff-1; i>=0; i--){ c = p->zDoc[i]; if( c!=' ' && c!='\t' && c!='\n' && c!='\r' ) break; } if( i!=iStartOff-1 && (c=='.' || c==':') ){ rc = fts5SentenceFinderAdd(p, p->iPos); } }else{ rc = fts5SentenceFinderAdd(p, 0); } p->iPos++; } return rc; } static int fts5SnippetScore( const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ Fts5Context *pFts, /* First arg to pass to pApi functions */ int nDocsize, /* Size of column in tokens */ unsigned char *aSeen, /* Array with one element per query phrase */ int iCol, /* Column to score */ int iPos, /* Starting offset to score */ int nToken, /* Max tokens per snippet */ int *pnScore, /* OUT: Score */ int *piPos /* OUT: Adjusted offset */ ){ int rc; int i; int ip = 0; int ic = 0; int iOff = 0; int iFirst = -1; int nInst; int nScore = 0; int iLast = 0; sqlite3_int64 iEnd = (sqlite3_int64)iPos + nToken; rc = pApi->xInstCount(pFts, &nInst); for(i=0; ixInst(pFts, i, &ip, &ic, &iOff); if( rc==SQLITE_OK && ic==iCol && iOff>=iPos && iOffxPhraseSize(pFts, ip); } } *pnScore = nScore; if( piPos ){ sqlite3_int64 iAdj = iFirst - (nToken - (iLast-iFirst)) / 2; if( (iAdj+nToken)>nDocsize ) iAdj = nDocsize - nToken; if( iAdj<0 ) iAdj = 0; *piPos = (int)iAdj; } return rc; } /* ** Return the value in pVal interpreted as utf-8 text. Except, if pVal ** contains a NULL value, return a pointer to a static string zero ** bytes in length instead of a NULL pointer. */ static const char *fts5ValueToText(sqlite3_value *pVal){ const char *zRet = (const char*)sqlite3_value_text(pVal); return zRet ? zRet : ""; } /* ** Implementation of snippet() function. */ static void fts5SnippetFunction( const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ Fts5Context *pFts, /* First arg to pass to pApi functions */ sqlite3_context *pCtx, /* Context for returning result/error */ int nVal, /* Number of values in apVal[] array */ sqlite3_value **apVal /* Array of trailing arguments */ ){ HighlightContext ctx; int rc = SQLITE_OK; /* Return code */ int iCol; /* 1st argument to snippet() */ const char *zEllips; /* 4th argument to snippet() */ int nToken; /* 5th argument to snippet() */ int nInst = 0; /* Number of instance matches this row */ int i; /* Used to iterate through instances */ int nPhrase; /* Number of phrases in query */ unsigned char *aSeen; /* Array of "seen instance" flags */ int iBestCol; /* Column containing best snippet */ int iBestStart = 0; /* First token of best snippet */ int nBestScore = 0; /* Score of best snippet */ int nColSize = 0; /* Total size of iBestCol in tokens */ Fts5SFinder sFinder; /* Used to find the beginnings of sentences */ int nCol; if( nVal!=5 ){ const char *zErr = "wrong number of arguments to function snippet()"; sqlite3_result_error(pCtx, zErr, -1); return; } nCol = pApi->xColumnCount(pFts); memset(&ctx, 0, sizeof(HighlightContext)); iCol = sqlite3_value_int(apVal[0]); ctx.zOpen = fts5ValueToText(apVal[1]); ctx.zClose = fts5ValueToText(apVal[2]); ctx.iRangeEnd = -1; zEllips = fts5ValueToText(apVal[3]); nToken = sqlite3_value_int(apVal[4]); iBestCol = (iCol>=0 ? iCol : 0); nPhrase = pApi->xPhraseCount(pFts); aSeen = sqlite3_malloc(nPhrase); if( aSeen==0 ){ rc = SQLITE_NOMEM; } if( rc==SQLITE_OK ){ rc = pApi->xInstCount(pFts, &nInst); } memset(&sFinder, 0, sizeof(Fts5SFinder)); for(i=0; ixColumnText(pFts, i, &sFinder.zDoc, &nDoc); if( rc!=SQLITE_OK ) break; rc = pApi->xTokenize(pFts, sFinder.zDoc, nDoc, (void*)&sFinder,fts5SentenceFinderCb ); if( rc!=SQLITE_OK ) break; rc = pApi->xColumnSize(pFts, i, &nDocsize); if( rc!=SQLITE_OK ) break; for(ii=0; rc==SQLITE_OK && iixInst(pFts, ii, &ip, &ic, &io); if( ic!=i ) continue; if( io>nDocsize ) rc = FTS5_CORRUPT; if( rc!=SQLITE_OK ) continue; memset(aSeen, 0, nPhrase); rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i, io, nToken, &nScore, &iAdj ); if( rc==SQLITE_OK && nScore>nBestScore ){ nBestScore = nScore; iBestCol = i; iBestStart = iAdj; nColSize = nDocsize; } if( rc==SQLITE_OK && sFinder.nFirst && nDocsize>nToken ){ for(jj=0; jj<(sFinder.nFirst-1); jj++){ if( sFinder.aFirst[jj+1]>io ) break; } if( sFinder.aFirst[jj]nBestScore ){ nBestScore = nScore; iBestCol = i; iBestStart = sFinder.aFirst[jj]; nColSize = nDocsize; } } } } } } if( rc==SQLITE_OK ){ rc = pApi->xColumnText(pFts, iBestCol, &ctx.zIn, &ctx.nIn); } if( rc==SQLITE_OK && nColSize==0 ){ rc = pApi->xColumnSize(pFts, iBestCol, &nColSize); } if( ctx.zIn ){ if( rc==SQLITE_OK ){ rc = fts5CInstIterInit(pApi, pFts, iBestCol, &ctx.iter); } ctx.iRangeStart = iBestStart; ctx.iRangeEnd = iBestStart + nToken - 1; if( iBestStart>0 ){ fts5HighlightAppend(&rc, &ctx, zEllips, -1); } /* Advance iterator ctx.iter so that it points to the first coalesced ** phrase instance at or following position iBestStart. */ while( ctx.iter.iStart>=0 && ctx.iter.iStartxTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb); } if( ctx.bOpen ){ fts5HighlightAppend(&rc, &ctx, ctx.zClose, -1); } if( ctx.iRangeEnd>=(nColSize-1) ){ fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff); }else{ fts5HighlightAppend(&rc, &ctx, zEllips, -1); } } if( rc==SQLITE_OK ){ sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT); }else{ sqlite3_result_error_code(pCtx, rc); } sqlite3_free(ctx.zOut); sqlite3_free(aSeen); sqlite3_free(sFinder.aFirst); } /************************************************************************/ /* ** The first time the bm25() function is called for a query, an instance ** of the following structure is allocated and populated. */ typedef struct Fts5Bm25Data Fts5Bm25Data; struct Fts5Bm25Data { int nPhrase; /* Number of phrases in query */ double avgdl; /* Average number of tokens in each row */ double *aIDF; /* IDF for each phrase */ double *aFreq; /* Array used to calculate phrase freq. */ }; /* ** Callback used by fts5Bm25GetData() to count the number of rows in the ** table matched by each individual phrase within the query. */ static int fts5CountCb( const Fts5ExtensionApi *pApi, Fts5Context *pFts, void *pUserData /* Pointer to sqlite3_int64 variable */ ){ sqlite3_int64 *pn = (sqlite3_int64*)pUserData; UNUSED_PARAM2(pApi, pFts); (*pn)++; return SQLITE_OK; } /* ** Set *ppData to point to the Fts5Bm25Data object for the current query. ** If the object has not already been allocated, allocate and populate it ** now. */ static int fts5Bm25GetData( const Fts5ExtensionApi *pApi, Fts5Context *pFts, Fts5Bm25Data **ppData /* OUT: bm25-data object for this query */ ){ int rc = SQLITE_OK; /* Return code */ Fts5Bm25Data *p; /* Object to return */ p = (Fts5Bm25Data*)pApi->xGetAuxdata(pFts, 0); if( p==0 ){ int nPhrase; /* Number of phrases in query */ sqlite3_int64 nRow = 0; /* Number of rows in table */ sqlite3_int64 nToken = 0; /* Number of tokens in table */ sqlite3_int64 nByte; /* Bytes of space to allocate */ int i; /* Allocate the Fts5Bm25Data object */ nPhrase = pApi->xPhraseCount(pFts); nByte = sizeof(Fts5Bm25Data) + nPhrase*2*sizeof(double); p = (Fts5Bm25Data*)sqlite3_malloc64(nByte); if( p==0 ){ rc = SQLITE_NOMEM; }else{ memset(p, 0, (size_t)nByte); p->nPhrase = nPhrase; p->aIDF = (double*)&p[1]; p->aFreq = &p->aIDF[nPhrase]; } /* Calculate the average document length for this FTS5 table */ if( rc==SQLITE_OK ) rc = pApi->xRowCount(pFts, &nRow); assert( rc!=SQLITE_OK || nRow>0 ); if( rc==SQLITE_OK ) rc = pApi->xColumnTotalSize(pFts, -1, &nToken); if( rc==SQLITE_OK ) p->avgdl = (double)nToken / (double)nRow; /* Calculate an IDF for each phrase in the query */ for(i=0; rc==SQLITE_OK && ixQueryPhrase(pFts, i, (void*)&nHit, fts5CountCb); if( rc==SQLITE_OK ){ /* Calculate the IDF (Inverse Document Frequency) for phrase i. ** This is done using the standard BM25 formula as found on wikipedia: ** ** IDF = log( (N - nHit + 0.5) / (nHit + 0.5) ) ** ** where "N" is the total number of documents in the set and nHit ** is the number that contain at least one instance of the phrase ** under consideration. ** ** The problem with this is that if (N < 2*nHit), the IDF is ** negative. Which is undesirable. So the mimimum allowable IDF is ** (1e-6) - roughly the same as a term that appears in just over ** half of set of 5,000,000 documents. */ double idf = log( (nRow - nHit + 0.5) / (nHit + 0.5) ); if( idf<=0.0 ) idf = 1e-6; p->aIDF[i] = idf; } } if( rc!=SQLITE_OK ){ sqlite3_free(p); }else{ rc = pApi->xSetAuxdata(pFts, p, sqlite3_free); } if( rc!=SQLITE_OK ) p = 0; } *ppData = p; return rc; } /* ** Implementation of bm25() function. */ static void fts5Bm25Function( const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ Fts5Context *pFts, /* First arg to pass to pApi functions */ sqlite3_context *pCtx, /* Context for returning result/error */ int nVal, /* Number of values in apVal[] array */ sqlite3_value **apVal /* Array of trailing arguments */ ){ const double k1 = 1.2; /* Constant "k1" from BM25 formula */ const double b = 0.75; /* Constant "b" from BM25 formula */ int rc; /* Error code */ double score = 0.0; /* SQL function return value */ Fts5Bm25Data *pData; /* Values allocated/calculated once only */ int i; /* Iterator variable */ int nInst = 0; /* Value returned by xInstCount() */ double D = 0.0; /* Total number of tokens in row */ double *aFreq = 0; /* Array of phrase freq. for current row */ /* Calculate the phrase frequency (symbol "f(qi,D)" in the documentation) ** for each phrase in the query for the current row. */ rc = fts5Bm25GetData(pApi, pFts, &pData); if( rc==SQLITE_OK ){ aFreq = pData->aFreq; memset(aFreq, 0, sizeof(double) * pData->nPhrase); rc = pApi->xInstCount(pFts, &nInst); } for(i=0; rc==SQLITE_OK && ixInst(pFts, i, &ip, &ic, &io); if( rc==SQLITE_OK ){ double w = (nVal > ic) ? sqlite3_value_double(apVal[ic]) : 1.0; aFreq[ip] += w; } } /* Figure out the total size of the current row in tokens. */ if( rc==SQLITE_OK ){ int nTok; rc = pApi->xColumnSize(pFts, -1, &nTok); D = (double)nTok; } /* Determine and return the BM25 score for the current row. Or, if an ** error has occurred, throw an exception. */ if( rc==SQLITE_OK ){ for(i=0; inPhrase; i++){ score += pData->aIDF[i] * ( ( aFreq[i] * (k1 + 1.0) ) / ( aFreq[i] + k1 * (1 - b + b * D / pData->avgdl) ) ); } sqlite3_result_double(pCtx, -1.0 * score); }else{ sqlite3_result_error_code(pCtx, rc); } } int sqlite3Fts5AuxInit(fts5_api *pApi){ struct Builtin { const char *zFunc; /* Function name (nul-terminated) */ void *pUserData; /* User-data pointer */ fts5_extension_function xFunc;/* Callback function */ void (*xDestroy)(void*); /* Destructor function */ } aBuiltin [] = { { "snippet", 0, fts5SnippetFunction, 0 }, { "highlight", 0, fts5HighlightFunction, 0 }, { "bm25", 0, fts5Bm25Function, 0 }, }; int rc = SQLITE_OK; /* Return code */ int i; /* To iterate through builtin functions */ for(i=0; rc==SQLITE_OK && ixCreateFunction(pApi, aBuiltin[i].zFunc, aBuiltin[i].pUserData, aBuiltin[i].xFunc, aBuiltin[i].xDestroy ); } return rc; }