diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 14:07:11 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 14:07:11 +0000 |
commit | 63847496f14c813a5d80efd5b7de0f1294ffe1e3 (patch) | |
tree | 01c7571c7c762ceee70638549a99834fdd7c411b /ext/fts5/fts5_aux.c | |
parent | Initial commit. (diff) | |
download | sqlite3-63847496f14c813a5d80efd5b7de0f1294ffe1e3.tar.xz sqlite3-63847496f14c813a5d80efd5b7de0f1294ffe1e3.zip |
Adding upstream version 3.45.1.upstream/3.45.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'ext/fts5/fts5_aux.c')
-rw-r--r-- | ext/fts5/fts5_aux.c | 753 |
1 files changed, 753 insertions, 0 deletions
diff --git a/ext/fts5/fts5_aux.c b/ext/fts5/fts5_aux.c new file mode 100644 index 0000000..30101fb --- /dev/null +++ b/ext/fts5/fts5_aux.c @@ -0,0 +1,753 @@ +/* +** 2014 May 31 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +*/ + + +#include "fts5Int.h" +#include <math.h> /* amalgamator: keep */ + +/* +** Object used to iterate through all "coalesced phrase instances" in +** a single column of the current row. If the phrase instances in the +** column being considered do not overlap, this object simply iterates +** through them. Or, if they do overlap (share one or more tokens in +** common), each set of overlapping instances is treated as a single +** match. See documentation for the highlight() auxiliary function for +** details. +** +** Usage is: +** +** for(rc = fts5CInstIterNext(pApi, pFts, iCol, &iter); +** (rc==SQLITE_OK && 0==fts5CInstIterEof(&iter); +** rc = fts5CInstIterNext(&iter) +** ){ +** printf("instance starts at %d, ends at %d\n", iter.iStart, iter.iEnd); +** } +** +*/ +typedef struct CInstIter CInstIter; +struct CInstIter { + const Fts5ExtensionApi *pApi; /* API offered by current FTS version */ + Fts5Context *pFts; /* First arg to pass to pApi functions */ + int iCol; /* Column to search */ + int iInst; /* Next phrase instance index */ + int nInst; /* Total number of phrase instances */ + + /* Output variables */ + int iStart; /* First token in coalesced phrase instance */ + int iEnd; /* Last token in coalesced phrase instance */ +}; + +/* +** Advance the iterator to the next coalesced phrase instance. Return +** an SQLite error code if an error occurs, or SQLITE_OK otherwise. +*/ +static int fts5CInstIterNext(CInstIter *pIter){ + int rc = SQLITE_OK; + pIter->iStart = -1; + pIter->iEnd = -1; + + while( rc==SQLITE_OK && pIter->iInst<pIter->nInst ){ + int ip; int ic; int io; + rc = pIter->pApi->xInst(pIter->pFts, pIter->iInst, &ip, &ic, &io); + if( rc==SQLITE_OK ){ + if( ic==pIter->iCol ){ + int iEnd = io - 1 + pIter->pApi->xPhraseSize(pIter->pFts, ip); + if( pIter->iStart<0 ){ + pIter->iStart = io; + pIter->iEnd = iEnd; + }else if( io<=pIter->iEnd ){ + if( iEnd>pIter->iEnd ) pIter->iEnd = iEnd; + }else{ + break; + } + } + pIter->iInst++; + } + } + + return rc; +} + +/* +** Initialize the iterator object indicated by the final parameter to +** iterate through coalesced phrase instances in column iCol. +*/ +static int fts5CInstIterInit( + const Fts5ExtensionApi *pApi, + Fts5Context *pFts, + int iCol, + CInstIter *pIter +){ + int rc; + + memset(pIter, 0, sizeof(CInstIter)); + pIter->pApi = pApi; + pIter->pFts = pFts; + pIter->iCol = iCol; + rc = pApi->xInstCount(pFts, &pIter->nInst); + + if( rc==SQLITE_OK ){ + rc = fts5CInstIterNext(pIter); + } + + return rc; +} + + + +/************************************************************************* +** Start of highlight() implementation. +*/ +typedef struct HighlightContext HighlightContext; +struct HighlightContext { + /* Constant parameters to fts5HighlightCb() */ + int iRangeStart; /* First token to include */ + int iRangeEnd; /* If non-zero, last token to include */ + const char *zOpen; /* Opening highlight */ + const char *zClose; /* Closing highlight */ + const char *zIn; /* Input text */ + int nIn; /* Size of input text in bytes */ + + /* Variables modified by fts5HighlightCb() */ + CInstIter iter; /* Coalesced Instance Iterator */ + int iPos; /* Current token offset in zIn[] */ + int iOff; /* Have copied up to this offset in zIn[] */ + int bOpen; /* True if highlight is open */ + char *zOut; /* Output value */ +}; + +/* +** Append text to the HighlightContext output string - p->zOut. Argument +** z points to a buffer containing n bytes of text to append. If n is +** negative, everything up until the first '\0' is appended to the output. +** +** If *pRc is set to any value other than SQLITE_OK when this function is +** called, it is a no-op. If an error (i.e. an OOM condition) is encountered, +** *pRc is set to an error code before returning. +*/ +static void fts5HighlightAppend( + int *pRc, + HighlightContext *p, + const char *z, int n +){ + if( *pRc==SQLITE_OK && z ){ + if( n<0 ) n = (int)strlen(z); + p->zOut = sqlite3_mprintf("%z%.*s", p->zOut, n, z); + if( p->zOut==0 ) *pRc = SQLITE_NOMEM; + } +} + +/* +** Tokenizer callback used by implementation of highlight() function. +*/ +static int fts5HighlightCb( + void *pContext, /* Pointer to HighlightContext object */ + int tflags, /* Mask of FTS5_TOKEN_* flags */ + const char *pToken, /* Buffer containing token */ + int nToken, /* Size of token in bytes */ + int iStartOff, /* Start byte offset of token */ + int iEndOff /* End byte offset of token */ +){ + HighlightContext *p = (HighlightContext*)pContext; + int rc = SQLITE_OK; + int iPos; + + UNUSED_PARAM2(pToken, nToken); + + if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK; + iPos = p->iPos++; + + if( p->iRangeEnd>=0 ){ + if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK; + if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff; + } + + /* If the parenthesis is open, and this token is not part of the current + ** phrase, and the starting byte offset of this token is past the point + ** that has currently been copied into the output buffer, close the + ** parenthesis. */ + if( p->bOpen + && (iPos<=p->iter.iStart || p->iter.iStart<0) + && iStartOff>p->iOff + ){ + fts5HighlightAppend(&rc, p, p->zClose, -1); + p->bOpen = 0; + } + + /* If this is the start of a new phrase, and the highlight is not open: + ** + ** * copy text from the input up to the start of the phrase, and + ** * open the highlight. + */ + if( iPos==p->iter.iStart && p->bOpen==0 ){ + fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff); + fts5HighlightAppend(&rc, p, p->zOpen, -1); + p->iOff = iStartOff; + p->bOpen = 1; + } + + if( iPos==p->iter.iEnd ){ + if( p->bOpen==0 ){ + assert( p->iRangeEnd>=0 ); + fts5HighlightAppend(&rc, p, p->zOpen, -1); + p->bOpen = 1; + } + fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff); + p->iOff = iEndOff; + + if( rc==SQLITE_OK ){ + rc = fts5CInstIterNext(&p->iter); + } + } + + if( iPos==p->iRangeEnd ){ + if( p->bOpen ){ + if( p->iter.iStart>=0 && iPos>=p->iter.iStart ){ + fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff); + p->iOff = iEndOff; + } + fts5HighlightAppend(&rc, p, p->zClose, -1); + p->bOpen = 0; + } + fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff); + p->iOff = iEndOff; + } + + return rc; +} + +/* +** Implementation of highlight() function. +*/ +static void fts5HighlightFunction( + const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ + Fts5Context *pFts, /* First arg to pass to pApi functions */ + sqlite3_context *pCtx, /* Context for returning result/error */ + int nVal, /* Number of values in apVal[] array */ + sqlite3_value **apVal /* Array of trailing arguments */ +){ + HighlightContext ctx; + int rc; + int iCol; + + if( nVal!=3 ){ + const char *zErr = "wrong number of arguments to function highlight()"; + sqlite3_result_error(pCtx, zErr, -1); + return; + } + + iCol = sqlite3_value_int(apVal[0]); + memset(&ctx, 0, sizeof(HighlightContext)); + ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]); + ctx.zClose = (const char*)sqlite3_value_text(apVal[2]); + ctx.iRangeEnd = -1; + rc = pApi->xColumnText(pFts, iCol, &ctx.zIn, &ctx.nIn); + if( rc==SQLITE_RANGE ){ + sqlite3_result_text(pCtx, "", -1, SQLITE_STATIC); + rc = SQLITE_OK; + }else if( ctx.zIn ){ + if( rc==SQLITE_OK ){ + rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter); + } + + if( rc==SQLITE_OK ){ + rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb); + } + if( ctx.bOpen ){ + fts5HighlightAppend(&rc, &ctx, ctx.zClose, -1); + } + fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff); + + if( rc==SQLITE_OK ){ + sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT); + } + sqlite3_free(ctx.zOut); + } + if( rc!=SQLITE_OK ){ + sqlite3_result_error_code(pCtx, rc); + } +} +/* +** End of highlight() implementation. +**************************************************************************/ + +/* +** Context object passed to the fts5SentenceFinderCb() function. +*/ +typedef struct Fts5SFinder Fts5SFinder; +struct Fts5SFinder { + int iPos; /* Current token position */ + int nFirstAlloc; /* Allocated size of aFirst[] */ + int nFirst; /* Number of entries in aFirst[] */ + int *aFirst; /* Array of first token in each sentence */ + const char *zDoc; /* Document being tokenized */ +}; + +/* +** Add an entry to the Fts5SFinder.aFirst[] array. Grow the array if +** necessary. Return SQLITE_OK if successful, or SQLITE_NOMEM if an +** error occurs. +*/ +static int fts5SentenceFinderAdd(Fts5SFinder *p, int iAdd){ + if( p->nFirstAlloc==p->nFirst ){ + int nNew = p->nFirstAlloc ? p->nFirstAlloc*2 : 64; + int *aNew; + + aNew = (int*)sqlite3_realloc64(p->aFirst, nNew*sizeof(int)); + if( aNew==0 ) return SQLITE_NOMEM; + p->aFirst = aNew; + p->nFirstAlloc = nNew; + } + p->aFirst[p->nFirst++] = iAdd; + return SQLITE_OK; +} + +/* +** This function is an xTokenize() callback used by the auxiliary snippet() +** function. Its job is to identify tokens that are the first in a sentence. +** For each such token, an entry is added to the SFinder.aFirst[] array. +*/ +static int fts5SentenceFinderCb( + void *pContext, /* Pointer to HighlightContext object */ + int tflags, /* Mask of FTS5_TOKEN_* flags */ + const char *pToken, /* Buffer containing token */ + int nToken, /* Size of token in bytes */ + int iStartOff, /* Start offset of token */ + int iEndOff /* End offset of token */ +){ + int rc = SQLITE_OK; + + UNUSED_PARAM2(pToken, nToken); + UNUSED_PARAM(iEndOff); + + if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){ + Fts5SFinder *p = (Fts5SFinder*)pContext; + if( p->iPos>0 ){ + int i; + char c = 0; + for(i=iStartOff-1; i>=0; i--){ + c = p->zDoc[i]; + if( c!=' ' && c!='\t' && c!='\n' && c!='\r' ) break; + } + if( i!=iStartOff-1 && (c=='.' || c==':') ){ + rc = fts5SentenceFinderAdd(p, p->iPos); + } + }else{ + rc = fts5SentenceFinderAdd(p, 0); + } + p->iPos++; + } + return rc; +} + +static int fts5SnippetScore( + const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ + Fts5Context *pFts, /* First arg to pass to pApi functions */ + int nDocsize, /* Size of column in tokens */ + unsigned char *aSeen, /* Array with one element per query phrase */ + int iCol, /* Column to score */ + int iPos, /* Starting offset to score */ + int nToken, /* Max tokens per snippet */ + int *pnScore, /* OUT: Score */ + int *piPos /* OUT: Adjusted offset */ +){ + int rc; + int i; + int ip = 0; + int ic = 0; + int iOff = 0; + int iFirst = -1; + int nInst; + int nScore = 0; + int iLast = 0; + sqlite3_int64 iEnd = (sqlite3_int64)iPos + nToken; + + rc = pApi->xInstCount(pFts, &nInst); + for(i=0; i<nInst && rc==SQLITE_OK; i++){ + rc = pApi->xInst(pFts, i, &ip, &ic, &iOff); + if( rc==SQLITE_OK && ic==iCol && iOff>=iPos && iOff<iEnd ){ + nScore += (aSeen[ip] ? 1 : 1000); + aSeen[ip] = 1; + if( iFirst<0 ) iFirst = iOff; + iLast = iOff + pApi->xPhraseSize(pFts, ip); + } + } + + *pnScore = nScore; + if( piPos ){ + sqlite3_int64 iAdj = iFirst - (nToken - (iLast-iFirst)) / 2; + if( (iAdj+nToken)>nDocsize ) iAdj = nDocsize - nToken; + if( iAdj<0 ) iAdj = 0; + *piPos = (int)iAdj; + } + + return rc; +} + +/* +** Return the value in pVal interpreted as utf-8 text. Except, if pVal +** contains a NULL value, return a pointer to a static string zero +** bytes in length instead of a NULL pointer. +*/ +static const char *fts5ValueToText(sqlite3_value *pVal){ + const char *zRet = (const char*)sqlite3_value_text(pVal); + return zRet ? zRet : ""; +} + +/* +** Implementation of snippet() function. +*/ +static void fts5SnippetFunction( + const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ + Fts5Context *pFts, /* First arg to pass to pApi functions */ + sqlite3_context *pCtx, /* Context for returning result/error */ + int nVal, /* Number of values in apVal[] array */ + sqlite3_value **apVal /* Array of trailing arguments */ +){ + HighlightContext ctx; + int rc = SQLITE_OK; /* Return code */ + int iCol; /* 1st argument to snippet() */ + const char *zEllips; /* 4th argument to snippet() */ + int nToken; /* 5th argument to snippet() */ + int nInst = 0; /* Number of instance matches this row */ + int i; /* Used to iterate through instances */ + int nPhrase; /* Number of phrases in query */ + unsigned char *aSeen; /* Array of "seen instance" flags */ + int iBestCol; /* Column containing best snippet */ + int iBestStart = 0; /* First token of best snippet */ + int nBestScore = 0; /* Score of best snippet */ + int nColSize = 0; /* Total size of iBestCol in tokens */ + Fts5SFinder sFinder; /* Used to find the beginnings of sentences */ + int nCol; + + if( nVal!=5 ){ + const char *zErr = "wrong number of arguments to function snippet()"; + sqlite3_result_error(pCtx, zErr, -1); + return; + } + + nCol = pApi->xColumnCount(pFts); + memset(&ctx, 0, sizeof(HighlightContext)); + iCol = sqlite3_value_int(apVal[0]); + ctx.zOpen = fts5ValueToText(apVal[1]); + ctx.zClose = fts5ValueToText(apVal[2]); + ctx.iRangeEnd = -1; + zEllips = fts5ValueToText(apVal[3]); + nToken = sqlite3_value_int(apVal[4]); + + iBestCol = (iCol>=0 ? iCol : 0); + nPhrase = pApi->xPhraseCount(pFts); + aSeen = sqlite3_malloc(nPhrase); + if( aSeen==0 ){ + rc = SQLITE_NOMEM; + } + if( rc==SQLITE_OK ){ + rc = pApi->xInstCount(pFts, &nInst); + } + + memset(&sFinder, 0, sizeof(Fts5SFinder)); + for(i=0; i<nCol; i++){ + if( iCol<0 || iCol==i ){ + int nDoc; + int nDocsize; + int ii; + sFinder.iPos = 0; + sFinder.nFirst = 0; + rc = pApi->xColumnText(pFts, i, &sFinder.zDoc, &nDoc); + if( rc!=SQLITE_OK ) break; + rc = pApi->xTokenize(pFts, + sFinder.zDoc, nDoc, (void*)&sFinder,fts5SentenceFinderCb + ); + if( rc!=SQLITE_OK ) break; + rc = pApi->xColumnSize(pFts, i, &nDocsize); + if( rc!=SQLITE_OK ) break; + + for(ii=0; rc==SQLITE_OK && ii<nInst; ii++){ + int ip, ic, io; + int iAdj; + int nScore; + int jj; + + rc = pApi->xInst(pFts, ii, &ip, &ic, &io); + if( ic!=i ) continue; + if( io>nDocsize ) rc = FTS5_CORRUPT; + if( rc!=SQLITE_OK ) continue; + memset(aSeen, 0, nPhrase); + rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i, + io, nToken, &nScore, &iAdj + ); + if( rc==SQLITE_OK && nScore>nBestScore ){ + nBestScore = nScore; + iBestCol = i; + iBestStart = iAdj; + nColSize = nDocsize; + } + + if( rc==SQLITE_OK && sFinder.nFirst && nDocsize>nToken ){ + for(jj=0; jj<(sFinder.nFirst-1); jj++){ + if( sFinder.aFirst[jj+1]>io ) break; + } + + if( sFinder.aFirst[jj]<io ){ + memset(aSeen, 0, nPhrase); + rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i, + sFinder.aFirst[jj], nToken, &nScore, 0 + ); + + nScore += (sFinder.aFirst[jj]==0 ? 120 : 100); + if( rc==SQLITE_OK && nScore>nBestScore ){ + nBestScore = nScore; + iBestCol = i; + iBestStart = sFinder.aFirst[jj]; + nColSize = nDocsize; + } + } + } + } + } + } + + if( rc==SQLITE_OK ){ + rc = pApi->xColumnText(pFts, iBestCol, &ctx.zIn, &ctx.nIn); + } + if( rc==SQLITE_OK && nColSize==0 ){ + rc = pApi->xColumnSize(pFts, iBestCol, &nColSize); + } + if( ctx.zIn ){ + if( rc==SQLITE_OK ){ + rc = fts5CInstIterInit(pApi, pFts, iBestCol, &ctx.iter); + } + + ctx.iRangeStart = iBestStart; + ctx.iRangeEnd = iBestStart + nToken - 1; + + if( iBestStart>0 ){ + fts5HighlightAppend(&rc, &ctx, zEllips, -1); + } + + /* Advance iterator ctx.iter so that it points to the first coalesced + ** phrase instance at or following position iBestStart. */ + while( ctx.iter.iStart>=0 && ctx.iter.iStart<iBestStart && rc==SQLITE_OK ){ + rc = fts5CInstIterNext(&ctx.iter); + } + + if( rc==SQLITE_OK ){ + rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb); + } + if( ctx.bOpen ){ + fts5HighlightAppend(&rc, &ctx, ctx.zClose, -1); + } + if( ctx.iRangeEnd>=(nColSize-1) ){ + fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff); + }else{ + fts5HighlightAppend(&rc, &ctx, zEllips, -1); + } + } + if( rc==SQLITE_OK ){ + sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT); + }else{ + sqlite3_result_error_code(pCtx, rc); + } + sqlite3_free(ctx.zOut); + sqlite3_free(aSeen); + sqlite3_free(sFinder.aFirst); +} + +/************************************************************************/ + +/* +** The first time the bm25() function is called for a query, an instance +** of the following structure is allocated and populated. +*/ +typedef struct Fts5Bm25Data Fts5Bm25Data; +struct Fts5Bm25Data { + int nPhrase; /* Number of phrases in query */ + double avgdl; /* Average number of tokens in each row */ + double *aIDF; /* IDF for each phrase */ + double *aFreq; /* Array used to calculate phrase freq. */ +}; + +/* +** Callback used by fts5Bm25GetData() to count the number of rows in the +** table matched by each individual phrase within the query. +*/ +static int fts5CountCb( + const Fts5ExtensionApi *pApi, + Fts5Context *pFts, + void *pUserData /* Pointer to sqlite3_int64 variable */ +){ + sqlite3_int64 *pn = (sqlite3_int64*)pUserData; + UNUSED_PARAM2(pApi, pFts); + (*pn)++; + return SQLITE_OK; +} + +/* +** Set *ppData to point to the Fts5Bm25Data object for the current query. +** If the object has not already been allocated, allocate and populate it +** now. +*/ +static int fts5Bm25GetData( + const Fts5ExtensionApi *pApi, + Fts5Context *pFts, + Fts5Bm25Data **ppData /* OUT: bm25-data object for this query */ +){ + int rc = SQLITE_OK; /* Return code */ + Fts5Bm25Data *p; /* Object to return */ + + p = (Fts5Bm25Data*)pApi->xGetAuxdata(pFts, 0); + if( p==0 ){ + int nPhrase; /* Number of phrases in query */ + sqlite3_int64 nRow = 0; /* Number of rows in table */ + sqlite3_int64 nToken = 0; /* Number of tokens in table */ + sqlite3_int64 nByte; /* Bytes of space to allocate */ + int i; + + /* Allocate the Fts5Bm25Data object */ + nPhrase = pApi->xPhraseCount(pFts); + nByte = sizeof(Fts5Bm25Data) + nPhrase*2*sizeof(double); + p = (Fts5Bm25Data*)sqlite3_malloc64(nByte); + if( p==0 ){ + rc = SQLITE_NOMEM; + }else{ + memset(p, 0, (size_t)nByte); + p->nPhrase = nPhrase; + p->aIDF = (double*)&p[1]; + p->aFreq = &p->aIDF[nPhrase]; + } + + /* Calculate the average document length for this FTS5 table */ + if( rc==SQLITE_OK ) rc = pApi->xRowCount(pFts, &nRow); + assert( rc!=SQLITE_OK || nRow>0 ); + if( rc==SQLITE_OK ) rc = pApi->xColumnTotalSize(pFts, -1, &nToken); + if( rc==SQLITE_OK ) p->avgdl = (double)nToken / (double)nRow; + + /* Calculate an IDF for each phrase in the query */ + for(i=0; rc==SQLITE_OK && i<nPhrase; i++){ + sqlite3_int64 nHit = 0; + rc = pApi->xQueryPhrase(pFts, i, (void*)&nHit, fts5CountCb); + if( rc==SQLITE_OK ){ + /* Calculate the IDF (Inverse Document Frequency) for phrase i. + ** This is done using the standard BM25 formula as found on wikipedia: + ** + ** IDF = log( (N - nHit + 0.5) / (nHit + 0.5) ) + ** + ** where "N" is the total number of documents in the set and nHit + ** is the number that contain at least one instance of the phrase + ** under consideration. + ** + ** The problem with this is that if (N < 2*nHit), the IDF is + ** negative. Which is undesirable. So the mimimum allowable IDF is + ** (1e-6) - roughly the same as a term that appears in just over + ** half of set of 5,000,000 documents. */ + double idf = log( (nRow - nHit + 0.5) / (nHit + 0.5) ); + if( idf<=0.0 ) idf = 1e-6; + p->aIDF[i] = idf; + } + } + + if( rc!=SQLITE_OK ){ + sqlite3_free(p); + }else{ + rc = pApi->xSetAuxdata(pFts, p, sqlite3_free); + } + if( rc!=SQLITE_OK ) p = 0; + } + *ppData = p; + return rc; +} + +/* +** Implementation of bm25() function. +*/ +static void fts5Bm25Function( + const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ + Fts5Context *pFts, /* First arg to pass to pApi functions */ + sqlite3_context *pCtx, /* Context for returning result/error */ + int nVal, /* Number of values in apVal[] array */ + sqlite3_value **apVal /* Array of trailing arguments */ +){ + const double k1 = 1.2; /* Constant "k1" from BM25 formula */ + const double b = 0.75; /* Constant "b" from BM25 formula */ + int rc; /* Error code */ + double score = 0.0; /* SQL function return value */ + Fts5Bm25Data *pData; /* Values allocated/calculated once only */ + int i; /* Iterator variable */ + int nInst = 0; /* Value returned by xInstCount() */ + double D = 0.0; /* Total number of tokens in row */ + double *aFreq = 0; /* Array of phrase freq. for current row */ + + /* Calculate the phrase frequency (symbol "f(qi,D)" in the documentation) + ** for each phrase in the query for the current row. */ + rc = fts5Bm25GetData(pApi, pFts, &pData); + if( rc==SQLITE_OK ){ + aFreq = pData->aFreq; + memset(aFreq, 0, sizeof(double) * pData->nPhrase); + rc = pApi->xInstCount(pFts, &nInst); + } + for(i=0; rc==SQLITE_OK && i<nInst; i++){ + int ip; int ic; int io; + rc = pApi->xInst(pFts, i, &ip, &ic, &io); + if( rc==SQLITE_OK ){ + double w = (nVal > ic) ? sqlite3_value_double(apVal[ic]) : 1.0; + aFreq[ip] += w; + } + } + + /* Figure out the total size of the current row in tokens. */ + if( rc==SQLITE_OK ){ + int nTok; + rc = pApi->xColumnSize(pFts, -1, &nTok); + D = (double)nTok; + } + + /* Determine and return the BM25 score for the current row. Or, if an + ** error has occurred, throw an exception. */ + if( rc==SQLITE_OK ){ + for(i=0; i<pData->nPhrase; i++){ + score += pData->aIDF[i] * ( + ( aFreq[i] * (k1 + 1.0) ) / + ( aFreq[i] + k1 * (1 - b + b * D / pData->avgdl) ) + ); + } + sqlite3_result_double(pCtx, -1.0 * score); + }else{ + sqlite3_result_error_code(pCtx, rc); + } +} + +int sqlite3Fts5AuxInit(fts5_api *pApi){ + struct Builtin { + const char *zFunc; /* Function name (nul-terminated) */ + void *pUserData; /* User-data pointer */ + fts5_extension_function xFunc;/* Callback function */ + void (*xDestroy)(void*); /* Destructor function */ + } aBuiltin [] = { + { "snippet", 0, fts5SnippetFunction, 0 }, + { "highlight", 0, fts5HighlightFunction, 0 }, + { "bm25", 0, fts5Bm25Function, 0 }, + }; + int rc = SQLITE_OK; /* Return code */ + int i; /* To iterate through builtin functions */ + + for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){ + rc = pApi->xCreateFunction(pApi, + aBuiltin[i].zFunc, + aBuiltin[i].pUserData, + aBuiltin[i].xFunc, + aBuiltin[i].xDestroy + ); + } + + return rc; +} |