diff options
Diffstat (limited to 'ext/fts5/fts5_hash.c')
-rw-r--r-- | ext/fts5/fts5_hash.c | 560 |
1 files changed, 560 insertions, 0 deletions
diff --git a/ext/fts5/fts5_hash.c b/ext/fts5/fts5_hash.c new file mode 100644 index 0000000..bc9244f --- /dev/null +++ b/ext/fts5/fts5_hash.c @@ -0,0 +1,560 @@ +/* +** 2014 August 11 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +** +*/ + + + +#include "fts5Int.h" + +typedef struct Fts5HashEntry Fts5HashEntry; + +/* +** This file contains the implementation of an in-memory hash table used +** to accumuluate "term -> doclist" content before it is flused to a level-0 +** segment. +*/ + + +struct Fts5Hash { + int eDetail; /* Copy of Fts5Config.eDetail */ + int *pnByte; /* Pointer to bytes counter */ + int nEntry; /* Number of entries currently in hash */ + int nSlot; /* Size of aSlot[] array */ + Fts5HashEntry *pScan; /* Current ordered scan item */ + Fts5HashEntry **aSlot; /* Array of hash slots */ +}; + +/* +** Each entry in the hash table is represented by an object of the +** following type. Each object, its key (a nul-terminated string) and +** its current data are stored in a single memory allocation. The +** key immediately follows the object in memory. The position list +** data immediately follows the key data in memory. +** +** The data that follows the key is in a similar, but not identical format +** to the doclist data stored in the database. It is: +** +** * Rowid, as a varint +** * Position list, without 0x00 terminator. +** * Size of previous position list and rowid, as a 4 byte +** big-endian integer. +** +** iRowidOff: +** Offset of last rowid written to data area. Relative to first byte of +** structure. +** +** nData: +** Bytes of data written since iRowidOff. +*/ +struct Fts5HashEntry { + Fts5HashEntry *pHashNext; /* Next hash entry with same hash-key */ + Fts5HashEntry *pScanNext; /* Next entry in sorted order */ + + int nAlloc; /* Total size of allocation */ + int iSzPoslist; /* Offset of space for 4-byte poslist size */ + int nData; /* Total bytes of data (incl. structure) */ + int nKey; /* Length of key in bytes */ + u8 bDel; /* Set delete-flag @ iSzPoslist */ + u8 bContent; /* Set content-flag (detail=none mode) */ + i16 iCol; /* Column of last value written */ + int iPos; /* Position of last value written */ + i64 iRowid; /* Rowid of last value written */ +}; + +/* +** Eqivalent to: +** +** char *fts5EntryKey(Fts5HashEntry *pEntry){ return zKey; } +*/ +#define fts5EntryKey(p) ( ((char *)(&(p)[1])) ) + + +/* +** Allocate a new hash table. +*/ +int sqlite3Fts5HashNew(Fts5Config *pConfig, Fts5Hash **ppNew, int *pnByte){ + int rc = SQLITE_OK; + Fts5Hash *pNew; + + *ppNew = pNew = (Fts5Hash*)sqlite3_malloc(sizeof(Fts5Hash)); + if( pNew==0 ){ + rc = SQLITE_NOMEM; + }else{ + sqlite3_int64 nByte; + memset(pNew, 0, sizeof(Fts5Hash)); + pNew->pnByte = pnByte; + pNew->eDetail = pConfig->eDetail; + + pNew->nSlot = 1024; + nByte = sizeof(Fts5HashEntry*) * pNew->nSlot; + pNew->aSlot = (Fts5HashEntry**)sqlite3_malloc64(nByte); + if( pNew->aSlot==0 ){ + sqlite3_free(pNew); + *ppNew = 0; + rc = SQLITE_NOMEM; + }else{ + memset(pNew->aSlot, 0, (size_t)nByte); + } + } + return rc; +} + +/* +** Free a hash table object. +*/ +void sqlite3Fts5HashFree(Fts5Hash *pHash){ + if( pHash ){ + sqlite3Fts5HashClear(pHash); + sqlite3_free(pHash->aSlot); + sqlite3_free(pHash); + } +} + +/* +** Empty (but do not delete) a hash table. +*/ +void sqlite3Fts5HashClear(Fts5Hash *pHash){ + int i; + for(i=0; i<pHash->nSlot; i++){ + Fts5HashEntry *pNext; + Fts5HashEntry *pSlot; + for(pSlot=pHash->aSlot[i]; pSlot; pSlot=pNext){ + pNext = pSlot->pHashNext; + sqlite3_free(pSlot); + } + } + memset(pHash->aSlot, 0, pHash->nSlot * sizeof(Fts5HashEntry*)); + pHash->nEntry = 0; +} + +static unsigned int fts5HashKey(int nSlot, const u8 *p, int n){ + int i; + unsigned int h = 13; + for(i=n-1; i>=0; i--){ + h = (h << 3) ^ h ^ p[i]; + } + return (h % nSlot); +} + +static unsigned int fts5HashKey2(int nSlot, u8 b, const u8 *p, int n){ + int i; + unsigned int h = 13; + for(i=n-1; i>=0; i--){ + h = (h << 3) ^ h ^ p[i]; + } + h = (h << 3) ^ h ^ b; + return (h % nSlot); +} + +/* +** Resize the hash table by doubling the number of slots. +*/ +static int fts5HashResize(Fts5Hash *pHash){ + int nNew = pHash->nSlot*2; + int i; + Fts5HashEntry **apNew; + Fts5HashEntry **apOld = pHash->aSlot; + + apNew = (Fts5HashEntry**)sqlite3_malloc64(nNew*sizeof(Fts5HashEntry*)); + if( !apNew ) return SQLITE_NOMEM; + memset(apNew, 0, nNew*sizeof(Fts5HashEntry*)); + + for(i=0; i<pHash->nSlot; i++){ + while( apOld[i] ){ + unsigned int iHash; + Fts5HashEntry *p = apOld[i]; + apOld[i] = p->pHashNext; + iHash = fts5HashKey(nNew, (u8*)fts5EntryKey(p), + (int)strlen(fts5EntryKey(p))); + p->pHashNext = apNew[iHash]; + apNew[iHash] = p; + } + } + + sqlite3_free(apOld); + pHash->nSlot = nNew; + pHash->aSlot = apNew; + return SQLITE_OK; +} + +static int fts5HashAddPoslistSize( + Fts5Hash *pHash, + Fts5HashEntry *p, + Fts5HashEntry *p2 +){ + int nRet = 0; + if( p->iSzPoslist ){ + u8 *pPtr = p2 ? (u8*)p2 : (u8*)p; + int nData = p->nData; + if( pHash->eDetail==FTS5_DETAIL_NONE ){ + assert( nData==p->iSzPoslist ); + if( p->bDel ){ + pPtr[nData++] = 0x00; + if( p->bContent ){ + pPtr[nData++] = 0x00; + } + } + }else{ + int nSz = (nData - p->iSzPoslist - 1); /* Size in bytes */ + int nPos = nSz*2 + p->bDel; /* Value of nPos field */ + + assert( p->bDel==0 || p->bDel==1 ); + if( nPos<=127 ){ + pPtr[p->iSzPoslist] = (u8)nPos; + }else{ + int nByte = sqlite3Fts5GetVarintLen((u32)nPos); + memmove(&pPtr[p->iSzPoslist + nByte], &pPtr[p->iSzPoslist + 1], nSz); + sqlite3Fts5PutVarint(&pPtr[p->iSzPoslist], nPos); + nData += (nByte-1); + } + } + + nRet = nData - p->nData; + if( p2==0 ){ + p->iSzPoslist = 0; + p->bDel = 0; + p->bContent = 0; + p->nData = nData; + } + } + return nRet; +} + +/* +** Add an entry to the in-memory hash table. The key is the concatenation +** of bByte and (pToken/nToken). The value is (iRowid/iCol/iPos). +** +** (bByte || pToken) -> (iRowid,iCol,iPos) +** +** Or, if iCol is negative, then the value is a delete marker. +*/ +int sqlite3Fts5HashWrite( + Fts5Hash *pHash, + i64 iRowid, /* Rowid for this entry */ + int iCol, /* Column token appears in (-ve -> delete) */ + int iPos, /* Position of token within column */ + char bByte, /* First byte of token */ + const char *pToken, int nToken /* Token to add or remove to or from index */ +){ + unsigned int iHash; + Fts5HashEntry *p; + u8 *pPtr; + int nIncr = 0; /* Amount to increment (*pHash->pnByte) by */ + int bNew; /* If non-delete entry should be written */ + + bNew = (pHash->eDetail==FTS5_DETAIL_FULL); + + /* Attempt to locate an existing hash entry */ + iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken); + for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){ + char *zKey = fts5EntryKey(p); + if( zKey[0]==bByte + && p->nKey==nToken + && memcmp(&zKey[1], pToken, nToken)==0 + ){ + break; + } + } + + /* If an existing hash entry cannot be found, create a new one. */ + if( p==0 ){ + /* Figure out how much space to allocate */ + char *zKey; + sqlite3_int64 nByte = sizeof(Fts5HashEntry) + (nToken+1) + 1 + 64; + if( nByte<128 ) nByte = 128; + + /* Grow the Fts5Hash.aSlot[] array if necessary. */ + if( (pHash->nEntry*2)>=pHash->nSlot ){ + int rc = fts5HashResize(pHash); + if( rc!=SQLITE_OK ) return rc; + iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken); + } + + /* Allocate new Fts5HashEntry and add it to the hash table. */ + p = (Fts5HashEntry*)sqlite3_malloc64(nByte); + if( !p ) return SQLITE_NOMEM; + memset(p, 0, sizeof(Fts5HashEntry)); + p->nAlloc = (int)nByte; + zKey = fts5EntryKey(p); + zKey[0] = bByte; + memcpy(&zKey[1], pToken, nToken); + assert( iHash==fts5HashKey(pHash->nSlot, (u8*)zKey, nToken+1) ); + p->nKey = nToken; + zKey[nToken+1] = '\0'; + p->nData = nToken+1 + 1 + sizeof(Fts5HashEntry); + p->pHashNext = pHash->aSlot[iHash]; + pHash->aSlot[iHash] = p; + pHash->nEntry++; + + /* Add the first rowid field to the hash-entry */ + p->nData += sqlite3Fts5PutVarint(&((u8*)p)[p->nData], iRowid); + p->iRowid = iRowid; + + p->iSzPoslist = p->nData; + if( pHash->eDetail!=FTS5_DETAIL_NONE ){ + p->nData += 1; + p->iCol = (pHash->eDetail==FTS5_DETAIL_FULL ? 0 : -1); + } + + }else{ + + /* Appending to an existing hash-entry. Check that there is enough + ** space to append the largest possible new entry. Worst case scenario + ** is: + ** + ** + 9 bytes for a new rowid, + ** + 4 byte reserved for the "poslist size" varint. + ** + 1 byte for a "new column" byte, + ** + 3 bytes for a new column number (16-bit max) as a varint, + ** + 5 bytes for the new position offset (32-bit max). + */ + if( (p->nAlloc - p->nData) < (9 + 4 + 1 + 3 + 5) ){ + sqlite3_int64 nNew = p->nAlloc * 2; + Fts5HashEntry *pNew; + Fts5HashEntry **pp; + pNew = (Fts5HashEntry*)sqlite3_realloc64(p, nNew); + if( pNew==0 ) return SQLITE_NOMEM; + pNew->nAlloc = (int)nNew; + for(pp=&pHash->aSlot[iHash]; *pp!=p; pp=&(*pp)->pHashNext); + *pp = pNew; + p = pNew; + } + nIncr -= p->nData; + } + assert( (p->nAlloc - p->nData) >= (9 + 4 + 1 + 3 + 5) ); + + pPtr = (u8*)p; + + /* If this is a new rowid, append the 4-byte size field for the previous + ** entry, and the new rowid for this entry. */ + if( iRowid!=p->iRowid ){ + u64 iDiff = (u64)iRowid - (u64)p->iRowid; + fts5HashAddPoslistSize(pHash, p, 0); + p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iDiff); + p->iRowid = iRowid; + bNew = 1; + p->iSzPoslist = p->nData; + if( pHash->eDetail!=FTS5_DETAIL_NONE ){ + p->nData += 1; + p->iCol = (pHash->eDetail==FTS5_DETAIL_FULL ? 0 : -1); + p->iPos = 0; + } + } + + if( iCol>=0 ){ + if( pHash->eDetail==FTS5_DETAIL_NONE ){ + p->bContent = 1; + }else{ + /* Append a new column value, if necessary */ + assert_nc( iCol>=p->iCol ); + if( iCol!=p->iCol ){ + if( pHash->eDetail==FTS5_DETAIL_FULL ){ + pPtr[p->nData++] = 0x01; + p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iCol); + p->iCol = (i16)iCol; + p->iPos = 0; + }else{ + bNew = 1; + p->iCol = (i16)(iPos = iCol); + } + } + + /* Append the new position offset, if necessary */ + if( bNew ){ + p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iPos - p->iPos + 2); + p->iPos = iPos; + } + } + }else{ + /* This is a delete. Set the delete flag. */ + p->bDel = 1; + } + + nIncr += p->nData; + *pHash->pnByte += nIncr; + return SQLITE_OK; +} + + +/* +** Arguments pLeft and pRight point to linked-lists of hash-entry objects, +** each sorted in key order. This function merges the two lists into a +** single list and returns a pointer to its first element. +*/ +static Fts5HashEntry *fts5HashEntryMerge( + Fts5HashEntry *pLeft, + Fts5HashEntry *pRight +){ + Fts5HashEntry *p1 = pLeft; + Fts5HashEntry *p2 = pRight; + Fts5HashEntry *pRet = 0; + Fts5HashEntry **ppOut = &pRet; + + while( p1 || p2 ){ + if( p1==0 ){ + *ppOut = p2; + p2 = 0; + }else if( p2==0 ){ + *ppOut = p1; + p1 = 0; + }else{ + int i = 0; + char *zKey1 = fts5EntryKey(p1); + char *zKey2 = fts5EntryKey(p2); + while( zKey1[i]==zKey2[i] ) i++; + + if( ((u8)zKey1[i])>((u8)zKey2[i]) ){ + /* p2 is smaller */ + *ppOut = p2; + ppOut = &p2->pScanNext; + p2 = p2->pScanNext; + }else{ + /* p1 is smaller */ + *ppOut = p1; + ppOut = &p1->pScanNext; + p1 = p1->pScanNext; + } + *ppOut = 0; + } + } + + return pRet; +} + +/* +** Extract all tokens from hash table iHash and link them into a list +** in sorted order. The hash table is cleared before returning. It is +** the responsibility of the caller to free the elements of the returned +** list. +*/ +static int fts5HashEntrySort( + Fts5Hash *pHash, + const char *pTerm, int nTerm, /* Query prefix, if any */ + Fts5HashEntry **ppSorted +){ + const int nMergeSlot = 32; + Fts5HashEntry **ap; + Fts5HashEntry *pList; + int iSlot; + int i; + + *ppSorted = 0; + ap = sqlite3_malloc64(sizeof(Fts5HashEntry*) * nMergeSlot); + if( !ap ) return SQLITE_NOMEM; + memset(ap, 0, sizeof(Fts5HashEntry*) * nMergeSlot); + + for(iSlot=0; iSlot<pHash->nSlot; iSlot++){ + Fts5HashEntry *pIter; + for(pIter=pHash->aSlot[iSlot]; pIter; pIter=pIter->pHashNext){ + if( pTerm==0 + || (pIter->nKey+1>=nTerm && 0==memcmp(fts5EntryKey(pIter), pTerm, nTerm)) + ){ + Fts5HashEntry *pEntry = pIter; + pEntry->pScanNext = 0; + for(i=0; ap[i]; i++){ + pEntry = fts5HashEntryMerge(pEntry, ap[i]); + ap[i] = 0; + } + ap[i] = pEntry; + } + } + } + + pList = 0; + for(i=0; i<nMergeSlot; i++){ + pList = fts5HashEntryMerge(pList, ap[i]); + } + + pHash->nEntry = 0; + sqlite3_free(ap); + *ppSorted = pList; + return SQLITE_OK; +} + +/* +** Query the hash table for a doclist associated with term pTerm/nTerm. +*/ +int sqlite3Fts5HashQuery( + Fts5Hash *pHash, /* Hash table to query */ + int nPre, + const char *pTerm, int nTerm, /* Query term */ + void **ppOut, /* OUT: Pointer to new object */ + int *pnDoclist /* OUT: Size of doclist in bytes */ +){ + unsigned int iHash = fts5HashKey(pHash->nSlot, (const u8*)pTerm, nTerm); + char *zKey = 0; + Fts5HashEntry *p; + + for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){ + zKey = fts5EntryKey(p); + assert( p->nKey+1==(int)strlen(zKey) ); + if( nTerm==p->nKey+1 && memcmp(zKey, pTerm, nTerm)==0 ) break; + } + + if( p ){ + int nHashPre = sizeof(Fts5HashEntry) + nTerm + 1; + int nList = p->nData - nHashPre; + u8 *pRet = (u8*)(*ppOut = sqlite3_malloc64(nPre + nList + 10)); + if( pRet ){ + Fts5HashEntry *pFaux = (Fts5HashEntry*)&pRet[nPre-nHashPre]; + memcpy(&pRet[nPre], &((u8*)p)[nHashPre], nList); + nList += fts5HashAddPoslistSize(pHash, p, pFaux); + *pnDoclist = nList; + }else{ + *pnDoclist = 0; + return SQLITE_NOMEM; + } + }else{ + *ppOut = 0; + *pnDoclist = 0; + } + + return SQLITE_OK; +} + +int sqlite3Fts5HashScanInit( + Fts5Hash *p, /* Hash table to query */ + const char *pTerm, int nTerm /* Query prefix */ +){ + return fts5HashEntrySort(p, pTerm, nTerm, &p->pScan); +} + +void sqlite3Fts5HashScanNext(Fts5Hash *p){ + assert( !sqlite3Fts5HashScanEof(p) ); + p->pScan = p->pScan->pScanNext; +} + +int sqlite3Fts5HashScanEof(Fts5Hash *p){ + return (p->pScan==0); +} + +void sqlite3Fts5HashScanEntry( + Fts5Hash *pHash, + const char **pzTerm, /* OUT: term (nul-terminated) */ + const u8 **ppDoclist, /* OUT: pointer to doclist */ + int *pnDoclist /* OUT: size of doclist in bytes */ +){ + Fts5HashEntry *p; + if( (p = pHash->pScan) ){ + char *zKey = fts5EntryKey(p); + int nTerm = (int)strlen(zKey); + fts5HashAddPoslistSize(pHash, p, 0); + *pzTerm = zKey; + *ppDoclist = (const u8*)&zKey[nTerm+1]; + *pnDoclist = p->nData - (sizeof(Fts5HashEntry) + nTerm + 1); + }else{ + *pzTerm = 0; + *ppDoclist = 0; + *pnDoclist = 0; + } +} |