1 files changed, 1976 insertions, 0 deletions
diff --git a/ext/lsm1/lsm_shared.c b/ext/lsm1/lsm_shared.c
new file mode 100644
index 0000000..2fdacf1
--- /dev/null
+++ b/ext/lsm1/lsm_shared.c
@@ -0,0 +1,1976 @@
+/*
+** 2012-01-23
+**
+** The author disclaims copyright to this source code.  In place of
+** a legal notice, here is a blessing:
+**
+**    May you do good and not evil.
+**    May you find forgiveness for yourself and forgive others.
+**    May you share freely, never taking more than you give.
+**
+*************************************************************************
+**
+** Utilities used to help multiple LSM clients to coexist within the
+** same process space.
+*/
+#include "lsmInt.h"
+
+/*
+** Global data. All global variables used by code in this file are grouped
+** into the following structure instance.
+**
+** pDatabase:
+**   Linked list of all Database objects allocated within this process.
+**   This list may not be traversed without holding the global mutex (see
+**   functions enterGlobalMutex() and leaveGlobalMutex()).
+*/
+static struct SharedData {
+  Database *pDatabase;            /* Linked list of all Database objects */
+} gShared;
+
+/*
+** Database structure. There is one such structure for each distinct 
+** database accessed by this process. They are stored in the singly linked 
+** list starting at global variable gShared.pDatabase. Database objects are 
+** reference counted. Once the number of connections to the associated
+** database drops to zero, they are removed from the linked list and deleted.
+**
+** pFile:
+**   In multi-process mode, this file descriptor is used to obtain locks 
+**   and to access shared-memory. In single process mode, its only job is
+**   to hold the exclusive lock on the file.
+**   
+*/
+struct Database {
+  /* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */
+  char *zName;                    /* Canonical path to database file */
+  int nName;                      /* strlen(zName) */
+  int nDbRef;                     /* Number of associated lsm_db handles */
+  Database *pDbNext;              /* Next Database structure in global list */
+
+  /* Protected by the local mutex (pClientMutex) */
+  int bReadonly;                  /* True if Database.pFile is read-only */
+  int bMultiProc;                 /* True if running in multi-process mode */
+  lsm_file *pFile;                /* Used for locks/shm in multi-proc mode */
+  LsmFile *pLsmFile;              /* List of deferred closes */
+  lsm_mutex *pClientMutex;        /* Protects the apShmChunk[] and pConn */
+  int nShmChunk;                  /* Number of entries in apShmChunk[] array */
+  void **apShmChunk;              /* Array of "shared" memory regions */
+  lsm_db *pConn;                  /* List of connections to this db. */
+};
+
+/*
+** Functions to enter and leave the global mutex. This mutex is used
+** to protect the global linked-list headed at gShared.pDatabase.
+*/
+static int enterGlobalMutex(lsm_env *pEnv){
+  lsm_mutex *p;
+  int rc = lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
+  if( rc==LSM_OK ) lsmMutexEnter(pEnv, p);
+  return rc;
+}
+static void leaveGlobalMutex(lsm_env *pEnv){
+  lsm_mutex *p;
+  lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
+  lsmMutexLeave(pEnv, p);
+}
+
+#ifdef LSM_DEBUG
+static int holdingGlobalMutex(lsm_env *pEnv){
+  lsm_mutex *p;
+  lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
+  return lsmMutexHeld(pEnv, p);
+}
+#endif
+
+#if 0
+static void assertNotInFreelist(Freelist *p, int iBlk){
+  int i; 
+  for(i=0; i<p->nEntry; i++){
+    assert( p->aEntry[i].iBlk!=iBlk );
+  }
+}
+#else
+# define assertNotInFreelist(x,y)
+#endif
+
+/*
+** Append an entry to the free-list. If (iId==-1), this is a delete.
+*/
+int freelistAppend(lsm_db *db, u32 iBlk, i64 iId){
+  lsm_env *pEnv = db->pEnv;
+  Freelist *p;
+  int i; 
+
+  assert( iId==-1 || iId>=0 );
+  p = db->bUseFreelist ? db->pFreelist : &db->pWorker->freelist;
+
+  /* Extend the space allocated for the freelist, if required */
+  assert( p->nAlloc>=p->nEntry );
+  if( p->nAlloc==p->nEntry ){
+    int nNew; 
+    int nByte; 
+    FreelistEntry *aNew;
+
+    nNew = (p->nAlloc==0 ? 4 : p->nAlloc*2);
+    nByte = sizeof(FreelistEntry) * nNew;
+    aNew = (FreelistEntry *)lsmRealloc(pEnv, p->aEntry, nByte);
+    if( !aNew ) return LSM_NOMEM_BKPT;
+    p->nAlloc = nNew;
+    p->aEntry = aNew;
+  }
+
+  for(i=0; i<p->nEntry; i++){
+    assert( i==0 || p->aEntry[i].iBlk > p->aEntry[i-1].iBlk );
+    if( p->aEntry[i].iBlk>=iBlk ) break;
+  }
+
+  if( i<p->nEntry && p->aEntry[i].iBlk==iBlk ){
+    /* Clobber an existing entry */
+    p->aEntry[i].iId = iId;
+  }else{
+    /* Insert a new entry into the list */
+    int nByte = sizeof(FreelistEntry)*(p->nEntry-i);
+    memmove(&p->aEntry[i+1], &p->aEntry[i], nByte);
+    p->aEntry[i].iBlk = iBlk;
+    p->aEntry[i].iId = iId;
+    p->nEntry++;
+  }
+
+  return LSM_OK;
+}
+
+/*
+** This function frees all resources held by the Database structure passed
+** as the only argument.
+*/
+static void freeDatabase(lsm_env *pEnv, Database *p){
+  assert( holdingGlobalMutex(pEnv) );
+  if( p ){
+    /* Free the mutexes */
+    lsmMutexDel(pEnv, p->pClientMutex);
+
+    if( p->pFile ){
+      lsmEnvClose(pEnv, p->pFile);
+    }
+
+    /* Free the array of shm pointers */
+    lsmFree(pEnv, p->apShmChunk);
+
+    /* Free the memory allocated for the Database struct itself */
+    lsmFree(pEnv, p);
+  }
+}
+
+typedef struct DbTruncateCtx DbTruncateCtx;
+struct DbTruncateCtx {
+  int nBlock;
+  i64 iInUse;
+};
+
+static int dbTruncateCb(void *pCtx, int iBlk, i64 iSnapshot){
+  DbTruncateCtx *p = (DbTruncateCtx *)pCtx;
+  if( iBlk!=p->nBlock || (p->iInUse>=0 && iSnapshot>=p->iInUse) ) return 1;
+  p->nBlock--;
+  return 0;
+}
+
+static int dbTruncate(lsm_db *pDb, i64 iInUse){
+  int rc = LSM_OK;
+#if 0
+  int i;
+  DbTruncateCtx ctx;
+
+  assert( pDb->pWorker );
+  ctx.nBlock = pDb->pWorker->nBlock;
+  ctx.iInUse = iInUse;
+
+  rc = lsmWalkFreelist(pDb, 1, dbTruncateCb, (void *)&ctx);
+  for(i=ctx.nBlock+1; rc==LSM_OK && i<=pDb->pWorker->nBlock; i++){
+    rc = freelistAppend(pDb, i, -1);
+  }
+
+  if( rc==LSM_OK ){
+#ifdef LSM_LOG_FREELIST
+    if( ctx.nBlock!=pDb->pWorker->nBlock ){
+      lsmLogMessage(pDb, 0, 
+          "dbTruncate(): truncated db to %d blocks",ctx.nBlock
+      );
+    }
+#endif
+    pDb->pWorker->nBlock = ctx.nBlock;
+  }
+#endif
+  return rc;
+}
+
+
+/*
+** This function is called during database shutdown (when the number of
+** connections drops from one to zero). It truncates the database file
+** to as small a size as possible without truncating away any blocks that
+** contain data.
+*/
+static int dbTruncateFile(lsm_db *pDb){
+  int rc;
+
+  assert( pDb->pWorker==0 );
+  assert( lsmShmAssertLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL) );
+  rc = lsmCheckpointLoadWorker(pDb);
+
+  if( rc==LSM_OK ){
+    DbTruncateCtx ctx;
+
+    /* Walk the database free-block-list in reverse order. Set ctx.nBlock
+    ** to the block number of the last block in the database that actually
+    ** contains data. */
+    ctx.nBlock = pDb->pWorker->nBlock;
+    ctx.iInUse = -1;
+    rc = lsmWalkFreelist(pDb, 1, dbTruncateCb, (void *)&ctx);
+
+    /* If the last block that contains data is not already the last block in
+    ** the database file, truncate the database file so that it is. */
+    if( rc==LSM_OK ){
+      rc = lsmFsTruncateDb(
+          pDb->pFS, (i64)ctx.nBlock*lsmFsBlockSize(pDb->pFS)
+      );
+    }
+  }
+
+  lsmFreeSnapshot(pDb->pEnv, pDb->pWorker);
+  pDb->pWorker = 0;
+  return rc;
+}
+
+static void doDbDisconnect(lsm_db *pDb){
+  int rc;
+
+  if( pDb->bReadonly ){
+    lsmShmLock(pDb, LSM_LOCK_DMS3, LSM_LOCK_UNLOCK, 0);
+  }else{
+    /* Block for an exclusive lock on DMS1. This lock serializes all calls
+    ** to doDbConnect() and doDbDisconnect() across all processes.  */
+    rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
+    if( rc==LSM_OK ){
+
+      lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0);
+
+      /* Try an exclusive lock on DMS2. If successful, this is the last
+      ** connection to the database. In this case flush the contents of the
+      ** in-memory tree to disk and write a checkpoint.  */
+      rc = lsmShmTestLock(pDb, LSM_LOCK_DMS2, 1, LSM_LOCK_EXCL);
+      if( rc==LSM_OK ){
+        rc = lsmShmTestLock(pDb, LSM_LOCK_CHECKPOINTER, 1, LSM_LOCK_EXCL);
+      }
+      if( rc==LSM_OK ){
+        int bReadonly = 0;        /* True if there exist read-only conns. */
+
+        /* Flush the in-memory tree, if required. If there is data to flush,
+        ** this will create a new client snapshot in Database.pClient. The
+        ** checkpoint (serialization) of this snapshot may be written to disk
+        ** by the following block.  
+        **
+        ** There is no need to take a WRITER lock here. That there are no 
+        ** other locks on DMS2 guarantees that there are no other read-write
+        ** connections at this time (and the lock on DMS1 guarantees that
+        ** no new ones may appear).
+        */
+        rc = lsmTreeLoadHeader(pDb, 0);
+        if( rc==LSM_OK && (lsmTreeHasOld(pDb) || lsmTreeSize(pDb)>0) ){
+          rc = lsmFlushTreeToDisk(pDb);
+        }
+
+        /* Now check if there are any read-only connections. If there are,
+        ** then do not truncate the db file or unlink the shared-memory 
+        ** region.  */
+        if( rc==LSM_OK ){
+          rc = lsmShmTestLock(pDb, LSM_LOCK_DMS3, 1, LSM_LOCK_EXCL);
+          if( rc==LSM_BUSY ){
+            bReadonly = 1;
+            rc = LSM_OK;
+          }
+        }
+
+        /* Write a checkpoint to disk. */
+        if( rc==LSM_OK ){
+          rc = lsmCheckpointWrite(pDb, 0);
+        }
+
+        /* If the checkpoint was written successfully, delete the log file
+        ** and, if possible, truncate the database file.  */
+        if( rc==LSM_OK ){
+          int bRotrans = 0;
+          Database *p = pDb->pDatabase;
+
+          /* The log file may only be deleted if there are no clients 
+          ** read-only clients running rotrans transactions.  */
+          rc = lsmDetectRoTrans(pDb, &bRotrans);
+          if( rc==LSM_OK && bRotrans==0 ){
+            lsmFsCloseAndDeleteLog(pDb->pFS);
+          }
+
+          /* The database may only be truncated if there exist no read-only
+          ** clients - either connected or running rotrans transactions. */
+          if( bReadonly==0 && bRotrans==0 ){
+            lsmFsUnmap(pDb->pFS);
+            dbTruncateFile(pDb);
+            if( p->pFile && p->bMultiProc ){
+              lsmEnvShmUnmap(pDb->pEnv, p->pFile, 1);
+            }
+          }
+        }
+      }
+    }
+
+    if( pDb->iRwclient>=0 ){
+      lsmShmLock(pDb, LSM_LOCK_RWCLIENT(pDb->iRwclient), LSM_LOCK_UNLOCK, 0);
+      pDb->iRwclient = -1;
+    }
+
+    lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
+  }
+  pDb->pShmhdr = 0;
+}
+
+static int doDbConnect(lsm_db *pDb){
+  const int nUsMax = 100000;      /* Max value for nUs */
+  int nUs = 1000;                 /* us to wait between DMS1 attempts */
+  int rc;
+
+  /* Obtain a pointer to the shared-memory header */
+  assert( pDb->pShmhdr==0 );
+  assert( pDb->bReadonly==0 );
+
+  /* Block for an exclusive lock on DMS1. This lock serializes all calls
+  ** to doDbConnect() and doDbDisconnect() across all processes.  */
+  while( 1 ){
+    rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
+    if( rc!=LSM_BUSY ) break;
+    lsmEnvSleep(pDb->pEnv, nUs);
+    nUs = nUs * 2;
+    if( nUs>nUsMax ) nUs = nUsMax;
+  }
+  if( rc==LSM_OK ){
+    rc = lsmShmCacheChunks(pDb, 1);
+  }
+  if( rc!=LSM_OK ) return rc;
+  pDb->pShmhdr = (ShmHeader *)pDb->apShm[0];
+
+  /* Try an exclusive lock on DMS2/DMS3. If successful, this is the first 
+  ** and only connection to the database. In this case initialize the 
+  ** shared-memory and run log file recovery.  */
+  assert( LSM_LOCK_DMS3==1+LSM_LOCK_DMS2 );
+  rc = lsmShmTestLock(pDb, LSM_LOCK_DMS2, 2, LSM_LOCK_EXCL);
+  if( rc==LSM_OK ){
+    memset(pDb->pShmhdr, 0, sizeof(ShmHeader));
+    rc = lsmCheckpointRecover(pDb);
+    if( rc==LSM_OK ){
+      rc = lsmLogRecover(pDb);
+    }
+    if( rc==LSM_OK ){
+      ShmHeader *pShm = pDb->pShmhdr;
+      pShm->aReader[0].iLsmId = lsmCheckpointId(pShm->aSnap1, 0);
+      pShm->aReader[0].iTreeId = pDb->treehdr.iUsedShmid;
+    }
+  }else if( rc==LSM_BUSY ){
+    rc = LSM_OK;
+  }
+
+  /* Take a shared lock on DMS2. In multi-process mode this lock "cannot" 
+  ** fail, as connections may only hold an exclusive lock on DMS2 if they 
+  ** first hold an exclusive lock on DMS1. And this connection is currently 
+  ** holding the exclusive lock on DSM1. 
+  **
+  ** However, if some other connection has the database open in single-process
+  ** mode, this operation will fail. In this case, return the error to the
+  ** caller - the attempt to connect to the db has failed.
+  */
+  if( rc==LSM_OK ){
+    rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_SHARED, 0);
+  }
+
+  /* If anything went wrong, unlock DMS2. Otherwise, try to take an exclusive
+  ** lock on one of the LSM_LOCK_RWCLIENT() locks. Unlock DMS1 in any case. */
+  if( rc!=LSM_OK ){
+    pDb->pShmhdr = 0;
+  }else{
+    int i;
+    for(i=0; i<LSM_LOCK_NRWCLIENT; i++){
+      int rc2 = lsmShmLock(pDb, LSM_LOCK_RWCLIENT(i), LSM_LOCK_EXCL, 0);
+      if( rc2==LSM_OK ) pDb->iRwclient = i;
+      if( rc2!=LSM_BUSY ){
+        rc = rc2;
+        break;
+      }
+    }
+  }
+  lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
+
+  return rc;
+}
+
+static int dbOpenSharedFd(lsm_env *pEnv, Database *p, int bRoOk){
+  int rc;
+
+  rc = lsmEnvOpen(pEnv, p->zName, 0, &p->pFile);
+  if( rc==LSM_IOERR && bRoOk ){
+    rc = lsmEnvOpen(pEnv, p->zName, LSM_OPEN_READONLY, &p->pFile);
+    p->bReadonly = 1;
+  }
+
+  return rc;
+}
+
+/*
+** Return a reference to the shared Database handle for the database 
+** identified by canonical path zName. If this is the first connection to
+** the named database, a new Database object is allocated. Otherwise, a
+** pointer to an existing object is returned.
+**
+** If successful, *ppDatabase is set to point to the shared Database 
+** structure and LSM_OK returned. Otherwise, *ppDatabase is set to NULL
+** and and LSM error code returned.
+**
+** Each successful call to this function should be (eventually) matched
+** by a call to lsmDbDatabaseRelease().
+*/
+int lsmDbDatabaseConnect(
+  lsm_db *pDb,                    /* Database handle */
+  const char *zName               /* Full-path to db file */
+){
+  lsm_env *pEnv = pDb->pEnv;
+  int rc;                         /* Return code */
+  Database *p = 0;                /* Pointer returned via *ppDatabase */
+  int nName = lsmStrlen(zName);
+
+  assert( pDb->pDatabase==0 );
+  rc = enterGlobalMutex(pEnv);
+  if( rc==LSM_OK ){
+
+    /* Search the global list for an existing object. TODO: Need something
+    ** better than the memcmp() below to figure out if a given Database
+    ** object represents the requested file.  */
+    for(p=gShared.pDatabase; p; p=p->pDbNext){
+      if( nName==p->nName && 0==memcmp(zName, p->zName, nName) ) break;
+    }
+
+    /* If no suitable Database object was found, allocate a new one. */
+    if( p==0 ){
+      p = (Database *)lsmMallocZeroRc(pEnv, sizeof(Database)+nName+1, &rc);
+
+      /* If the allocation was successful, fill in other fields and
+      ** allocate the client mutex. */ 
+      if( rc==LSM_OK ){
+        p->bMultiProc = pDb->bMultiProc;
+        p->zName = (char *)&p[1];
+        p->nName = nName;
+        memcpy((void *)p->zName, zName, nName+1);
+        rc = lsmMutexNew(pEnv, &p->pClientMutex);
+      }
+
+      /* If nothing has gone wrong so far, open the shared fd. And if that
+      ** succeeds and this connection requested single-process mode, 
+      ** attempt to take the exclusive lock on DMS2.  */
+      if( rc==LSM_OK ){
+        int bReadonly = (pDb->bReadonly && pDb->bMultiProc);
+        rc = dbOpenSharedFd(pDb->pEnv, p, bReadonly);
+      }
+
+      if( rc==LSM_OK && p->bMultiProc==0 ){
+        /* Hold an exclusive lock DMS1 while grabbing DMS2. This ensures
+        ** that any ongoing call to doDbDisconnect() (even one in another
+        ** process) is finished before proceeding.  */
+        assert( p->bReadonly==0 );
+        rc = lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS1, LSM_LOCK_EXCL);
+        if( rc==LSM_OK ){
+          rc = lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS2, LSM_LOCK_EXCL);
+          lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK);
+        }
+      }
+
+      if( rc==LSM_OK ){
+        p->pDbNext = gShared.pDatabase;
+        gShared.pDatabase = p;
+      }else{
+        freeDatabase(pEnv, p);
+        p = 0;
+      }
+    }
+
+    if( p ){
+      p->nDbRef++;
+    }
+    leaveGlobalMutex(pEnv);
+
+    if( p ){
+      lsmMutexEnter(pDb->pEnv, p->pClientMutex);
+      pDb->pNext = p->pConn;
+      p->pConn = pDb;
+      lsmMutexLeave(pDb->pEnv, p->pClientMutex);
+    }
+  }
+
+  pDb->pDatabase = p;
+  if( rc==LSM_OK ){
+    assert( p );
+    rc = lsmFsOpen(pDb, zName, p->bReadonly);
+  }
+
+  /* If the db handle is read-write, then connect to the system now. Run
+  ** recovery as necessary. Or, if this is a read-only database handle,
+  ** defer attempting to connect to the system until a read-transaction
+  ** is opened.  */
+  if( rc==LSM_OK ){
+    rc = lsmFsConfigure(pDb);
+  }
+  if( rc==LSM_OK && pDb->bReadonly==0 ){
+    rc = doDbConnect(pDb);
+  }
+
+  return rc;
+}
+
+static void dbDeferClose(lsm_db *pDb){
+  if( pDb->pFS ){
+    LsmFile *pLsmFile;
+    Database *p = pDb->pDatabase;
+    pLsmFile = lsmFsDeferClose(pDb->pFS);
+    pLsmFile->pNext = p->pLsmFile;
+    p->pLsmFile = pLsmFile;
+  }
+}
+
+LsmFile *lsmDbRecycleFd(lsm_db *db){
+  LsmFile *pRet;
+  Database *p = db->pDatabase;
+  lsmMutexEnter(db->pEnv, p->pClientMutex);
+  if( (pRet = p->pLsmFile)!=0 ){
+    p->pLsmFile = pRet->pNext;
+  }
+  lsmMutexLeave(db->pEnv, p->pClientMutex);
+  return pRet;
+}
+
+/*
+** Release a reference to a Database object obtained from 
+** lsmDbDatabaseConnect(). There should be exactly one call to this function 
+** for each successful call to Find().
+*/
+void lsmDbDatabaseRelease(lsm_db *pDb){
+  Database *p = pDb->pDatabase;
+  if( p ){
+    lsm_db **ppDb;
+
+    if( pDb->pShmhdr ){
+      doDbDisconnect(pDb);
+    }
+
+    lsmFsUnmap(pDb->pFS);
+    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
+    for(ppDb=&p->pConn; *ppDb!=pDb; ppDb=&((*ppDb)->pNext));
+    *ppDb = pDb->pNext;
+    dbDeferClose(pDb);
+    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
+
+    enterGlobalMutex(pDb->pEnv);
+    p->nDbRef--;
+    if( p->nDbRef==0 ){
+      LsmFile *pIter;
+      LsmFile *pNext;
+      Database **pp;
+
+      /* Remove the Database structure from the linked list. */
+      for(pp=&gShared.pDatabase; *pp!=p; pp=&((*pp)->pDbNext));
+      *pp = p->pDbNext;
+
+      /* If they were allocated from the heap, free the shared memory chunks */
+      if( p->bMultiProc==0 ){
+        int i;
+        for(i=0; i<p->nShmChunk; i++){
+          lsmFree(pDb->pEnv, p->apShmChunk[i]);
+        }
+      }
+
+      /* Close any outstanding file descriptors */
+      for(pIter=p->pLsmFile; pIter; pIter=pNext){
+        pNext = pIter->pNext;
+        lsmEnvClose(pDb->pEnv, pIter->pFile);
+        lsmFree(pDb->pEnv, pIter);
+      }
+      freeDatabase(pDb->pEnv, p);
+    }
+    leaveGlobalMutex(pDb->pEnv);
+  }
+}
+
+Level *lsmDbSnapshotLevel(Snapshot *pSnapshot){
+  return pSnapshot->pLevel;
+}
+
+void lsmDbSnapshotSetLevel(Snapshot *pSnap, Level *pLevel){
+  pSnap->pLevel = pLevel;
+}
+
+/* TODO: Shuffle things around to get rid of this */
+static int firstSnapshotInUse(lsm_db *, i64 *);
+
+/* 
+** Context object used by the lsmWalkFreelist() utility. 
+*/
+typedef struct WalkFreelistCtx WalkFreelistCtx;
+struct WalkFreelistCtx {
+  lsm_db *pDb;
+  int bReverse;
+  Freelist *pFreelist;
+  int iFree;
+  int (*xUsr)(void *, int, i64);  /* User callback function */
+  void *pUsrctx;                  /* User callback context */
+  int bDone;                      /* Set to true after xUsr() returns true */
+};
+
+/* 
+** Callback used by lsmWalkFreelist().
+*/
+static int walkFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){
+  WalkFreelistCtx *p = (WalkFreelistCtx *)pCtx;
+  const int iDir = (p->bReverse ? -1 : 1);
+  Freelist *pFree = p->pFreelist;
+
+  assert( p->bDone==0 );
+  assert( iBlk>=0 );
+  if( pFree ){
+    while( (p->iFree < pFree->nEntry) && p->iFree>=0 ){
+      FreelistEntry *pEntry = &pFree->aEntry[p->iFree];
+      if( (p->bReverse==0 && pEntry->iBlk>(u32)iBlk)
+       || (p->bReverse!=0 && pEntry->iBlk<(u32)iBlk)
+      ){
+        break;
+      }else{
+        p->iFree += iDir;
+        if( pEntry->iId>=0 
+            && p->xUsr(p->pUsrctx, pEntry->iBlk, pEntry->iId) 
+          ){
+          p->bDone = 1;
+          return 1;
+        }
+        if( pEntry->iBlk==(u32)iBlk ) return 0;
+      }
+    }
+  }
+
+  if( p->xUsr(p->pUsrctx, iBlk, iSnapshot) ){
+    p->bDone = 1;
+    return 1;
+  }
+  return 0;
+}
+
+/*
+** The database handle passed as the first argument must be the worker
+** connection. This function iterates through the contents of the current
+** free block list, invoking the supplied callback once for each list
+** element.
+**
+** The difference between this function and lsmSortedWalkFreelist() is
+** that lsmSortedWalkFreelist() only considers those free-list elements
+** stored within the LSM. This function also merges in any in-memory 
+** elements.
+*/
+int lsmWalkFreelist(
+  lsm_db *pDb,                    /* Database handle (must be worker) */
+  int bReverse,                   /* True to iterate from largest to smallest */
+  int (*x)(void *, int, i64),     /* Callback function */
+  void *pCtx                      /* First argument to pass to callback */
+){
+  const int iDir = (bReverse ? -1 : 1);
+  int rc;
+  int iCtx;
+
+  WalkFreelistCtx ctx[2];
+
+  ctx[0].pDb = pDb;
+  ctx[0].bReverse = bReverse;
+  ctx[0].pFreelist = &pDb->pWorker->freelist;
+  if( ctx[0].pFreelist && bReverse ){
+    ctx[0].iFree = ctx[0].pFreelist->nEntry-1;
+  }else{
+    ctx[0].iFree = 0;
+  }
+  ctx[0].xUsr = walkFreelistCb;
+  ctx[0].pUsrctx = (void *)&ctx[1];
+  ctx[0].bDone = 0;
+
+  ctx[1].pDb = pDb;
+  ctx[1].bReverse = bReverse;
+  ctx[1].pFreelist = pDb->pFreelist;
+  if( ctx[1].pFreelist && bReverse ){
+    ctx[1].iFree = ctx[1].pFreelist->nEntry-1;
+  }else{
+    ctx[1].iFree = 0;
+  }
+  ctx[1].xUsr = x;
+  ctx[1].pUsrctx = pCtx;
+  ctx[1].bDone = 0;
+
+  rc = lsmSortedWalkFreelist(pDb, bReverse, walkFreelistCb, (void *)&ctx[0]);
+
+  if( ctx[0].bDone==0 ){
+    for(iCtx=0; iCtx<2; iCtx++){
+      int i;
+      WalkFreelistCtx *p = &ctx[iCtx];
+      for(i=p->iFree; 
+          p->pFreelist && rc==LSM_OK && i<p->pFreelist->nEntry && i>=0;
+          i += iDir
+         ){
+        FreelistEntry *pEntry = &p->pFreelist->aEntry[i];
+        if( pEntry->iId>=0 && p->xUsr(p->pUsrctx, pEntry->iBlk, pEntry->iId) ){
+          return LSM_OK;
+        }
+      }
+    }
+  }
+
+  return rc;
+}
+
+
+typedef struct FindFreeblockCtx FindFreeblockCtx;
+struct FindFreeblockCtx {
+  i64 iInUse;
+  int iRet;
+  int bNotOne;
+};
+
+static int findFreeblockCb(void *pCtx, int iBlk, i64 iSnapshot){
+  FindFreeblockCtx *p = (FindFreeblockCtx *)pCtx;
+  if( iSnapshot<p->iInUse && (iBlk!=1 || p->bNotOne==0) ){
+    p->iRet = iBlk;
+    return 1;
+  }
+  return 0;
+}
+
+static int findFreeblock(lsm_db *pDb, i64 iInUse, int bNotOne, int *piRet){
+  int rc;                         /* Return code */
+  FindFreeblockCtx ctx;           /* Context object */
+
+  ctx.iInUse = iInUse;
+  ctx.iRet = 0;
+  ctx.bNotOne = bNotOne;
+  rc = lsmWalkFreelist(pDb, 0, findFreeblockCb, (void *)&ctx);
+  *piRet = ctx.iRet;
+
+  return rc;
+}
+
+/*
+** Allocate a new database file block to write data to, either by extending
+** the database file or by recycling a free-list entry. The worker snapshot 
+** must be held in order to call this function.
+**
+** If successful, *piBlk is set to the block number allocated and LSM_OK is
+** returned. Otherwise, *piBlk is zeroed and an lsm error code returned.
+*/
+int lsmBlockAllocate(lsm_db *pDb, int iBefore, int *piBlk){
+  Snapshot *p = pDb->pWorker;
+  int iRet = 0;                   /* Block number of allocated block */
+  int rc = LSM_OK;
+  i64 iInUse = 0;                 /* Snapshot id still in use */
+  i64 iSynced = 0;                /* Snapshot id synced to disk */
+
+  assert( p );
+
+#ifdef LSM_LOG_FREELIST
+  {
+    static int nCall = 0;
+    char *zFree = 0;
+    nCall++;
+    rc = lsmInfoFreelist(pDb, &zFree);
+    if( rc!=LSM_OK ) return rc;
+    lsmLogMessage(pDb, 0, "lsmBlockAllocate(): %d freelist: %s", nCall, zFree);
+    lsmFree(pDb->pEnv, zFree);
+  }
+#endif
+
+  /* Set iInUse to the smallest snapshot id that is either:
+  **
+  **   * Currently in use by a database client,
+  **   * May be used by a database client in the future, or
+  **   * Is the most recently checkpointed snapshot (i.e. the one that will
+  **     be used following recovery if a failure occurs at this point).
+  */
+  rc = lsmCheckpointSynced(pDb, &iSynced, 0, 0);
+  if( rc==LSM_OK && iSynced==0 ) iSynced = p->iId;
+  iInUse = iSynced;
+  if( rc==LSM_OK && pDb->iReader>=0 ){
+    assert( pDb->pClient );
+    iInUse = LSM_MIN(iInUse, pDb->pClient->iId);
+  }
+  if( rc==LSM_OK ) rc = firstSnapshotInUse(pDb, &iInUse);
+
+#ifdef LSM_LOG_FREELIST
+  {
+    lsmLogMessage(pDb, 0, "lsmBlockAllocate(): "
+        "snapshot-in-use: %lld (iSynced=%lld) (client-id=%lld)", 
+        iInUse, iSynced, (pDb->iReader>=0 ? pDb->pClient->iId : 0)
+    );
+  }
+#endif
+
+
+  /* Unless there exists a read-only transaction (which prevents us from
+  ** recycling any blocks regardless, query the free block list for a 
+  ** suitable block to reuse. 
+  **
+  ** It might seem more natural to check for a read-only transaction at
+  ** the start of this function. However, it is better do wait until after
+  ** the call to lsmCheckpointSynced() to do so.
+  */
+  if( rc==LSM_OK ){
+    int bRotrans;
+    rc = lsmDetectRoTrans(pDb, &bRotrans);
+
+    if( rc==LSM_OK && bRotrans==0 ){
+      rc = findFreeblock(pDb, iInUse, (iBefore>0), &iRet);
+    }
+  }
+
+  if( iBefore>0 && (iRet<=0 || iRet>=iBefore) ){
+    iRet = 0;
+
+  }else if( rc==LSM_OK ){
+    /* If a block was found in the free block list, use it and remove it from 
+    ** the list. Otherwise, if no suitable block was found, allocate one from
+    ** the end of the file.  */
+    if( iRet>0 ){
+#ifdef LSM_LOG_FREELIST
+      lsmLogMessage(pDb, 0, 
+          "reusing block %d (snapshot-in-use=%lld)", iRet, iInUse);
+#endif
+      rc = freelistAppend(pDb, iRet, -1);
+      if( rc==LSM_OK ){
+        rc = dbTruncate(pDb, iInUse);
+      }
+    }else{
+      iRet = ++(p->nBlock);
+#ifdef LSM_LOG_FREELIST
+      lsmLogMessage(pDb, 0, "extending file to %d blocks", iRet);
+#endif
+    }
+  }
+
+  assert( iBefore>0 || iRet>0 || rc!=LSM_OK );
+  *piBlk = iRet;
+  return rc;
+}
+
+/*
+** Free a database block. The worker snapshot must be held in order to call 
+** this function.
+**
+** If successful, LSM_OK is returned. Otherwise, an lsm error code (e.g. 
+** LSM_NOMEM).
+*/
+int lsmBlockFree(lsm_db *pDb, int iBlk){
+  Snapshot *p = pDb->pWorker;
+  assert( lsmShmAssertWorker(pDb) );
+
+#ifdef LSM_LOG_FREELIST
+  lsmLogMessage(pDb, LSM_OK, "lsmBlockFree(): Free block %d", iBlk);
+#endif
+
+  return freelistAppend(pDb, iBlk, p->iId);
+}
+
+/*
+** Refree a database block. The worker snapshot must be held in order to call 
+** this function.
+**
+** Refreeing is required when a block is allocated using lsmBlockAllocate()
+** but then not used. This function is used to push the block back onto
+** the freelist. Refreeing a block is different from freeing is, as a refreed
+** block may be reused immediately. Whereas a freed block can not be reused 
+** until (at least) after the next checkpoint.
+*/
+int lsmBlockRefree(lsm_db *pDb, int iBlk){
+  int rc = LSM_OK;                /* Return code */
+
+#ifdef LSM_LOG_FREELIST
+  lsmLogMessage(pDb, LSM_OK, "lsmBlockRefree(): Refree block %d", iBlk);
+#endif
+
+  rc = freelistAppend(pDb, iBlk, 0);
+  return rc;
+}
+
+/*
+** If required, copy a database checkpoint from shared memory into the
+** database itself.
+**
+** The WORKER lock must not be held when this is called. This is because
+** this function may indirectly call fsync(). And the WORKER lock should
+** not be held that long (in case it is required by a client flushing an
+** in-memory tree to disk).
+*/
+int lsmCheckpointWrite(lsm_db *pDb, u32 *pnWrite){
+  int rc;                         /* Return Code */
+  u32 nWrite = 0;
+
+  assert( pDb->pWorker==0 );
+  assert( 1 || pDb->pClient==0 );
+  assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK) );
+
+  rc = lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_EXCL, 0);
+  if( rc!=LSM_OK ) return rc;
+
+  rc = lsmCheckpointLoad(pDb, 0);
+  if( rc==LSM_OK ){
+    int nBlock = lsmCheckpointNBlock(pDb->aSnapshot);
+    ShmHeader *pShm = pDb->pShmhdr;
+    int bDone = 0;                /* True if checkpoint is already stored */
+
+    /* Check if this checkpoint has already been written to the database
+    ** file. If so, set variable bDone to true.  */
+    if( pShm->iMetaPage ){
+      MetaPage *pPg;              /* Meta page */
+      u8 *aData;                  /* Meta-page data buffer */
+      int nData;                  /* Size of aData[] in bytes */
+      i64 iCkpt;                  /* Id of checkpoint just loaded */
+      i64 iDisk = 0;              /* Id of checkpoint already stored in db */
+      iCkpt = lsmCheckpointId(pDb->aSnapshot, 0);
+      rc = lsmFsMetaPageGet(pDb->pFS, 0, pShm->iMetaPage, &pPg);
+      if( rc==LSM_OK ){
+        aData = lsmFsMetaPageData(pPg, &nData);
+        iDisk = lsmCheckpointId((u32 *)aData, 1);
+        nWrite = lsmCheckpointNWrite((u32 *)aData, 1);
+        lsmFsMetaPageRelease(pPg);
+      }
+      bDone = (iDisk>=iCkpt);
+    }
+
+    if( rc==LSM_OK && bDone==0 ){
+      int iMeta = (pShm->iMetaPage % 2) + 1;
+      if( pDb->eSafety!=LSM_SAFETY_OFF ){
+        rc = lsmFsSyncDb(pDb->pFS, nBlock);
+      }
+      if( rc==LSM_OK ) rc = lsmCheckpointStore(pDb, iMeta);
+      if( rc==LSM_OK && pDb->eSafety!=LSM_SAFETY_OFF){
+        rc = lsmFsSyncDb(pDb->pFS, 0);
+      }
+      if( rc==LSM_OK ){
+        pShm->iMetaPage = iMeta;
+        nWrite = lsmCheckpointNWrite(pDb->aSnapshot, 0) - nWrite;
+      }
+#ifdef LSM_LOG_WORK
+      lsmLogMessage(pDb, 0, "finish checkpoint %d", 
+          (int)lsmCheckpointId(pDb->aSnapshot, 0)
+      );
+#endif
+    }
+  }
+
+  lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_UNLOCK, 0);
+  if( pnWrite && rc==LSM_OK ) *pnWrite = nWrite;
+  return rc;
+}
+
+int lsmBeginWork(lsm_db *pDb){
+  int rc;
+
+  /* Attempt to take the WORKER lock */
+  rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0);
+
+  /* Deserialize the current worker snapshot */
+  if( rc==LSM_OK ){
+    rc = lsmCheckpointLoadWorker(pDb);
+  }
+  return rc;
+}
+
+void lsmFreeSnapshot(lsm_env *pEnv, Snapshot *p){
+  if( p ){
+    lsmSortedFreeLevel(pEnv, p->pLevel);
+    lsmFree(pEnv, p->freelist.aEntry);
+    lsmFree(pEnv, p->redirect.a);
+    lsmFree(pEnv, p);
+  }
+}
+
+/*
+** Attempt to populate one of the read-lock slots to contain lock values
+** iLsm/iShm. Or, if such a slot exists already, this function is a no-op.
+**
+** It is not an error if no slot can be populated because the write-lock
+** cannot be obtained. If any other error occurs, return an LSM error code.
+** Otherwise, LSM_OK.
+**
+** This function is called at various points to try to ensure that there
+** always exists at least one read-lock slot that can be used by a read-only
+** client. And so that, in the usual case, there is an "exact match" available
+** whenever a read transaction is opened by any client. At present this
+** function is called when:
+**
+**    * A write transaction that called lsmTreeDiscardOld() is committed, and
+**    * Whenever the working snapshot is updated (i.e. lsmFinishWork()).
+*/
+static int dbSetReadLock(lsm_db *db, i64 iLsm, u32 iShm){
+  int rc = LSM_OK;
+  ShmHeader *pShm = db->pShmhdr;
+  int i;
+
+  /* Check if there is already a slot containing the required values. */
+  for(i=0; i<LSM_LOCK_NREADER; i++){
+    ShmReader *p = &pShm->aReader[i];
+    if( p->iLsmId==iLsm && p->iTreeId==iShm ) return LSM_OK;
+  }
+
+  /* Iterate through all read-lock slots, attempting to take a write-lock
+  ** on each of them. If a write-lock succeeds, populate the locked slot
+  ** with the required values and break out of the loop.  */
+  for(i=0; rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
+    rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
+    if( rc==LSM_BUSY ){
+      rc = LSM_OK;
+    }else{
+      ShmReader *p = &pShm->aReader[i];
+      p->iLsmId = iLsm;
+      p->iTreeId = iShm;
+      lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
+      break;
+    }
+  }
+
+  return rc;
+}
+
+/*
+** Release the read-lock currently held by connection db.
+*/
+int dbReleaseReadlock(lsm_db *db){
+  int rc = LSM_OK;
+  if( db->iReader>=0 ){
+    rc = lsmShmLock(db, LSM_LOCK_READER(db->iReader), LSM_LOCK_UNLOCK, 0);
+    db->iReader = -1;
+  }
+  db->bRoTrans = 0;
+  return rc;
+}
+
+
+/*
+** Argument bFlush is true if the contents of the in-memory tree has just
+** been flushed to disk. The significance of this is that once the snapshot
+** created to hold the updated state of the database is synced to disk, log
+** file space can be recycled.
+*/
+void lsmFinishWork(lsm_db *pDb, int bFlush, int *pRc){
+  int rc = *pRc;
+  assert( rc!=0 || pDb->pWorker );
+  if( pDb->pWorker ){
+    /* If no error has occurred, serialize the worker snapshot and write
+    ** it to shared memory.  */
+    if( rc==LSM_OK ){
+      rc = lsmSaveWorker(pDb, bFlush);
+    }
+
+    /* Assuming no error has occurred, update a read lock slot with the
+    ** new snapshot id (see comments above function dbSetReadLock()).  */
+    if( rc==LSM_OK ){
+      if( pDb->iReader<0 ){
+        rc = lsmTreeLoadHeader(pDb, 0);
+      }
+      if( rc==LSM_OK ){
+        rc = dbSetReadLock(pDb, pDb->pWorker->iId, pDb->treehdr.iUsedShmid);
+      }
+    }
+
+    /* Free the snapshot object. */
+    lsmFreeSnapshot(pDb->pEnv, pDb->pWorker);
+    pDb->pWorker = 0;
+  }
+
+  lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0);
+  *pRc = rc;
+}
+
+/*
+** Called when recovery is finished.
+*/
+int lsmFinishRecovery(lsm_db *pDb){
+  lsmTreeEndTransaction(pDb, 1);
+  return LSM_OK;
+}
+
+/*
+** Check if the currently configured compression functions
+** (LSM_CONFIG_SET_COMPRESSION) are compatible with a database that has its
+** compression id set to iReq. Compression routines are compatible if iReq
+** is zero (indicating the database is empty), or if it is equal to the 
+** compression id of the configured compression routines.
+**
+** If the check shows that the current compression are incompatible and there
+** is a compression factory registered, give it a chance to install new
+** compression routines.
+**
+** If, after any registered factory is invoked, the compression functions
+** are still incompatible, return LSM_MISMATCH. Otherwise, LSM_OK.
+*/
+int lsmCheckCompressionId(lsm_db *pDb, u32 iReq){
+  if( iReq!=LSM_COMPRESSION_EMPTY && pDb->compress.iId!=iReq ){
+    if( pDb->factory.xFactory ){
+      pDb->bInFactory = 1;
+      pDb->factory.xFactory(pDb->factory.pCtx, pDb, iReq);
+      pDb->bInFactory = 0;
+    }
+    if( pDb->compress.iId!=iReq ){
+      /* Incompatible */
+      return LSM_MISMATCH;
+    }
+  }
+  /* Compatible */
+  return LSM_OK;
+}
+
+/*
+** Begin a read transaction. This function is a no-op if the connection
+** passed as the only argument already has an open read transaction.
+*/
+int lsmBeginReadTrans(lsm_db *pDb){
+  const int MAX_READLOCK_ATTEMPTS = 10;
+  const int nMaxAttempt = (pDb->bRoTrans ? 1 : MAX_READLOCK_ATTEMPTS);
+
+  int rc = LSM_OK;                /* Return code */
+  int iAttempt = 0;
+
+  assert( pDb->pWorker==0 );
+
+  while( rc==LSM_OK && pDb->iReader<0 && (iAttempt++)<nMaxAttempt ){
+    int iTreehdr = 0;
+    int iSnap = 0;
+    assert( pDb->pCsr==0 && pDb->nTransOpen==0 );
+
+    /* Load the in-memory tree header. */
+    rc = lsmTreeLoadHeader(pDb, &iTreehdr);
+
+    /* Load the database snapshot */
+    if( rc==LSM_OK ){
+      if( lsmCheckpointClientCacheOk(pDb)==0 ){
+        lsmFreeSnapshot(pDb->pEnv, pDb->pClient);
+        pDb->pClient = 0;
+        lsmMCursorFreeCache(pDb);
+        lsmFsPurgeCache(pDb->pFS);
+        rc = lsmCheckpointLoad(pDb, &iSnap);
+      }else{
+        iSnap = 1;
+      }
+    }
+
+    /* Take a read-lock on the tree and snapshot just loaded. Then check
+    ** that the shared-memory still contains the same values. If so, proceed.
+    ** Otherwise, relinquish the read-lock and retry the whole procedure
+    ** (starting with loading the in-memory tree header).  */
+    if( rc==LSM_OK ){
+      u32 iShmMax = pDb->treehdr.iUsedShmid;
+      u32 iShmMin = pDb->treehdr.iNextShmid+1-LSM_MAX_SHMCHUNKS;
+      rc = lsmReadlock(
+          pDb, lsmCheckpointId(pDb->aSnapshot, 0), iShmMin, iShmMax
+      );
+      if( rc==LSM_OK ){
+        if( lsmTreeLoadHeaderOk(pDb, iTreehdr)
+         && lsmCheckpointLoadOk(pDb, iSnap)
+        ){
+          /* Read lock has been successfully obtained. Deserialize the 
+          ** checkpoint just loaded. TODO: This will be removed after 
+          ** lsm_sorted.c is changed to work directly from the serialized
+          ** version of the snapshot.  */
+          if( pDb->pClient==0 ){
+            rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot,&pDb->pClient);
+          }
+          assert( (rc==LSM_OK)==(pDb->pClient!=0) );
+          assert( pDb->iReader>=0 );
+
+          /* Check that the client has the right compression hooks loaded.
+          ** If not, set rc to LSM_MISMATCH.  */
+          if( rc==LSM_OK ){
+            rc = lsmCheckCompressionId(pDb, pDb->pClient->iCmpId);
+          }
+        }else{
+          rc = dbReleaseReadlock(pDb);
+        }
+      }
+
+      if( rc==LSM_BUSY ){
+        rc = LSM_OK;
+      }
+    }
+#if 0
+if( rc==LSM_OK && pDb->pClient ){
+  fprintf(stderr, 
+      "reading %p: snapshot:%d used-shmid:%d trans-id:%d iOldShmid=%d\n",
+      (void *)pDb,
+      (int)pDb->pClient->iId, (int)pDb->treehdr.iUsedShmid, 
+      (int)pDb->treehdr.root.iTransId,
+      (int)pDb->treehdr.iOldShmid
+  );
+}
+#endif
+  }
+
+  if( rc==LSM_OK ){
+    rc = lsmShmCacheChunks(pDb, pDb->treehdr.nChunk);
+  }
+  if( rc!=LSM_OK ){
+    dbReleaseReadlock(pDb);
+  }
+  if( pDb->pClient==0 && rc==LSM_OK ) rc = LSM_BUSY;
+  return rc;
+}
+
+/*
+** This function is used by a read-write connection to determine if there
+** are currently one or more read-only transactions open on the database
+** (in this context a read-only transaction is one opened by a read-only
+** connection on a non-live database).
+**
+** If no error occurs, LSM_OK is returned and *pbExists is set to true if
+** some other connection has a read-only transaction open, or false 
+** otherwise. If an error occurs an LSM error code is returned and the final
+** value of *pbExist is undefined.
+*/
+int lsmDetectRoTrans(lsm_db *db, int *pbExist){
+  int rc;
+
+  /* Only a read-write connection may use this function. */
+  assert( db->bReadonly==0 );
+
+  rc = lsmShmTestLock(db, LSM_LOCK_ROTRANS, 1, LSM_LOCK_EXCL);
+  if( rc==LSM_BUSY ){
+    *pbExist = 1;
+    rc = LSM_OK;
+  }else{
+    *pbExist = 0;
+  }
+
+  return rc;
+}
+
+/*
+** db is a read-only database handle in the disconnected state. This function
+** attempts to open a read-transaction on the database. This may involve
+** connecting to the database system (opening shared memory etc.).
+*/
+int lsmBeginRoTrans(lsm_db *db){
+  int rc = LSM_OK;
+
+  assert( db->bReadonly && db->pShmhdr==0 );
+  assert( db->iReader<0 );
+
+  if( db->bRoTrans==0 ){
+
+    /* Attempt a shared-lock on DMS1. */
+    rc = lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_SHARED, 0);
+    if( rc!=LSM_OK ) return rc;
+
+    rc = lsmShmTestLock(
+        db, LSM_LOCK_RWCLIENT(0), LSM_LOCK_NREADER, LSM_LOCK_SHARED
+    );
+    if( rc==LSM_OK ){
+      /* System is not live. Take a SHARED lock on the ROTRANS byte and
+      ** release DMS1. Locking ROTRANS tells all read-write clients that they
+      ** may not recycle any disk space from within the database or log files,
+      ** as a read-only client may be using it.  */
+      rc = lsmShmLock(db, LSM_LOCK_ROTRANS, LSM_LOCK_SHARED, 0);
+      lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
+
+      if( rc==LSM_OK ){
+        db->bRoTrans = 1;
+        rc = lsmShmCacheChunks(db, 1);
+        if( rc==LSM_OK ){
+          db->pShmhdr = (ShmHeader *)db->apShm[0];
+          memset(db->pShmhdr, 0, sizeof(ShmHeader));
+          rc = lsmCheckpointRecover(db);
+          if( rc==LSM_OK ){
+            rc = lsmLogRecover(db);
+          }
+        }
+      }
+    }else if( rc==LSM_BUSY ){
+      /* System is live! */
+      rc = lsmShmLock(db, LSM_LOCK_DMS3, LSM_LOCK_SHARED, 0);
+      lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
+      if( rc==LSM_OK ){
+        rc = lsmShmCacheChunks(db, 1);
+        if( rc==LSM_OK ){
+          db->pShmhdr = (ShmHeader *)db->apShm[0];
+        }
+      }
+    }
+
+    if( rc==LSM_OK ){
+      rc = lsmBeginReadTrans(db);
+    }
+  }
+
+  return rc;
+}
+
+/*
+** Close the currently open read transaction.
+*/
+void lsmFinishReadTrans(lsm_db *pDb){
+
+  /* Worker connections should not be closing read transactions. And
+  ** read transactions should only be closed after all cursors and write
+  ** transactions have been closed. Finally pClient should be non-NULL
+  ** only iff pDb->iReader>=0.  */
+  assert( pDb->pWorker==0 );
+  assert( pDb->pCsr==0 && pDb->nTransOpen==0 );
+
+  if( pDb->bRoTrans ){
+    int i;
+    for(i=0; i<pDb->nShm; i++){
+      lsmFree(pDb->pEnv, pDb->apShm[i]);
+    }
+    lsmFree(pDb->pEnv, pDb->apShm);
+    pDb->apShm = 0;
+    pDb->nShm = 0;
+    pDb->pShmhdr = 0;
+
+    lsmShmLock(pDb, LSM_LOCK_ROTRANS, LSM_LOCK_UNLOCK, 0);
+  }
+  dbReleaseReadlock(pDb);
+}
+
+/*
+** Open a write transaction.
+*/
+int lsmBeginWriteTrans(lsm_db *pDb){
+  int rc = LSM_OK;                /* Return code */
+  ShmHeader *pShm = pDb->pShmhdr; /* Shared memory header */
+
+  assert( pDb->nTransOpen==0 );
+  assert( pDb->bDiscardOld==0 );
+  assert( pDb->bReadonly==0 );
+
+  /* If there is no read-transaction open, open one now. */
+  if( pDb->iReader<0 ){
+    rc = lsmBeginReadTrans(pDb);
+  }
+
+  /* Attempt to take the WRITER lock */
+  if( rc==LSM_OK ){
+    rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0);
+  }
+
+  /* If the previous writer failed mid-transaction, run emergency rollback. */
+  if( rc==LSM_OK && pShm->bWriter ){
+    rc = lsmTreeRepair(pDb);
+    if( rc==LSM_OK ) pShm->bWriter = 0;
+  }
+
+  /* Check that this connection is currently reading from the most recent
+  ** version of the database. If not, return LSM_BUSY.  */
+  if( rc==LSM_OK && memcmp(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)) ){
+    rc = LSM_BUSY;
+  }
+
+  if( rc==LSM_OK ){
+    rc = lsmLogBegin(pDb);
+  }
+
+  /* If everything was successful, set the "transaction-in-progress" flag
+  ** and return LSM_OK. Otherwise, if some error occurred, relinquish the 
+  ** WRITER lock and return an error code.  */
+  if( rc==LSM_OK ){
+    TreeHeader *p = &pDb->treehdr;
+    pShm->bWriter = 1;
+    p->root.iTransId++;
+    if( lsmTreeHasOld(pDb) && p->iOldLog==pDb->pClient->iLogOff ){
+      lsmTreeDiscardOld(pDb);
+      pDb->bDiscardOld = 1;
+    }
+  }else{
+    lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
+    if( pDb->pCsr==0 ) lsmFinishReadTrans(pDb);
+  }
+  return rc;
+}
+
+/*
+** End the current write transaction. The connection is left with an open
+** read transaction. It is an error to call this if there is no open write 
+** transaction.
+**
+** If the transaction was committed, then a commit record has already been
+** written into the log file when this function is called. Or, if the
+** transaction was rolled back, both the log file and in-memory tree 
+** structure have already been restored. In either case, this function 
+** merely releases locks and other resources held by the write-transaction.
+**
+** LSM_OK is returned if successful, or an LSM error code otherwise.
+*/
+int lsmFinishWriteTrans(lsm_db *pDb, int bCommit){
+  int rc = LSM_OK;
+  int bFlush = 0;
+
+  lsmLogEnd(pDb, bCommit);
+  if( rc==LSM_OK && bCommit && lsmTreeSize(pDb)>pDb->nTreeLimit ){
+    bFlush = 1;
+    lsmTreeMakeOld(pDb);
+  }
+  lsmTreeEndTransaction(pDb, bCommit);
+
+  if( rc==LSM_OK ){
+    if( bFlush && pDb->bAutowork ){
+      rc = lsmSortedAutoWork(pDb, 1);
+    }else if( bCommit && pDb->bDiscardOld ){
+      rc = dbSetReadLock(pDb, pDb->pClient->iId, pDb->treehdr.iUsedShmid);
+    }
+  }
+  pDb->bDiscardOld = 0;
+  lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
+
+  if( bFlush && pDb->bAutowork==0 && pDb->xWork ){
+    pDb->xWork(pDb, pDb->pWorkCtx);
+  }
+  return rc;
+}
+
+
+/*
+** Return non-zero if the caller is holding the client mutex.
+*/
+#ifdef LSM_DEBUG
+int lsmHoldingClientMutex(lsm_db *pDb){
+  return lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pClientMutex);
+}
+#endif
+
+static int slotIsUsable(ShmReader *p, i64 iLsm, u32 iShmMin, u32 iShmMax){
+  return( 
+      p->iLsmId && p->iLsmId<=iLsm 
+      && shm_sequence_ge(iShmMax, p->iTreeId)
+      && shm_sequence_ge(p->iTreeId, iShmMin)
+  );
+}
+
+/*
+** Obtain a read-lock on database version identified by the combination
+** of snapshot iLsm and tree iTree. Return LSM_OK if successful, or
+** an LSM error code otherwise.
+*/
+int lsmReadlock(lsm_db *db, i64 iLsm, u32 iShmMin, u32 iShmMax){
+  int rc = LSM_OK;
+  ShmHeader *pShm = db->pShmhdr;
+  int i;
+
+  assert( db->iReader<0 );
+  assert( shm_sequence_ge(iShmMax, iShmMin) );
+
+  /* This is a no-op if the read-only transaction flag is set. */
+  if( db->bRoTrans ){
+    db->iReader = 0;
+    return LSM_OK;
+  }
+
+  /* Search for an exact match. */
+  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
+    ShmReader *p = &pShm->aReader[i];
+    if( p->iLsmId==iLsm && p->iTreeId==iShmMax ){
+      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
+      if( rc==LSM_OK && p->iLsmId==iLsm && p->iTreeId==iShmMax ){
+        db->iReader = i;
+      }else if( rc==LSM_BUSY ){
+        rc = LSM_OK;
+      }
+    }
+  }
+
+  /* Try to obtain a write-lock on each slot, in order. If successful, set
+  ** the slot values to iLsm/iTree.  */
+  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
+    rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
+    if( rc==LSM_BUSY ){
+      rc = LSM_OK;
+    }else{
+      ShmReader *p = &pShm->aReader[i];
+      p->iLsmId = iLsm;
+      p->iTreeId = iShmMax;
+      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
+      assert( rc!=LSM_BUSY );
+      if( rc==LSM_OK ) db->iReader = i;
+    }
+  }
+
+  /* Search for any usable slot */
+  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
+    ShmReader *p = &pShm->aReader[i];
+    if( slotIsUsable(p, iLsm, iShmMin, iShmMax) ){
+      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
+      if( rc==LSM_OK && slotIsUsable(p, iLsm, iShmMin, iShmMax) ){
+        db->iReader = i;
+      }else if( rc==LSM_BUSY ){
+        rc = LSM_OK;
+      }
+    }
+  }
+
+  if( rc==LSM_OK && db->iReader<0 ){
+    rc = LSM_BUSY;
+  }
+  return rc;
+}
+
+/*
+** This is used to check if there exists a read-lock locking a particular
+** version of either the in-memory tree or database file. 
+**
+** If iLsmId is non-zero, then it is a snapshot id. If there exists a 
+** read-lock using this snapshot or newer, set *pbInUse to true. Or,
+** if there is no such read-lock, set it to false.
+**
+** Or, if iLsmId is zero, then iShmid is a shared-memory sequence id.
+** Search for a read-lock using this sequence id or newer. etc.
+*/
+static int isInUse(lsm_db *db, i64 iLsmId, u32 iShmid, int *pbInUse){
+  ShmHeader *pShm = db->pShmhdr;
+  int i;
+  int rc = LSM_OK;
+
+  for(i=0; rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
+    ShmReader *p = &pShm->aReader[i];
+    if( p->iLsmId ){
+      if( (iLsmId!=0 && p->iLsmId!=0 && iLsmId>=p->iLsmId) 
+       || (iLsmId==0 && shm_sequence_ge(p->iTreeId, iShmid))
+      ){
+        rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
+        if( rc==LSM_OK ){
+          p->iLsmId = 0;
+          lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
+        }
+      }
+    }
+  }
+
+  if( rc==LSM_BUSY ){
+    *pbInUse = 1;
+    return LSM_OK;
+  }
+  *pbInUse = 0;
+  return rc;
+}
+
+/*
+** This function is called by worker connections to determine the smallest
+** snapshot id that is currently in use by a database client. The worker
+** connection uses this result to determine whether or not it is safe to
+** recycle a database block.
+*/
+static int firstSnapshotInUse(
+  lsm_db *db,                     /* Database handle */
+  i64 *piInUse                    /* IN/OUT: Smallest snapshot id in use */
+){
+  ShmHeader *pShm = db->pShmhdr;
+  i64 iInUse = *piInUse;
+  int i;
+
+  assert( iInUse>0 );
+  for(i=0; i<LSM_LOCK_NREADER; i++){
+    ShmReader *p = &pShm->aReader[i];
+    if( p->iLsmId ){
+      i64 iThis = p->iLsmId;
+      if( iThis!=0 && iInUse>iThis ){
+        int rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
+        if( rc==LSM_OK ){
+          p->iLsmId = 0;
+          lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
+        }else if( rc==LSM_BUSY ){
+          iInUse = iThis;
+        }else{
+          /* Some error other than LSM_BUSY. Return the error code to
+          ** the caller in this case.  */
+          return rc;
+        }
+      }
+    }
+  }
+
+  *piInUse = iInUse;
+  return LSM_OK;
+}
+
+int lsmTreeInUse(lsm_db *db, u32 iShmid, int *pbInUse){
+  if( db->treehdr.iUsedShmid==iShmid ){
+    *pbInUse = 1;
+    return LSM_OK;
+  }
+  return isInUse(db, 0, iShmid, pbInUse);
+}
+
+int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse){
+  if( db->pClient && db->pClient->iId<=iLsmId ){
+    *pbInUse = 1;
+    return LSM_OK;
+  }
+  return isInUse(db, iLsmId, 0, pbInUse);
+}
+
+/*
+** This function may only be called after a successful call to
+** lsmDbDatabaseConnect(). It returns true if the connection is in
+** multi-process mode, or false otherwise.
+*/
+int lsmDbMultiProc(lsm_db *pDb){
+  return pDb->pDatabase && pDb->pDatabase->bMultiProc;
+}
+
+
+/*************************************************************************
+**************************************************************************
+**************************************************************************
+**************************************************************************
+**************************************************************************
+*************************************************************************/
+
+/*
+** Ensure that database connection db has cached pointers to at least the 
+** first nChunk chunks of shared memory.
+*/
+int lsmShmCacheChunks(lsm_db *db, int nChunk){
+  int rc = LSM_OK;
+  if( nChunk>db->nShm ){
+    static const int NINCR = 16;
+    Database *p = db->pDatabase;
+    lsm_env *pEnv = db->pEnv;
+    int nAlloc;
+    int i;
+
+    /* Ensure that the db->apShm[] array is large enough. If an attempt to
+    ** allocate memory fails, return LSM_NOMEM immediately. The apShm[] array
+    ** is always extended in multiples of 16 entries - so the actual allocated
+    ** size can be inferred from nShm.  */ 
+    nAlloc = ((db->nShm + NINCR - 1) / NINCR) * NINCR;
+    while( nChunk>=nAlloc ){
+      void **apShm;
+      nAlloc += NINCR;
+      apShm = lsmRealloc(pEnv, db->apShm, sizeof(void*)*nAlloc);
+      if( !apShm ) return LSM_NOMEM_BKPT;
+      db->apShm = apShm;
+    }
+
+    if( db->bRoTrans ){
+      for(i=db->nShm; rc==LSM_OK && i<nChunk; i++){
+        db->apShm[i] = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc);
+        db->nShm++;
+      }
+
+    }else{
+
+      /* Enter the client mutex */
+      lsmMutexEnter(pEnv, p->pClientMutex);
+
+      /* Extend the Database objects apShmChunk[] array if necessary. Using the
+       ** same pattern as for the lsm_db.apShm[] array above.  */
+      nAlloc = ((p->nShmChunk + NINCR - 1) / NINCR) * NINCR;
+      while( nChunk>=nAlloc ){
+        void **apShm;
+        nAlloc +=  NINCR;
+        apShm = lsmRealloc(pEnv, p->apShmChunk, sizeof(void*)*nAlloc);
+        if( !apShm ){
+          rc = LSM_NOMEM_BKPT;
+          break;
+        }
+        p->apShmChunk = apShm;
+      }
+
+      for(i=db->nShm; rc==LSM_OK && i<nChunk; i++){
+        if( i>=p->nShmChunk ){
+          void *pChunk = 0;
+          if( p->bMultiProc==0 ){
+            /* Single process mode */
+            pChunk = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc);
+          }else{
+            /* Multi-process mode */
+            rc = lsmEnvShmMap(pEnv, p->pFile, i, LSM_SHM_CHUNK_SIZE, &pChunk);
+          }
+          if( rc==LSM_OK ){
+            p->apShmChunk[i] = pChunk;
+            p->nShmChunk++;
+          }
+        }
+        if( rc==LSM_OK ){
+          db->apShm[i] = p->apShmChunk[i];
+          db->nShm++;
+        }
+      }
+
+      /* Release the client mutex */
+      lsmMutexLeave(pEnv, p->pClientMutex);
+    }
+  }
+
+  return rc;
+}
+
+static int lockSharedFile(lsm_env *pEnv, Database *p, int iLock, int eOp){
+  int rc = LSM_OK;
+  if( p->bMultiProc ){
+    rc = lsmEnvLock(pEnv, p->pFile, iLock, eOp);
+  }
+  return rc;
+}
+
+/*
+** Test if it would be possible for connection db to obtain a lock of type
+** eType on the nLock locks starting at iLock. If so, return LSM_OK. If it
+** would not be possible to obtain the lock due to a lock held by another
+** connection, return LSM_BUSY. If an IO or other error occurs (i.e. in the 
+** lsm_env.xTestLock function), return some other LSM error code.
+**
+** Note that this function never actually locks the database - it merely
+** queries the system to see if there exists a lock that would prevent
+** it from doing so.
+*/
+int lsmShmTestLock(
+  lsm_db *db,
+  int iLock,
+  int nLock,
+  int eOp
+){
+  int rc = LSM_OK;
+  lsm_db *pIter;
+  Database *p = db->pDatabase;
+  int i;
+  u64 mask = 0;
+
+  for(i=iLock; i<(iLock+nLock); i++){
+    mask |= ((u64)1 << (iLock-1));
+    if( eOp==LSM_LOCK_EXCL ) mask |= ((u64)1 << (iLock+32-1));
+  }
+
+  lsmMutexEnter(db->pEnv, p->pClientMutex);
+  for(pIter=p->pConn; pIter; pIter=pIter->pNext){
+    if( pIter!=db && (pIter->mLock & mask) ){
+      assert( pIter!=db );
+      break;
+    }
+  }
+
+  if( pIter ){
+    rc = LSM_BUSY;
+  }else if( p->bMultiProc ){
+    rc = lsmEnvTestLock(db->pEnv, p->pFile, iLock, nLock, eOp);
+  }
+
+  lsmMutexLeave(db->pEnv, p->pClientMutex);
+  return rc;
+}
+
+/*
+** Attempt to obtain the lock identified by the iLock and bExcl parameters.
+** If successful, return LSM_OK. If the lock cannot be obtained because 
+** there exists some other conflicting lock, return LSM_BUSY. If some other
+** error occurs, return an LSM error code.
+**
+** Parameter iLock must be one of LSM_LOCK_WRITER, WORKER or CHECKPOINTER,
+** or else a value returned by the LSM_LOCK_READER macro.
+*/
+int lsmShmLock(
+  lsm_db *db, 
+  int iLock,
+  int eOp,                        /* One of LSM_LOCK_UNLOCK, SHARED or EXCL */
+  int bBlock                      /* True for a blocking lock */
+){
+  lsm_db *pIter;
+  const u64 me = ((u64)1 << (iLock-1));
+  const u64 ms = ((u64)1 << (iLock+32-1));
+  int rc = LSM_OK;
+  Database *p = db->pDatabase;
+
+  assert( eOp!=LSM_LOCK_EXCL || p->bReadonly==0 );
+  assert( iLock>=1 && iLock<=LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT-1) );
+  assert( LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT-1)<=32 );
+  assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL );
+
+  /* Check for a no-op. Proceed only if this is not one of those. */
+  if( (eOp==LSM_LOCK_UNLOCK && (db->mLock & (me|ms))!=0)
+   || (eOp==LSM_LOCK_SHARED && (db->mLock & (me|ms))!=ms)
+   || (eOp==LSM_LOCK_EXCL   && (db->mLock & me)==0)
+  ){
+    int nExcl = 0;                /* Number of connections holding EXCLUSIVE */
+    int nShared = 0;              /* Number of connections holding SHARED */
+    lsmMutexEnter(db->pEnv, p->pClientMutex);
+
+    /* Figure out the locks currently held by this process on iLock, not
+    ** including any held by connection db.  */
+    for(pIter=p->pConn; pIter; pIter=pIter->pNext){
+      assert( (pIter->mLock & me)==0 || (pIter->mLock & ms)!=0 );
+      if( pIter!=db ){
+        if( pIter->mLock & me ){
+          nExcl++;
+        }else if( pIter->mLock & ms ){
+          nShared++;
+        }
+      }
+    }
+    assert( nExcl==0 || nExcl==1 );
+    assert( nExcl==0 || nShared==0 );
+    assert( nExcl==0 || (db->mLock & (me|ms))==0 );
+
+    switch( eOp ){
+      case LSM_LOCK_UNLOCK:
+        if( nShared==0 ){
+          lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_UNLOCK);
+        }
+        db->mLock &= ~(me|ms);
+        break;
+
+      case LSM_LOCK_SHARED:
+        if( nExcl ){
+          rc = LSM_BUSY;
+        }else{
+          if( nShared==0 ){
+            rc = lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_SHARED);
+          }
+          if( rc==LSM_OK ){
+            db->mLock |= ms;
+            db->mLock &= ~me;
+          }
+        }
+        break;
+
+      default:
+        assert( eOp==LSM_LOCK_EXCL );
+        if( nExcl || nShared ){
+          rc = LSM_BUSY;
+        }else{
+          rc = lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_EXCL);
+          if( rc==LSM_OK ){
+            db->mLock |= (me|ms);
+          }
+        }
+        break;
+    }
+
+    lsmMutexLeave(db->pEnv, p->pClientMutex);
+  }
+
+  return rc;
+}
+
+#ifdef LSM_DEBUG
+
+int shmLockType(lsm_db *db, int iLock){
+  const u64 me = ((u64)1 << (iLock-1));
+  const u64 ms = ((u64)1 << (iLock+32-1));
+
+  if( db->mLock & me ) return LSM_LOCK_EXCL;
+  if( db->mLock & ms ) return LSM_LOCK_SHARED;
+  return LSM_LOCK_UNLOCK;
+}
+
+/*
+** The arguments passed to this function are similar to those passed to
+** the lsmShmLock() function. However, instead of obtaining a new lock 
+** this function returns true if the specified connection already holds 
+** (or does not hold) such a lock, depending on the value of eOp. As
+** follows:
+**
+**   (eOp==LSM_LOCK_UNLOCK) -> true if db has no lock on iLock
+**   (eOp==LSM_LOCK_SHARED) -> true if db has at least a SHARED lock on iLock.
+**   (eOp==LSM_LOCK_EXCL)   -> true if db has an EXCLUSIVE lock on iLock.
+*/
+int lsmShmAssertLock(lsm_db *db, int iLock, int eOp){
+  int ret = 0;
+  int eHave;
+
+  assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) );
+  assert( iLock<=16 );
+  assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL );
+
+  eHave = shmLockType(db, iLock);
+
+  switch( eOp ){
+    case LSM_LOCK_UNLOCK:
+      ret = (eHave==LSM_LOCK_UNLOCK);
+      break;
+    case LSM_LOCK_SHARED:
+      ret = (eHave!=LSM_LOCK_UNLOCK);
+      break;
+    case LSM_LOCK_EXCL:
+      ret = (eHave==LSM_LOCK_EXCL);
+      break;
+    default:
+      assert( !"bad eOp value passed to lsmShmAssertLock()" );
+      break;
+  }
+
+  return ret;
+}
+
+int lsmShmAssertWorker(lsm_db *db){
+  return lsmShmAssertLock(db, LSM_LOCK_WORKER, LSM_LOCK_EXCL) && db->pWorker;
+}
+
+/*
+** This function does not contribute to library functionality, and is not
+** included in release builds. It is intended to be called from within
+** an interactive debugger.
+**
+** When called, this function prints a single line of human readable output
+** to stdout describing the locks currently held by the connection. For 
+** example:
+**
+**     (gdb) call print_db_locks(pDb)
+**     (shared on dms2) (exclusive on writer) 
+*/
+void print_db_locks(lsm_db *db){
+  int iLock;
+  for(iLock=0; iLock<16; iLock++){
+    int bOne = 0;
+    const char *azLock[] = {0, "shared", "exclusive"};
+    const char *azName[] = {
+      0, "dms1", "dms2", "writer", "worker", "checkpointer",
+      "reader0", "reader1", "reader2", "reader3", "reader4", "reader5"
+    };
+    int eHave = shmLockType(db, iLock);
+    if( azLock[eHave] ){
+      printf("%s(%s on %s)", (bOne?" ":""), azLock[eHave], azName[iLock]);
+      bOne = 1;
+    }
+  }
+  printf("\n");
+}
+void print_all_db_locks(lsm_db *db){
+  lsm_db *p;
+  for(p=db->pDatabase->pConn; p; p=p->pNext){
+    printf("%s connection %p ", ((p==db)?"*":""), p);
+    print_db_locks(p);
+  }
+}
+#endif
+
+void lsmShmBarrier(lsm_db *db){
+  lsmEnvShmBarrier(db->pEnv);
+}
+
+int lsm_checkpoint(lsm_db *pDb, int *pnKB){
+  int rc;                         /* Return code */
+  u32 nWrite = 0;                 /* Number of pages checkpointed */
+
+  /* Attempt the checkpoint. If successful, nWrite is set to the number of
+  ** pages written between this and the previous checkpoint.  */
+  rc = lsmCheckpointWrite(pDb, &nWrite);
+
+  /* If required, calculate the output variable (KB of data checkpointed). 
+  ** Set it to zero if an error occured.  */
+  if( pnKB ){
+    int nKB = 0;
+    if( rc==LSM_OK && nWrite ){
+      nKB = (((i64)nWrite * lsmFsPageSize(pDb->pFS)) + 1023) / 1024;
+    }
+    *pnKB = nKB;
+  }
+
+  return rc;
+}