ADDED doc/wal2.md Index: doc/wal2.md ================================================================== --- /dev/null +++ doc/wal2.md @@ -0,0 +1,98 @@ + +Wal2 Mode Notes +=============== + +## Activating/Deactivating Wal2 Mode + +"Wal2" mode is very similar to "wal" mode. To change a database to wal2 mode, +use the command: + +> + PRAGMA journal_mode = wal2; + +It is not possible to change a database directly from "wal" mode to "wal2" +mode. Instead, it must first be changed to rollback mode. So, to change a wal +mode database to wal2 mode, the following two commands may be used: + +> + PRAGMA journal_mode = delete; + PRAGMA journal_mode = wal2; + +A database in wal2 mode may only be accessed by versions of SQLite compiled +from this branch. Attempting to use any other version of SQLite results in an +SQLITE_NOTADB error. A wal2 mode database may be changed back to rollback mode +(making it accessible by all versions of SQLite) using: + +> + PRAGMA journal_mode = delete; + +## The Advantage of Wal2 Mode + +In legacy wal mode, when a writer writes data to the database, it doesn't +modify the database file directly. Instead, it appends new data to the +"<database>-wal" file. Readers read data from both the original database +file and the "<database>-wal" file. At some point, data is copied from the +"<database>-wal" file into the database file, after which the wal file can +be deleted or overwritten. Copying data from the wal file into the database +file is called a "checkpoint", and may be done explictly (either by "PRAGMA +wal_checkpoint" or sqlite3_wal_checkpoint_v2()), or +automatically (by configuring "PRAGMA wal_autocheckpoint" - this is the +default). + +Checkpointers do not block writers, and writers do not block checkpointers. +However, if a writer writes to the database while a checkpoint is ongoing, +then the new data is appended to the end of the wal file. This means that, +even following the checkpoint, the wal file cannot be overwritten or deleted, +and so all subsequent transactions must also be appended to the wal file. The +work of the checkpointer is not wasted - SQLite remembers which parts of the +wal file have already been copied into the db file so that the next checkpoint +does not have to do so again - but it does mean that the wal file may grow +indefinitely if the checkpointer never gets a chance to finish without a +writer appending to the wal file. There are also circumstances in which +long-running readers may prevent a checkpointer from checkpointing the entire +wal file - also causing the wal file to grow indefinitely in a busy system. + +Wal2 mode does not have this problem. In wal2 mode, wal files do not grow +indefinitely even if the checkpointer never has a chance to finish +uninterrupted. + +In wal2 mode, the system uses two wal files instead of one. The files are named +"<database>-wal" and "<database>-wal2", where "<database>" is of +course the name of the database file. When data is written to the database, the +writer begins by appending the new data to the first wal file. Once the first +wal file has grown large enough, writers switch to appending data to the second +wal file. At this point the first wal file can be checkpointed (after which it +can be overwritten). Then, once the second wal file has grown large enough and +the first wal file has been checkpointed, writers switch back to the first wal +file. And so on. + +## Application Programming + +From the point of view of the user, the main differences between wal and +wal2 mode are to do with checkpointing: + + * In wal mode, a checkpoint may be attempted at any time. In wal2 + mode, the checkpointer has to wait until writers have switched + to the "other" wal file before a checkpoint can take place. + + * In wal mode, the wal-hook (callback registered using + sqlite3_wal_hook()) is invoked after a transaction is committed + with the total number of pages in the wal file as an argument. In wal2 + mode, the argument is either the total number of uncheckpointed pages in + both wal files, or - if the "other" wal file is empty or already + checkpointed - 0. + +Clients are recommended to use the same strategies for checkpointing wal2 mode +databases as for wal databases - by registering a wal-hook using +sqlite3_wal_hook() and attempting a checkpoint when the parameter +exceeds a certain threshold. + +However, it should be noted that although the wal-hook is invoked after each +transaction is committed to disk and database locks released, it is still +invoked from within the sqlite3_step() call used to execute the "COMMIT" +command. In BEGIN CONCURRENT systems, where the "COMMIT" is often protected by +an application mutex, this may reduce concurrency. In such systems, instead of +executing a checkpoint from within the wal-hook, a thread might defer this +action until after the application mutex has been released. + + ADDED ext/misc/bgckpt.c Index: ext/misc/bgckpt.c ================================================================== --- /dev/null +++ ext/misc/bgckpt.c @@ -0,0 +1,244 @@ +/* +** 2017-10-11 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +** +*/ + +#if !defined(SQLITE_TEST) || defined(SQLITE_OS_UNIX) + +#include "sqlite3.h" +#include +#include + +/* +** API declarations. +*/ +typedef struct Checkpointer Checkpointer; +int sqlite3_bgckpt_create(const char *zFilename, Checkpointer **pp); +int sqlite3_bgckpt_checkpoint(Checkpointer *p, int bBlock); +void sqlite3_bgckpt_destroy(Checkpointer *p); + + +struct Checkpointer { + sqlite3 *db; /* Database handle */ + + pthread_t thread; /* Background thread */ + pthread_mutex_t mutex; + pthread_cond_t cond; + + int rc; /* Error from "PRAGMA wal_checkpoint" */ + int bCkpt; /* True if checkpoint requested */ + int bExit; /* True if exit requested */ +}; + +static void *bgckptThreadMain(void *pCtx){ + int rc = SQLITE_OK; + Checkpointer *p = (Checkpointer*)pCtx; + + while( rc==SQLITE_OK ){ + int bExit; + + pthread_mutex_lock(&p->mutex); + if( p->bCkpt==0 && p->bExit==0 ){ + pthread_cond_wait(&p->cond, &p->mutex); + } + p->bCkpt = 0; + bExit = p->bExit; + pthread_mutex_unlock(&p->mutex); + + if( bExit ) break; + rc = sqlite3_exec(p->db, "PRAGMA wal_checkpoint", 0, 0, 0); + if( rc==SQLITE_BUSY ){ + rc = SQLITE_OK; + } + } + + pthread_mutex_lock(&p->mutex); + p->rc = rc; + pthread_mutex_unlock(&p->mutex); + return 0; +} + +void sqlite3_bgckpt_destroy(Checkpointer *p){ + if( p ){ + void *ret = 0; + + /* Signal the background thread to exit */ + pthread_mutex_lock(&p->mutex); + p->bExit = 1; + pthread_cond_broadcast(&p->cond); + pthread_mutex_unlock(&p->mutex); + + pthread_join(p->thread, &ret); + sqlite3_close(p->db); + sqlite3_free(p); + } +} + + +int sqlite3_bgckpt_create(const char *zFilename, Checkpointer **pp){ + Checkpointer *pNew = 0; + int rc; + + pNew = (Checkpointer*)sqlite3_malloc(sizeof(Checkpointer)); + if( pNew==0 ){ + rc = SQLITE_NOMEM; + }else{ + memset(pNew, 0, sizeof(Checkpointer)); + rc = sqlite3_open(zFilename, &pNew->db); + } + + if( rc==SQLITE_OK ){ + pthread_mutex_init(&pNew->mutex, 0); + pthread_cond_init(&pNew->cond, 0); + pthread_create(&pNew->thread, 0, bgckptThreadMain, (void*)pNew); + } + + if( rc!=SQLITE_OK ){ + sqlite3_bgckpt_destroy(pNew); + pNew = 0; + } + *pp = pNew; + return rc; +} + +int sqlite3_bgckpt_checkpoint(Checkpointer *p, int bBlock){ + int rc; + pthread_mutex_lock(&p->mutex); + rc = p->rc; + if( rc==SQLITE_OK ){ + p->bCkpt = 1; + pthread_cond_broadcast(&p->cond); + } + pthread_mutex_unlock(&p->mutex); + return rc; +} + +#ifdef SQLITE_TEST + +#if defined(INCLUDE_SQLITE_TCL_H) +# include "sqlite_tcl.h" +#else +# include "tcl.h" +# ifndef SQLITE_TCLAPI +# define SQLITE_TCLAPI +# endif +#endif + +const char *sqlite3ErrName(int rc); + +static void SQLITE_TCLAPI bgckpt_del(void * clientData){ + Checkpointer *pCkpt = (Checkpointer*)clientData; + sqlite3_bgckpt_destroy(pCkpt); +} + +/* +** Tclcmd: $ckpt SUBCMD ... +*/ +static int SQLITE_TCLAPI bgckpt_obj_cmd( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + Checkpointer *pCkpt = (Checkpointer*)clientData; + const char *aCmd[] = { "checkpoint", "destroy", 0 }; + int iCmd; + + if( objc<2 ){ + Tcl_WrongNumArgs(interp, 1, objv, "SUBCMD ..."); + return TCL_ERROR; + } + + if( Tcl_GetIndexFromObj(interp, objv[1], aCmd, "sub-command", 0, &iCmd) ){ + return TCL_ERROR; + } + + switch( iCmd ){ + case 0: { + int rc; + int bBlock = 0; + + if( objc>3 ){ + Tcl_WrongNumArgs(interp, 2, objv, "?BLOCKING?"); + return TCL_ERROR; + } + if( objc==3 && Tcl_GetBooleanFromObj(interp, objv[2], &bBlock) ){ + return TCL_ERROR; + } + + rc = sqlite3_bgckpt_checkpoint(pCkpt, bBlock); + if( rc!=SQLITE_OK ){ + Tcl_SetObjResult(interp, Tcl_NewStringObj(sqlite3ErrName(rc), -1)); + return TCL_ERROR; + } + break; + } + + case 1: { + Tcl_DeleteCommand(interp, Tcl_GetString(objv[0])); + break; + } + } + + return TCL_OK; +} + +/* +** Tclcmd: bgckpt CMDNAME FILENAME +*/ +static int SQLITE_TCLAPI bgckpt_cmd( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + const char *zCmd; + const char *zFilename; + int rc; + Checkpointer *pCkpt; + + if( objc!=3 ){ + Tcl_WrongNumArgs(interp, 1, objv, "CMDNAME FILENAME"); + return TCL_ERROR; + } + zCmd = Tcl_GetString(objv[1]); + zFilename = Tcl_GetString(objv[2]); + + rc = sqlite3_bgckpt_create(zFilename, &pCkpt); + if( rc!=SQLITE_OK ){ + Tcl_SetObjResult(interp, Tcl_NewStringObj(sqlite3ErrName(rc), -1)); + return TCL_ERROR; + } + + Tcl_CreateObjCommand(interp, zCmd, bgckpt_obj_cmd, (void*)pCkpt, bgckpt_del); + Tcl_SetObjResult(interp, objv[1]); + return TCL_OK; +} + +int Bgckpt_Init(Tcl_Interp *interp){ + Tcl_CreateObjCommand(interp, "bgckpt", bgckpt_cmd, 0, 0); + return TCL_OK; +} +#endif /* SQLITE_TEST */ + +#else +#if defined(INCLUDE_SQLITE_TCL_H) +# include "sqlite_tcl.h" +#else +# include "tcl.h" +# ifndef SQLITE_TCLAPI +# define SQLITE_TCLAPI +# endif +#endif +int Bgckpt_Init(Tcl_Interp *interp){ return TCL_OK; } +#endif + Index: ext/session/sqlite3session.h ================================================================== --- ext/session/sqlite3session.h +++ ext/session/sqlite3session.h @@ -316,10 +316,23 @@ ** Or, if one field of a row is updated while a session is disabled, and ** another field of the same row is updated while the session is enabled, the ** resulting changeset will contain an UPDATE change that updates both fields. */ int sqlite3session_changeset( + sqlite3_session *pSession, /* Session object */ + int *pnChangeset, /* OUT: Size of buffer at *ppChangeset */ + void **ppChangeset /* OUT: Buffer containing changeset */ +); + +/* +** CAPI3REF: Generate A Full Changeset From A Session Object +** +** This function is similar to sqlite3session_changeset(), except that for +** each row affected by an UPDATE statement, all old.* values are recorded +** as part of the changeset, not just those modified. +*/ +int sqlite3session_fullchangeset( sqlite3_session *pSession, /* Session object */ int *pnChangeset, /* OUT: Size of buffer at *ppChangeset */ void **ppChangeset /* OUT: Buffer containing changeset */ ); Index: main.mk ================================================================== --- main.mk +++ main.mk @@ -355,10 +355,11 @@ # Extensions to be statically loaded. # TESTSRC += \ $(TOP)/ext/misc/amatch.c \ + $(TOP)/ext/misc/bgckpt.c \ $(TOP)/ext/misc/carray.c \ $(TOP)/ext/misc/closure.c \ $(TOP)/ext/misc/csv.c \ $(TOP)/ext/misc/eval.c \ $(TOP)/ext/misc/explain.c \ Index: src/btree.c ================================================================== --- src/btree.c +++ src/btree.c @@ -3295,14 +3295,14 @@ } if( page1[19]>1 ){ goto page1_init_failed; } #else - if( page1[18]>2 ){ + if( page1[18]>3 ){ pBt->btsFlags |= BTS_READ_ONLY; } - if( page1[19]>2 ){ + if( page1[19]>3 ){ goto page1_init_failed; } /* If the write version is set to 2, this database should be accessed ** in WAL mode. If the log is not already open, open it now. Then @@ -3310,13 +3310,13 @@ ** The caller detects this and calls this function again. This is ** required as the version of page 1 currently in the page1 buffer ** may not be the latest version - there may be a newer one in the log ** file. */ - if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){ + if( page1[19]>=2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){ int isOpen = 0; - rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen); + rc = sqlite3PagerOpenWal(pBt->pPager, (page1[19]==3), &isOpen); if( rc!=SQLITE_OK ){ goto page1_init_failed; }else{ setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1); if( isOpen==0 ){ @@ -10615,11 +10615,11 @@ */ int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){ BtShared *pBt = pBtree->pBt; int rc; /* Return code */ - assert( iVersion==1 || iVersion==2 ); + assert( iVersion==1 || iVersion==2 || iVersion==3 ); /* If setting the version fields to 1, do not automatically open the ** WAL connection, even if the version fields are currently set to 2. */ pBt->btsFlags &= ~BTS_NO_WAL; Index: src/pager.c ================================================================== --- src/pager.c +++ src/pager.c @@ -812,24 +812,10 @@ /* ** The maximum legal page number is (2^31 - 1). */ #define PAGER_MAX_PGNO 2147483647 -/* -** The argument to this macro is a file descriptor (type sqlite3_file*). -** Return 0 if it is not open, or non-zero (but not 1) if it is. -** -** This is so that expressions can be written as: -** -** if( isOpen(pPager->jfd) ){ ... -** -** instead of -** -** if( pPager->jfd->pMethods ){ ... -*/ -#define isOpen(pFd) ((pFd)->pMethods!=0) - #ifdef SQLITE_DIRECT_OVERFLOW_READ /* ** Return true if page pgno can be read directly from the database file ** by the b-tree layer. This is the case if: ** @@ -959,10 +945,11 @@ */ assert( p->eLock>=RESERVED_LOCK ); assert( isOpen(p->jfd) || p->journalMode==PAGER_JOURNALMODE_OFF || p->journalMode==PAGER_JOURNALMODE_WAL + || p->journalMode==PAGER_JOURNALMODE_WAL2 ); } assert( pPager->dbOrigSize==pPager->dbFileSize ); assert( pPager->dbOrigSize==pPager->dbHintSize ); break; @@ -973,10 +960,11 @@ assert( !pagerUseWal(pPager) ); assert( p->eLock>=EXCLUSIVE_LOCK ); assert( isOpen(p->jfd) || p->journalMode==PAGER_JOURNALMODE_OFF || p->journalMode==PAGER_JOURNALMODE_WAL + || p->journalMode==PAGER_JOURNALMODE_WAL2 || (sqlite3OsDeviceCharacteristics(p->fd)&SQLITE_IOCAP_BATCH_ATOMIC) ); assert( pPager->dbOrigSize<=pPager->dbHintSize ); break; @@ -985,10 +973,11 @@ assert( pPager->errCode==SQLITE_OK ); assert( !pagerUseWal(pPager) ); assert( isOpen(p->jfd) || p->journalMode==PAGER_JOURNALMODE_OFF || p->journalMode==PAGER_JOURNALMODE_WAL + || p->journalMode==PAGER_JOURNALMODE_WAL2 || (sqlite3OsDeviceCharacteristics(p->fd)&SQLITE_IOCAP_BATCH_ATOMIC) ); break; case PAGER_ERROR: @@ -2129,11 +2118,11 @@ rc = sqlite3OsSync(pPager->jfd, pPager->syncFlags); } } pPager->journalOff = 0; }else if( pPager->journalMode==PAGER_JOURNALMODE_PERSIST - || (pPager->exclusiveMode && pPager->journalMode!=PAGER_JOURNALMODE_WAL) + || (pPager->exclusiveMode && pPager->journalModetempFile); pPager->journalOff = 0; }else{ /* This branch may be executed with Pager.journalMode==MEMORY if @@ -2143,11 +2132,12 @@ */ int bDelete = !pPager->tempFile; assert( sqlite3JournalIsInMemory(pPager->jfd)==0 ); assert( pPager->journalMode==PAGER_JOURNALMODE_DELETE || pPager->journalMode==PAGER_JOURNALMODE_MEMORY - || pPager->journalMode==PAGER_JOURNALMODE_WAL + || pPager->journalMode==PAGER_JOURNALMODE_WAL + || pPager->journalMode==PAGER_JOURNALMODE_WAL2 ); sqlite3OsClose(pPager->jfd); if( bDelete ){ rc = sqlite3OsDelete(pPager->pVfs, pPager->zJournal, pPager->extraSync); } @@ -3334,10 +3324,14 @@ rc = sqlite3WalBeginReadTransaction(pPager->pWal, &changed); if( rc!=SQLITE_OK || changed ){ pager_reset(pPager); if( USEFETCH(pPager) ) sqlite3OsUnfetch(pPager->fd, 0, 0); + assert( pPager->journalMode==PAGER_JOURNALMODE_WAL + || pPager->journalMode==PAGER_JOURNALMODE_WAL2 + ); + pPager->journalMode = sqlite3WalJournalMode(pPager->pWal); } return rc; } #endif @@ -3429,13 +3423,13 @@ if( rc ) return rc; if( nPage==0 ){ rc = sqlite3OsDelete(pPager->pVfs, pPager->zWal, 0); }else{ testcase( sqlite3PcachePagecount(pPager->pPCache)==0 ); - rc = sqlite3PagerOpenWal(pPager, 0); + rc = sqlite3PagerOpenWal(pPager, 0, 0); } - }else if( pPager->journalMode==PAGER_JOURNALMODE_WAL ){ + }else if( pPager->journalMode>=PAGER_JOURNALMODE_WAL ){ pPager->journalMode = PAGER_JOURNALMODE_DELETE; } } } return rc; @@ -7462,10 +7456,11 @@ assert( eMode==PAGER_JOURNALMODE_DELETE || eMode==PAGER_JOURNALMODE_TRUNCATE || eMode==PAGER_JOURNALMODE_PERSIST || eMode==PAGER_JOURNALMODE_OFF || eMode==PAGER_JOURNALMODE_WAL + || eMode==PAGER_JOURNALMODE_WAL2 || eMode==PAGER_JOURNALMODE_MEMORY ); /* This routine is only called from the OP_JournalMode opcode, and ** the logic there will never allow a temporary file to be changed ** to WAL mode. @@ -7496,13 +7491,16 @@ assert( (PAGER_JOURNALMODE_PERSIST & 5)==1 ); assert( (PAGER_JOURNALMODE_DELETE & 5)==0 ); assert( (PAGER_JOURNALMODE_MEMORY & 5)==4 ); assert( (PAGER_JOURNALMODE_OFF & 5)==0 ); assert( (PAGER_JOURNALMODE_WAL & 5)==5 ); + assert( (PAGER_JOURNALMODE_WAL2 & 5)==4 ); assert( isOpen(pPager->fd) || pPager->exclusiveMode ); - if( !pPager->exclusiveMode && (eOld & 5)==1 && (eMode & 1)==0 ){ + if( !pPager->exclusiveMode && (eOld & 5)==1 && (eMode & 1)==0 + && eMode!=PAGER_JOURNALMODE_WAL2 /* TODO: fix this if possible */ + ){ /* In this case we would like to delete the journal file. If it is ** not possible, then that is not a problem. Deleting the journal file ** here is an optimization only. ** @@ -7661,11 +7659,11 @@ ** Call sqlite3WalOpen() to open the WAL handle. If the pager is in ** exclusive-locking mode when this function is called, take an EXCLUSIVE ** lock on the database file and use heap-memory to store the wal-index ** in. Otherwise, use the normal shared-memory. */ -static int pagerOpenWal(Pager *pPager){ +static int pagerOpenWal(Pager *pPager, int bWal2){ int rc = SQLITE_OK; assert( pPager->pWal==0 && pPager->tempFile==0 ); assert( pPager->eLock==SHARED_LOCK || pPager->eLock==EXCLUSIVE_LOCK ); @@ -7682,11 +7680,11 @@ ** (e.g. due to malloc() failure), return an error code. */ if( rc==SQLITE_OK ){ rc = sqlite3WalOpen(pPager->pVfs, pPager->fd, pPager->zWal, pPager->exclusiveMode, - pPager->journalSizeLimit, &pPager->pWal + pPager->journalSizeLimit, bWal2, &pPager->pWal ); } pagerFixMaplimit(pPager); return rc; @@ -7708,10 +7706,11 @@ ** the WAL file is already open, set *pbOpen to 1 and return SQLITE_OK ** without doing anything. */ int sqlite3PagerOpenWal( Pager *pPager, /* Pager object */ + int bWal2, /* Open in wal2 mode if not already open */ int *pbOpen /* OUT: Set to true if call is a no-op */ ){ int rc = SQLITE_OK; /* Return code */ assert( assert_pager_state(pPager) ); @@ -7724,13 +7723,13 @@ if( !sqlite3PagerWalSupported(pPager) ) return SQLITE_CANTOPEN; /* Close any rollback journal previously open */ sqlite3OsClose(pPager->jfd); - rc = pagerOpenWal(pPager); + rc = pagerOpenWal(pPager, bWal2); if( rc==SQLITE_OK ){ - pPager->journalMode = PAGER_JOURNALMODE_WAL; + pPager->journalMode = bWal2?PAGER_JOURNALMODE_WAL2:PAGER_JOURNALMODE_WAL; pPager->eState = PAGER_OPEN; } }else{ *pbOpen = 1; } @@ -7748,11 +7747,13 @@ ** If successful, the EXCLUSIVE lock is not released before returning. */ int sqlite3PagerCloseWal(Pager *pPager, sqlite3 *db){ int rc = SQLITE_OK; - assert( pPager->journalMode==PAGER_JOURNALMODE_WAL ); + assert( pPager->journalMode==PAGER_JOURNALMODE_WAL + || pPager->journalMode==PAGER_JOURNALMODE_WAL2 + ); /* If the log file is not already open, but does exist in the file-system, ** it may need to be checkpointed before the connection can switch to ** rollback mode. Open it now so this can happen. */ @@ -7763,11 +7764,11 @@ rc = sqlite3OsAccess( pPager->pVfs, pPager->zWal, SQLITE_ACCESS_EXISTS, &logexists ); } if( rc==SQLITE_OK && logexists ){ - rc = pagerOpenWal(pPager); + rc = pagerOpenWal(pPager, 0); } } /* Checkpoint and close the log. Because an EXCLUSIVE lock is held on ** the database file, the log and log-summary files will be deleted. Index: src/pager.h ================================================================== --- src/pager.h +++ src/pager.h @@ -79,10 +79,27 @@ #define PAGER_JOURNALMODE_PERSIST 1 /* Commit by zeroing journal header */ #define PAGER_JOURNALMODE_OFF 2 /* Journal omitted. */ #define PAGER_JOURNALMODE_TRUNCATE 3 /* Commit by truncating journal */ #define PAGER_JOURNALMODE_MEMORY 4 /* In-memory journal file */ #define PAGER_JOURNALMODE_WAL 5 /* Use write-ahead logging */ +#define PAGER_JOURNALMODE_WAL2 6 /* Use write-ahead logging mode 2 */ + +#define isWalMode(x) ((x)==PAGER_JOURNALMODE_WAL || (x)==PAGER_JOURNALMODE_WAL2) + +/* +** The argument to this macro is a file descriptor (type sqlite3_file*). +** Return 0 if it is not open, or non-zero (but not 1) if it is. +** +** This is so that expressions can be written as: +** +** if( isOpen(pPager->jfd) ){ ... +** +** instead of +** +** if( pPager->jfd->pMethods ){ ... +*/ +#define isOpen(pFd) ((pFd)->pMethods!=0) /* ** Flags that make up the mask passed to sqlite3PagerGet(). */ #define PAGER_GET_NOCONTENT 0x01 /* Do not load data from disk */ @@ -176,11 +193,11 @@ #ifndef SQLITE_OMIT_WAL int sqlite3PagerCheckpoint(Pager *pPager, sqlite3*, int, int*, int*); int sqlite3PagerWalSupported(Pager *pPager); int sqlite3PagerWalCallback(Pager *pPager); - int sqlite3PagerOpenWal(Pager *pPager, int *pisOpen); + int sqlite3PagerOpenWal(Pager *pPager, int, int *pisOpen); int sqlite3PagerCloseWal(Pager *pPager, sqlite3*); # ifdef SQLITE_ENABLE_SNAPSHOT int sqlite3PagerSnapshotGet(Pager *pPager, sqlite3_snapshot **ppSnapshot); int sqlite3PagerSnapshotOpen(Pager *pPager, sqlite3_snapshot *pSnapshot); int sqlite3PagerSnapshotRecover(Pager *pPager); Index: src/pragma.c ================================================================== --- src/pragma.c +++ src/pragma.c @@ -258,19 +258,20 @@ */ const char *sqlite3JournalModename(int eMode){ static char * const azModeName[] = { "delete", "persist", "off", "truncate", "memory" #ifndef SQLITE_OMIT_WAL - , "wal" + , "wal", "wal2" #endif }; assert( PAGER_JOURNALMODE_DELETE==0 ); assert( PAGER_JOURNALMODE_PERSIST==1 ); assert( PAGER_JOURNALMODE_OFF==2 ); assert( PAGER_JOURNALMODE_TRUNCATE==3 ); assert( PAGER_JOURNALMODE_MEMORY==4 ); assert( PAGER_JOURNALMODE_WAL==5 ); + assert( PAGER_JOURNALMODE_WAL2==6 ); assert( eMode>=0 && eMode<=ArraySize(azModeName) ); if( eMode==ArraySize(azModeName) ) return 0; return azModeName[eMode]; } Index: src/test_hexio.c ================================================================== --- src/test_hexio.c +++ src/test_hexio.c @@ -188,11 +188,11 @@ Tcl_SetObjResult(interp, Tcl_NewIntObj(written)); return TCL_OK; } /* -** USAGE: hexio_get_int HEXDATA +** USAGE: hexio_get_int [-littleendian] HEXDATA ** ** Interpret the HEXDATA argument as a big-endian integer. Return ** the value of that integer. HEXDATA can contain between 2 and 8 ** hexadecimal digits. */ @@ -205,16 +205,24 @@ int val; int nIn, nOut; const unsigned char *zIn; unsigned char *aOut; unsigned char aNum[4]; + int bLittle = 0; - if( objc!=2 ){ - Tcl_WrongNumArgs(interp, 1, objv, "HEXDATA"); + if( objc==3 ){ + int n; + char *z = Tcl_GetStringFromObj(objv[1], &n); + if( n>=2 && n<=13 && memcmp(z, "-littleendian", n)==0 ){ + bLittle = 1; + } + } + if( (objc-bLittle)!=2 ){ + Tcl_WrongNumArgs(interp, 1, objv, "[-littleendian] HEXDATA"); return TCL_ERROR; } - zIn = (const unsigned char *)Tcl_GetStringFromObj(objv[1], &nIn); + zIn = (const unsigned char *)Tcl_GetStringFromObj(objv[1+bLittle], &nIn); aOut = sqlite3_malloc( nIn/2 ); if( aOut==0 ){ return TCL_ERROR; } nOut = sqlite3TestHexToBin(zIn, nIn, aOut); @@ -223,11 +231,15 @@ }else{ memset(aNum, 0, sizeof(aNum)); memcpy(&aNum[4-nOut], aOut, nOut); } sqlite3_free(aOut); - val = (aNum[0]<<24) | (aNum[1]<<16) | (aNum[2]<<8) | aNum[3]; + if( bLittle ){ + val = (aNum[3]<<24) | (aNum[2]<<16) | (aNum[1]<<8) | aNum[0]; + }else{ + val = (aNum[0]<<24) | (aNum[1]<<16) | (aNum[2]<<8) | aNum[3]; + } Tcl_SetObjResult(interp, Tcl_NewIntObj(val)); return TCL_OK; } Index: src/test_tclsh.c ================================================================== --- src/test_tclsh.c +++ src/test_tclsh.c @@ -96,10 +96,11 @@ #endif extern int Md5_Init(Tcl_Interp*); extern int Fts5tcl_Init(Tcl_Interp *); extern int SqliteRbu_Init(Tcl_Interp*); extern int Sqlitetesttcl_Init(Tcl_Interp*); + extern int Bgckpt_Init(Tcl_Interp*); #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) extern int Sqlitetestfts3_Init(Tcl_Interp *interp); #endif #ifdef SQLITE_ENABLE_ZIPVFS extern int Zipvfs_Init(Tcl_Interp*); @@ -163,10 +164,12 @@ TestSession_Init(interp); #endif Fts5tcl_Init(interp); SqliteRbu_Init(interp); Sqlitetesttcl_Init(interp); + Bgckpt_Init(interp); + #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) Sqlitetestfts3_Init(interp); #endif TestExpert_Init(interp); Index: src/vdbe.c ================================================================== --- src/vdbe.c +++ src/vdbe.c @@ -6655,10 +6655,11 @@ || eNew==PAGER_JOURNALMODE_TRUNCATE || eNew==PAGER_JOURNALMODE_PERSIST || eNew==PAGER_JOURNALMODE_OFF || eNew==PAGER_JOURNALMODE_MEMORY || eNew==PAGER_JOURNALMODE_WAL + || eNew==PAGER_JOURNALMODE_WAL2 || eNew==PAGER_JOURNALMODE_QUERY ); assert( pOp->p1>=0 && pOp->p1nDb ); assert( p->readOnly==0 ); @@ -6672,52 +6673,63 @@ zFilename = sqlite3PagerFilename(pPager, 1); /* Do not allow a transition to journal_mode=WAL for a database ** in temporary storage or if the VFS does not support shared memory */ - if( eNew==PAGER_JOURNALMODE_WAL + if( isWalMode(eNew) && (sqlite3Strlen30(zFilename)==0 /* Temp file */ || !sqlite3PagerWalSupported(pPager)) /* No shared-memory support */ ){ eNew = eOld; } - if( (eNew!=eOld) - && (eOld==PAGER_JOURNALMODE_WAL || eNew==PAGER_JOURNALMODE_WAL) - ){ + if( eNew!=eOld && (isWalMode(eNew) || isWalMode(eOld)) ){ + + /* Prevent changing directly to wal2 from wal mode. And vice versa. */ + if( isWalMode(eNew) && isWalMode(eOld) ){ + rc = SQLITE_ERROR; + sqlite3VdbeError(p, "cannot change from %s to %s mode", + sqlite3JournalModename(eOld), sqlite3JournalModename(eNew) + ); + goto abort_due_to_error; + } + + /* Prevent switching into or out of wal/wal2 mode mid-transaction */ if( !db->autoCommit || db->nVdbeRead>1 ){ rc = SQLITE_ERROR; sqlite3VdbeError(p, "cannot change %s wal mode from within a transaction", (eNew==PAGER_JOURNALMODE_WAL ? "into" : "out of") ); goto abort_due_to_error; - }else{ - - if( eOld==PAGER_JOURNALMODE_WAL ){ - /* If leaving WAL mode, close the log file. If successful, the call - ** to PagerCloseWal() checkpoints and deletes the write-ahead-log - ** file. An EXCLUSIVE lock may still be held on the database file - ** after a successful return. - */ - rc = sqlite3PagerCloseWal(pPager, db); - if( rc==SQLITE_OK ){ - sqlite3PagerSetJournalMode(pPager, eNew); - } - }else if( eOld==PAGER_JOURNALMODE_MEMORY ){ - /* Cannot transition directly from MEMORY to WAL. Use mode OFF - ** as an intermediate */ - sqlite3PagerSetJournalMode(pPager, PAGER_JOURNALMODE_OFF); - } - - /* Open a transaction on the database file. Regardless of the journal - ** mode, this transaction always uses a rollback journal. - */ - assert( sqlite3BtreeIsInTrans(pBt)==0 ); - if( rc==SQLITE_OK ){ - rc = sqlite3BtreeSetVersion(pBt, (eNew==PAGER_JOURNALMODE_WAL ? 2 : 1)); - } + } + + if( isWalMode(eOld) ){ + /* If leaving WAL mode, close the log file. If successful, the call + ** to PagerCloseWal() checkpoints and deletes the write-ahead-log + ** file. An EXCLUSIVE lock may still be held on the database file + ** after a successful return. + */ + rc = sqlite3PagerCloseWal(pPager, db); + if( rc==SQLITE_OK ){ + sqlite3PagerSetJournalMode(pPager, eNew); + } + }else if( eOld==PAGER_JOURNALMODE_MEMORY ){ + /* Cannot transition directly from MEMORY to WAL. Use mode OFF + ** as an intermediate */ + sqlite3PagerSetJournalMode(pPager, PAGER_JOURNALMODE_OFF); + } + + /* Open a transaction on the database file. Regardless of the journal + ** mode, this transaction always uses a rollback journal. + */ + assert( sqlite3BtreeIsInTrans(pBt)==0 ); + if( rc==SQLITE_OK ){ + /* 1==rollback, 2==wal, 3==wal2 */ + rc = sqlite3BtreeSetVersion(pBt, + 1 + isWalMode(eNew) + (eNew==PAGER_JOURNALMODE_WAL2) + ); } } #endif /* ifndef SQLITE_OMIT_WAL */ if( rc ) eNew = eOld; Index: src/wal.c ================================================================== --- src/wal.c +++ src/wal.c @@ -99,11 +99,11 @@ ** ** READER ALGORITHM ** ** To read a page from the database (call it page number P), a reader ** first checks the WAL to see if it contains page P. If so, then the -** last valid instance of page P that is a followed by a commit frame +** last valid instance of page P that is followed by a commit frame ** or is a commit frame itself becomes the value read. If the WAL ** contains no copies of page P that are valid and which are a commit ** frame or are followed by a commit frame, then page P is read from ** the database file. ** @@ -231,11 +231,11 @@ ** reader might be using some value K0 and a second reader that started ** at a later time (after additional transactions were added to the WAL ** and to the wal-index) might be using a different value K1, where K1>K0. ** Both readers can use the same hash table and mapping section to get ** the correct result. There may be entries in the hash table with -** K>K0 but to the first reader, those entries will appear to be unused +** K>K0, but to the first reader those entries will appear to be unused ** slots in the hash table and so the first reader will get an answer as ** if no values greater than K0 had ever been inserted into the hash table ** in the first place - which is what reader one wants. Meanwhile, the ** second reader using K1 will see additional values that were inserted ** later, which is exactly what reader two wants. @@ -242,10 +242,195 @@ ** ** When a rollback occurs, the value of K is decreased. Hash table entries ** that correspond to frames greater than the new K value are removed ** from the hash table at this point. */ + +/* +** WAL2 NOTES +** +** This file also contains the implementation of "wal2" mode - activated +** using "PRAGMA journal_mode = wal2". Wal2 mode is very similar to wal +** mode, except that it uses two wal files instead of one. Under some +** circumstances, wal2 mode provides more concurrency than legacy wal +** mode. +** +** THE PROBLEM WAL2 SOLVES: +** +** In legacy wal mode, if a writer wishes to write to the database while +** a checkpoint is ongoing, it may append frames to the existing wal file. +** This means that after the checkpoint has finished, the wal file consists +** of a large block of checkpointed frames, followed by a block of +** uncheckpointed frames. In a deployment that features a high volume of +** write traffic, this may mean that the wal file is never completely +** checkpointed. And so grows indefinitely. +** +** An alternative is to use "PRAGMA wal_checkpoint=RESTART" or similar to +** force a complete checkpoint of the wal file. But this must: +** +** 1) Wait on all existing readers to finish, +** 2) Wait on any existing writer, and then block all new writers, +** 3) Do the checkpoint, +** 4) Wait on any new readers that started during steps 2 and 3. Writers +** are still blocked during this step. +** +** This means that in order to avoid the wal file growing indefinitely +** in a busy system, writers must periodically pause to allow a checkpoint +** to complete. In a system with long running readers, such pauses may be +** for a non-trivial amount of time. +** +** OVERVIEW OF SOLUTION +** +** Wal2 mode uses two wal files. After writers have grown the first wal +** file to a pre-configured size, they begin appending transactions to +** the second wal file. Once all existing readers are reading snapshots +** new enough to include the entire first wal file, a checkpointer can +** checkpoint it. +** +** Meanwhile, writers are writing transactions to the second wal file. +** Once that wal file has grown larger than the pre-configured size, each +** new writer checks if: +** +** * the first wal file has been checkpointed, and if so, if +** * there are no readers still reading from the first wal file (once +** it has been checkpointed, new readers read only from the second +** wal file). +** +** If both these conditions are true, the writer may switch back to the +** first wal file. Eventually, a checkpointer can checkpoint the second +** wal file, and so on. +** +** The wal file that writers are currently appending to (the one they +** don't have to check the above two criteria before writing to) is called +** the "current" wal file. +** +** The first wal file takes the same name as the wal file in legacy wal +** mode systems - "-wal". The second is named "-wal2". + +** +** CHECKPOINTS +** +** The "pre-configured size" mentioned above is the value set by +** "PRAGMA journal_size_limit". Or, if journal_size_limit is not set, +** 1000 pages. +** +** There is only a single type of checkpoint in wal2 mode (no "truncate", +** "restart" etc.), and it always checkpoints the entire contents of a single +** wal file. A wal file cannot be checkpointed until after a writer has written +** the first transaction into the other wal file and all readers are reading a +** snapshot that includes at least one transaction from the other wal file. +** +** The wal-hook, if one is registered, is invoked after a write-transaction +** is committed, just as it is in legacy wal mode. The integer parameter +** passed to the wal-hook is the total number of uncheckpointed frames in both +** wal files. Except, the parameter is set to zero if there is no frames +** that may be checkpointed. This happens in two scenarios: +** +** 1. The "other" wal file (the one that the writer did not just append to) +** is completely empty, or +** +** 2. The "other" wal file (the one that the writer did not just append to) +** has already been checkpointed. +** +** +** WAL FILE FORMAT +** +** The file format used for each wal file in wal2 mode is the same as for +** legacy wal mode. Except, the file format field is set to 3021000 +** instead of 3007000. +** +** WAL-INDEX FORMAT +** +** The wal-index format is also very similar. Even though there are two +** wal files, there is still a single wal-index shared-memory area (*-shm +** file with the default unix or win32 VFS). The wal-index header is the +** same size, with the following exceptions it has the same format: +** +** * The version field is set to 3021000 instead of 3007000. +** +** * An unused 32-bit field in the legacy wal-index header is +** now used to store (a) a single bit indicating which of the +** two wal files writers should append to and (b) the number +** of frames in the second wal file (31 bits). +** +** The first hash table in the wal-index contains entries corresponding +** to the first HASHTABLE_NPAGE_ONE frames stored in the first wal file. +** The second hash table in the wal-index contains entries indexing the +** first HASHTABLE_NPAGE frames in the second wal file. The third hash +** table contains the next HASHTABLE_NPAGE frames in the first wal file, +** and so on. +** +** LOCKS +** +** Read-locks are simpler than for legacy wal mode. There are no locking +** slots that contain frame numbers. Instead, there are four distinct +** combinations of read locks a reader may hold: +** +** WAL_LOCK_PART1: "part" lock on first wal, none of second. +** WAL_LOCK_PART1_FULL2: "part" lock on first wal, "full" of second. +** WAL_LOCK_PART2: no lock on first wal, "part" lock on second. +** WAL_LOCK_PART2_FULL1: "full" lock on first wal, "part" lock on second. +** +** When a reader reads the wal-index header as part of opening a read +** transaction, it takes a "part" lock on the current wal file. "Part" +** because the wal file may grow while the read transaction is active, in +** which case the reader would be reading only part of the wal file. +** A part lock prevents a checkpointer from checkpointing the wal file +** on which it is held. +** +** If there is data in the non-current wal file that has not been +** checkpointed, the reader takes a "full" lock on that wal file. A +** "full" lock indicates that the reader is using the entire wal file. +** A full lock prevents a writer from overwriting the wal file on which +** it is held, but does not prevent a checkpointer from checkpointing +** it. +** +** There is still a single WRITER and a single CHECKPOINTER lock. The +** recovery procedure still takes the same exclusive lock on the entire +** range of SQLITE_SHM_NLOCK shm-locks. This works because the read-locks +** above use four of the six read-locking slots used by legacy wal mode. +** +** STARTUP/RECOVERY +** +** The read and write version fields of the database header in a wal2 +** database are set to 0x03, instead of 0x02 as in legacy wal mode. +** +** The wal file format used in wal2 mode is the same as the format used +** in legacy wal mode. However, in order to support recovery, there are two +** differences in the way wal file header fields are populated, as follows: +** +** * When the first wal file is first created, the "nCkpt" field in +** the wal file header is set to 0. Thereafter, each time the writer +** switches wal file, it sets the nCkpt field in the new wal file +** header to ((nCkpt0 + 1) & 0x0F), where nCkpt0 is the value in +** the previous wal file header. This means that the first wal file +** always has an even value in the nCkpt field, and the second wal +** file always has an odd value. +** +** * When a writer switches wal file, it sets the salt values in the +** new wal file to a copy of the checksum for the final frame in +** the previous wal file. +** +** Recovery proceeds as follows: +** +** 1. Each wal file is recovered separately. Except, if the first wal +** file does not exist or is zero bytes in size, the second wal file +** is truncated to zero bytes before it is "recovered". +** +** 2. If both wal files contain valid headers, then the nCkpt fields +** are compared to see which of the two wal files is older. If the +** salt keys in the second wal file match the final frame checksum +** in the older wal file, then both wal files are used. Otherwise, +** the newer wal file is ignored. +** +** 3. Or, if only one or neither of the wal files has a valid header, +** then only a single or no wal files are recovered into the +** reconstructed wal-index. +** +** Refer to header comments for walIndexRecover() for further details. +*/ + #ifndef SQLITE_OMIT_WAL #include "wal.h" /* @@ -269,24 +454,23 @@ # define AtomicLoad(PTR) (*(PTR)) # define AtomicStore(PTR,VAL) (*(PTR) = (VAL)) #endif /* -** The maximum (and only) versions of the wal and wal-index formats -** that may be interpreted by this version of SQLite. -** -** If a client begins recovering a WAL file and finds that (a) the checksum -** values in the wal-header are correct and (b) the version field is not -** WAL_MAX_VERSION, recovery fails and SQLite returns SQLITE_CANTOPEN. -** -** Similarly, if a client successfully reads a wal-index header (i.e. the -** checksum test is successful) and finds that the version field is not -** WALINDEX_MAX_VERSION, then no read-transaction is opened and SQLite -** returns SQLITE_CANTOPEN. -*/ -#define WAL_MAX_VERSION 3007000 -#define WALINDEX_MAX_VERSION 3007000 +** Both the wal-file and the wal-index contain version fields +** indicating the current version of the system. If a client +** reads the header of a wal file (as part of recovery), or the +** wal-index (as part of opening a read transaction) and (a) the +** header checksum is correct but (b) the version field is not +** recognized, the operation fails with SQLITE_CANTOPEN. +** +** Currently, clients support both version-1 ("journal_mode=wal") and +** version-2 ("journal_mode=wal2"). Legacy clients may support version-1 +** only. +*/ +#define WAL_VERSION1 3007000 /* For "journal_mode=wal" */ +#define WAL_VERSION2 3021000 /* For "journal_mode=wal2" */ /* ** Index numbers for various locking bytes. WAL_NREADER is the number ** of available reader locks and should be at least 3. The default ** is SQLITE_SHM_NLOCK==8 and WAL_NREADER==5. @@ -305,10 +489,43 @@ #define WAL_CKPT_LOCK 1 #define WAL_RECOVER_LOCK 2 #define WAL_READ_LOCK(I) (3+(I)) #define WAL_NREADER (SQLITE_SHM_NLOCK-3) +/* +** Values that may be stored in Wal.readLock in wal2 mode. +** +** In wal mode, the Wal.readLock member is set to -1 when no read-lock +** is held, or else is the index of the read-mark on which a lock is +** held. +** +** In wal2 mode, a value of -1 still indicates that no read-lock is held. +** And a non-zero value still represents the index of the read-mark on +** which a lock is held. There are two differences: +** +** 1. wal2 mode never uses read-mark 0. +** +** 2. locks on each read-mark have a different interpretation, as +** indicated by the symbolic names below. +*/ +#define WAL_LOCK_NONE -1 +#define WAL_LOCK_PART1 1 +#define WAL_LOCK_PART1_FULL2 2 +#define WAL_LOCK_PART2_FULL1 3 +#define WAL_LOCK_PART2 4 + +/* +** This constant is used in wal2 mode only. +** +** In wal2 mode, when committing a transaction, if the current wal file +** is sufficiently large and there are no conflicting locks held, the +** writer writes the new transaction into the start of the other wal +** file. Usually, "sufficiently large" is defined by the value configured +** using "PRAGMA journal_size_limit". However, if no such value has been +** configured, sufficiently large defaults to WAL_DEFAULT_WALSIZE frames. +*/ +#define WAL_DEFAULT_WALSIZE 1000 /* Object declarations */ typedef struct WalIndexHdr WalIndexHdr; typedef struct WalIterator WalIterator; typedef struct WalCkptInfo WalCkptInfo; @@ -324,25 +541,68 @@ ** the total header size is 136 bytes. ** ** The szPage value can be any power of 2 between 512 and 32768, inclusive. ** Or it can be 1 to represent a 65536-byte page. The latter case was ** added in 3.7.1 when support for 64K pages was added. +** +** WAL2 mode notes: Member variable mxFrame2 is only used in wal2 mode +** (when iVersion is set to WAL_VERSION2). The lower 31 bits store +** the maximum frame number in file *-wal2. The most significant bit +** is a flag - set if clients are currently appending to *-wal2, clear +** otherwise. */ struct WalIndexHdr { u32 iVersion; /* Wal-index version */ - u32 unused; /* Unused (padding) field */ + u32 mxFrame2; /* See "WAL2 mode notes" above */ u32 iChange; /* Counter incremented each transaction */ u8 isInit; /* 1 when initialized */ u8 bigEndCksum; /* True if checksums in WAL are big-endian */ u16 szPage; /* Database page size in bytes. 1==64K */ - u32 mxFrame; /* Index of last valid frame in the WAL */ + u32 mxFrame; /* Index of last valid frame in each WAL */ u32 nPage; /* Size of database in pages */ u32 aFrameCksum[2]; /* Checksum of last frame in log */ u32 aSalt[2]; /* Two salt values copied from WAL header */ u32 aCksum[2]; /* Checksum over all prior fields */ }; +/* +** The following macros and functions are get/set methods for the maximum +** frame numbers and current wal file values stored in the WalIndexHdr +** structure. These are helpful because of the unorthodox way in which +** the values are stored in wal2 mode (see above). They are equivalent +** to functions with the following signatures. +** +** u32 walidxGetMxFrame(WalIndexHdr*, int iWal); // get mxFrame +** void walidxSetMxFrame(WalIndexHdr*, int iWal, u32 val); // set mxFrame +** int walidxGetFile(WalIndexHdr*) // get file +** void walidxSetFile(WalIndexHdr*, int val); // set file +*/ +#define walidxGetMxFrame(pHdr, iWal) \ + ((iWal) ? ((pHdr)->mxFrame2 & 0x7FFFFFFF) : (pHdr)->mxFrame) + +static void walidxSetMxFrame(WalIndexHdr *pHdr, int iWal, u32 mxFrame){ + if( iWal ){ + pHdr->mxFrame2 = (pHdr->mxFrame2 & 0x80000000) | mxFrame; + }else{ + pHdr->mxFrame = mxFrame; + } + assert( walidxGetMxFrame(pHdr, iWal)==mxFrame ); +} + +#define walidxGetFile(pHdr) ((pHdr)->mxFrame2 >> 31) + +#define walidxSetFile(pHdr, iWal) ( \ + (pHdr)->mxFrame2 = ((pHdr)->mxFrame2 & 0x7FFFFFFF) | ((iWal)<<31) \ +) + +/* +** Argument is a pointer to a Wal structure. Return true if the current +** cache of the wal-index header indicates "journal_mode=wal2" mode, or +** false otherwise. +*/ +#define isWalMode2(pWal) ((pWal)->hdr.iVersion==WAL_VERSION2) + /* ** A copy of the following object occurs in the wal-index immediately ** following the second copy of the WalIndexHdr. This object stores ** information used by checkpoint. ** @@ -449,11 +709,11 @@ ** following object. */ struct Wal { sqlite3_vfs *pVfs; /* The VFS used to create pDbFd */ sqlite3_file *pDbFd; /* File handle for the database file */ - sqlite3_file *pWalFd; /* File handle for WAL file */ + sqlite3_file *apWalFd[2]; /* File handle for "*-wal" and "*-wal2" */ u32 iCallback; /* Value to pass to log callback (or 0) */ i64 mxWalSize; /* Truncate WAL to this size upon reset */ int nWiData; /* Size of array apWiData */ int szFirstBlock; /* Size of first block written to WAL file */ volatile u32 **apWiData; /* Pointer to wal-index content in memory */ @@ -471,17 +731,19 @@ WalIndexHdr hdr; /* Wal-index header for current transaction */ u32 minFrame; /* Ignore wal frames before this one */ u32 iReCksum; /* On commit, recalculate checksums from here */ u32 nPriorFrame; /* For sqlite3WalInfo() */ const char *zWalName; /* Name of WAL file */ + char *zWalName2; /* Name of second WAL file */ u32 nCkpt; /* Checkpoint sequence counter in the wal-header */ #ifdef SQLITE_DEBUG u8 lockError; /* True if a locking error has occurred */ #endif #ifdef SQLITE_ENABLE_SNAPSHOT WalIndexHdr *pSnapshot; /* Start transaction here if not NULL */ #endif + int bWal2; /* bWal2 flag passed to WalOpen() */ }; /* ** Candidate values for Wal.exclusiveMode. */ @@ -713,11 +975,11 @@ volatile WalIndexHdr *aHdr = walIndexHdr(pWal); const int nCksum = offsetof(WalIndexHdr, aCksum); assert( pWal->writeLock ); pWal->hdr.isInit = 1; - pWal->hdr.iVersion = WALINDEX_MAX_VERSION; + assert( pWal->hdr.iVersion==WAL_VERSION1||pWal->hdr.iVersion==WAL_VERSION2 ); walChecksumBytes(1, (u8*)&pWal->hdr, nCksum, 0, pWal->hdr.aCksum); memcpy((void*)&aHdr[1], (const void*)&pWal->hdr, sizeof(WalIndexHdr)); walShmBarrier(pWal); memcpy((void*)&aHdr[0], (const void*)&pWal->hdr, sizeof(WalIndexHdr)); } @@ -791,11 +1053,11 @@ if( pgno==0 ){ return 0; } /* A frame is only valid if a checksum of the WAL header, - ** all prior frams, the first 16 bytes of this frame-header, + ** all prior frames, the first 16 bytes of this frame-header, ** and the frame-data matches the checksum in the last 8 ** bytes of this frame-header. */ nativeCksum = (pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN); walChecksumBytes(nativeCksum, aFrame, 8, aCksum, aCksum); @@ -939,10 +1201,42 @@ } pLoc->aPgno = &pLoc->aPgno[-1]; } return rc; } + +static u32 walExternalEncode(int iWal, u32 iFrame){ + u32 iRet; + if( iWal ){ + iRet = HASHTABLE_NPAGE_ONE + iFrame; + iRet += ((iFrame-1) / HASHTABLE_NPAGE) * HASHTABLE_NPAGE; + }else{ + iRet = iFrame; + iFrame += HASHTABLE_NPAGE - HASHTABLE_NPAGE_ONE; + iRet += ((iFrame-1) / HASHTABLE_NPAGE) * HASHTABLE_NPAGE; + } + return iRet; +} + +/* +** Parameter iExternal is an external frame identifier. This function +** transforms it to a wal file number (0 or 1) and frame number within +** this wal file (reported via output parameter *piRead). +*/ +static int walExternalDecode(u32 iExternal, u32 *piRead){ + int iHash = (iExternal+HASHTABLE_NPAGE-HASHTABLE_NPAGE_ONE-1)/HASHTABLE_NPAGE; + + if( 0==(iHash & 0x01) ){ + /* A frame in wal file 0 */ + *piRead = (iExternal <= HASHTABLE_NPAGE_ONE) ? iExternal : + iExternal - (iHash/2) * HASHTABLE_NPAGE; + return 0; + } + + *piRead = iExternal - HASHTABLE_NPAGE_ONE - ((iHash-1)/2) * HASHTABLE_NPAGE; + return 1; +} /* ** Return the number of the wal-index page that contains the hash-table ** and page-number array that contain entries corresponding to WAL frame ** iFrame. The wal-index is broken up into 32KB pages. Wal-index pages @@ -956,10 +1250,26 @@ && (iHash>=2 || iFrame<=HASHTABLE_NPAGE_ONE+HASHTABLE_NPAGE) && (iHash<=2 || iFrame>(HASHTABLE_NPAGE_ONE+2*HASHTABLE_NPAGE)) ); return iHash; } + +/* +** Return the index of the hash-table corresponding to frame iFrame of wal +** file iWal. +*/ +static int walFramePage2(int iWal, u32 iFrame){ + int iRet; + assert( iWal==0 || iWal==1 ); + assert( iFrame>0 ); + if( iWal==0 ){ + iRet = 2*((iFrame+HASHTABLE_NPAGE-HASHTABLE_NPAGE_ONE-1)/HASHTABLE_NPAGE); + }else{ + iRet = 1 + 2 * ((iFrame-1) / HASHTABLE_NPAGE); + } + return iRet; +} /* ** Return the page number associated with frame iFrame in this WAL. */ static u32 walFramePgno(Wal *pWal, u32 iFrame){ @@ -967,10 +1277,14 @@ if( iHash==0 ){ return pWal->apWiData[0][WALINDEX_HDR_SIZE/sizeof(u32) + iFrame - 1]; } return pWal->apWiData[iHash][(iFrame-1-HASHTABLE_NPAGE_ONE)%HASHTABLE_NPAGE]; } + +static u32 walFramePgno2(Wal *pWal, int iWal, u32 iFrame){ + return walFramePgno(pWal, walExternalEncode(iWal, iFrame)); +} /* ** Remove entries from the hash table that point to WAL slots greater ** than pWal->hdr.mxFrame. ** @@ -985,40 +1299,47 @@ static void walCleanupHash(Wal *pWal){ WalHashLoc sLoc; /* Hash table location */ int iLimit = 0; /* Zero values greater than this */ int nByte; /* Number of bytes to zero in aPgno[] */ int i; /* Used to iterate through aHash[] */ + int iWal = walidxGetFile(&pWal->hdr); + u32 mxFrame = walidxGetMxFrame(&pWal->hdr, iWal); + + u32 iExternal; + if( isWalMode2(pWal) ){ + iExternal = walExternalEncode(iWal, mxFrame); + }else{ + assert( iWal==0 ); + iExternal = mxFrame; + } assert( pWal->writeLock ); - testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE-1 ); - testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE ); - testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE+1 ); + testcase( mxFrame==HASHTABLE_NPAGE_ONE-1 ); + testcase( mxFrame==HASHTABLE_NPAGE_ONE ); + testcase( mxFrame==HASHTABLE_NPAGE_ONE+1 ); - if( pWal->hdr.mxFrame==0 ) return; + if( mxFrame==0 ) return; /* Obtain pointers to the hash-table and page-number array containing - ** the entry that corresponds to frame pWal->hdr.mxFrame. It is guaranteed - ** that the page said hash-table and array reside on is already mapped. - */ - assert( pWal->nWiData>walFramePage(pWal->hdr.mxFrame) ); - assert( pWal->apWiData[walFramePage(pWal->hdr.mxFrame)] ); - walHashGet(pWal, walFramePage(pWal->hdr.mxFrame), &sLoc); + ** the entry that corresponds to frame pWal->hdr.mxFrame. */ + assert( pWal->nWiData>walFramePage(iExternal) ); + assert( pWal->apWiData[walFramePage(iExternal)] ); + walHashGet(pWal, walFramePage(iExternal), &sLoc); /* Zero all hash-table entries that correspond to frame numbers greater ** than pWal->hdr.mxFrame. */ - iLimit = pWal->hdr.mxFrame - sLoc.iZero; + iLimit = iExternal - sLoc.iZero; assert( iLimit>0 ); for(i=0; iiLimit ){ sLoc.aHash[i] = 0; } } /* Zero the entries in the aPgno array that correspond to frames with - ** frame numbers greater than pWal->hdr.mxFrame. - */ + ** frame numbers greater than pWal->hdr.mxFrame. */ nByte = (int)((char *)sLoc.aHash - (char *)&sLoc.aPgno[iLimit+1]); memset((void *)&sLoc.aPgno[iLimit+1], 0, nByte); #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT /* Verify that the every entry in the mapping region is still reachable @@ -1035,30 +1356,37 @@ } } #endif /* SQLITE_ENABLE_EXPENSIVE_ASSERT */ } - /* ** Set an entry in the wal-index that will map database page number ** pPage into WAL frame iFrame. */ -static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){ +static int walIndexAppend(Wal *pWal, int iWal, u32 iFrame, u32 iPage){ int rc; /* Return code */ WalHashLoc sLoc; /* Wal-index hash table location */ + u32 iExternal; + + if( isWalMode2(pWal) ){ + iExternal = walExternalEncode(iWal, iFrame); + }else{ + assert( iWal==0 ); + iExternal = iFrame; + } - rc = walHashGet(pWal, walFramePage(iFrame), &sLoc); + rc = walHashGet(pWal, walFramePage(iExternal), &sLoc); /* Assuming the wal-index file was successfully mapped, populate the ** page number array and hash table entry. */ if( rc==SQLITE_OK ){ int iKey; /* Hash table key */ int idx; /* Value to write to hash-table slot */ int nCollide; /* Number of hash collisions */ - idx = iFrame - sLoc.iZero; + idx = iExternal - sLoc.iZero; assert( idx <= HASHTABLE_NSLOT/2 + 1 ); /* If this is the first entry to be added to this hash-table, zero the ** entire hash table and aPgno[] array before proceeding. */ @@ -1119,10 +1447,152 @@ return rc; } +/* +** Recover a single wal file - *-wal if iWal==0, or *-wal2 if iWal==1. +*/ +static int walIndexRecoverOne(Wal *pWal, int iWal, u32 *pnCkpt, int *pbZero){ + i64 nSize; /* Size of log file */ + u32 aFrameCksum[2] = {0, 0}; + int rc; + sqlite3_file *pWalFd = pWal->apWalFd[iWal]; + + assert( iWal==0 || iWal==1 ); + + memset(&pWal->hdr, 0, sizeof(WalIndexHdr)); + sqlite3_randomness(8, pWal->hdr.aSalt); + + rc = sqlite3OsFileSize(pWalFd, &nSize); + if( rc==SQLITE_OK ){ + if( nSize>WAL_HDRSIZE ){ + u8 aBuf[WAL_HDRSIZE]; /* Buffer to load WAL header into */ + u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */ + int szFrame; /* Number of bytes in buffer aFrame[] */ + u8 *aData; /* Pointer to data part of aFrame buffer */ + int iFrame; /* Index of last frame read */ + i64 iOffset; /* Next offset to read from log file */ + int szPage; /* Page size according to the log */ + u32 magic; /* Magic value read from WAL header */ + u32 version; /* Magic value read from WAL header */ + int isValid; /* True if this frame is valid */ + + /* Read in the WAL header. */ + rc = sqlite3OsRead(pWalFd, aBuf, WAL_HDRSIZE, 0); + if( rc!=SQLITE_OK ){ + return rc; + } + + /* If the database page size is not a power of two, or is greater than + ** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid + ** data. Similarly, if the 'magic' value is invalid, ignore the whole + ** WAL file. + */ + magic = sqlite3Get4byte(&aBuf[0]); + szPage = sqlite3Get4byte(&aBuf[8]); + if( (magic&0xFFFFFFFE)!=WAL_MAGIC + || szPage&(szPage-1) + || szPage>SQLITE_MAX_PAGE_SIZE + || szPage<512 + ){ + return SQLITE_OK; + } + pWal->hdr.bigEndCksum = (u8)(magic&0x00000001); + pWal->szPage = szPage; + + /* Verify that the WAL header checksum is correct */ + walChecksumBytes(pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN, + aBuf, WAL_HDRSIZE-2*4, 0, pWal->hdr.aFrameCksum + ); + if( pWal->hdr.aFrameCksum[0]!=sqlite3Get4byte(&aBuf[24]) + || pWal->hdr.aFrameCksum[1]!=sqlite3Get4byte(&aBuf[28]) + ){ + return SQLITE_OK; + } + + memcpy(&pWal->hdr.aSalt, &aBuf[16], 8); + *pnCkpt = sqlite3Get4byte(&aBuf[12]); + + /* Verify that the version number on the WAL format is one that + ** are able to understand */ + version = sqlite3Get4byte(&aBuf[4]); + if( version!=WAL_VERSION1 && version!=WAL_VERSION2 ){ + return SQLITE_CANTOPEN_BKPT; + } + pWal->hdr.iVersion = version; + + /* Malloc a buffer to read frames into. */ + szFrame = szPage + WAL_FRAME_HDRSIZE; + aFrame = (u8 *)sqlite3_malloc64(szFrame); + if( !aFrame ){ + return SQLITE_NOMEM_BKPT; + } + aData = &aFrame[WAL_FRAME_HDRSIZE]; + + /* Read all frames from the log file. */ + iFrame = 0; + for(iOffset=WAL_HDRSIZE; (iOffset+szFrame)<=nSize; iOffset+=szFrame){ + u32 pgno; /* Database page number for frame */ + u32 nTruncate; /* dbsize field from frame header */ + + /* Read and decode the next log frame. */ + iFrame++; + rc = sqlite3OsRead(pWalFd, aFrame, szFrame, iOffset); + if( rc!=SQLITE_OK ) break; + isValid = walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame); + if( !isValid ) break; + rc = walIndexAppend(pWal, iWal, iFrame, pgno); + if( rc!=SQLITE_OK ) break; + + /* If nTruncate is non-zero, this is a commit record. */ + if( nTruncate ){ + pWal->hdr.mxFrame = iFrame; + pWal->hdr.nPage = nTruncate; + pWal->hdr.szPage = (u16)((szPage&0xff00) | (szPage>>16)); + testcase( szPage<=32768 ); + testcase( szPage>=65536 ); + aFrameCksum[0] = pWal->hdr.aFrameCksum[0]; + aFrameCksum[1] = pWal->hdr.aFrameCksum[1]; + } + } + + sqlite3_free(aFrame); + }else if( pbZero ){ + *pbZero = 1; + } + } + + pWal->hdr.aFrameCksum[0] = aFrameCksum[0]; + pWal->hdr.aFrameCksum[1] = aFrameCksum[1]; + + return rc; +} + +static int walOpenWal2(Wal *pWal){ + int rc = SQLITE_OK; + if( !isOpen(pWal->apWalFd[1]) ){ + int f = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL); + rc = sqlite3OsOpen(pWal->pVfs, pWal->zWalName2, pWal->apWalFd[1], f, &f); + } + return rc; +} + +static int walTruncateWal2(Wal *pWal){ + int bIs; + int rc; + assert( !isOpen(pWal->apWalFd[1]) ); + rc = sqlite3OsAccess(pWal->pVfs, pWal->zWalName2, SQLITE_ACCESS_EXISTS, &bIs); + if( rc==SQLITE_OK && bIs ){ + rc = walOpenWal2(pWal); + if( rc==SQLITE_OK ){ + rc = sqlite3OsTruncate(pWal->apWalFd[1], 0); + sqlite3OsClose(pWal->apWalFd[1]); + } + } + return rc; +} /* ** Recover the wal-index by reading the write-ahead log file. ** ** This routine first tries to establish an exclusive lock on the @@ -1132,13 +1602,15 @@ ** that this thread is running recovery. If unable to establish ** the necessary locks, this routine returns SQLITE_BUSY. */ static int walIndexRecover(Wal *pWal){ int rc; /* Return Code */ - i64 nSize; /* Size of log file */ - u32 aFrameCksum[2] = {0, 0}; int iLock; /* Lock offset to lock for checkpoint */ + u32 nCkpt1 = 0xFFFFFFFF; + u32 nCkpt2 = 0xFFFFFFFF; + int bZero = 0; + WalIndexHdr hdr; /* Obtain an exclusive lock on all byte in the locking range not already ** locked by the caller. The caller is guaranteed to have locked the ** WAL_WRITE_LOCK byte, and may have also locked the WAL_CKPT_LOCK byte. ** If successful, the same bytes that are locked here are concurrent before @@ -1160,152 +1632,137 @@ return rc; } WALTRACE(("WAL%p: recovery begin...\n", pWal)); - memset(&pWal->hdr, 0, sizeof(WalIndexHdr)); - - rc = sqlite3OsFileSize(pWal->pWalFd, &nSize); - if( rc!=SQLITE_OK ){ - goto recovery_error; - } - - if( nSize>WAL_HDRSIZE ){ - u8 aBuf[WAL_HDRSIZE]; /* Buffer to load WAL header into */ - u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */ - int szFrame; /* Number of bytes in buffer aFrame[] */ - u8 *aData; /* Pointer to data part of aFrame buffer */ - int iFrame; /* Index of last frame read */ - i64 iOffset; /* Next offset to read from log file */ - int szPage; /* Page size according to the log */ - u32 magic; /* Magic value read from WAL header */ - u32 version; /* Magic value read from WAL header */ - int isValid; /* True if this frame is valid */ - - /* Read in the WAL header. */ - rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0); - if( rc!=SQLITE_OK ){ - goto recovery_error; - } - - /* If the database page size is not a power of two, or is greater than - ** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid - ** data. Similarly, if the 'magic' value is invalid, ignore the whole - ** WAL file. - */ - magic = sqlite3Get4byte(&aBuf[0]); - szPage = sqlite3Get4byte(&aBuf[8]); - if( (magic&0xFFFFFFFE)!=WAL_MAGIC - || szPage&(szPage-1) - || szPage>SQLITE_MAX_PAGE_SIZE - || szPage<512 - ){ - goto finished; - } - pWal->hdr.bigEndCksum = (u8)(magic&0x00000001); - pWal->szPage = szPage; - pWal->nCkpt = sqlite3Get4byte(&aBuf[12]); - memcpy(&pWal->hdr.aSalt, &aBuf[16], 8); - - /* Verify that the WAL header checksum is correct */ - walChecksumBytes(pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN, - aBuf, WAL_HDRSIZE-2*4, 0, pWal->hdr.aFrameCksum - ); - if( pWal->hdr.aFrameCksum[0]!=sqlite3Get4byte(&aBuf[24]) - || pWal->hdr.aFrameCksum[1]!=sqlite3Get4byte(&aBuf[28]) - ){ - goto finished; - } - - /* Verify that the version number on the WAL format is one that - ** are able to understand */ - version = sqlite3Get4byte(&aBuf[4]); - if( version!=WAL_MAX_VERSION ){ - rc = SQLITE_CANTOPEN_BKPT; - goto finished; - } - - /* Malloc a buffer to read frames into. */ - szFrame = szPage + WAL_FRAME_HDRSIZE; - aFrame = (u8 *)sqlite3_malloc64(szFrame); - if( !aFrame ){ - rc = SQLITE_NOMEM_BKPT; - goto recovery_error; - } - aData = &aFrame[WAL_FRAME_HDRSIZE]; - - /* Read all frames from the log file. */ - iFrame = 0; - for(iOffset=WAL_HDRSIZE; (iOffset+szFrame)<=nSize; iOffset+=szFrame){ - u32 pgno; /* Database page number for frame */ - u32 nTruncate; /* dbsize field from frame header */ - - /* Read and decode the next log frame. */ - iFrame++; - rc = sqlite3OsRead(pWal->pWalFd, aFrame, szFrame, iOffset); - if( rc!=SQLITE_OK ) break; - isValid = walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame); - if( !isValid ) break; - rc = walIndexAppend(pWal, iFrame, pgno); - if( rc!=SQLITE_OK ) break; - - /* If nTruncate is non-zero, this is a commit record. */ - if( nTruncate ){ - pWal->hdr.mxFrame = iFrame; - pWal->hdr.nPage = nTruncate; - pWal->hdr.szPage = (u16)((szPage&0xff00) | (szPage>>16)); - testcase( szPage<=32768 ); - testcase( szPage>=65536 ); - aFrameCksum[0] = pWal->hdr.aFrameCksum[0]; - aFrameCksum[1] = pWal->hdr.aFrameCksum[1]; - } - } - - sqlite3_free(aFrame); - } - -finished: + /* Recover the *-wal file. If a valid version-1 header is recovered + ** from it, do not open the *-wal2 file. Even if it exists. + ** + ** Otherwise, if the *-wal2 file exists or if the "wal2" flag was + ** specified when sqlite3WalOpen() was called, open and recover + ** the *-wal2 file. Except, if the *-wal file was zero bytes in size, + ** truncate the *-wal2 to zero bytes in size. + ** + ** After this block has run, if the *-wal2 file is open the system + ** starts up in VERSION2 mode. In this case pWal->hdr contains the + ** wal-index header considering only *-wal2. Stack variable hdr + ** contains the wal-index header considering only *-wal. The hash + ** tables are populated for both. + ** + ** Or, if the *-wal2 file is not open, start up in VERSION1 mode. + ** pWal->hdr is already populated. + */ + rc = walIndexRecoverOne(pWal, 0, &nCkpt1, &bZero); + assert( pWal->hdr.iVersion==0 + || pWal->hdr.iVersion==WAL_VERSION1 + || pWal->hdr.iVersion==WAL_VERSION2 + ); + if( rc==SQLITE_OK && bZero ){ + rc = walTruncateWal2(pWal); + } + if( rc==SQLITE_OK && pWal->hdr.iVersion!=WAL_VERSION1 ){ + int bOpen = 1; + sqlite3_vfs *pVfs = pWal->pVfs; + if( pWal->hdr.iVersion==0 && pWal->bWal2==0 ){ + rc = sqlite3OsAccess(pVfs, pWal->zWalName2, SQLITE_ACCESS_EXISTS, &bOpen); + } + if( rc==SQLITE_OK && bOpen ){ + rc = walOpenWal2(pWal); + if( rc==SQLITE_OK ){ + hdr = pWal->hdr; + rc = walIndexRecoverOne(pWal, 1, &nCkpt2, 0); + } + } + } + if( rc==SQLITE_OK ){ volatile WalCkptInfo *pInfo; - int i; - pWal->hdr.aFrameCksum[0] = aFrameCksum[0]; - pWal->hdr.aFrameCksum[1] = aFrameCksum[1]; + + if( isOpen(pWal->apWalFd[1]) ){ + /* The case where *-wal2 may follow *-wal */ + if( nCkpt2<=0x0F && nCkpt2==nCkpt1+1 ){ + if( sqlite3Get4byte((u8*)(&pWal->hdr.aSalt[0]))==hdr.aFrameCksum[0] + && sqlite3Get4byte((u8*)(&pWal->hdr.aSalt[1]))==hdr.aFrameCksum[1] + ){ + walidxSetFile(&pWal->hdr, 1); + walidxSetMxFrame(&pWal->hdr, 1, pWal->hdr.mxFrame); + walidxSetMxFrame(&pWal->hdr, 0, hdr.mxFrame); + }else{ + pWal->hdr = hdr; + } + }else + + /* When *-wal may follow *-wal2 */ + if( (nCkpt2==0x0F && nCkpt1==0) || (nCkpt2<0x0F && nCkpt2==nCkpt1-1) ){ + if( sqlite3Get4byte((u8*)(&hdr.aSalt[0]))==pWal->hdr.aFrameCksum[0] + && sqlite3Get4byte((u8*)(&hdr.aSalt[1]))==pWal->hdr.aFrameCksum[1] + ){ + SWAP(WalIndexHdr, pWal->hdr, hdr); + walidxSetMxFrame(&pWal->hdr, 1, hdr.mxFrame); + }else{ + walidxSetFile(&pWal->hdr, 1); + walidxSetMxFrame(&pWal->hdr, 1, pWal->hdr.mxFrame); + walidxSetMxFrame(&pWal->hdr, 0, 0); + } + }else + + /* Fallback */ + if( nCkpt1<=nCkpt2 ){ + pWal->hdr = hdr; + }else{ + walidxSetFile(&pWal->hdr, 1); + walidxSetMxFrame(&pWal->hdr, 1, pWal->hdr.mxFrame); + walidxSetMxFrame(&pWal->hdr, 0, 0); + } + pWal->hdr.iVersion = WAL_VERSION2; + }else{ + pWal->hdr.iVersion = WAL_VERSION1; + } + walIndexWriteHdr(pWal); /* Reset the checkpoint-header. This is safe because this thread is ** currently holding locks that exclude all other readers, writers and - ** checkpointers. - */ + ** checkpointers. */ pInfo = walCkptInfo(pWal); - pInfo->nBackfill = 0; - pInfo->nBackfillAttempted = pWal->hdr.mxFrame; - pInfo->aReadMark[0] = 0; - for(i=1; iaReadMark[i] = READMARK_NOT_USED; - if( pWal->hdr.mxFrame ) pInfo->aReadMark[1] = pWal->hdr.mxFrame; + memset((void*)pInfo, 0, sizeof(WalCkptInfo)); + if( 0==isWalMode2(pWal) ){ + int i; + pInfo->nBackfillAttempted = pWal->hdr.mxFrame; + pInfo->aReadMark[0] = 0; + for(i=1; iaReadMark[i] = READMARK_NOT_USED; + if( pWal->hdr.mxFrame ) pInfo->aReadMark[1] = pWal->hdr.mxFrame; + } /* If more than one frame was recovered from the log file, report an ** event via sqlite3_log(). This is to help with identifying performance ** problems caused by applications routinely shutting down without - ** checkpointing the log file. - */ + ** checkpointing the log file. */ if( pWal->hdr.nPage ){ - sqlite3_log(SQLITE_NOTICE_RECOVER_WAL, - "recovered %d frames from WAL file %s", - pWal->hdr.mxFrame, pWal->zWalName - ); + if( isWalMode2(pWal) ){ + sqlite3_log(SQLITE_NOTICE_RECOVER_WAL, + "recovered (%d,%d) frames from WAL files %s[2] (wal2 mode)", + walidxGetMxFrame(&pWal->hdr, 0), walidxGetMxFrame(&pWal->hdr, 1), + pWal->zWalName + ); + }else{ + sqlite3_log(SQLITE_NOTICE_RECOVER_WAL, + "recovered %d frames from WAL file %s", + pWal->hdr.mxFrame, pWal->zWalName + ); + } } } -recovery_error: WALTRACE(("WAL%p: recovery %s\n", pWal, rc ? "failed" : "ok")); walUnlockExclusive(pWal, iLock, WAL_READ_LOCK(0)-iLock); walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1); return rc; } /* -** Close an open wal-index. +** Close an open wal-index and wal files. */ static void walIndexClose(Wal *pWal, int isDelete){ if( pWal->exclusiveMode==WAL_HEAPMEMORY_MODE || pWal->bShmUnreliable ){ int i; for(i=0; inWiData; i++){ @@ -1314,10 +1771,12 @@ } } if( pWal->exclusiveMode!=WAL_HEAPMEMORY_MODE ){ sqlite3OsShmUnmap(pWal->pDbFd, isDelete); } + sqlite3OsClose(pWal->apWalFd[0]); + sqlite3OsClose(pWal->apWalFd[1]); } /* ** Open a connection to the WAL file zWalName. The database file must ** already be opened on connection pDbFd. The buffer that zWalName points @@ -1337,15 +1796,18 @@ sqlite3_vfs *pVfs, /* vfs module to open wal and wal-index */ sqlite3_file *pDbFd, /* The open database file */ const char *zWalName, /* Name of the WAL file */ int bNoShm, /* True to run in heap-memory mode */ i64 mxWalSize, /* Truncate WAL to this size on reset */ + int bWal2, /* True to open in wal2 mode */ Wal **ppWal /* OUT: Allocated Wal handle */ ){ int rc; /* Return Code */ Wal *pRet; /* Object to allocate and return */ int flags; /* Flags passed to OsOpen() */ + int nWalName; /* Length of zWalName in bytes */ + int nByte; /* Bytes of space to allocate */ assert( zWalName && zWalName[0] ); assert( pDbFd ); /* In the amalgamation, the os_unix.c and os_win.c source files come before @@ -1361,38 +1823,46 @@ #endif #ifdef UNIX_SHM_BASE assert( UNIX_SHM_BASE==WALINDEX_LOCK_OFFSET ); #endif + nWalName = sqlite3Strlen30(zWalName); + nByte = sizeof(Wal) + pVfs->szOsFile*2 + nWalName+2; /* Allocate an instance of struct Wal to return. */ *ppWal = 0; - pRet = (Wal*)sqlite3MallocZero(sizeof(Wal) + pVfs->szOsFile); + pRet = (Wal*)sqlite3MallocZero(nByte); if( !pRet ){ return SQLITE_NOMEM_BKPT; } pRet->pVfs = pVfs; - pRet->pWalFd = (sqlite3_file *)&pRet[1]; + pRet->apWalFd[0] = (sqlite3_file*)((char*)pRet+sizeof(Wal)); + pRet->apWalFd[1] = (sqlite3_file*)((char*)pRet+sizeof(Wal)+pVfs->szOsFile); pRet->pDbFd = pDbFd; - pRet->readLock = -1; + pRet->readLock = WAL_LOCK_NONE; pRet->mxWalSize = mxWalSize; pRet->zWalName = zWalName; pRet->syncHeader = 1; pRet->padToSectorBoundary = 1; pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE); + pRet->bWal2 = bWal2; - /* Open file handle on the write-ahead log file. */ + pRet->zWalName2 = (char*)pRet + sizeof(Wal) + 2*pVfs->szOsFile; + memcpy(pRet->zWalName2, zWalName, nWalName); + pRet->zWalName2[nWalName] = '2'; + pRet->zWalName2[nWalName+1] = '\0'; + + /* Open a file handle on the first write-ahead log file. */ flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL); - rc = sqlite3OsOpen(pVfs, zWalName, pRet->pWalFd, flags, &flags); + rc = sqlite3OsOpen(pVfs, zWalName, pRet->apWalFd[0], flags, &flags); if( rc==SQLITE_OK && flags&SQLITE_OPEN_READONLY ){ pRet->readOnly = WAL_RDONLY; } if( rc!=SQLITE_OK ){ walIndexClose(pRet, 0); - sqlite3OsClose(pRet->pWalFd); sqlite3_free(pRet); }else{ int iDC = sqlite3OsDeviceCharacteristics(pDbFd); if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->syncHeader = 0; } if( iDC & SQLITE_IOCAP_POWERSAFE_OVERWRITE ){ @@ -1598,38 +2068,54 @@ sqlite3_free(p); } /* ** Construct a WalInterator object that can be used to loop over all -** pages in the WAL following frame nBackfill in ascending order. Frames +** pages in wal file iWal following frame nBackfill in ascending order. Frames ** nBackfill or earlier may be included - excluding them is an optimization ** only. The caller must hold the checkpoint lock. ** -** On success, make *pp point to the newly allocated WalInterator object -** return SQLITE_OK. Otherwise, return an error code. If this routine -** returns an error, the value of *pp is undefined. +** On success, make *pp point to the newly allocated WalIterator object +** and return SQLITE_OK. Otherwise, return an error code. If this routine +** returns an error, the final value of *pp is undefined. ** ** The calling routine should invoke walIteratorFree() to destroy the ** WalIterator object when it has finished with it. */ -static int walIteratorInit(Wal *pWal, u32 nBackfill, WalIterator **pp){ +static int walIteratorInit( + Wal *pWal, + int iWal, + u32 nBackfill, + WalIterator **pp +){ WalIterator *p; /* Return value */ int nSegment; /* Number of segments to merge */ u32 iLast; /* Last frame in log */ int nByte; /* Number of bytes to allocate */ int i; /* Iterator variable */ + int iLastSeg; /* Last hash table to iterate though */ ht_slot *aTmp; /* Temp space used by merge-sort */ int rc = SQLITE_OK; /* Return Code */ + int iMode = isWalMode2(pWal) ? 2 : 1; + + assert( isWalMode2(pWal) || iWal==0 ); + assert( 0==isWalMode2(pWal) || nBackfill==0 ); /* This routine only runs while holding the checkpoint lock. And ** it only runs if there is actually content in the log (mxFrame>0). */ - assert( pWal->ckptLock && pWal->hdr.mxFrame>0 ); - iLast = pWal->hdr.mxFrame; + iLast = walidxGetMxFrame(&pWal->hdr, iWal); + assert( pWal->ckptLock && iLast>0 ); + + if( iMode==2 ){ + iLastSeg = walFramePage2(iWal, iLast); + }else{ + iLastSeg = walFramePage(iLast); + } + nSegment = 1 + (iLastSeg/iMode); /* Allocate space for the WalIterator object. */ - nSegment = walFramePage(iLast) + 1; nByte = sizeof(WalIterator) + (nSegment-1)*sizeof(struct WalSegment) + iLast*sizeof(ht_slot); p = (WalIterator *)sqlite3_malloc64(nByte); if( !p ){ @@ -1646,36 +2132,46 @@ ); if( !aTmp ){ rc = SQLITE_NOMEM_BKPT; } - for(i=walFramePage(nBackfill+1); rc==SQLITE_OK && i=2 ); + }else{ + iZero = sLoc.iZero; + } sLoc.aPgno++; - if( (i+1)==nSegment ){ - nEntry = (int)(iLast - sLoc.iZero); + if( i==iLastSeg ){ + nEntry = (int)(iLast - iZero); }else{ nEntry = (int)((u32*)sLoc.aHash - (u32*)sLoc.aPgno); } - aIndex = &((ht_slot *)&p->aSegment[p->nSegment])[sLoc.iZero]; - sLoc.iZero++; + aIndex = &((ht_slot *)&p->aSegment[p->nSegment])[iZero]; + iZero++; for(j=0; jaSegment[i].iZero = sLoc.iZero; - p->aSegment[i].nEntry = nEntry; - p->aSegment[i].aIndex = aIndex; - p->aSegment[i].aPgno = (u32 *)sLoc.aPgno; + walMergesort((u32*)sLoc.aPgno, aTmp, aIndex, &nEntry); + p->aSegment[i/iMode].iZero = iZero; + p->aSegment[i/iMode].nEntry = nEntry; + p->aSegment[i/iMode].aIndex = aIndex; + p->aSegment[i/iMode].aPgno = (u32*)sLoc.aPgno; } } sqlite3_free(aTmp); if( rc!=SQLITE_OK ){ @@ -1733,10 +2229,11 @@ */ static void walRestartHdr(Wal *pWal, u32 salt1){ volatile WalCkptInfo *pInfo = walCkptInfo(pWal); int i; /* Loop counter */ u32 *aSalt = pWal->hdr.aSalt; /* Big-endian salt values */ + assert( isWalMode2(pWal)==0 ); pWal->nCkpt++; pWal->hdr.mxFrame = 0; sqlite3Put4byte((u8*)&aSalt[0], 1 + sqlite3Get4byte((u8*)&aSalt[0])); memcpy(&pWal->hdr.aSalt[1], &salt1, 4); walIndexWriteHdr(pWal); @@ -1744,10 +2241,72 @@ pInfo->nBackfillAttempted = 0; pInfo->aReadMark[1] = 0; for(i=2; iaReadMark[i] = READMARK_NOT_USED; assert( pInfo->aReadMark[0]==0 ); } + +/* +** This function is used in wal2 mode. +** +** This function is called when writer pWal is just about to start +** writing out frames. Parameter iApp is the current wal file. The "other" wal +** file (wal file !iApp) has been fully checkpointed. This function returns +** SQLITE_OK if there are no readers preventing the writer from switching to +** the other wal file. Or SQLITE_BUSY if there are. +*/ +static int wal2RestartOk(Wal *pWal, int iApp){ + /* The other wal file (wal file !iApp) can be overwritten if there + ** are no readers reading from it - no "full" or "partial" locks. + ** Technically speaking it is not possible for any reader to hold + ** a "part" lock, as this would have prevented the file from being + ** checkpointed. But checking anyway doesn't hurt. The following + ** is equivalent to: + ** + ** if( iApp==0 ) eLock = WAL_LOCK_PART1_FULL2; + ** if( iApp==1 ) eLock = WAL_LOCK_PART1; + */ + int eLock = 1 + (iApp==0); + + assert( WAL_LOCK_PART1==1 ); + assert( WAL_LOCK_PART1_FULL2==2 ); + assert( WAL_LOCK_PART2_FULL1==3 ); + assert( WAL_LOCK_PART2==4 ); + + assert( iApp!=0 || eLock==WAL_LOCK_PART1_FULL2 ); + assert( iApp!=1 || eLock==WAL_LOCK_PART1 ); + + return walLockExclusive(pWal, WAL_READ_LOCK(eLock), 3); +} +static void wal2RestartFinished(Wal *pWal, int iApp){ + walUnlockExclusive(pWal, WAL_READ_LOCK(1 + (iApp==0)), 3); +} + +/* +** This function is used in wal2 mode. +** +** This function is called when a checkpointer wishes to checkpoint wal +** file iCkpt. It takes the required lock and, if successful, returns +** SQLITE_OK. Otherwise, an SQLite error code (e.g. SQLITE_BUSY). If this +** function returns SQLITE_OK, it is the responsibility of the caller +** to invoke wal2CheckpointFinished() to release the lock. +*/ +static int wal2CheckpointOk(Wal *pWal, int iCkpt){ + int eLock = 1 + (iCkpt*2); + + assert( WAL_LOCK_PART1==1 ); + assert( WAL_LOCK_PART1_FULL2==2 ); + assert( WAL_LOCK_PART2_FULL1==3 ); + assert( WAL_LOCK_PART2==4 ); + + assert( iCkpt!=0 || eLock==WAL_LOCK_PART1 ); + assert( iCkpt!=1 || eLock==WAL_LOCK_PART2_FULL1 ); + + return walLockExclusive(pWal, WAL_READ_LOCK(eLock), 2); +} +static void wal2CheckpointFinished(Wal *pWal, int iCkpt){ + walUnlockExclusive(pWal, WAL_READ_LOCK(1 + (iCkpt*2)), 2); +} /* ** Copy as much content as we can from the WAL back into the database file ** in response to an sqlite3_wal_checkpoint() request or the equivalent. ** @@ -1794,135 +2353,163 @@ u32 iFrame = 0; /* Wal frame containing data for iDbpage */ u32 mxSafeFrame; /* Max frame that can be backfilled */ u32 mxPage; /* Max database page to write */ int i; /* Loop counter */ volatile WalCkptInfo *pInfo; /* The checkpoint status information */ + int bWal2 = isWalMode2(pWal); /* True for wal2 connections */ + int iCkpt = bWal2 ? !walidxGetFile(&pWal->hdr) : 0; + mxSafeFrame = walidxGetMxFrame(&pWal->hdr, iCkpt); szPage = walPagesize(pWal); testcase( szPage<=32768 ); testcase( szPage>=65536 ); pInfo = walCkptInfo(pWal); - if( pInfo->nBackfillhdr.mxFrame ){ + if( (bWal2==1 && pInfo->nBackfill==0 && mxSafeFrame) + || (bWal2==0 && pInfo->nBackfillapWalFd[iCkpt]; + mxPage = pWal->hdr.nPage; + + /* If this is a wal2 system, check for a reader holding a lock + ** preventing this checkpoint operation. If one is found, return + ** early. */ + if( bWal2 ){ + rc = wal2CheckpointOk(pWal, iCkpt); + if( rc!=SQLITE_OK ) return rc; + } /* EVIDENCE-OF: R-62920-47450 The busy-handler callback is never invoked ** in the SQLITE_CHECKPOINT_PASSIVE mode. */ assert( eMode!=SQLITE_CHECKPOINT_PASSIVE || xBusy==0 ); - /* Compute in mxSafeFrame the index of the last frame of the WAL that is - ** safe to write into the database. Frames beyond mxSafeFrame might - ** overwrite database pages that are in use by active readers and thus - ** cannot be backfilled from the WAL. + /* If this is a wal system (not wal2), compute in mxSafeFrame the index + ** of the last frame of the WAL that is safe to write into the database. + ** Frames beyond mxSafeFrame might overwrite database pages that are in + ** use by active readers and thus cannot be backfilled from the WAL. */ - mxSafeFrame = pWal->hdr.mxFrame; - mxPage = pWal->hdr.nPage; - for(i=1; iaReadMark[i]; - if( mxSafeFrame>y ){ - assert( y<=pWal->hdr.mxFrame ); - rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(i), 1); - if( rc==SQLITE_OK ){ - pInfo->aReadMark[i] = (i==1 ? mxSafeFrame : READMARK_NOT_USED); - walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); - }else if( rc==SQLITE_BUSY ){ - mxSafeFrame = y; - xBusy = 0; - }else{ - goto walcheckpoint_out; + if( bWal2==0 ){ + mxSafeFrame = pWal->hdr.mxFrame; + mxPage = pWal->hdr.nPage; + for(i=1; iaReadMark[i]; + if( mxSafeFrame>y ){ + assert( y<=pWal->hdr.mxFrame ); + rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(i), 1); + if( rc==SQLITE_OK ){ + pInfo->aReadMark[i] = (i==1 ? mxSafeFrame : READMARK_NOT_USED); + walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); + }else if( rc==SQLITE_BUSY ){ + mxSafeFrame = y; + xBusy = 0; + }else{ + goto walcheckpoint_out; + } } } } /* Allocate the iterator */ - if( pInfo->nBackfillnBackfill, &pIter); + if( bWal2 || pInfo->nBackfillnBackfill==0 ); + rc = walIteratorInit(pWal, iCkpt, pInfo->nBackfill, &pIter); assert( rc==SQLITE_OK || pIter==0 ); } - if( pIter - && (rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(0),1))==SQLITE_OK - ){ + if( pIter && (bWal2 + || (rc = walBusyLock(pWal, xBusy, pBusyArg,WAL_READ_LOCK(0),1))==SQLITE_OK + )){ u32 nBackfill = pInfo->nBackfill; + assert( bWal2==0 || nBackfill==0 ); pInfo->nBackfillAttempted = mxSafeFrame; - /* Sync the WAL to disk */ - rc = sqlite3OsSync(pWal->pWalFd, CKPT_SYNC_FLAGS(sync_flags)); + /* Sync the wal file being checkpointed to disk */ + rc = sqlite3OsSync(pWalFd, CKPT_SYNC_FLAGS(sync_flags)); /* If the database may grow as a result of this checkpoint, hint - ** about the eventual size of the db file to the VFS layer. - */ + ** about the eventual size of the db file to the VFS layer. */ if( rc==SQLITE_OK ){ i64 nReq = ((i64)mxPage * szPage); i64 nSize; /* Current size of database file */ rc = sqlite3OsFileSize(pWal->pDbFd, &nSize); if( rc==SQLITE_OK && nSizepDbFd, SQLITE_FCNTL_SIZE_HINT, &nReq); } } - /* Iterate through the contents of the WAL, copying data to the db file */ while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){ i64 iOffset; - assert( walFramePgno(pWal, iFrame)==iDbpage ); + + assert( bWal2==1 || walFramePgno(pWal, iFrame)==iDbpage ); + assert( bWal2==0 || walFramePgno2(pWal, iCkpt, iFrame)==iDbpage ); + if( db->u1.isInterrupted ){ rc = db->mallocFailed ? SQLITE_NOMEM_BKPT : SQLITE_INTERRUPT; break; } if( iFrame<=nBackfill || iFrame>mxSafeFrame || iDbpage>mxPage ){ + assert( bWal2==0 || iDbpage>mxPage ); continue; } iOffset = walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE; + WALTRACE(("WAL%p: checkpoint frame %d of wal %d to db page %d\n", + pWal, (int)iFrame, iCkpt, (int)iDbpage + )); /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL file */ - rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, iOffset); + rc = sqlite3OsRead(pWalFd, zBuf, szPage, iOffset); if( rc!=SQLITE_OK ) break; iOffset = (iDbpage-1)*(i64)szPage; testcase( IS_BIG_INT(iOffset) ); rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, iOffset); if( rc!=SQLITE_OK ) break; } - /* If work was actually accomplished... */ - if( rc==SQLITE_OK ){ - if( mxSafeFrame==walIndexHdr(pWal)->mxFrame ){ + /* If work was actually accomplished, truncate the db file, sync the wal + ** file and set WalCkptInfo.nBackfill to indicate so. */ + if( rc==SQLITE_OK && (bWal2 || mxSafeFrame==walIndexHdr(pWal)->mxFrame) ){ + if( !bWal2 ){ i64 szDb = pWal->hdr.nPage*(i64)szPage; testcase( IS_BIG_INT(szDb) ); rc = sqlite3OsTruncate(pWal->pDbFd, szDb); - if( rc==SQLITE_OK ){ - rc = sqlite3OsSync(pWal->pDbFd, CKPT_SYNC_FLAGS(sync_flags)); - } } if( rc==SQLITE_OK ){ - pInfo->nBackfill = mxSafeFrame; + rc = sqlite3OsSync(pWal->pDbFd, CKPT_SYNC_FLAGS(sync_flags)); } } + if( rc==SQLITE_OK ){ + pInfo->nBackfill = bWal2 ? 1 : mxSafeFrame; + } /* Release the reader lock held while backfilling */ - walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1); + if( bWal2==0 ){ + walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1); + } } if( rc==SQLITE_BUSY ){ /* Reset the return code so as not to report a checkpoint failure ** just because there are active readers. */ rc = SQLITE_OK; } + if( bWal2 ) wal2CheckpointFinished(pWal, iCkpt); } /* If this is an SQLITE_CHECKPOINT_RESTART or TRUNCATE operation, and the ** entire wal file has been copied into the database file, then block ** until all readers have finished using the wal file. This ensures that ** the next process to write to the database restarts the wal file. */ - if( rc==SQLITE_OK && eMode!=SQLITE_CHECKPOINT_PASSIVE ){ + if( bWal2==0 && rc==SQLITE_OK && eMode!=SQLITE_CHECKPOINT_PASSIVE ){ assert( pWal->writeLock ); if( pInfo->nBackfillhdr.mxFrame ){ rc = SQLITE_BUSY; }else if( eMode>=SQLITE_CHECKPOINT_RESTART ){ u32 salt1; @@ -1943,11 +2530,11 @@ ** as it would leave the system in a state where the contents of ** the wal-index header do not match the contents of the ** file-system. To avoid this, update the wal-index header to ** indicate that the log file contains zero valid frames. */ walRestartHdr(pWal, salt1); - rc = sqlite3OsTruncate(pWal->pWalFd, 0); + rc = sqlite3OsTruncate(pWal->apWalFd[0], 0); } walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1); } } } @@ -1960,20 +2547,22 @@ /* ** If the WAL file is currently larger than nMax bytes in size, truncate ** it to exactly nMax bytes. If an error occurs while doing so, ignore it. */ static void walLimitSize(Wal *pWal, i64 nMax){ - i64 sz; - int rx; - sqlite3BeginBenignMalloc(); - rx = sqlite3OsFileSize(pWal->pWalFd, &sz); - if( rx==SQLITE_OK && (sz > nMax ) ){ - rx = sqlite3OsTruncate(pWal->pWalFd, nMax); - } - sqlite3EndBenignMalloc(); - if( rx ){ - sqlite3_log(rx, "cannot limit WAL size: %s", pWal->zWalName); + if( isWalMode2(pWal)==0 ){ + i64 sz; + int rx; + sqlite3BeginBenignMalloc(); + rx = sqlite3OsFileSize(pWal->apWalFd[0], &sz); + if( rx==SQLITE_OK && (sz > nMax ) ){ + rx = sqlite3OsTruncate(pWal->apWalFd[0], nMax); + } + sqlite3EndBenignMalloc(); + if( rx ){ + sqlite3_log(rx, "cannot limit WAL size: %s", pWal->zWalName); + } } } /* ** Close a connection to a log file. @@ -1998,43 +2587,54 @@ ** The EXCLUSIVE lock is not released before returning. */ if( zBuf!=0 && SQLITE_OK==(rc = sqlite3OsLock(pWal->pDbFd, SQLITE_LOCK_EXCLUSIVE)) ){ + int i; if( pWal->exclusiveMode==WAL_NORMAL_MODE ){ pWal->exclusiveMode = WAL_EXCLUSIVE_MODE; } - rc = sqlite3WalCheckpoint(pWal, db, - SQLITE_CHECKPOINT_PASSIVE, 0, 0, sync_flags, nBuf, zBuf, 0, 0 - ); - if( rc==SQLITE_OK ){ - int bPersist = -1; - sqlite3OsFileControlHint( - pWal->pDbFd, SQLITE_FCNTL_PERSIST_WAL, &bPersist - ); - if( bPersist!=1 ){ - /* Try to delete the WAL file if the checkpoint completed and - ** fsyned (rc==SQLITE_OK) and if we are not in persistent-wal - ** mode (!bPersist) */ - isDelete = 1; - }else if( pWal->mxWalSize>=0 ){ - /* Try to truncate the WAL file to zero bytes if the checkpoint - ** completed and fsynced (rc==SQLITE_OK) and we are in persistent - ** WAL mode (bPersist) and if the PRAGMA journal_size_limit is a - ** non-negative value (pWal->mxWalSize>=0). Note that we truncate - ** to zero bytes as truncating to the journal_size_limit might - ** leave a corrupt WAL file on disk. */ - walLimitSize(pWal, 0); - } + for(i=0; rc==SQLITE_OK && i<2; i++){ + rc = sqlite3WalCheckpoint(pWal, db, + SQLITE_CHECKPOINT_PASSIVE, 0, 0, sync_flags, nBuf, zBuf, 0, 0 + ); + if( rc==SQLITE_OK ){ + int bPersist = -1; + sqlite3OsFileControlHint( + pWal->pDbFd, SQLITE_FCNTL_PERSIST_WAL, &bPersist + ); + if( bPersist!=1 ){ + /* Try to delete the WAL file if the checkpoint completed and + ** fsyned (rc==SQLITE_OK) and if we are not in persistent-wal + ** mode (!bPersist) */ + isDelete = 1; + }else if( pWal->mxWalSize>=0 ){ + /* Try to truncate the WAL file to zero bytes if the checkpoint + ** completed and fsynced (rc==SQLITE_OK) and we are in persistent + ** WAL mode (bPersist) and if the PRAGMA journal_size_limit is a + ** non-negative value (pWal->mxWalSize>=0). Note that we truncate + ** to zero bytes as truncating to the journal_size_limit might + ** leave a corrupt WAL file on disk. */ + walLimitSize(pWal, 0); + } + } + + if( isWalMode2(pWal)==0 ) break; + + walCkptInfo(pWal)->nBackfill = 0; + walidxSetFile(&pWal->hdr, !walidxGetFile(&pWal->hdr)); + pWal->writeLock = 1; + walIndexWriteHdr(pWal); + pWal->writeLock = 0; } } walIndexClose(pWal, isDelete); - sqlite3OsClose(pWal->pWalFd); if( isDelete ){ sqlite3BeginBenignMalloc(); sqlite3OsDelete(pWal->pVfs, pWal->zWalName, 0); + sqlite3OsDelete(pWal->pVfs, pWal->zWalName2, 0); sqlite3EndBenignMalloc(); } WALTRACE(("WAL%p: closed\n", pWal)); sqlite3_free((void *)pWal->apWiData); sqlite3_free(pWal); @@ -2211,11 +2811,13 @@ /* If the header is read successfully, check the version number to make ** sure the wal-index was not constructed with some future format that ** this version of SQLite cannot understand. */ - if( badHdr==0 && pWal->hdr.iVersion!=WALINDEX_MAX_VERSION ){ + if( badHdr==0 + && pWal->hdr.iVersion!=WAL_VERSION1 && pWal->hdr.iVersion!=WAL_VERSION2 + ){ rc = SQLITE_CANTOPEN_BKPT; } if( pWal->bShmUnreliable ){ if( rc!=SQLITE_OK ){ walIndexClose(pWal, 0); @@ -2303,11 +2905,11 @@ ** returned any SQLITE_READONLY value, it must return only SQLITE_READONLY ** or SQLITE_READONLY_CANTINIT or some error for all subsequent invocations, ** even if some external agent does a "chmod" to make the shared-memory ** writable by us, until sqlite3OsShmUnmap() has been called. ** This is a requirement on the VFS implementation. - */ + */ rc = sqlite3OsShmMap(pWal->pDbFd, 0, WALINDEX_PGSZ, 0, &pDummy); assert( rc!=SQLITE_OK ); /* SQLITE_OK not possible for read-only connection */ if( rc!=SQLITE_READONLY_CANTINIT ){ rc = (rc==SQLITE_READONLY ? WAL_RETRY : rc); goto begin_unreliable_shm_out; @@ -2320,11 +2922,11 @@ memcpy(&pWal->hdr, (void*)walIndexHdr(pWal), sizeof(WalIndexHdr)); /* Make sure some writer hasn't come in and changed the WAL file out ** from under us, then disconnected, while we were not looking. */ - rc = sqlite3OsFileSize(pWal->pWalFd, &szWal); + rc = sqlite3OsFileSize(pWal->apWalFd[0], &szWal); if( rc!=SQLITE_OK ){ goto begin_unreliable_shm_out; } if( szWalhdr.mxFrame==0 ? SQLITE_OK : WAL_RETRY); goto begin_unreliable_shm_out; } /* Check the salt keys at the start of the wal file still match. */ - rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0); + rc = sqlite3OsRead(pWal->apWalFd[0], aBuf, WAL_HDRSIZE, 0); if( rc!=SQLITE_OK ){ goto begin_unreliable_shm_out; } if( memcmp(&pWal->hdr.aSalt, &aBuf[16], 8) ){ /* Some writer has wrapped the WAL file while we were not looking. @@ -2372,11 +2974,11 @@ ){ u32 pgno; /* Database page number for frame */ u32 nTruncate; /* dbsize field from frame header */ /* Read and decode the next log frame. */ - rc = sqlite3OsRead(pWal->pWalFd, aFrame, szFrame, iOffset); + rc = sqlite3OsRead(pWal->apWalFd[0], aFrame, szFrame, iOffset); if( rc!=SQLITE_OK ) break; if( !walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame) ) break; /* If nTruncate is non-zero, then a complete transaction has been ** appended to this wal file. Set rc to WAL_RETRY and break out of @@ -2454,17 +3056,13 @@ ** so it takes care to hold an exclusive lock on the corresponding ** WAL_READ_LOCK() while changing values. */ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){ volatile WalCkptInfo *pInfo; /* Checkpoint information in wal-index */ - u32 mxReadMark; /* Largest aReadMark[] value */ - int mxI; /* Index of largest aReadMark[] value */ - int i; /* Loop counter */ int rc = SQLITE_OK; /* Return code */ - u32 mxFrame; /* Wal frame to lock to */ - assert( pWal->readLock<0 ); /* Not currently locked */ + assert( pWal->readLock==WAL_LOCK_NONE ); /* Not currently locked */ /* useWal may only be set for read/write connections */ assert( (pWal->readOnly & WAL_SHM_RDONLY)==0 || useWal==0 ); /* Take steps to avoid spinning forever if there is a protocol error. @@ -2533,134 +3131,167 @@ } assert( pWal->nWiData>0 ); assert( pWal->apWiData[0]!=0 ); pInfo = walCkptInfo(pWal); - if( !useWal && pInfo->nBackfill==pWal->hdr.mxFrame -#ifdef SQLITE_ENABLE_SNAPSHOT - && (pWal->pSnapshot==0 || pWal->hdr.mxFrame==0) -#endif - ){ - /* The WAL has been completely backfilled (or it is empty). - ** and can be safely ignored. - */ - rc = walLockShared(pWal, WAL_READ_LOCK(0)); - walShmBarrier(pWal); - if( rc==SQLITE_OK ){ - if( memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) ){ - /* It is not safe to allow the reader to continue here if frames - ** may have been appended to the log before READ_LOCK(0) was obtained. - ** When holding READ_LOCK(0), the reader ignores the entire log file, - ** which implies that the database file contains a trustworthy - ** snapshot. Since holding READ_LOCK(0) prevents a checkpoint from - ** happening, this is usually correct. - ** - ** However, if frames have been appended to the log (or if the log - ** is wrapped and written for that matter) before the READ_LOCK(0) - ** is obtained, that is not necessarily true. A checkpointer may - ** have started to backfill the appended frames but crashed before - ** it finished. Leaving a corrupt image in the database file. - */ - walUnlockShared(pWal, WAL_READ_LOCK(0)); - return WAL_RETRY; - } - pWal->readLock = 0; - return SQLITE_OK; - }else if( rc!=SQLITE_BUSY ){ - return rc; - } - } - - /* If we get this far, it means that the reader will want to use - ** the WAL to get at content from recent commits. The job now is - ** to select one of the aReadMark[] entries that is closest to - ** but not exceeding pWal->hdr.mxFrame and lock that entry. - */ - mxReadMark = 0; - mxI = 0; - mxFrame = pWal->hdr.mxFrame; -#ifdef SQLITE_ENABLE_SNAPSHOT - if( pWal->pSnapshot && pWal->pSnapshot->mxFramepSnapshot->mxFrame; - } -#endif - for(i=1; iaReadMark+i); - if( mxReadMark<=thisMark && thisMark<=mxFrame ){ - assert( thisMark!=READMARK_NOT_USED ); - mxReadMark = thisMark; - mxI = i; - } - } - if( (pWal->readOnly & WAL_SHM_RDONLY)==0 - && (mxReadMarkaReadMark+i,mxFrame); - mxI = i; - walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); - break; - }else if( rc!=SQLITE_BUSY ){ - return rc; - } - } - } - if( mxI==0 ){ - assert( rc==SQLITE_BUSY || (pWal->readOnly & WAL_SHM_RDONLY)!=0 ); - return rc==SQLITE_BUSY ? WAL_RETRY : SQLITE_READONLY_CANTINIT; - } - - rc = walLockShared(pWal, WAL_READ_LOCK(mxI)); - if( rc ){ - return rc==SQLITE_BUSY ? WAL_RETRY : rc; - } - /* Now that the read-lock has been obtained, check that neither the - ** value in the aReadMark[] array or the contents of the wal-index - ** header have changed. - ** - ** It is necessary to check that the wal-index header did not change - ** between the time it was read and when the shared-lock was obtained - ** on WAL_READ_LOCK(mxI) was obtained to account for the possibility - ** that the log file may have been wrapped by a writer, or that frames - ** that occur later in the log than pWal->hdr.mxFrame may have been - ** copied into the database by a checkpointer. If either of these things - ** happened, then reading the database with the current value of - ** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry - ** instead. - ** - ** Before checking that the live wal-index header has not changed - ** since it was read, set Wal.minFrame to the first frame in the wal - ** file that has not yet been checkpointed. This client will not need - ** to read any frames earlier than minFrame from the wal file - they - ** can be safely read directly from the database file. - ** - ** Because a ShmBarrier() call is made between taking the copy of - ** nBackfill and checking that the wal-header in shared-memory still - ** matches the one cached in pWal->hdr, it is guaranteed that the - ** checkpointer that set nBackfill was not working with a wal-index - ** header newer than that cached in pWal->hdr. If it were, that could - ** cause a problem. The checkpointer could omit to checkpoint - ** a version of page X that lies before pWal->minFrame (call that version - ** A) on the basis that there is a newer version (version B) of the same - ** page later in the wal file. But if version B happens to like past - ** frame pWal->hdr.mxFrame - then the client would incorrectly assume - ** that it can read version A from the database file. However, since - ** we can guarantee that the checkpointer that set nBackfill could not - ** see any pages past pWal->hdr.mxFrame, this problem does not come up. - */ - pWal->minFrame = AtomicLoad(&pInfo->nBackfill)+1; - walShmBarrier(pWal); - if( AtomicLoad(pInfo->aReadMark+mxI)!=mxReadMark - || memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) - ){ - walUnlockShared(pWal, WAL_READ_LOCK(mxI)); - return WAL_RETRY; - }else{ - assert( mxReadMark<=pWal->hdr.mxFrame ); - pWal->readLock = (i16)mxI; + if( isWalMode2(pWal) ){ + /* This connection needs a "part" lock on the current wal file and, + ** unless pInfo->nBackfill is set to indicate that it has already been + ** checkpointed, a "full" lock on the other wal file. */ + int iWal = walidxGetFile(&pWal->hdr); + int nBackfill = pInfo->nBackfill || walidxGetMxFrame(&pWal->hdr, !iWal)==0; + int eLock = 1 + (iWal*2) + (nBackfill==iWal); + + assert( nBackfill==0 || nBackfill==1 ); + assert( iWal==0 || iWal==1 ); + assert( iWal!=0 || nBackfill!=1 || eLock==WAL_LOCK_PART1 ); + assert( iWal!=0 || nBackfill!=0 || eLock==WAL_LOCK_PART1_FULL2 ); + assert( iWal!=1 || nBackfill!=1 || eLock==WAL_LOCK_PART2 ); + assert( iWal!=1 || nBackfill!=0 || eLock==WAL_LOCK_PART2_FULL1 ); + + rc = walLockShared(pWal, WAL_READ_LOCK(eLock)); + if( rc!=SQLITE_OK ){ + return (rc==SQLITE_BUSY ? WAL_RETRY : rc); + } + walShmBarrier(pWal); + if( memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) ){ + walUnlockShared(pWal, WAL_READ_LOCK(eLock)); + return WAL_RETRY; + }else{ + pWal->readLock = eLock; + } + assert( pWal->minFrame==0 && walFramePage(pWal->minFrame)==0 ); + }else{ + u32 mxReadMark; /* Largest aReadMark[] value */ + int mxI; /* Index of largest aReadMark[] value */ + int i; /* Loop counter */ + u32 mxFrame; /* Wal frame to lock to */ + if( !useWal && pInfo->nBackfill==pWal->hdr.mxFrame + #ifdef SQLITE_ENABLE_SNAPSHOT + && (pWal->pSnapshot==0 || pWal->hdr.mxFrame==0) + #endif + ){ + /* The WAL has been completely backfilled (or it is empty). + ** and can be safely ignored. + */ + rc = walLockShared(pWal, WAL_READ_LOCK(0)); + walShmBarrier(pWal); + if( rc==SQLITE_OK ){ + if( memcmp((void *)walIndexHdr(pWal), &pWal->hdr,sizeof(WalIndexHdr)) ){ + /* It is not safe to allow the reader to continue here if frames + ** may have been appended to the log before READ_LOCK(0) was obtained. + ** When holding READ_LOCK(0), the reader ignores the entire log file, + ** which implies that the database file contains a trustworthy + ** snapshot. Since holding READ_LOCK(0) prevents a checkpoint from + ** happening, this is usually correct. + ** + ** However, if frames have been appended to the log (or if the log + ** is wrapped and written for that matter) before the READ_LOCK(0) + ** is obtained, that is not necessarily true. A checkpointer may + ** have started to backfill the appended frames but crashed before + ** it finished. Leaving a corrupt image in the database file. + */ + walUnlockShared(pWal, WAL_READ_LOCK(0)); + return WAL_RETRY; + } + pWal->readLock = 0; + return SQLITE_OK; + }else if( rc!=SQLITE_BUSY ){ + return rc; + } + } + + /* If we get this far, it means that the reader will want to use + ** the WAL to get at content from recent commits. The job now is + ** to select one of the aReadMark[] entries that is closest to + ** but not exceeding pWal->hdr.mxFrame and lock that entry. + */ + mxReadMark = 0; + mxI = 0; + mxFrame = pWal->hdr.mxFrame; + #ifdef SQLITE_ENABLE_SNAPSHOT + if( pWal->pSnapshot && pWal->pSnapshot->mxFramepSnapshot->mxFrame; + } + #endif + for(i=1; iaReadMark+i); + if( mxReadMark<=thisMark && thisMark<=mxFrame ){ + assert( thisMark!=READMARK_NOT_USED ); + mxReadMark = thisMark; + mxI = i; + } + } + if( (pWal->readOnly & WAL_SHM_RDONLY)==0 + && (mxReadMarkaReadMark+i,mxFrame); + mxI = i; + walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); + break; + }else if( rc!=SQLITE_BUSY ){ + return rc; + } + } + } + if( mxI==0 ){ + assert( rc==SQLITE_BUSY || (pWal->readOnly & WAL_SHM_RDONLY)!=0 ); + return rc==SQLITE_BUSY ? WAL_RETRY : SQLITE_READONLY_CANTINIT; + } + + rc = walLockShared(pWal, WAL_READ_LOCK(mxI)); + if( rc ){ + return rc==SQLITE_BUSY ? WAL_RETRY : rc; + } + /* Now that the read-lock has been obtained, check that neither the + ** value in the aReadMark[] array or the contents of the wal-index + ** header have changed. + ** + ** It is necessary to check that the wal-index header did not change + ** between the time it was read and when the shared-lock was obtained + ** on WAL_READ_LOCK(mxI) was obtained to account for the possibility + ** that the log file may have been wrapped by a writer, or that frames + ** that occur later in the log than pWal->hdr.mxFrame may have been + ** copied into the database by a checkpointer. If either of these things + ** happened, then reading the database with the current value of + ** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry + ** instead. + ** + ** Before checking that the live wal-index header has not changed + ** since it was read, set Wal.minFrame to the first frame in the wal + ** file that has not yet been checkpointed. This client will not need + ** to read any frames earlier than minFrame from the wal file - they + ** can be safely read directly from the database file. + ** + ** Because a ShmBarrier() call is made between taking the copy of + ** nBackfill and checking that the wal-header in shared-memory still + ** matches the one cached in pWal->hdr, it is guaranteed that the + ** checkpointer that set nBackfill was not working with a wal-index + ** header newer than that cached in pWal->hdr. If it were, that could + ** cause a problem. The checkpointer could omit to checkpoint + ** a version of page X that lies before pWal->minFrame (call that version + ** A) on the basis that there is a newer version (version B) of the same + ** page later in the wal file. But if version B happens to like past + ** frame pWal->hdr.mxFrame - then the client would incorrectly assume + ** that it can read version A from the database file. However, since + ** we can guarantee that the checkpointer that set nBackfill could not + ** see any pages past pWal->hdr.mxFrame, this problem does not come up. + */ + pWal->minFrame = AtomicLoad(&pInfo->nBackfill)+1; + walShmBarrier(pWal); + if( AtomicLoad(pInfo->aReadMark+mxI)!=mxReadMark + || memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) + ){ + walUnlockShared(pWal, WAL_READ_LOCK(mxI)); + return WAL_RETRY; + }else{ + assert( mxReadMark<=pWal->hdr.mxFrame ); + pWal->readLock = (i16)mxI; + } } return rc; } #ifdef SQLITE_ENABLE_SNAPSHOT @@ -2683,10 +3314,13 @@ ** error occurs. It is not an error if nBackfillAttempted cannot be ** decreased at all. */ int sqlite3WalSnapshotRecover(Wal *pWal){ int rc; + + /* Snapshots may not be used with wal2 mode databases. */ + if( isWalMode2(pWal) ) return SQLITE_ERROR; assert( pWal->readLock>=0 ); rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1); if( rc==SQLITE_OK ){ volatile WalCkptInfo *pInfo = walCkptInfo(pWal); @@ -2712,11 +3346,11 @@ pgno = sLoc.aPgno[i-sLoc.iZero]; iDbOff = (i64)(pgno-1) * szPage; if( iDbOff+szPage<=szDb ){ iWalOff = walFrameOffset(i, szPage) + WAL_FRAME_HDRSIZE; - rc = sqlite3OsRead(pWal->pWalFd, pBuf1, szPage, iWalOff); + rc = sqlite3OsRead(pWal->apWalFd[0], pBuf1, szPage, iWalOff); if( rc==SQLITE_OK ){ rc = sqlite3OsRead(pWal->pDbFd, pBuf2, szPage, iDbOff); } @@ -2758,10 +3392,11 @@ int cnt = 0; /* Number of TryBeginRead attempts */ #ifdef SQLITE_ENABLE_SNAPSHOT int bChanged = 0; WalIndexHdr *pSnapshot = pWal->pSnapshot; + if( pSnapshot && isWalMode2(pWal) ) return SQLITE_ERROR; if( pSnapshot && memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))!=0 ){ bChanged = 1; } #endif @@ -2770,10 +3405,14 @@ }while( rc==WAL_RETRY ); testcase( (rc&0xff)==SQLITE_BUSY ); testcase( (rc&0xff)==SQLITE_IOERR ); testcase( rc==SQLITE_PROTOCOL ); testcase( rc==SQLITE_OK ); + + if( rc==SQLITE_OK && pWal->hdr.iVersion==WAL_VERSION2 ){ + rc = walOpenWal2(pWal); + } pWal->nPriorFrame = pWal->hdr.mxFrame; #ifdef SQLITE_ENABLE_SNAPSHOT if( rc==SQLITE_OK ){ if( pSnapshot && memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))!=0 ){ @@ -2845,14 +3484,98 @@ ** Finish with a read transaction. All this does is release the ** read-lock. */ void sqlite3WalEndReadTransaction(Wal *pWal){ sqlite3WalEndWriteTransaction(pWal); - if( pWal->readLock>=0 ){ + if( pWal->readLock!=WAL_LOCK_NONE ){ walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock)); - pWal->readLock = -1; + pWal->readLock = WAL_LOCK_NONE; + } +} + +/* Search hash table iHash for an entry matching page number +** pgno. Each call to this function searches a single hash table +** (each hash table indexes up to HASHTABLE_NPAGE frames). +** +** This code might run concurrently to the code in walIndexAppend() +** that adds entries to the wal-index (and possibly to this hash +** table). This means the value just read from the hash +** slot (aHash[iKey]) may have been added before or after the +** current read transaction was opened. Values added after the +** read transaction was opened may have been written incorrectly - +** i.e. these slots may contain garbage data. However, we assume +** that any slots written before the current read transaction was +** opened remain unmodified. +** +** For the reasons above, the if(...) condition featured in the inner +** loop of the following block is more stringent that would be required +** if we had exclusive access to the hash-table: +** +** (aPgno[iFrame]==pgno): +** This condition filters out normal hash-table collisions. +** +** (iFrame<=iLast): +** This condition filters out entries that were added to the hash +** table after the current read-transaction had started. +*/ +static int walSearchHash( + Wal *pWal, + u32 iLast, + int iHash, + Pgno pgno, + u32 *piRead +){ + WalHashLoc sLoc; /* Hash table location */ + int iKey; /* Hash slot index */ + int nCollide; /* Number of hash collisions remaining */ + int rc; /* Error code */ + + rc = walHashGet(pWal, iHash, &sLoc); + if( rc!=SQLITE_OK ){ + return rc; + } + nCollide = HASHTABLE_NSLOT; + for(iKey=walHash(pgno); sLoc.aHash[iKey]; iKey=walNextHash(iKey)){ + u32 iFrame = sLoc.aHash[iKey] + sLoc.iZero; + if( iFrame<=iLast + && iFrame>=pWal->minFrame + && sLoc.aPgno[sLoc.aHash[iKey]]==pgno + ){ + assert( iFrame>*piRead || CORRUPT_DB ); + *piRead = iFrame; + } + if( (nCollide--)==0 ){ + return SQLITE_CORRUPT_BKPT; + } + } + + return SQLITE_OK; +} + +static int walSearchWal( + Wal *pWal, + int iWal, + Pgno pgno, + u32 *piRead +){ + int rc = SQLITE_OK; + int bWal2 = isWalMode2(pWal); + u32 iLast = walidxGetMxFrame(&pWal->hdr, iWal); + if( iLast ){ + int iHash; + int iMinHash = walFramePage(pWal->minFrame); + u32 iExternal = bWal2 ? walExternalEncode(iWal, iLast) : iLast; + assert( bWal2==0 || pWal->minFrame==0 ); + for(iHash=walFramePage(iExternal); + iHash>=iMinHash && *piRead==0; + iHash-=(1+bWal2) + ){ + rc = walSearchHash(pWal, iExternal, iHash, pgno, piRead); + if( rc!=SQLITE_OK ) break; + } } + return rc; } /* ** Search the wal file for page pgno. If found, set *piRead to the frame that ** contains the page. Otherwise, if pgno is not in the wal file, set *piRead @@ -2864,84 +3587,80 @@ int sqlite3WalFindFrame( Wal *pWal, /* WAL handle */ Pgno pgno, /* Database page number to read data for */ u32 *piRead /* OUT: Frame number (or zero) */ ){ + int bWal2 = isWalMode2(pWal); + int iApp = walidxGetFile(&pWal->hdr); + int rc = SQLITE_OK; u32 iRead = 0; /* If !=0, WAL frame to return data from */ - u32 iLast = pWal->hdr.mxFrame; /* Last page in WAL for this reader */ - int iHash; /* Used to loop through N hash tables */ - int iMinHash; - - /* This routine is only be called from within a read transaction. */ - assert( pWal->readLock>=0 || pWal->lockError ); - - /* If the "last page" field of the wal-index header snapshot is 0, then - ** no data will be read from the wal under any circumstances. Return early - ** in this case as an optimization. Likewise, if pWal->readLock==0, - ** then the WAL is ignored by the reader so return early, as if the - ** WAL were empty. - */ - if( iLast==0 || (pWal->readLock==0 && pWal->bShmUnreliable==0) ){ + + /* This routine is only be called from within a read transaction. Or, + ** sometimes, as part of a rollback that occurs after an error reaquiring + ** a read-lock in walRestartLog(). */ + assert( pWal->readLock!=WAL_LOCK_NONE || pWal->writeLock ); + + /* If this is a regular wal system, then iApp must be set to 0 (there is + ** only one wal file, after all). Or, if this is a wal2 system and the + ** write-lock is not held, the client must have a partial-wal lock on wal + ** file iApp. This is not always true if the write-lock is held and this + ** function is being called after WalLockForCommit() as part of committing + ** a CONCURRENT transaction. */ +#ifdef SQLITE_DEBUG + if( bWal2 ){ + if( pWal->writeLock==0 ){ + int l = pWal->readLock; + assert( iApp==1 || l==WAL_LOCK_PART1 || l==WAL_LOCK_PART1_FULL2 ); + assert( iApp==0 || l==WAL_LOCK_PART2 || l==WAL_LOCK_PART2_FULL1 ); + } + }else{ + assert( iApp==0 ); + } +#endif + + /* Return early if read-lock 0 is held. */ + if( (pWal->readLock==0 && pWal->bShmUnreliable==0) ){ + assert( !bWal2 ); *piRead = 0; return SQLITE_OK; } - /* Each iteration of the following for() loop searches one - ** hash table (each hash table indexes up to HASHTABLE_NPAGE frames). - ** - ** This code might run concurrently to the code in walIndexAppend() - ** that adds entries to the wal-index (and possibly to this hash - ** table). This means the value just read from the hash - ** slot (aHash[iKey]) may have been added before or after the - ** current read transaction was opened. Values added after the - ** read transaction was opened may have been written incorrectly - - ** i.e. these slots may contain garbage data. However, we assume - ** that any slots written before the current read transaction was - ** opened remain unmodified. - ** - ** For the reasons above, the if(...) condition featured in the inner - ** loop of the following block is more stringent that would be required - ** if we had exclusive access to the hash-table: - ** - ** (aPgno[iFrame]==pgno): - ** This condition filters out normal hash-table collisions. - ** - ** (iFrame<=iLast): - ** This condition filters out entries that were added to the hash - ** table after the current read-transaction had started. - */ - iMinHash = walFramePage(pWal->minFrame); - for(iHash=walFramePage(iLast); iHash>=iMinHash; iHash--){ - WalHashLoc sLoc; /* Hash table location */ - int iKey; /* Hash slot index */ - int nCollide; /* Number of hash collisions remaining */ - int rc; /* Error code */ - - rc = walHashGet(pWal, iHash, &sLoc); - if( rc!=SQLITE_OK ){ - return rc; - } - nCollide = HASHTABLE_NSLOT; - for(iKey=walHash(pgno); sLoc.aHash[iKey]; iKey=walNextHash(iKey)){ - u32 iFrame = sLoc.aHash[iKey] + sLoc.iZero; - if( iFrame<=iLast && iFrame>=pWal->minFrame - && sLoc.aPgno[sLoc.aHash[iKey]]==pgno ){ - assert( iFrame>iRead || CORRUPT_DB ); - iRead = iFrame; - } - if( (nCollide--)==0 ){ - return SQLITE_CORRUPT_BKPT; - } - } - if( iRead ) break; - } + /* Search the wal file that the client holds a partial lock on first. */ + rc = walSearchWal(pWal, iApp, pgno, &iRead); + + /* If the requested page was not found, no error has occured, and + ** the client holds a full-wal lock on the other wal file, search it + ** too. */ + if( rc==SQLITE_OK && bWal2 && iRead==0 && ( + pWal->readLock==WAL_LOCK_PART1_FULL2 + || pWal->readLock==WAL_LOCK_PART2_FULL1 +#ifndef SQLITE_OMIT_CONCURRENT + || (pWal->readLock==WAL_LOCK_PART1 && iApp==1) + || (pWal->readLock==WAL_LOCK_PART2 && iApp==0) +#endif + )){ + rc = walSearchWal(pWal, !iApp, pgno, &iRead); + } + +#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) + if( iRead ){ + u32 iFrame; + int iWal = walExternalDecode(iRead, &iFrame); + WALTRACE(("WAL%p: page %d @ frame %d wal %d\n",pWal,(int)pgno,iFrame,iWal)); + }else{ + WALTRACE(("WAL%p: page %d not found\n", pWal, (int)pgno)); + } +#endif #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT /* If expensive assert() statements are available, do a linear search ** of the wal-index file content. Make sure the results agree with the - ** result obtained using the hash indexes above. */ - { + ** result obtained using the hash indexes above. + ** + ** TODO: This is broken for wal2. + */ + if( rc==SQLITE_OK ){ u32 iRead2 = 0; u32 iTest; assert( pWal->bShmUnreliable || pWal->minFrame>0 ); for(iTest=iLast; iTest>=pWal->minFrame && iTest>0; iTest--){ if( walFramePgno(pWal, iTest)==pgno ){ @@ -2962,30 +3681,44 @@ ** (which is nOut bytes in size). Return SQLITE_OK if successful, or an ** error code otherwise. */ int sqlite3WalReadFrame( Wal *pWal, /* WAL handle */ - u32 iRead, /* Frame to read */ + u32 iExternal, /* Frame to read */ int nOut, /* Size of buffer pOut in bytes */ u8 *pOut /* Buffer to write page data to */ ){ int sz; + int iWal = 0; + u32 iRead; i64 iOffset; + + /* Figure out the page size */ sz = pWal->hdr.szPage; sz = (sz&0xfe00) + ((sz&0x0001)<<16); testcase( sz<=32768 ); testcase( sz>=65536 ); + + if( isWalMode2(pWal) ){ + /* Figure out which of the two wal files, and the frame within, that + ** iExternal refers to. */ + iWal = walExternalDecode(iExternal, &iRead); + }else{ + iRead = iExternal; + } + + WALTRACE(("WAL%p: reading frame %d wal %d\n", pWal, iRead, iWal)); iOffset = walFrameOffset(iRead, sz) + WAL_FRAME_HDRSIZE; /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */ - return sqlite3OsRead(pWal->pWalFd, pOut, (nOut>sz ? sz : nOut), iOffset); + return sqlite3OsRead(pWal->apWalFd[iWal], pOut, (nOut>sz?sz:nOut), iOffset); } /* ** Return the size of the database in pages (or zero, if unknown). */ Pgno sqlite3WalDbsize(Wal *pWal){ - if( pWal && ALWAYS(pWal->readLock>=0) ){ + if( pWal && ALWAYS(pWal->readLock!=WAL_LOCK_NONE) ){ return pWal->hdr.nPage; } return 0; } @@ -3057,10 +3790,11 @@ */ static int walUpgradeReadlock(Wal *pWal){ int cnt; int rc; assert( pWal->writeLock && pWal->readLock==0 ); + assert( isWalMode2(pWal)==0 ); walUnlockShared(pWal, WAL_READ_LOCK(0)); pWal->readLock = -1; cnt = 0; do{ int notUsed; @@ -3130,71 +3864,112 @@ ** occurs if some other writer has crashed while committing a ** transaction to this database since the current concurrent transaction ** was opened. */ rc = SQLITE_BUSY_SNAPSHOT; }else if( memcmp(&pWal->hdr, (void*)&head, sizeof(WalIndexHdr))!=0 ){ - int iHash; - int iLastHash = walFramePage(head.mxFrame); - u32 iFirst = pWal->hdr.mxFrame+1; /* First wal frame to check */ - if( memcmp(pWal->hdr.aSalt, (u32*)head.aSalt, sizeof(u32)*2) ){ - assert( pWal->readLock==0 ); - iFirst = 1; - } - for(iHash=walFramePage(iFirst); iHash<=iLastHash; iHash++){ - WalHashLoc sLoc; - - rc = walHashGet(pWal, iHash, &sLoc); - if( rc==SQLITE_OK ){ - u32 i, iMin, iMax; - assert( head.mxFrame>=sLoc.iZero ); - iMin = (sLoc.iZero >= iFirst) ? 1 : (iFirst - sLoc.iZero); - iMax = (iHash==0) ? HASHTABLE_NPAGE_ONE : HASHTABLE_NPAGE; - if( iMax>(head.mxFrame-sLoc.iZero) ) iMax = (head.mxFrame-sLoc.iZero); - for(i=iMin; rc==SQLITE_OK && i<=iMax; i++){ - PgHdr *pPg; - if( sLoc.aPgno[i]==1 ){ - /* Check that the schema cookie has not been modified. If - ** it has not, the commit can proceed. */ - u8 aNew[4]; - u8 *aOld = &((u8*)pPage1->pData)[40]; - int sz; - i64 iOffset; - sz = pWal->hdr.szPage; - sz = (sz&0xfe00) + ((sz&0x0001)<<16); - iOffset = walFrameOffset(i+sLoc.iZero, sz) + WAL_FRAME_HDRSIZE+40; - rc = sqlite3OsRead(pWal->pWalFd, aNew, sizeof(aNew), iOffset); - if( rc==SQLITE_OK && memcmp(aOld, aNew, sizeof(aNew)) ){ - rc = SQLITE_BUSY_SNAPSHOT; - } - }else if( sqlite3BitvecTestNotNull(pAllRead, sLoc.aPgno[i]) ){ - *piConflict = sLoc.aPgno[i]; - rc = SQLITE_BUSY_SNAPSHOT; - }else if( (pPg = sqlite3PagerLookup(pPager, sLoc.aPgno[i])) ){ - /* Page aPgno[i], which is present in the pager cache, has been - ** modified since the current CONCURRENT transaction was started. - ** However it was not read by the current transaction, so is not - ** a conflict. There are two possibilities: (a) the page was - ** allocated at the of the file by the current transaction or - ** (b) was present in the cache at the start of the transaction. - ** - ** For case (a), do nothing. This page will be moved within the - ** database file by the commit code to avoid the conflict. The - ** call to PagerUnref() is to release the reference grabbed by - ** the sqlite3PagerLookup() above. - ** - ** In case (b), drop the page from the cache - otherwise - ** following the snapshot upgrade the cache would be inconsistent - ** with the database as stored on disk. */ - if( sqlite3PagerIswriteable(pPg) ){ - sqlite3PagerUnref(pPg); - }else{ - sqlite3PcacheDrop(pPg); - } - } - } - } - if( rc!=SQLITE_OK ) break; + int bWal2 = isWalMode2(pWal); + int iHash; + int nLoop = 1+(bWal2 && walidxGetFile(&head)!=walidxGetFile(&pWal->hdr)); + int iLoop; + + + assert( nLoop==1 || nLoop==2 ); + for(iLoop=0; rc==SQLITE_OK && iLoophdr.mxFrame (which will be + ** set to the size of the old, now overwritten, wal file). This + ** doesn't come up in wal2 mode, as in wal2 mode the client always + ** has a PART lock on one of the wal files, preventing it from being + ** checkpointed or overwritten. */ + iFirst = pWal->hdr.mxFrame+1; + if( memcmp(pWal->hdr.aSalt, (u32*)head.aSalt, sizeof(u32)*2) ){ + assert( pWal->readLock==0 ); + iFirst = 1; + } + mxFrame = head.mxFrame; + }else{ + int iA = walidxGetFile(&pWal->hdr); + if( iLoop==0 ){ + iFirst = walExternalEncode(iA, 1+walidxGetMxFrame(&pWal->hdr, iA)); + mxFrame = walExternalEncode(iA, walidxGetMxFrame(&head, iA)); + }else{ + iFirst = walExternalEncode(!iA, 1); + mxFrame = walExternalEncode(!iA, walidxGetMxFrame(&head, !iA)); + } + } + iLastHash = walFramePage(mxFrame); + + for(iHash=walFramePage(iFirst); iHash<=iLastHash; iHash += (1+bWal2)){ + WalHashLoc sLoc; + + rc = walHashGet(pWal, iHash, &sLoc); + if( rc==SQLITE_OK ){ + u32 i, iMin, iMax; + assert( mxFrame>=sLoc.iZero ); + iMin = (sLoc.iZero >= iFirst) ? 1 : (iFirst - sLoc.iZero); + iMax = (iHash==0) ? HASHTABLE_NPAGE_ONE : HASHTABLE_NPAGE; + if( iMax>(mxFrame-sLoc.iZero) ) iMax = (mxFrame-sLoc.iZero); + for(i=iMin; rc==SQLITE_OK && i<=iMax; i++){ + PgHdr *pPg; + if( sLoc.aPgno[i]==1 ){ + /* Check that the schema cookie has not been modified. If + ** it has not, the commit can proceed. */ + u8 aNew[4]; + u8 *aOld = &((u8*)pPage1->pData)[40]; + int sz; + i64 iOff; + u32 iFrame = sLoc.iZero + i; + int iWal = 0; + if( bWal2 ){ + iWal = walExternalDecode(iFrame, &iFrame); + } + sz = pWal->hdr.szPage; + sz = (sz&0xfe00) + ((sz&0x0001)<<16); + iOff = walFrameOffset(iFrame, sz) + WAL_FRAME_HDRSIZE + 40; + rc = sqlite3OsRead(pWal->apWalFd[iWal],aNew,sizeof(aNew),iOff); + if( rc==SQLITE_OK && memcmp(aOld, aNew, sizeof(aNew)) ){ + rc = SQLITE_BUSY_SNAPSHOT; + } + }else if( sqlite3BitvecTestNotNull(pAllRead, sLoc.aPgno[i]) ){ + *piConflict = sLoc.aPgno[i]; + rc = SQLITE_BUSY_SNAPSHOT; + }else if( (pPg = sqlite3PagerLookup(pPager, sLoc.aPgno[i])) ){ + /* Page aPgno[i], which is present in the pager cache, has been + ** modified since the current CONCURRENT transaction was + ** started. However it was not read by the current + ** transaction, so is not a conflict. There are two + ** possibilities: (a) the page was allocated at the of the file + ** by the current transaction or (b) was present in the cache + ** at the start of the transaction. + ** + ** For case (a), do nothing. This page will be moved within the + ** database file by the commit code to avoid the conflict. The + ** call to PagerUnref() is to release the reference grabbed by + ** the sqlite3PagerLookup() above. + ** + ** In case (b), drop the page from the cache - otherwise + ** following the snapshot upgrade the cache would be + ** inconsistent with the database as stored on disk. */ + if( sqlite3PagerIswriteable(pPg) ){ + sqlite3PagerUnref(pPg); + }else{ + sqlite3PcacheDrop(pPg); + } + } + } + } + if( rc!=SQLITE_OK ) break; + } } } } pWal->nPriorFrame = pWal->hdr.mxFrame; @@ -3222,10 +3997,11 @@ /* If this client has its read-lock on slot aReadmark[0] and the entire ** wal has not been checkpointed, switch it to a different slot. Otherwise ** any reads performed between now and committing the transaction will ** read from the old snapshot - not the one just upgraded to. */ if( pWal->readLock==0 && pWal->hdr.mxFrame!=walCkptInfo(pWal)->nBackfill ){ + assert( isWalMode2(pWal)==0 ); rc = walUpgradeReadlock(pWal); } return rc; } #endif /* SQLITE_OMIT_CONCURRENT */ @@ -3262,29 +4038,49 @@ void *pUndoCtx, int bConcurrent /* True if this is a CONCURRENT transaction */ ){ int rc = SQLITE_OK; if( pWal->writeLock ){ - Pgno iMax = pWal->hdr.mxFrame; + int iWal = walidxGetFile(&pWal->hdr); + Pgno iMax = walidxGetMxFrame(&pWal->hdr, iWal); + Pgno iNew; Pgno iFrame; - + + assert( isWalMode2(pWal) || iWal==0 ); + /* Restore the clients cache of the wal-index header to the state it ** was in before the client began writing to the database. */ memcpy(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr)); + iNew = walidxGetMxFrame(&pWal->hdr, walidxGetFile(&pWal->hdr)); + + /* BEGIN CONCURRENT transactions are different, as the header just + ** memcpy()d into pWal->hdr may not be the same as the current header + ** when the transaction was started. Instead, pWal->hdr now contains + ** the header written by the most recent successful COMMIT. Because + ** Wal.writeLock is set, if this is a BEGIN CONCURRENT transaction, + ** the rollback must be taking place because an error occurred during + ** a COMMIT. + ** + ** The code below is still valid. All frames between (iNew+1) and iMax + ** must have been written by this transaction before the error occurred. + ** The exception is in wal2 mode - if the current wal file at the time + ** of the last COMMIT is not wal file iWal, then the error must have + ** occurred in WalLockForCommit(), before any pages were written + ** to the database file. In this case return early. */ #ifndef SQLITE_OMIT_CONCURRENT if( bConcurrent ){ pWal->hdr.aCksum[0]++; } -#else - UNUSED_PARAMETER(bConcurrent); + if( walidxGetFile(&pWal->hdr)!=iWal ){ + assert( bConcurrent && isWalMode2(pWal) ); + return SQLITE_OK; + } #endif + assert( walidxGetFile(&pWal->hdr)==iWal ); - for(iFrame=pWal->hdr.mxFrame+1; - ALWAYS(rc==SQLITE_OK) && iFrame<=iMax; - iFrame++ - ){ + for(iFrame=iNew+1; ALWAYS(rc==SQLITE_OK) && iFrame<=iMax; iFrame++){ /* This call cannot fail. Unless the page for which the page number ** is passed as the second argument is (a) in the cache and ** (b) has an outstanding reference, then xUndo is either a no-op ** (if (a) is false) or simply expels the page from the cache (if (b) ** is false). @@ -3292,14 +4088,20 @@ ** If the upper layer is doing a rollback, it is guaranteed that there ** are no outstanding references to any page other than page 1. And ** page 1 is never written to the log until the transaction is ** committed. As a result, the call to xUndo may not fail. */ - assert( walFramePgno(pWal, iFrame)!=1 ); - rc = xUndo(pUndoCtx, walFramePgno(pWal, iFrame)); + Pgno pgno; + if( isWalMode2(pWal) ){ + pgno = walFramePgno2(pWal, iWal, iFrame); + }else{ + pgno = walFramePgno(pWal, iFrame); + } + assert( pgno!=1 ); + rc = xUndo(pUndoCtx, pgno); } - if( iMax!=pWal->hdr.mxFrame ) walCleanupHash(pWal); + if( iMax!=iNew ) walCleanupHash(pWal); } return rc; } /* @@ -3307,14 +4109,16 @@ ** values. This function populates the array with values required to ** "rollback" the write position of the WAL handle back to the current ** point in the event of a savepoint rollback (via WalSavepointUndo()). */ void sqlite3WalSavepoint(Wal *pWal, u32 *aWalData){ - aWalData[0] = pWal->hdr.mxFrame; + int iWal = walidxGetFile(&pWal->hdr); + assert( isWalMode2(pWal) || iWal==0 ); + aWalData[0] = walidxGetMxFrame(&pWal->hdr, iWal); aWalData[1] = pWal->hdr.aFrameCksum[0]; aWalData[2] = pWal->hdr.aFrameCksum[1]; - aWalData[3] = pWal->nCkpt; + aWalData[3] = isWalMode2(pWal) ? iWal : pWal->nCkpt; } /* ** Move the write position of the WAL back to the point identified by ** the values in the aWalData[] array. aWalData must point to an array @@ -3321,25 +4125,28 @@ ** of WAL_SAVEPOINT_NDATA u32 values that has been previously populated ** by a call to WalSavepoint(). */ int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){ int rc = SQLITE_OK; + int iWal = walidxGetFile(&pWal->hdr); + int iCmp = isWalMode2(pWal) ? iWal : pWal->nCkpt; assert( pWal->writeLock || aWalData[0]==pWal->hdr.mxFrame ); - assert( aWalData[3]!=pWal->nCkpt || aWalData[0]<=pWal->hdr.mxFrame ); + assert( isWalMode2(pWal) || iWal==0 ); + assert( aWalData[3]!=iCmp || aWalData[0]<=walidxGetMxFrame(&pWal->hdr,iWal) ); - if( aWalData[3]!=pWal->nCkpt ){ + if( aWalData[3]!=iCmp ){ /* This savepoint was opened immediately after the write-transaction ** was started. Right after that, the writer decided to wrap around ** to the start of the log. Update the savepoint values to match. */ aWalData[0] = 0; - aWalData[3] = pWal->nCkpt; + aWalData[3] = iCmp; } - if( aWalData[0]hdr.mxFrame ){ - pWal->hdr.mxFrame = aWalData[0]; + if( aWalData[0]hdr, iWal) ){ + walidxSetMxFrame(&pWal->hdr, iWal, aWalData[0]); pWal->hdr.aFrameCksum[0] = aWalData[1]; pWal->hdr.aFrameCksum[1] = aWalData[2]; walCleanupHash(pWal); } @@ -3347,23 +4154,55 @@ } /* ** This function is called just before writing a set of frames to the log ** file (see sqlite3WalFrames()). It checks to see if, instead of appending -** to the current log file, it is possible to overwrite the start of the -** existing log file with the new frames (i.e. "reset" the log). If so, -** it sets pWal->hdr.mxFrame to 0. Otherwise, pWal->hdr.mxFrame is left -** unchanged. +** to the current log file, it is possible and desirable to switch to the +** other log file and write the new transaction to the start of it. +** If so, the wal-index header is updated accordingly - both in heap memory +** and in the *-shm file. ** ** SQLITE_OK is returned if no error is encountered (regardless of whether -** or not pWal->hdr.mxFrame is modified). An SQLite error code is returned +** or not the wal-index header is modified). An SQLite error code is returned ** if an error occurs. */ static int walRestartLog(Wal *pWal){ int rc = SQLITE_OK; - if( pWal->readLock==0 ){ + if( isWalMode2(pWal) ){ + int iApp = walidxGetFile(&pWal->hdr); + int nWalSize = WAL_DEFAULT_WALSIZE; + if( pWal->mxWalSize>0 ){ + nWalSize = (pWal->mxWalSize-WAL_HDRSIZE+pWal->szPage+WAL_FRAME_HDRSIZE-1) + / (pWal->szPage+WAL_FRAME_HDRSIZE); + nWalSize = MAX(nWalSize, 1); + } + + if( walidxGetMxFrame(&pWal->hdr, iApp)>=nWalSize ){ + volatile WalCkptInfo *pInfo = walCkptInfo(pWal); + u32 mxFrame = walidxGetMxFrame(&pWal->hdr, !iApp); + if( mxFrame==0 || pInfo->nBackfill ){ + rc = wal2RestartOk(pWal, iApp); + if( rc==SQLITE_OK ){ + int iNew = !iApp; + pWal->nCkpt++; + walidxSetFile(&pWal->hdr, iNew); + walidxSetMxFrame(&pWal->hdr, iNew, 0); + sqlite3Put4byte((u8*)&pWal->hdr.aSalt[0], pWal->hdr.aFrameCksum[0]); + sqlite3Put4byte((u8*)&pWal->hdr.aSalt[1], pWal->hdr.aFrameCksum[1]); + walIndexWriteHdr(pWal); + pInfo->nBackfill = 0; + wal2RestartFinished(pWal, iApp); + walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock)); + pWal->readLock = iNew ? WAL_LOCK_PART2_FULL1 : WAL_LOCK_PART1_FULL2; + rc = walLockShared(pWal, WAL_READ_LOCK(pWal->readLock)); + }else if( rc==SQLITE_BUSY ){ + rc = SQLITE_OK; + } + } + } + }else if( pWal->readLock==0 ){ volatile WalCkptInfo *pInfo = walCkptInfo(pWal); assert( pInfo->nBackfill==pWal->hdr.mxFrame ); if( pInfo->nBackfill>0 ){ u32 salt1; sqlite3_randomness(4, &salt1); @@ -3394,10 +4233,11 @@ ** read-transaction, subsequent read operations would read directly from ** the database file - ignoring the new pages just appended ** to the wal file. */ rc = walUpgradeReadlock(pWal); } + return rc; } /* ** Information about the current state of the WAL file and where @@ -3452,10 +4292,22 @@ sqlite3_int64 iOffset /* Byte offset at which to write */ ){ int rc; /* Result code from subfunctions */ void *pData; /* Data actually written */ u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */ + +#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) + { + int iWal = walidxGetFile(&p->pWal->hdr); + int iFrame = 1 + (iOffset / (WAL_FRAME_HDRSIZE + p->pWal->szPage)); + assert( p->pWal->apWalFd[iWal]==p->pFd ); + WALTRACE(("WAL%p: page %d written to frame %d of wal %d\n", + p->pWal, (int)pPage->pgno, iFrame, iWal + )); + } +#endif + #if defined(SQLITE_HAS_CODEC) if( (pData = sqlite3PagerCodec(pPage))==0 ) return SQLITE_NOMEM_BKPT; #else pData = pPage->pData; #endif @@ -3474,16 +4326,17 @@ ** with the earliest to have been overwritten. ** ** SQLITE_OK is returned if successful, or an SQLite error code otherwise. */ static int walRewriteChecksums(Wal *pWal, u32 iLast){ - const int szPage = pWal->szPage;/* Database page size */ int rc = SQLITE_OK; /* Return code */ + const int szPage = pWal->szPage;/* Database page size */ u8 *aBuf; /* Buffer to load data from wal file into */ u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-headers in */ u32 iRead; /* Next frame to read from wal file */ i64 iCksumOff; + sqlite3_file *pWalFd = pWal->apWalFd[walidxGetFile(&pWal->hdr)]; aBuf = sqlite3_malloc(szPage + WAL_FRAME_HDRSIZE); if( aBuf==0 ) return SQLITE_NOMEM_BKPT; /* Find the checksum values to use as input for the recalculating the @@ -3495,26 +4348,26 @@ if( pWal->iReCksum==1 ){ iCksumOff = 24; }else{ iCksumOff = walFrameOffset(pWal->iReCksum-1, szPage) + 16; } - rc = sqlite3OsRead(pWal->pWalFd, aBuf, sizeof(u32)*2, iCksumOff); + rc = sqlite3OsRead(pWalFd, aBuf, sizeof(u32)*2, iCksumOff); pWal->hdr.aFrameCksum[0] = sqlite3Get4byte(aBuf); pWal->hdr.aFrameCksum[1] = sqlite3Get4byte(&aBuf[sizeof(u32)]); iRead = pWal->iReCksum; pWal->iReCksum = 0; for(; rc==SQLITE_OK && iRead<=iLast; iRead++){ i64 iOff = walFrameOffset(iRead, szPage); - rc = sqlite3OsRead(pWal->pWalFd, aBuf, szPage+WAL_FRAME_HDRSIZE, iOff); + rc = sqlite3OsRead(pWalFd, aBuf, szPage+WAL_FRAME_HDRSIZE, iOff); if( rc==SQLITE_OK ){ u32 iPgno, nDbSize; iPgno = sqlite3Get4byte(aBuf); nDbSize = sqlite3Get4byte(&aBuf[4]); walEncodeFrame(pWal, iPgno, nDbSize, &aBuf[WAL_FRAME_HDRSIZE], aFrame); - rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOff); + rc = sqlite3OsWrite(pWalFd, aFrame, sizeof(aFrame), iOff); } } sqlite3_free(aBuf); return rc; @@ -3540,63 +4393,81 @@ int szFrame; /* The size of a single frame */ i64 iOffset; /* Next byte to write in WAL file */ WalWriter w; /* The writer */ u32 iFirst = 0; /* First frame that may be overwritten */ WalIndexHdr *pLive; /* Pointer to shared header */ + int iApp; + int bWal2 = isWalMode2(pWal); assert( pList ); assert( pWal->writeLock ); /* If this frame set completes a transaction, then nTruncate>0. If ** nTruncate==0 then this frame set does not complete the transaction. */ assert( (isCommit!=0)==(nTruncate!=0) ); -#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) - { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){} - WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n", - pWal, cnt, pWal->hdr.mxFrame, isCommit ? "Commit" : "Spill")); - } -#endif - pLive = (WalIndexHdr*)walIndexHdr(pWal); if( memcmp(&pWal->hdr, (void *)pLive, sizeof(WalIndexHdr))!=0 ){ - iFirst = pLive->mxFrame+1; + /* if( isWalMode2(pWal)==0 ) */ + iFirst = walidxGetMxFrame(pLive, walidxGetFile(pLive))+1; } /* See if it is possible to write these frames into the start of the ** log file, instead of appending to it at pWal->hdr.mxFrame. */ - if( SQLITE_OK!=(rc = walRestartLog(pWal)) ){ + else if( SQLITE_OK!=(rc = walRestartLog(pWal)) ){ return rc; } /* If this is the first frame written into the log, write the WAL ** header to the start of the WAL file. See comments at the top of ** this source file for a description of the WAL header format. */ - iFrame = pWal->hdr.mxFrame; + iApp = walidxGetFile(&pWal->hdr); + iFrame = walidxGetMxFrame(&pWal->hdr, iApp); + assert( iApp==0 || bWal2 ); + +#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) + { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){} + WALTRACE(("WAL%p: frame write begin. %d frames. iWal=%d. mxFrame=%d. %s\n", + pWal, cnt, iApp, iFrame, isCommit ? "Commit" : "Spill")); + } +#endif + if( iFrame==0 ){ + u32 iCkpt = 0; u8 aWalHdr[WAL_HDRSIZE]; /* Buffer to assemble wal-header in */ u32 aCksum[2]; /* Checksum for wal-header */ sqlite3Put4byte(&aWalHdr[0], (WAL_MAGIC | SQLITE_BIGENDIAN)); - sqlite3Put4byte(&aWalHdr[4], WAL_MAX_VERSION); + sqlite3Put4byte(&aWalHdr[4], pWal->hdr.iVersion); sqlite3Put4byte(&aWalHdr[8], szPage); - sqlite3Put4byte(&aWalHdr[12], pWal->nCkpt); - if( pWal->nCkpt==0 ) sqlite3_randomness(8, pWal->hdr.aSalt); + if( bWal2 ){ + if( walidxGetMxFrame(&pWal->hdr, !iApp)>0 ){ + u8 aPrev[4]; + rc = sqlite3OsRead(pWal->apWalFd[!iApp], aPrev, 4, 12); + if( rc!=SQLITE_OK ){ + return rc; + } + iCkpt = (sqlite3Get4byte(aPrev) + 1) & 0x0F; + } + }else{ + iCkpt = pWal->nCkpt; + } + sqlite3Put4byte(&aWalHdr[12], iCkpt); memcpy(&aWalHdr[16], pWal->hdr.aSalt, 8); walChecksumBytes(1, aWalHdr, WAL_HDRSIZE-2*4, 0, aCksum); sqlite3Put4byte(&aWalHdr[24], aCksum[0]); sqlite3Put4byte(&aWalHdr[28], aCksum[1]); - + pWal->szPage = szPage; pWal->hdr.bigEndCksum = SQLITE_BIGENDIAN; pWal->hdr.aFrameCksum[0] = aCksum[0]; pWal->hdr.aFrameCksum[1] = aCksum[1]; pWal->truncateOnCommit = 1; - rc = sqlite3OsWrite(pWal->pWalFd, aWalHdr, sizeof(aWalHdr), 0); + rc = sqlite3OsWrite(pWal->apWalFd[iApp], aWalHdr, sizeof(aWalHdr), 0); WALTRACE(("WAL%p: wal-header write %s\n", pWal, rc ? "failed" : "ok")); if( rc!=SQLITE_OK ){ return rc; } @@ -3606,19 +4477,19 @@ ** database corruption. See the ticket: ** ** https://sqlite.org/src/info/ff5be73dee */ if( pWal->syncHeader ){ - rc = sqlite3OsSync(pWal->pWalFd, CKPT_SYNC_FLAGS(sync_flags)); + rc = sqlite3OsSync(pWal->apWalFd[iApp], CKPT_SYNC_FLAGS(sync_flags)); if( rc ) return rc; } } assert( (int)pWal->szPage==szPage ); /* Setup information needed to write frames into the WAL */ w.pWal = pWal; - w.pFd = pWal->pWalFd; + w.pFd = pWal->apWalFd[iApp]; w.iSyncPoint = 0; w.syncFlags = sync_flags; w.szPage = szPage; iOffset = walFrameOffset(iFrame+1, szPage); szFrame = szPage + WAL_FRAME_HDRSIZE; @@ -3631,12 +4502,15 @@ ** the current transaction. If so, overwrite the existing frame and ** set Wal.writeLock to WAL_WRITELOCK_RECKSUM - indicating that ** checksums must be recomputed when the transaction is committed. */ if( iFirst && (p->pDirty || isCommit==0) ){ u32 iWrite = 0; - VVA_ONLY(rc =) sqlite3WalFindFrame(pWal, p->pgno, &iWrite); + VVA_ONLY(rc =) walSearchWal(pWal, iApp, p->pgno, &iWrite); assert( rc==SQLITE_OK || iWrite==0 ); + if( iWrite && bWal2 ){ + walExternalDecode(iWrite, &iWrite); + } if( iWrite>=iFirst ){ i64 iOff = walFrameOffset(iWrite, szPage) + WAL_FRAME_HDRSIZE; void *pData; if( pWal->iReCksum==0 || iWriteiReCksum ){ pWal->iReCksum = iWrite; @@ -3644,11 +4518,11 @@ #if defined(SQLITE_HAS_CODEC) if( (pData = sqlite3PagerCodec(p))==0 ) return SQLITE_NOMEM; #else pData = p->pData; #endif - rc = sqlite3OsWrite(pWal->pWalFd, pData, szPage, iOff); + rc = sqlite3OsWrite(pWal->apWalFd[iApp], pData, szPage, iOff); if( rc ) return rc; p->flags &= ~PGHDR_WAL_APPEND; continue; } } @@ -3685,11 +4559,11 @@ ** past the sector boundary is written after the sync. */ if( isCommit && WAL_SYNC_FLAGS(sync_flags)!=0 ){ int bSync = 1; if( pWal->padToSectorBoundary ){ - int sectorSize = sqlite3SectorSize(pWal->pWalFd); + int sectorSize = sqlite3SectorSize(w.pFd); w.iSyncPoint = ((iOffset+sectorSize-1)/sectorSize)*sectorSize; bSync = (w.iSyncPoint==iOffset); testcase( bSync ); while( iOffsethdr.mxFrame; + iFrame = walidxGetMxFrame(&pWal->hdr, iApp); for(p=pList; p && rc==SQLITE_OK; p=p->pDirty){ if( (p->flags & PGHDR_WAL_APPEND)==0 ) continue; iFrame++; - rc = walIndexAppend(pWal, iFrame, p->pgno); + rc = walIndexAppend(pWal, iApp, iFrame, p->pgno); } while( rc==SQLITE_OK && nExtra>0 ){ iFrame++; nExtra--; - rc = walIndexAppend(pWal, iFrame, pLast->pgno); + rc = walIndexAppend(pWal, iApp, iFrame, pLast->pgno); } if( rc==SQLITE_OK ){ /* Update the private copy of the header. */ pWal->hdr.szPage = (u16)((szPage&0xff00) | (szPage>>16)); testcase( szPage<=32768 ); testcase( szPage>=65536 ); - pWal->hdr.mxFrame = iFrame; + walidxSetMxFrame(&pWal->hdr, iApp, iFrame); if( isCommit ){ pWal->hdr.iChange++; pWal->hdr.nPage = nTruncate; } /* If this is a commit, update the wal-index header too. */ if( isCommit ){ walIndexWriteHdr(pWal); - pWal->iCallback = iFrame; + if( bWal2 ){ + int iOther = !walidxGetFile(&pWal->hdr); + if( walidxGetMxFrame(&pWal->hdr, iOther) + && !walCkptInfo(pWal)->nBackfill + ){ + pWal->iCallback = walidxGetMxFrame(&pWal->hdr, 0); + pWal->iCallback += walidxGetMxFrame(&pWal->hdr, 1); + } + }else{ + pWal->iCallback = iFrame; + } } } WALTRACE(("WAL%p: frame write %s\n", pWal, rc ? "failed" : "ok")); return rc; @@ -3836,20 +4720,34 @@ } } /* Copy data from the log to the database file. */ if( rc==SQLITE_OK ){ - if( pWal->hdr.mxFrame && walPagesize(pWal)!=nBuf ){ + if( (walPagesize(pWal)!=nBuf) + && ((pWal->hdr.mxFrame2 & 0x7FFFFFFF) || pWal->hdr.mxFrame) + ){ rc = SQLITE_CORRUPT_BKPT; }else{ rc = walCheckpoint(pWal, db, eMode2, xBusy2, pBusyArg, sync_flags, zBuf); } /* If no error occurred, set the output variables. */ if( rc==SQLITE_OK || rc==SQLITE_BUSY ){ - if( pnLog ) *pnLog = (int)pWal->hdr.mxFrame; - if( pnCkpt ) *pnCkpt = (int)(walCkptInfo(pWal)->nBackfill); + if( pnLog ){ + *pnLog = walidxGetMxFrame(&pWal->hdr,0)+walidxGetMxFrame(&pWal->hdr,1); + } + if( pnCkpt ){ + if( isWalMode2(pWal) ){ + if( (int)(walCkptInfo(pWal)->nBackfill) ){ + *pnCkpt = walidxGetMxFrame(&pWal->hdr, !walidxGetFile(&pWal->hdr)); + }else{ + *pnCkpt = 0; + } + }else{ + *pnCkpt = walCkptInfo(pWal)->nBackfill; + } + } } } if( isChanged ){ /* If a new wal-index header was loaded before the checkpoint was @@ -3907,26 +4805,28 @@ ** should acquire the database exclusive lock prior to invoking ** the op==1 case. */ int sqlite3WalExclusiveMode(Wal *pWal, int op){ int rc; + assert( pWal->writeLock==0 ); assert( pWal->exclusiveMode!=WAL_HEAPMEMORY_MODE || op==-1 ); /* pWal->readLock is usually set, but might be -1 if there was a ** prior error while attempting to acquire are read-lock. This cannot ** happen if the connection is actually in exclusive mode (as no xShmLock ** locks are taken in this case). Nor should the pager attempt to ** upgrade to exclusive-mode following such an error. */ - assert( pWal->readLock>=0 || pWal->lockError ); - assert( pWal->readLock>=0 || (op<=0 && pWal->exclusiveMode==0) ); + assert( pWal->readLock!=WAL_LOCK_NONE || pWal->lockError ); + assert( pWal->readLock!=WAL_LOCK_NONE || (op<=0 && pWal->exclusiveMode==0) ); if( op==0 ){ - if( pWal->exclusiveMode!=WAL_NORMAL_MODE ){ + if( pWal->exclusiveMode ){ pWal->exclusiveMode = WAL_NORMAL_MODE; - if( walLockShared(pWal, WAL_READ_LOCK(pWal->readLock))!=SQLITE_OK ){ + rc = walLockShared(pWal, WAL_READ_LOCK(pWal->readLock)); + if( rc!=SQLITE_OK ){ pWal->exclusiveMode = WAL_EXCLUSIVE_MODE; } rc = pWal->exclusiveMode==WAL_NORMAL_MODE; }else{ /* Already in locking_mode=NORMAL */ @@ -3960,14 +4860,17 @@ */ int sqlite3WalSnapshotGet(Wal *pWal, sqlite3_snapshot **ppSnapshot){ int rc = SQLITE_OK; WalIndexHdr *pRet; static const u32 aZero[4] = { 0, 0, 0, 0 }; + + /* Snapshots may not be used with wal2 mode databases. */ + if( isWalMode2(pWal) ) return SQLITE_ERROR; assert( pWal->readLock>=0 && pWal->writeLock==0 ); - if( memcmp(&pWal->hdr.aFrameCksum[0],aZero,16)==0 ){ + if( memcmp(&pWal->hdr.aFrameCksum[0],aZero,8)==0 ){ *ppSnapshot = 0; return SQLITE_ERROR; } pRet = (WalIndexHdr*)sqlite3_malloc(sizeof(WalIndexHdr)); if( pRet==0 ){ @@ -4014,10 +4917,14 @@ ** occurs (any value other than SQLITE_OK is returned), the CHECKPOINTER ** lock is released before returning. */ int sqlite3WalSnapshotCheck(Wal *pWal, sqlite3_snapshot *pSnapshot){ int rc; + + /* Snapshots may not be used with wal2 mode databases. */ + if( isWalMode2(pWal) ) return SQLITE_ERROR; + rc = walLockShared(pWal, WAL_CKPT_LOCK); if( rc==SQLITE_OK ){ WalIndexHdr *pNew = (WalIndexHdr*)pSnapshot; if( memcmp(pNew->aSalt, pWal->hdr.aSalt, sizeof(pWal->hdr.aSalt)) || pNew->mxFramenBackfillAttempted @@ -4054,11 +4961,11 @@ #endif /* Return the sqlite3_file object for the WAL file */ sqlite3_file *sqlite3WalFile(Wal *pWal){ - return pWal->pWalFd; + return pWal->apWalFd[0]; } /* ** Return the values required by sqlite3_wal_info(). */ @@ -4068,7 +4975,15 @@ *pnFrame = pWal->hdr.mxFrame; *pnPrior = pWal->nPriorFrame; } return rc; } + +/* +** Return the journal mode used by this Wal object. +*/ +int sqlite3WalJournalMode(Wal *pWal){ + assert( pWal ); + return (isWalMode2(pWal) ? PAGER_JOURNALMODE_WAL2 : PAGER_JOURNALMODE_WAL); +} #endif /* #ifndef SQLITE_OMIT_WAL */ Index: src/wal.h ================================================================== --- src/wal.h +++ src/wal.h @@ -24,11 +24,11 @@ */ #define WAL_SYNC_FLAGS(X) ((X)&0x03) #define CKPT_SYNC_FLAGS(X) (((X)>>2)&0x03) #ifdef SQLITE_OMIT_WAL -# define sqlite3WalOpen(x,y,z) 0 +# define sqlite3WalOpen(w,x,y,z) 0 # define sqlite3WalLimit(x,y) # define sqlite3WalClose(v,w,x,y,z) 0 # define sqlite3WalBeginReadTransaction(y,z) 0 # define sqlite3WalEndReadTransaction(z) # define sqlite3WalDbsize(y) 0 @@ -43,10 +43,11 @@ # define sqlite3WalExclusiveMode(y,z) 0 # define sqlite3WalHeapMemory(z) 0 # define sqlite3WalFramesize(z) 0 # define sqlite3WalFindFrame(x,y,z) 0 # define sqlite3WalFile(x) 0 +# define sqlite3WalJournalMode(x) 0 #else #define WAL_SAVEPOINT_NDATA 4 /* Connection to a write-ahead log (WAL) file. @@ -53,11 +54,11 @@ ** There is one object of this type for each pager. */ typedef struct Wal Wal; /* Open and close a connection to a write-ahead log. */ -int sqlite3WalOpen(sqlite3_vfs*, sqlite3_file*, const char *, int, i64, Wal**); +int sqlite3WalOpen(sqlite3_vfs*, sqlite3_file*, const char *,int,i64,int,Wal**); int sqlite3WalClose(Wal *pWal, sqlite3*, int sync_flags, int, u8 *); /* Set the limiting size of a WAL file. */ void sqlite3WalLimit(Wal*, i64); @@ -152,11 +153,17 @@ int sqlite3WalFramesize(Wal *pWal); #endif /* Return the sqlite3_file object for the WAL file */ sqlite3_file *sqlite3WalFile(Wal *pWal); + +/* Return the journal mode (WAL or WAL2) used by this Wal object. */ +int sqlite3WalJournalMode(Wal *pWal); + +/* sqlite3_wal_info() data */ +int sqlite3WalInfo(Wal *pWal, u32 *pnPrior, u32 *pnFrame); /* sqlite3_wal_info() data */ int sqlite3WalInfo(Wal *pWal, u32 *pnPrior, u32 *pnFrame); #endif /* ifndef SQLITE_OMIT_WAL */ #endif /* SQLITE_WAL_H */ Index: test/concfault.test ================================================================== --- test/concfault.test +++ test/concfault.test @@ -80,48 +80,7 @@ faultsim_test_result {0 {}} catchsql { ROLLBACK } faultsim_integrity_check } - -#------------------------------------------------------------------------- -reset_db - -do_execsql_test 2.0 { - PRAGMA auto_vacuum = 0; - PRAGMA journal_mode = wal; - CREATE TABLE t1(a PRIMARY KEY, b); - CREATE TABLE t2(a PRIMARY KEY, b); - INSERT INTO t1 VALUES(randomblob(1000), randomblob(100)); - INSERT INTO t1 SELECT randomblob(1000), randomblob(1000) FROM t1; - INSERT INTO t1 SELECT randomblob(1000), randomblob(1000) FROM t1; - INSERT INTO t1 SELECT randomblob(1000), randomblob(1000) FROM t1; - INSERT INTO t1 SELECT randomblob(1000), randomblob(1000) FROM t1; - DELETE FROM t1 WHERE rowid%2; -} {wal} - -faultsim_save_and_close -do_faultsim_test 1 -prep { - faultsim_restore_and_reopen - execsql { - SELECT * FROM t1; - BEGIN CONCURRENT; - INSERT INTO t2 VALUES(1, 2); - } - sqlite3 db2 test.db - execsql { - PRAGMA journal_size_limit = 10000; - INSERT INTO t1 VALUES(randomblob(1000), randomblob(1000)); - } db2 - db2 close -} -body { - execsql { COMMIT } -} -test { - faultsim_test_result {0 {}} - catchsql { ROLLBACK } - set res [catchsql { SELECT count(*) FROM t1 }] - if {$res!="0 9"} { error "expected {0 9} got {$res}" } - faultsim_integrity_check -} - finish_test ADDED test/concfault2.test Index: test/concfault2.test ================================================================== --- /dev/null +++ test/concfault2.test @@ -0,0 +1,69 @@ +# 2018 Dec 28 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# +# This file contains fault injection tests designed to test the concurrent +# transactions feature. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/malloc_common.tcl +set testprefix concfault2 + +ifcapable !concurrent { + finish_test + return +} + +do_execsql_test 1.0 { + PRAGMA auto_vacuum = 0; + PRAGMA journal_mode = wal2; + CREATE TABLE t1(a PRIMARY KEY, b); + CREATE TABLE t2(a PRIMARY KEY, b); + INSERT INTO t1 VALUES(randomblob(1000), randomblob(100)); + INSERT INTO t1 SELECT randomblob(1000), randomblob(1000) FROM t1; + INSERT INTO t1 SELECT randomblob(1000), randomblob(1000) FROM t1; + INSERT INTO t1 SELECT randomblob(1000), randomblob(1000) FROM t1; + INSERT INTO t1 SELECT randomblob(1000), randomblob(1000) FROM t1; + DELETE FROM t1 WHERE rowid%2; +} {wal2} + +do_test 1.1 { + list [expr [file size test.db-wal]>75000] [file size test.db-shm] +} {1 32768} + +faultsim_save_and_close + +do_faultsim_test 1 -prep { + faultsim_restore_and_reopen + execsql { + SELECT * FROM t1; + BEGIN CONCURRENT; + INSERT INTO t2 VALUES(1, 2); + } + sqlite3 db2 test.db + execsql { + PRAGMA journal_size_limit = 10000; + INSERT INTO t1 VALUES(randomblob(1000), randomblob(1000)); + } db2 + db2 close +} -body { + execsql { COMMIT } +} -test { + faultsim_test_result {0 {}} + catchsql { ROLLBACK } + set res [catchsql { SELECT count(*) FROM t1 }] + if {$res!="0 9"} { error "expected {0 9} got {$res}" } + faultsim_integrity_check +} + +finish_test + Index: test/concurrent2.test ================================================================== --- test/concurrent2.test +++ test/concurrent2.test @@ -20,10 +20,14 @@ ifcapable !concurrent { finish_test return } + +do_test 0.1 { + llength [sqlite3_wal_info db main] +} {2} do_multiclient_test tn { do_test 1.$tn.1 { sql1 { Index: test/corruptA.test ================================================================== --- test/corruptA.test +++ test/corruptA.test @@ -45,11 +45,11 @@ # db close forcecopy test.db test.db-template set unreadable_version 02 -ifcapable wal { set unreadable_version 03 } +ifcapable wal { set unreadable_version 04 } do_test corruptA-2.1 { forcecopy test.db-template test.db hexio_write test.db 19 $unreadable_version ;# the read format number sqlite3 db test.db catchsql {SELECT * FROM t1} Index: test/permutations.test ================================================================== --- test/permutations.test +++ test/permutations.test @@ -428,22 +428,32 @@ # coverage-wal # test_suite "coverage-wal" -description { Coverage tests for file wal.c. } -files { - wal.test wal2.test wal3.test wal4.test wal5.test - wal64k.test wal6.test wal7.test wal8.test wal9.test - walbak.test walbig.test walblock.test walcksum.test walcrash2.test - walcrash3.test walcrash4.test walcrash.test walfault.test walhook.test - walmode.test walnoshm.test waloverwrite.test walpersist.test - walprotocol2.test walprotocol.test walro2.test walrofault.test - walro.test walshared.test walslow.test walvfs.test - walfault2.test - nockpt.test - +wal2big.test wal2recover.test wal2rewrite.test +wal2simple.test wal2snapshot.test wal2.test +wal3.test wal4.test wal5.test +wal64k.test wal6.test wal7.test wal8.test wal9.test +walbak.test walbig.test walblock.test walcksum.test +walfault.test walhook.test walmode.test walnoshm.test +waloverwrite.test walpersist.test walprotocol2.test +walprotocol.test walro2.test walrofault.test walro.test +walshared.test walslow.test wal.test +wal2savepoint.test wal2lock.test wal2recover2.test + + wal2concurrent.test + concurrent.test concurrent2.test concurrent3.test + concurrent4.test concurrent5.test concurrent6.test + concurrent7.test + concfault.test concfault2.test + + walvfs.test walfault2.test nockpt.test snapshot2.test snapshot3.test snapshot4.test snapshot_fault.test snapshot.test snapshot_up.test + walcrash2.test walcrash3.test walcrash4.test walcrash.test + wal2fault.test } test_suite "coverage-pager" -description { Coverage tests for file pager.c. } -files { @@ -982,10 +992,27 @@ } } test_suite "wal" -description { Run tests with journal_mode=WAL +} -initialize { + set ::G(savepoint6_iterations) 100 +} -shutdown { + unset -nocomplain ::G(savepoint6_iterations) +} -files { + savepoint.test savepoint2.test savepoint6.test + trans.test avtrans.test + + fts3aa.test fts3ab.test fts3ac.test fts3ad.test + fts3ae.test fts3af.test fts3ag.test fts3ah.test + fts3ai.test fts3aj.test fts3ak.test fts3al.test + fts3am.test fts3an.test fts3ao.test fts3b.test + fts3c.test fts3d.test fts3e.test fts3query.test +} + +test_suite "wal2" -description { + Run tests with journal_mode=WAL2 } -initialize { set ::G(savepoint6_iterations) 100 } -shutdown { unset -nocomplain ::G(savepoint6_iterations) } -files { Index: test/rdonly.test ================================================================== --- test/rdonly.test +++ test/rdonly.test @@ -39,19 +39,19 @@ # do_test rdonly-1.1.1 { sqlite3_db_readonly db main } {0} -# Changes the write version from 1 to 3. Verify that the database +# Changes the write version from 1 to 4. Verify that the database # can be read but not written. # do_test rdonly-1.2 { db close hexio_get_int [hexio_read test.db 18 1] } 1 do_test rdonly-1.3 { - hexio_write test.db 18 03 + hexio_write test.db 18 04 sqlite3 db test.db execsql { SELECT * FROM t1; } } {1} @@ -81,15 +81,15 @@ # write-version of the file (and the change-counter, so that the # write-version is reloaded). This way, SQLite does not discover that # the database is read-only until after it is locked. # set ro_version 02 -ifcapable wal { set ro_version 03 } +ifcapable wal { set ro_version 04 } do_test rdonly-1.6 { hexio_write test.db 18 $ro_version ; # write-version hexio_write test.db 24 11223344 ; # change-counter catchsql { INSERT INTO t1 VALUES(2); } } {1 {attempt to write a readonly database}} finish_test Index: test/savepoint.test ================================================================== --- test/savepoint.test +++ test/savepoint.test @@ -26,10 +26,11 @@ execsql { SAVEPOINT sp1; RELEASE sp1; } } {} +wal_check_journal_mode savepoint-1.1 do_test savepoint-1.2 { execsql { SAVEPOINT sp1; ROLLBACK TO sp1; } @@ -803,11 +804,12 @@ } } {} integrity_check savepoint-11.7 do_test savepoint-11.8 { execsql { ROLLBACK } - execsql { PRAGMA wal_checkpoint } + db close + sqlite3 db test.db file size test.db } {8192} do_test savepoint-11.9 { execsql { Index: test/savepoint6.test ================================================================== --- test/savepoint6.test +++ test/savepoint6.test @@ -13,10 +13,14 @@ set testdir [file dirname $argv0] source $testdir/tester.tcl proc sql {zSql} { + if {0 && $::debug_op} { + puts stderr "$zSql ;" + flush stderr + } uplevel db eval [list $zSql] #puts stderr "$zSql ;" } set DATABASE_SCHEMA { @@ -65,15 +69,17 @@ # # insert_rows XVALUES # delete_rows XVALUES # proc savepoint {zName} { + if {$::debug_op} { puts stderr "savepoint $zName" ; flush stderr } catch { sql "SAVEPOINT $zName" } lappend ::lSavepoint [list $zName [array get ::aEntry]] } proc rollback {zName} { + if {$::debug_op} { puts stderr "rollback $zName" ; flush stderr } catch { sql "ROLLBACK TO $zName" } for {set i [expr {[llength $::lSavepoint]-1}]} {$i>=0} {incr i -1} { set zSavepoint [lindex $::lSavepoint $i 0] if {$zSavepoint eq $zName} { unset -nocomplain ::aEntry @@ -87,10 +93,11 @@ } } } proc release {zName} { + if {$::debug_op} { puts stderr "release $zName" ; flush stderr } catch { sql "RELEASE $zName" } for {set i [expr {[llength $::lSavepoint]-1}]} {$i>=0} {incr i -1} { set zSavepoint [lindex $::lSavepoint $i 0] if {$zSavepoint eq $zName} { set ::lSavepoint [lreplace $::lSavepoint $i end] @@ -102,10 +109,11 @@ #puts stderr "-- End of transaction!!!!!!!!!!!!!" } } proc insert_rows {lX} { + if {$::debug_op} { puts stderr "insert_rows $lX" ; flush stderr } foreach x $lX { set y [x_to_y $x] # Update database [db] sql "INSERT OR REPLACE INTO t1 VALUES($x, '$y')" @@ -114,10 +122,11 @@ set ::aEntry($x) $y } } proc delete_rows {lX} { + if {$::debug_op} { puts stderr "delete_rows $lX" ; flush stderr } foreach x $lX { # Update database [db] sql "DELETE FROM t1 WHERE x = $x" # Update the Tcl database. @@ -161,10 +170,15 @@ lappend ret [expr int(rand()*$nRange)] } return $ret } #------------------------------------------------------------------------- + +set ::debug_op 0 +proc debug_ops {} { + set ::debug_op 1 +} proc database_op {} { set i [expr int(rand()*2)] if {$i==0} { insert_rows [random_integers 100 1000] @@ -183,13 +197,10 @@ set cmds {savepoint savepoint savepoint savepoint release rollback} set C [lindex $cmds [expr int(rand()*6)]] set N [lindex $names [expr int(rand()*5)]] - #puts stderr " $C $N ; " - #flush stderr - $C $N return ok } expr srand(0) Index: test/tester.tcl ================================================================== --- test/tester.tcl +++ test/tester.tcl @@ -589,10 +589,11 @@ proc reset_db {} { catch {db close} forcedelete test.db forcedelete test.db-journal forcedelete test.db-wal + forcedelete test.db-wal2 sqlite3 db ./test.db set ::DB [sqlite3_connection_pointer db] if {[info exists ::SETUP_SQL]} { db eval $::SETUP_SQL } @@ -2138,21 +2139,36 @@ # wal_is_wal_mode # # Returns true if this test should be run in WAL mode. False otherwise. # proc wal_is_wal_mode {} { - expr {[permutation] eq "wal"} + if {[permutation] eq "wal"} { return 1 } + if {[permutation] eq "wal2"} { return 2 } + return 0 } proc wal_set_journal_mode {{db db}} { - if { [wal_is_wal_mode] } { - $db eval "PRAGMA journal_mode = WAL" + switch -- [wal_is_wal_mode] { + 0 { + } + + 1 { + $db eval "PRAGMA journal_mode = WAL" + } + + 2 { + $db eval "PRAGMA journal_mode = WAL2" + } } } proc wal_check_journal_mode {testname {db db}} { if { [wal_is_wal_mode] } { $db eval { SELECT * FROM sqlite_master } - do_test $testname [list $db eval "PRAGMA main.journal_mode"] {wal} + set expected "wal" + if {[wal_is_wal_mode]==2} { + set expected "wal2" + } + do_test $testname [list $db eval "PRAGMA main.journal_mode"] $expected } } proc wal_is_capable {} { ifcapable !wal { return 0 } Index: test/uri.test ================================================================== --- test/uri.test +++ test/uri.test @@ -278,15 +278,15 @@ PRAGMA aux.journal_mode = WAL; INSERT INTO t1 VALUES('x', 'y'); INSERT INTO t2 VALUES('x', 'y'); } lsort [array names ::T1] - } {test.db1 test.db1-journal test.db1-wal} + } {test.db1 test.db1-journal test.db1-wal test.db1-wal2} do_test 5.1.2 { lsort [array names ::T2] - } {test.db2 test.db2-journal test.db2-wal} + } {test.db2 test.db2-journal test.db2-wal test.db2-wal2} db close tvfs1 delete tvfs2 delete } Index: test/wal.test ================================================================== --- test/wal.test +++ test/wal.test @@ -1172,28 +1172,28 @@ 7 8192 1 8 16384 1 9 32768 1 10 65536 1 11 131072 0 - 11 1016 0 + 12 1016 0 } { if {$::SQLITE_MAX_PAGE_SIZE < $pgsz} { set works 0 } for {set pg 1} {$pg <= 3} {incr pg} { forcecopy testX.db test.db forcedelete test.db-wal - + # Check that the database now exists and consists of three pages. And # that there is no associated wal file. # do_test wal-18.2.$tn.$pg.1 { file exists test.db-wal } 0 do_test wal-18.2.$tn.$pg.2 { file exists test.db } 1 do_test wal-18.2.$tn.$pg.3 { file size test.db } [expr 1024*3] - + do_test wal-18.2.$tn.$pg.4 { # Create a wal file that contains a single frame (database page # number $pg) with the commit flag set. The frame checksum is # correct, but the contents of the database page are corrupt. @@ -1221,20 +1221,20 @@ fconfigure $fd -encoding binary -translation binary puts -nonewline $fd $walhdr puts -nonewline $fd $framehdr puts -nonewline $fd $framebody close $fd - + file size test.db-wal } [wal_file_size 1 $pgsz] - + do_test wal-18.2.$tn.$pg.5 { sqlite3 db test.db set rc [catch { db one {PRAGMA integrity_check} } msg] expr { $rc!=0 || $msg!="ok" } } $works - + db close } } #------------------------------------------------------------------------- ADDED test/wal2big.test Index: test/wal2big.test ================================================================== --- /dev/null +++ test/wal2big.test @@ -0,0 +1,71 @@ +# 2017 September 19 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2big +ifcapable !wal {finish_test ; return } + +do_execsql_test 1.0 { + CREATE TABLE t1(a, b, c); + CREATE INDEX t1a ON t1(a); + CREATE INDEX t1b ON t1(b); + CREATE INDEX t1c ON t1(c); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 10000000; + + WITH s(i) AS ( + SELECT 1 UNION ALL SELECT i+1 FROM s WHERE i<200000 + ) + INSERT INTO t1 SELECT random(), random(), random() FROM s; +} {wal2 10000000} + +do_execsql_test 1.1 { + WITH s(i) AS ( + SELECT 1 UNION ALL SELECT i+1 FROM s WHERE i<200000 + ) + INSERT INTO t1 SELECT random(), random(), random() FROM s; +} + +do_test 1.2 { + list [expr [file size test.db-wal]>10000000] \ + [expr [file size test.db-wal2]>10000000] +} {1 1} + +do_test 1.3 { + sqlite3 db2 test.db + execsql { + SELECT count(*) FROM t1; + PRAGMA integrity_check; + } db2 +} {400000 ok} + +do_test 1.4 { + db2 close + forcecopy test.db test.db2 + forcecopy test.db-wal test.db2-wal + forcecopy test.db-wal2 test.db2-wal2 + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + PRAGMA integrity_check; + } +} {400000 ok} + +finish_test ADDED test/wal2concurrent.test Index: test/wal2concurrent.test ================================================================== --- /dev/null +++ test/wal2concurrent.test @@ -0,0 +1,164 @@ +# 2018 December 6 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +set ::testprefix wal2concurrent + +ifcapable !concurrent { + finish_test + return +} + + +#------------------------------------------------------------------------- +# Warm-body test. +# +foreach tn {1 2} { + reset_db + sqlite3 db2 test.db + do_execsql_test 1.0 { + PRAGMA page_size = 1024; + CREATE TABLE t1(x); + CREATE TABLE t2(y); + PRAGMA journal_size_limit = 5000; + PRAGMA journal_mode = wal2; + } {5000 wal2} + + do_execsql_test 1.1 { + INSERT INTO t1 VALUES(1); + BEGIN CONCURRENT; + INSERT INTO t1 VALUES(2); + } {} + + do_test 1.2 { + execsql { + PRAGMA journal_size_limit = 5000; + INSERT INTO t1 VALUES(3) + } db2 + catchsql { COMMIT } + } {1 {database is locked}} + + do_catchsql_test 1.3 { COMMIT } {1 {database is locked}} + do_catchsql_test 1.4 { ROLLBACK } {0 {}} + + do_test 1.5 { + list [file size test.db-wal] [file size test.db-wal2] + } {2128 0} + + do_execsql_test 1.6 { + BEGIN CONCURRENT; + INSERT INTO t1 VALUES(2); + } {} + + do_test 1.7 { + execsql { INSERT INTO t2 VALUES(randomblob(4000)) } db2 + list [file size test.db-wal] [file size test.db-wal2] + } {7368 0} + + if {$tn==1} { + do_test 1.8 { + execsql { + INSERT INTO t2 VALUES(1); + INSERT INTO t1 VALUES(5); + } db2 + list [file size test.db-wal] [file size test.db-wal2] + } {7368 2128} + + do_catchsql_test 1.9 { COMMIT } {1 {database is locked}} + do_catchsql_test 1.10 { ROLLBACK } {0 {}} + db close + sqlite3 db test.db + do_execsql_test 1.11 { SELECT * FROM t1 } {1 3 5} + do_execsql_test 1.12 { SELECT count(*) FROM t2 } {2} + } else { + do_test 1.8 { + execsql { + INSERT INTO t2 VALUES(1); + } db2 + list [file size test.db-wal] [file size test.db-wal2] + } {7368 1080} + + do_catchsql_test 1.9 { COMMIT } {0 {}} + db close + sqlite3 db test.db + do_execsql_test 1.11 { SELECT * FROM t1 } {1 3 2} + do_execsql_test 1.12 { SELECT count(*) FROM t2 } {2} + + do_test 1.13 { + list [file size test.db-wal] [file size test.db-wal2] + } {7368 2128} + } +} + +do_multiclient_test tn { + do_test 2.$tn.1 { + sql1 { + PRAGMA auto_vacuum = OFF; + CREATE TABLE t1(x UNIQUE); + CREATE TABLE t2(x UNIQUE); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 15000; + } + } {wal2 15000} + + do_test 2.$tn.2 { + sql1 { + WITH s(i) AS ( + SELECT 1 UNION ALL SELECT i+1 FROM s WHERE i<=10 + ) + INSERT INTO t1 SELECT randomblob(800) FROM s; + } + } {} + + do_test 2.$tn.3 { + sql1 { DELETE FROM t1 WHERE (rowid%4)==0 } + list [expr [file size test.db-wal]>15000] \ + [expr [file size test.db-wal2]>15000] + } {1 0} + + do_test 2.$tn.4 { + sql1 { PRAGMA wal_checkpoint; } + sql1 { + BEGIN CONCURRENT; + INSERT INTO t1 VALUES(randomblob(800)); + } + } {} + + do_test 2.$tn.5 { + sql2 { + PRAGMA journal_size_limit = 15000; + INSERT INTO t2 VALUES(randomblob(800)); + INSERT INTO t2 VALUES(randomblob(800)); + INSERT INTO t2 VALUES(randomblob(800)); + INSERT INTO t2 VALUES(randomblob(800)); + INSERT INTO t2 VALUES(randomblob(800)); + DELETE FROM t2; + } + list [expr [file size test.db-wal]>15000] \ + [expr [file size test.db-wal2]>15000] + } {1 1} + + do_test 2.$tn.6 { + sql1 { + INSERT INTO t1 VALUES(randomblob(800)); + COMMIT; + PRAGMA integrity_check; + } + } {ok} +} + + + +finish_test + ADDED test/wal2fault.test Index: test/wal2fault.test ================================================================== --- /dev/null +++ test/wal2fault.test @@ -0,0 +1,52 @@ +# 2010 May 03 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/malloc_common.tcl +source $testdir/lock_common.tcl + +ifcapable !wal {finish_test ; return } +set testprefix wal2fault + +do_execsql_test 1.0 { + CREATE TABLE t1(x,y); + PRAGMA journal_mode = wal2; + WITH s(i) AS ( SELECT 100 UNION ALL SELECT i-1 FROM s WHERE (i-1)>0 ) + INSERT INTO t1 SELECT i, randomblob(i) FROM s; + WITH s(i) AS ( SELECT 100 UNION ALL SELECT i-1 FROM s WHERE (i-1)>0 ) + INSERT INTO t1 SELECT i, randomblob(i) FROM s; +} {wal2} + +do_test 1.1 { + expr [file size test.db-wal]>10000 +} {1} +faultsim_save_and_close + +do_faultsim_test 1 -prep { + faultsim_restore_and_reopen + execsql { + PRAGMA journal_size_limit = 10000; + SELECT count(*) FROM sqlite_master; + } +} -body { + execsql { + INSERT INTO t1 VALUES(1, 2); + } +} -test { + faultsim_test_result {0 {}} +} + +finish_test ADDED test/wal2lock.test Index: test/wal2lock.test ================================================================== --- /dev/null +++ test/wal2lock.test @@ -0,0 +1,106 @@ +# 2018 December 15 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2lock +ifcapable !wal {finish_test ; return } + +db close +testvfs tvfs +sqlite3 db test.db -vfs tvfs + +do_execsql_test 1.0 { + PRAGMA journal_mode = wal2; + CREATE TABLE y1(y, yy); + CREATE INDEX y1y ON y1(y); + CREATE INDEX y1yy ON y1(yy); + INSERT INTO y1 VALUES(1, 2), (3, 4), (5, 6); +} {wal2} + +tvfs script vfs_callback +tvfs filter xShmLock + +set ::lock [list] +proc vfs_callback {func file name lock} { + lappend ::lock $lock + return SQLITE_OK +} + +do_execsql_test 1.1.1 { + SELECT * FROM y1 +} {1 2 3 4 5 6} +do_test 1.1.2 { + set ::lock +} {{4 1 lock shared} {4 1 unlock shared}} + +set ::bFirst 1 +proc vfs_callback {func file name lock} { + if {$::bFirst} { + set ::bFirst 0 + return SQLITE_BUSY + } + return SQLITE_OK +} +do_execsql_test 1.2 { + SELECT * FROM y1 +} {1 2 3 4 5 6} + +set ::bFirst 1 +proc vfs_callback {func file name lock} { + if {$::bFirst} { + set ::bFirst 0 + return SQLITE_IOERR + } + return SQLITE_OK +} +do_catchsql_test 1.3 { + SELECT * FROM y1 +} {1 {disk I/O error}} + +puts "# Warning: This next test case causes SQLite to call xSleep(1) 100 times." +puts "# Normally this equates to a delay of roughly 10 seconds, but if SQLite" +puts "# is built on unix without HAVE_USLEEP defined, it may be much longer." +proc vfs_callback {func file name lock} { return SQLITE_BUSY } +do_catchsql_test 1.4 { + SELECT * FROM y1 +} {1 {locking protocol}} +proc vfs_callback {func file name lock} { return SQLITE_OK } + +sqlite3 db2 test.db -vfs tvfs +set ::bFirst 1 + +proc vfs_callback {func file name lock} { + if {$::bFirst} { + set ::bFirst 0 + db2 eval { INSERT INTO y1 VALUES(7, 8) } + } +} + +do_execsql_test 1.5.1 { + SELECT * FROM y1 +} {1 2 3 4 5 6 7 8} +do_execsql_test 1.5.2 { + SELECT * FROM y1 +} {1 2 3 4 5 6 7 8} + +db close +db2 close +tvfs delete +finish_test ADDED test/wal2recover.test Index: test/wal2recover.test ================================================================== --- /dev/null +++ test/wal2recover.test @@ -0,0 +1,265 @@ +# 2018 December 13 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2recover +ifcapable !wal {finish_test ; return } + +proc db_copy {from to} { + forcecopy $from $to + forcecopy ${from}-wal ${to}-wal + forcecopy ${from}-wal2 ${to}-wal2 +} + +do_execsql_test 1.0 { + CREATE TABLE t1(a, b, c); + CREATE INDEX t1a ON t1(a); + CREATE INDEX t1b ON t1(b); + CREATE INDEX t1c ON t1(c); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 15000; + PRAGMA wal_autocheckpoint = 0; +} {wal2 15000 0} + +do_test 1.1 { + for {set i 1} {$i <= 1000} {incr i} { + execsql { INSERT INTO t1 VALUES(random(), random(), random()) } + db_copy test.db test.db2 + sqlite3 db2 test.db + set res [execsql { + SELECT count(*) FROM t1; + PRAGMA integrity_check; + } db2] + db2 close + if {$res != [list $i ok]} { + error "failure on iteration $i" + } + } + set {} {} +} {} + +#-------------------------------------------------------------------------- +reset_db +do_execsql_test 2.0 { + CREATE TABLE t1(x UNIQUE); + CREATE TABLE t2(x UNIQUE); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 10000; + PRAGMA wal_autocheckpoint = 0; + BEGIN; + INSERT INTO t1 VALUES(randomblob(4000)); + INSERT INTO t1 VALUES(randomblob(4000)); + INSERT INTO t1 VALUES(randomblob(4000)); + COMMIT; + BEGIN; + INSERT INTO t2 VALUES(randomblob(4000)); + INSERT INTO t2 VALUES(randomblob(4000)); + INSERT INTO t2 VALUES(randomblob(4000)); + COMMIT; +} {wal2 10000 0} +do_test 2.0.1 { + list [file size test.db] [file size test.db-wal] [file size test.db-wal2] +} {5120 28328 28328} + +# Test recovery with both wal files intact. +# +do_test 2.1 { + db_copy test.db test.db2 + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + SELECT count(*) FROM t2; + PRAGMA integrity_check; + } db2 +} {3 3 ok} + +do_test 2.2 { + db2 close + db_copy test.db test.db2 + hexio_write test.db2-wal 16 12345678 + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + SELECT count(*) FROM t2; + } db2 +} {0 3} + +do_test 2.3 { + db2 close + db_copy test.db test.db2 + hexio_write test.db2-wal2 16 12345678 + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + SELECT count(*) FROM t2; + PRAGMA integrity_check; + } db2 +} {3 0 ok} + +do_test 2.4 { + db2 close + db_copy test.db test.db2 + forcecopy test.db-wal test.db2-wal2 + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + SELECT count(*) FROM t2; + PRAGMA integrity_check; + } db2 +} {3 0 ok} + +do_test 2.5 { + db2 close + db_copy test.db test.db2 + forcecopy test.db-wal test.db2-wal2 + forcecopy test.db-wal2 test.db2-wal + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + SELECT count(*) FROM t2; + PRAGMA integrity_check; + } db2 +} {3 3 ok} + +do_test 2.6 { + db2 close + db_copy test.db test.db2 + forcecopy test.db-wal test.db2-wal2 + close [open test.db-wal w] + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + SELECT count(*) FROM t2; + PRAGMA integrity_check; + } db2 +} {3 0 ok} + +do_test 2.7 { + db2 close + db_copy test.db test.db2 + forcedelete test.db2-wal + sqlite3 db2 test.db2 + execsql { + SELECT count(*) FROM t1; + SELECT count(*) FROM t2; + PRAGMA integrity_check; + } db2 +} {0 0 ok} + +#------------------------------------------------------------------------- +# +reset_db +do_execsql_test 3.0 { + CREATE TABLE t1(a TEXT, b TEXT, c TEXT); + CREATE INDEX t1a ON t1(a); + CREATE INDEX t1b ON t1(b); + CREATE INDEX t1c ON t1(c); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 10000; + PRAGMA wal_autocheckpoint = 0; + PRAGMA cache_size = 5; +} {wal2 10000 0} + +do_execsql_test 3.1 { + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 200) + INSERT INTO t1 SELECT i, i, i FROM s; + + INSERT INTO t1 VALUES(201, 201, 201); +} {} + +do_test 3.2 { + list [file size test.db] [file size test.db-wal] [file size test.db-wal2] +} {5120 15752 4224} + +do_test 3.3 { + forcecopy test.db test.db2 + forcecopy test.db-wal test.db2-wal + forcecopy test.db-wal2 test.db2-wal2 + sqlite3 db2 test.db2 + execsql { + PRAGMA journal_size_limit = 10000; + PRAGMA wal_autocheckpoint = 0; + PRAGMA cache_size = 5; + BEGIN; + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 200) + INSERT INTO t1 SELECT i, i, i FROM s; + } db2 + list [file size test.db2] [file size test.db2-wal] [file size test.db2-wal2] +} {5120 15752 23088} + +do_test 3.4 { + set fd [open test.db2-shm] + fconfigure $fd -encoding binary -translation binary + set data [read $fd] + close $fd + + set fd [open test.db-shm w] + fconfigure $fd -encoding binary -translation binary + puts -nonewline $fd $data + close $fd + + execsql { + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 10) + INSERT INTO t1 SELECT i, i, i FROM s; + SELECT count(*) FROM t1; + PRAGMA integrity_check; + } +} {211 ok} + +do_test 3.5 { + list [file size test.db] [file size test.db-wal] [file size test.db-wal2] +} {5120 15752 18896} + +#------------------------------------------------------------------------- +# +reset_db +do_execsql_test 4.0 { + PRAGMA journal_mode = wal2; + CREATE TABLE xyz(x, y, z); + INSERT INTO xyz VALUES('x', 'y', 'z'); +} {wal2} +db close +do_test 4.1 { + close [open test.db-wal w] + file mkdir test.db-wal2 + sqlite3 db test.db + catchsql { SELECT * FROM xyz } +} {1 {unable to open database file}} +db close +file delete test.db-wal2 + +do_test 4.2 { + sqlite3 db test.db + execsql { + INSERT INTO xyz VALUES('a', 'b', 'c'); + } + forcecopy test.db test.db2 + forcecopy test.db-wal test.db2-wal + forcedelete test.db2-wal2 + file mkdir test.db2-wal2 + sqlite3 db2 test.db2 + catchsql { SELECT * FROM xyz } db2 +} {1 {unable to open database file}} +db2 close +file delete test.db2-wal2 + + +finish_test + ADDED test/wal2recover2.test Index: test/wal2recover2.test ================================================================== --- /dev/null +++ test/wal2recover2.test @@ -0,0 +1,313 @@ +# 2018 December 13 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2recover2 +ifcapable !wal {finish_test ; return } + +do_execsql_test 1.0 { + CREATE TABLE t1(x); + CREATE TABLE t2(x); + WITH s(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM s WHERE i<1500 ) + INSERT INTO t1 SELECT i FROM s; + WITH s(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM s WHERE i<1500 ) + INSERT INTO t2 SELECT i FROM s; + + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 10000; +} {wal2 10000} + +set ::L 1125750 +set ::M 1126500 +set ::H 1127250 + +do_execsql_test 1.1 { + UPDATE t1 SET x=x+1; + UPDATE t2 SET x=x+1 WHERE rowid<=750; + + SELECT sum(x) FROM t1; + SELECT sum(x) FROM t2; +} [list $H $M] + +do_test 1.2 { + list [file size test.db] [file size test.db-wal] [file size test.db-wal2] +} {31744 14704 7368} + +proc cksum {zIn data} { + if {[string length $zIn]==0} { + set s0 0 + set s1 0 + } else { + set s0 [hexio_get_int [string range $zIn 0 7]] + set s1 [hexio_get_int [string range $zIn 8 15]] + } + set n [expr [string length $data] / 8] + + for {set i 0} {$i < $n} {incr i 2} { + set x0 [hexio_get_int -l [string range $data [expr $i*8] [expr $i*8+7]]] + set x1 [hexio_get_int -l [string range $data [expr $i*8+8] [expr $i*8+8+7]]] + + set s0 [expr ($s0 + $x0 + $s1) & 0xFFFFFFFF] + set s1 [expr ($s1 + $x1 + $s0) & 0xFFFFFFFF] + } + + return "[hexio_render_int32 $s0][hexio_render_int32 $s1]" +} + +proc fix_wal_cksums {file} { + # Fix the checksum on the wal header. + set data [hexio_read $file 0 32] + set cksum [cksum {} [string range $data 0 47]] + set salt [hexio_read $file 16 8] + hexio_write $file 24 $cksum + + # Fix the checksums for all pages in the wal file. + set pgsz [hexio_get_int [hexio_read $file 8 4]] + set sz [file size $file] + for {set off 32} {$off < $sz} {incr off [expr $pgsz+24]} { + set e [hexio_read $file $off 8] + set cksum [cksum $cksum $e] + + set p [hexio_read $file [expr $off+24] $pgsz] + set cksum [cksum $cksum $p] + + hexio_write $file [expr $off+8] $salt + hexio_write $file [expr $off+16] $cksum + } +} + +proc wal_incr_hdrfield {file field} { + switch -- $field { + nCkpt { set offset 12 } + salt0 { set offset 16 } + salt1 { set offset 20 } + default { + error "unknown field $field - should be \"nCkpt\", \"salt0\" or \"salt1\"" + } + } + + # Increment the value in the wal header. + set v [hexio_get_int [hexio_read $file $offset 4]] + incr v + hexio_write $file $offset [hexio_render_int32 $v] + + # Fix various checksums + fix_wal_cksums $file +} + +proc wal_set_nckpt {file val} { + # Increment the value in the wal header. + hexio_write $file 12 [hexio_render_int32 $val] + + # Fix various checksums + fix_wal_cksums $file +} + +proc wal_set_follow {file prevfile} { + set pgsz [hexio_get_int [hexio_read $prevfile 8 4]] + set sz [file size $prevfile] + set cksum [hexio_read $prevfile [expr $sz-$pgsz-8] 8] + + hexio_write $file 16 $cksum + fix_wal_cksums $file +} + +foreach {tn file field} { + 1 test.db2-wal salt0 + 2 test.db2-wal salt1 + 3 test.db2-wal nCkpt + 4 test.db2-wal2 salt0 + 5 test.db2-wal2 salt1 + 6 test.db2-wal2 nCkpt +} { + do_test 1.3.$tn { + forcecopy test.db test.db2 + forcecopy test.db-wal test.db2-wal + forcecopy test.db-wal2 test.db2-wal2 + wal_incr_hdrfield $file $field + sqlite3 db2 test.db2 + execsql { + SELECT sum(x) FROM t1; + SELECT sum(x) FROM t2; + } db2 + } [list $H $L] + db2 close +} + +do_test 1.4 { + forcecopy test.db test.db2 + forcecopy test.db-wal2 test.db2-wal + forcedelete test.db2-wal2 + sqlite3 db2 test.db2 + execsql { + SELECT sum(x) FROM t1; + SELECT sum(x) FROM t2; + } db2 +} [list $L $M] + +do_test 1.5 { + forcecopy test.db test.db2 + forcecopy test.db-wal2 test.db2-wal + forcecopy test.db-wal test.db2-wal2 + sqlite3 db2 test.db2 + execsql { + SELECT sum(x) FROM t1; + SELECT sum(x) FROM t2; + } db2 +} [list $H $M] + +foreach {tn file field} { + 1 test.db2-wal salt0 + 2 test.db2-wal salt1 + 3 test.db2-wal2 salt0 + 4 test.db2-wal2 salt1 +} { + do_test 1.6.$tn { + forcecopy test.db test.db2 + forcecopy test.db-wal2 test.db2-wal + forcecopy test.db-wal test.db2-wal2 + wal_incr_hdrfield $file $field + sqlite3 db2 test.db2 + execsql { + SELECT sum(x) FROM t1; + SELECT sum(x) FROM t2; + } db2 + } [list $H $L] + db2 close +} + +foreach {tn nCkpt1 nCkpt2 res} [list \ + 1 2 1 "$H $M" \ + 2 2 2 "$L $M" \ + 3 3 1 "$H $L" \ + 4 15 14 "$H $M" \ + 5 0 15 "$H $M" \ + 6 1 15 "$L $M" \ +] { + do_test 1.7.$tn { + forcecopy test.db test.db2 + forcecopy test.db-wal2 test.db2-wal + forcecopy test.db-wal test.db2-wal2 + + wal_set_nckpt test.db2-wal2 $nCkpt2 + wal_set_nckpt test.db2-wal $nCkpt1 + wal_set_follow test.db2-wal test.db2-wal2 + + + sqlite3 db2 test.db2 + execsql { + SELECT sum(x) FROM t1; + SELECT sum(x) FROM t2; + } db2 + } $res + db2 close +} + + +#------------------------------------------------------------------------- +reset_db +do_execsql_test 1.8.1 { + PRAGMA autovacuum = 0; + PRAGMA page_size = 4096; + CREATE TABLE t1(x); + CREATE TABLE t2(x); + WITH s(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM s WHERE i<1500 ) + INSERT INTO t1 SELECT i FROM s; + WITH s(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM s WHERE i<1500 ) + INSERT INTO t2 SELECT i FROM s; + + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 10000; + + WITH s(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM s WHERE i<1500 ) + INSERT INTO t2 SELECT i FROM s; +} {wal2 10000} + +do_test 1.8.2 { + list [file size test.db-wal] [file size test.db-wal2] +} {24752 0} + +do_execsql_test 1.8.3 { PRAGMA user_version = 123 } +do_test 1.8.4 { + list [file size test.db-wal] [file size test.db-wal2] +} {24752 4152} + +do_test 1.8.5 { + hexio_write test.db-wal2 [expr 56+16] 0400 + fix_wal_cksums test.db-wal2 +} {} + +do_test 1.8.6 { + forcecopy test.db test.db2 + forcecopy test.db-wal test.db2-wal + forcecopy test.db-wal2 test.db2-wal2 + sqlite3 db2 test.db2 + catchsql { SELECT * FROM sqlite_master } db2 +} {1 {database disk image is malformed}} +db2 close + +#------------------------------------------------------------------------- +reset_db +do_execsql_test 1.0 { + CREATE TABLE t1(a, b, c); + CREATE INDEX t1a ON t1(a); + CREATE INDEX t1b ON t1(b); + CREATE INDEX t1c ON t1(c); + PRAGMA journal_mode = wal2; + + INSERT INTO t1 VALUES(randomblob(50), randomblob(50), randomblob(50)); + INSERT INTO t1 VALUES(randomblob(50), randomblob(50), randomblob(50)); + INSERT INTO t1 VALUES(randomblob(50), randomblob(50), randomblob(50)); + PRAGMA journal_size_limit = 5000; + INSERT INTO t1 VALUES(randomblob(50), randomblob(50), randomblob(50)); + INSERT INTO t1 VALUES(randomblob(50), randomblob(50), randomblob(50)); + INSERT INTO t1 VALUES(randomblob(50), randomblob(50), randomblob(50)); + INSERT INTO t1 VALUES(randomblob(50), randomblob(50), randomblob(50)); + INSERT INTO t1 VALUES(randomblob(50), randomblob(50), randomblob(50)); +} {wal2 5000} + +do_test 2.1 { + forcecopy test.db test.db2 + forcecopy test.db-wal2 test.db2-wal + forcecopy test.db-wal test.db2-wal2 + + hexio_write test.db2-wal 5000 1234567890 +} {5} + +do_test 2.2 { + sqlite3 db2 test.db2 + breakpoint + execsql { + SELECT count(*) FROM t1; + PRAGMA integrity_check + } db2 +} {4 ok} + +do_test 2.3 { + execsql { + INSERT INTO t1 VALUES(randomblob(50), randomblob(50), randomblob(50)); + SELECT count(*) FROM t1; + PRAGMA integrity_check + } db2 +} {5 ok} + + +finish_test + ADDED test/wal2rewrite.test Index: test/wal2rewrite.test ================================================================== --- /dev/null +++ test/wal2rewrite.test @@ -0,0 +1,92 @@ +# 2017 September 19 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2rewrite +ifcapable !wal {finish_test ; return } + +proc filesize {filename} { + if {[file exists $filename]} { + return [file size $filename] + } + return 0 +} + +foreach {tn jrnlmode} { + 1 wal + 2 wal2 +} { + reset_db + execsql "PRAGMA journal_mode = $jrnlmode" + do_execsql_test $tn.1 { + PRAGMA journal_size_limit = 10000; + PRAGMA cache_size = 5; + PRAGMA wal_autocheckpoint = 10; + + CREATE TABLE t1(a INTEGER PRIMARY KEY, b INTEGER, c BLOB); + CREATE INDEX t1b ON t1(b); + CREATE INDEX t1c ON t1(c); + + WITH s(i) AS ( + SELECT 1 UNION SELECT i+1 FROM s WHERE i<10 + ) + INSERT INTO t1 SELECT i, i, randomblob(800) FROM s; + } {10000 10} + + for {set i 0} {$i < 4} {incr i} { + do_execsql_test $tn.$i.1 { + UPDATE t1 SET c=randomblob(800) WHERE (b%10)==5 AND ($i%2) + } + do_execsql_test $tn.$i.2 { + BEGIN; + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + UPDATE t1 SET b=b+10, c=randomblob(800); + } + execsql COMMIT + + do_test $tn.$i.3 { expr [filesize test.db-wal] < 100000 } 1 + do_test $tn.$i.4 { expr [filesize test.db-wal2] < 100000 } 1 + + set sum [db eval {SELECT sum(b), md5sum(c) FROM t1}] + + do_test $tn.$i.5 { + foreach f [glob -nocomplain test.db2*] {forcedelete $f} + foreach f [glob -nocomplain test.db*] { + forcecopy $f [string map {test.db test.db2} $f] + } + + sqlite3 db2 test.db2 + db2 eval {SELECT sum(b), md5sum(c) FROM t1} + } $sum + db2 close + } +} + + + +finish_test ADDED test/wal2rollback.test Index: test/wal2rollback.test ================================================================== --- /dev/null +++ test/wal2rollback.test @@ -0,0 +1,62 @@ +# 2017 September 19 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2rollback +ifcapable !wal {finish_test ; return } + +do_execsql_test 1.0 { + CREATE TABLE t1(a, b, c); + CREATE TABLE t2(a, b, c); + CREATE INDEX i1 ON t1(a); + CREATE INDEX i2 ON t1(b); + PRAGMA journal_mode = wal2; + PRAGMA cache_size = 5; + PRAGMA journal_size_limit = 10000; + WITH s(i) AS ( + SELECT 1 UNION ALL SELECT i+1 FROM s LIMIT 1000 + ) + INSERT INTO t1 SELECT i, i, randomblob(200) FROM s; +} {wal2 10000} + +do_test 1.1 { + expr [file size test.db-wal] > 10000 +} 1 + +do_test 1.2 { + execsql { + BEGIN; + UPDATE t1 SET b=b+1; + INSERT INTO t2 VALUES(1,2,3); + } + expr [file size test.db-wal2] > 10000 +} {1} + +breakpoint +do_execsql_test 1.3 { + ROLLBACK; + SELECT * FROM t2; + SELECT count(*) FROM t1 WHERE a=b; + PRAGMA integrity_check; +} {1000 ok} + + + +finish_test ADDED test/wal2savepoint.test Index: test/wal2savepoint.test ================================================================== --- /dev/null +++ test/wal2savepoint.test @@ -0,0 +1,73 @@ +# 2018 December 13 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2savepoint +ifcapable !wal {finish_test ; return } + +do_execsql_test 1.0 { + CREATE TABLE t1(a, b, c); + CREATE INDEX t1a ON t1(a); + CREATE INDEX t1b ON t1(b); + CREATE INDEX t1c ON t1(c); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 15000; + PRAGMA wal_autocheckpoint = 0; + PRAGMA cache_size = 5; +} {wal2 15000 0} + +do_execsql_test 1.1 { + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 200) + INSERT INTO t1 SELECT random(), random(), random() FROM s; +} {} + +do_test 1.2 { + list [file size test.db] [file size test.db-wal] [file size test.db-wal2] +} {5120 23088 0} + +do_execsql_test 1.3 { + BEGIN; + SAVEPOINT abc; + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 100) + INSERT INTO t1 SELECT random(), random(), random() FROM s; + ROLLBACK TO abc; + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 10) + INSERT INTO t1 SELECT random(), random(), random() FROM s; + COMMIT; + SELECT count(*) FROM t1; + PRAGMA integrity_check; +} {210 ok} + +do_execsql_test 1.4 { + BEGIN; + SAVEPOINT abc; + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 100) + INSERT INTO t1 SELECT random(), random(), random() FROM s; + ROLLBACK TO abc; + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s where i < 10) + INSERT INTO t1 SELECT random(), random(), random() FROM s; + COMMIT; + SELECT count(*) FROM t1; + PRAGMA integrity_check; +} {220 ok} + + +finish_test + ADDED test/wal2simple.test Index: test/wal2simple.test ================================================================== --- /dev/null +++ test/wal2simple.test @@ -0,0 +1,474 @@ +# 2017 September 19 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/malloc_common.tcl +source $testdir/wal_common.tcl + +set testprefix wal2simple +ifcapable !wal {finish_test ; return } + +#------------------------------------------------------------------------- +# The following tests verify that a client can switch in and out of wal +# and wal2 mode. But that it is not possible to change directly from wal +# to wal2, or from wal2 to wal mode. +# +do_execsql_test 1.1.0 { + PRAGMA journal_mode = wal2 +} {wal2} +execsql { SELECT * FROM sqlite_master} +do_execsql_test 1.x { + PRAGMA journal_mode; + PRAGMA main.journal_mode; +} {wal2 wal2} +db close +do_test 1.1.1 { file size test.db } {1024} +do_test 1.1.2 { hexio_read test.db 18 2 } 0303 + +sqlite3 db test.db +do_execsql_test 1.2.0 { + SELECT * FROM sqlite_master; + PRAGMA journal_mode = delete; +} {delete} +db close +do_test 1.2.1 { file size test.db } {1024} +do_test 1.2.2 { hexio_read test.db 18 2 } 0101 + +sqlite3 db test.db +do_execsql_test 1.3.0 { + SELECT * FROM sqlite_master; + PRAGMA journal_mode = wal; +} {wal} +db close +do_test 1.3.1 { file size test.db } {1024} +do_test 1.3.2 { hexio_read test.db 18 2 } 0202 + +sqlite3 db test.db +do_catchsql_test 1.4.0 { + PRAGMA journal_mode = wal2; +} {1 {cannot change from wal to wal2 mode}} +do_execsql_test 1.4.1 { + PRAGMA journal_mode = wal; + PRAGMA journal_mode = delete; + PRAGMA journal_mode = wal2; + PRAGMA journal_mode = wal2; +} {wal delete wal2 wal2} +do_catchsql_test 1.4.2 { + PRAGMA journal_mode = wal; +} {1 {cannot change from wal2 to wal mode}} +db close +do_test 1.4.3 { hexio_read test.db 18 2 } 0303 + +#------------------------------------------------------------------------- +# Test that recovery in wal2 mode works. +# +forcedelete test.db test.db-wal test.db-wal2 +reset_db +do_execsql_test 2.0 { + CREATE TABLE t1(a INTEGER PRIMARY KEY, b); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 5000; +} {wal2 5000} + +proc wal_hook {DB nm nFrame} { $DB eval { PRAGMA wal_checkpoint } } +db wal_hook {wal_hook db} + +for {set i 1} {$i <= 200} {incr i} { + execsql { INSERT INTO t1 VALUES(NULL, randomblob(100)) } + set res [db eval { SELECT sum(a), md5sum(b) FROM t1 }] + + do_test 2.1.$i { + foreach f [glob -nocomplain test.db2*] { forcedelete $f } + forcecopy test.db test.db2 + forcecopy test.db-wal test.db2-wal + forcecopy test.db-wal2 test.db2-wal2 + + sqlite3 db2 test.db2 + db2 eval { SELECT sum(a), md5sum(b) FROM t1 } + } $res + + db2 close +} + +#------------------------------------------------------------------------- + +reset_db +do_execsql_test 3.0 { + CREATE TABLE t1(x BLOB, y INTEGER PRIMARY KEY); + CREATE INDEX i1 ON t1(x); + PRAGMA cache_size = 5; + PRAGMA journal_mode = wal2; +} {wal2} + +do_test 3.1 { + execsql BEGIN + for {set i 1} {$i < 1000} {incr i} { + execsql { INSERT INTO t1 VALUES(randomblob(800), $i) } + } + execsql COMMIT +} {} + +do_execsql_test 3.2 { + PRAGMA integrity_check; +} {ok} + +#------------------------------------------------------------------------- +catch { db close } +foreach f [glob -nocomplain test.db*] { forcedelete $f } +reset_db +do_execsql_test 4.0 { + CREATE TABLE t1(x, y); + PRAGMA journal_mode = wal2; +} {wal2} + +do_execsql_test 4.1 { + SELECT * FROM t1; +} {} + +do_execsql_test 4.2 { + INSERT INTO t1 VALUES(1, 2); +} {} + +do_execsql_test 4.3 { + SELECT * FROM t1; +} {1 2} + +do_test 4.4 { + sqlite3 db2 test.db + execsql { SELECT * FROM t1 } db2 +} {1 2} + +do_test 4.5 { + lsort [glob test.db*] +} {test.db test.db-shm test.db-wal test.db-wal2} + +do_test 4.6 { + db close + db2 close + sqlite3 db test.db + execsql { SELECT * FROM t1 } +} {1 2} + +do_execsql_test 4.7 { + PRAGMA journal_size_limit = 4000; + INSERT INTO t1 VALUES(3, 4); + INSERT INTO t1 VALUES(5, 6); + INSERT INTO t1 VALUES(7, 8); + INSERT INTO t1 VALUES(9, 10); + INSERT INTO t1 VALUES(11, 12); + INSERT INTO t1 VALUES(13, 14); + INSERT INTO t1 VALUES(15, 16); + INSERT INTO t1 VALUES(17, 18); + SELECT * FROM t1; +} {4000 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18} + +do_test 4.8 { + sqlite3 db2 test.db + execsql { SELECT * FROM t1 } db2 +} {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18} + +do_test 4.9 { + db close + db2 close + lsort [glob test.db*] +} {test.db} + +#------------------------------------------------------------------------- +reset_db +do_execsql_test 5.0 { + CREATE TABLE t1(a INTEGER PRIMARY KEY, b, c); + CREATE INDEX i1 ON t1(b, c); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 4000; +} {wal2 4000} + +proc wal_hook {DB nm nFrame} { + $DB eval { PRAGMA wal_checkpoint } +} +db wal_hook [list wal_hook db] + + +foreach js {4000 8000 12000} { + foreach NROW [list 100 200 300 400 500 600 1000] { + do_test 5.$js.$NROW.1 { + db eval "DELETE FROM t1" + db eval "PRAGMA journal_size_limit = $js" + set nTotal 0 + for {set i 0} {$i < $NROW} {incr i} { + db eval { INSERT INTO t1 VALUES($i, $i, randomblob(abs(random()%50))) } + incr nTotal $i + } + set {} {} + } {} + + do_test 5.$js.$NROW.2 { + sqlite3 db2 test.db + db2 eval { + PRAGMA integrity_check; + SELECT count(*), sum(b) FROM t1; + } + } [list ok $NROW $nTotal] + + db2 close + } +} + + +#------------------------------------------------------------------------- +reset_db +do_execsql_test 6.0 { + CREATE TABLE tx(x); + PRAGMA journal_mode = wal2; + PRAGMA journal_size_limit = 3500; +} {wal2 3500} + +do_test 6.1 { + for {set i 0} {$i < 10} {incr i} { + execsql "CREATE TABLE t$i (x);" + } +} {} + +do_test 6.2.1 { + foreach f [glob -nocomplain test.db2*] { forcedelete $f } + forcecopy test.db-wal2 test.db2-wal2 + sqlite3 db2 test.db2 + db2 eval { SELECT * FROM sqlite_master } +} {} +do_test 6.2.2 { + db2 eval { + PRAGMA journal_mode = wal2; + SELECT * FROM sqlite_master; + } +} {wal2} + +do_test 6.3.1 { + db2 close + foreach f [glob -nocomplain test.db2*] { forcedelete $f } + forcecopy test.db-wal2 test.db2-wal2 + forcecopy test.db test.db2 + sqlite3 db2 test.db2 + db2 eval { SELECT * FROM sqlite_master } +} {table tx tx 2 {CREATE TABLE tx(x)}} +do_test 6.3.2 { + db2 eval { + PRAGMA journal_mode = wal2; + SELECT * FROM sqlite_master; + } +} {wal2 table tx tx 2 {CREATE TABLE tx(x)}} + +do_test 6.4.1 { + db2 close + foreach f [glob -nocomplain test.db2*] { forcedelete $f } + forcecopy test.db-wal2 test.db2-wal2 + forcecopy test.db-wal test.db2-wal + sqlite3 db2 test.db2 + db2 eval { SELECT * FROM sqlite_master } +} {} +do_test 6.4.2 { + db2 eval { + PRAGMA journal_mode = wal2; + SELECT * FROM sqlite_master; + } +} {wal2} +db2 close + +#------------------------------------------------------------------------- +reset_db +sqlite3 db2 test.db +do_execsql_test 7.0 { + PRAGMA journal_size_limit = 10000; + PRAGMA journal_mode = wal2; + PRAGMA wal_autocheckpoint = 0; + BEGIN; + CREATE TABLE t1(a); + INSERT INTO t1 VALUES( randomblob(8000) ); + COMMIT; +} {10000 wal2 0} + +do_test 7.1 { + list [file size test.db-wal] [file size test.db-wal2] +} {9464 0} + +# Connection db2 is holding a PART1 lock. +# +# 7.2.2: Test that the PART1 does not prevent db from switching to the +# other wal file. +# +# 7.2.3: Test that the PART1 does prevent a checkpoint of test.db-wal. +# +# 7.2.4: Test that after the PART1 is released the checkpoint is possible. +# +do_test 7.2.1 { + execsql { + BEGIN; + SELECT count(*) FROM t1; + } db2 +} {1} +do_test 7.2.2 { + execsql { + INSERT INTO t1 VALUES( randomblob(800) ); + INSERT INTO t1 VALUES( randomblob(800) ); + } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {13656 3176 1024} +do_test 7.2.3 { + execsql { PRAGMA wal_checkpoint } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {13656 3176 1024} +do_test 7.2.4 { + execsql { END } db2 + execsql { PRAGMA wal_checkpoint } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {13656 3176 11264} + +# Connection db2 is holding a PART2_FULL1 lock. +# +# 7.3.2: Test that the lock does not prevent checkpointing. +# +# 7.3.3: Test that the lock does prevent the writer from overwriting +# test.db-wal. +# +# 7.3.4: Test that after the PART2_FULL1 is released the writer can +# switch wal files and overwrite test.db-wal +# +db close +db2 close +sqlite3 db test.db +sqlite3 db2 test.db +do_test 7.3.1 { + execsql { + PRAGMA wal_autocheckpoint = 0; + PRAGMA journal_size_limit = 10000; + INSERT INTO t1 VALUES(randomblob(10000)); + INSERT INTO t1 VALUES(randomblob(500)); + } + execsql { + BEGIN; + SELECT count(*) FROM t1; + } db2 + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 3176 11264} +do_test 7.3.2 { + execsql { PRAGMA wal_checkpoint } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 3176 21504} +do_test 7.3.3 { + execsql { + INSERT INTO t1 VALUES(randomblob(10000)); + INSERT INTO t1 VALUES(randomblob(500)); + } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 18896 21504} +do_test 7.3.4 { + execsql END db2 + execsql { INSERT INTO t1 VALUES(randomblob(5000)); } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 18896 21504} + +# Connection db2 is holding a PART2 lock. +# +# 7.4.2: Test that the lock does not prevent writer switching to test.db-wal. +# +# 7.3.3: Test that the lock does prevent checkpointing of test.db-wal2. +# +# 7.3.4: Test that after the PART2 is released test.db-wal2 can be +# checkpointed. +# +db close +db2 close +sqlite3 db test.db +sqlite3 db2 test.db +do_test 7.4.1 { + execsql { + PRAGMA wal_autocheckpoint = 0; + PRAGMA journal_size_limit = 10000; + INSERT INTO t1 VALUES(randomblob(10000)); + INSERT INTO t1 VALUES(randomblob(10000)); + PRAGMA wal_checkpoint; + } + execsql { + BEGIN; + SELECT count(*) FROM t1; + } db2 + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 12608 44032} +do_test 7.4.2 { + execsql { + INSERT INTO t1 VALUES(randomblob(5000)); + } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 12608 44032} +do_test 7.4.3 { + execsql { PRAGMA wal_checkpoint } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 12608 44032} +do_test 7.4.4 { + execsql END db2 + execsql { PRAGMA wal_checkpoint } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 12608 54272} + +# Connection db2 is holding a PART1_FULL2 lock. +# +# 7.5.2: Test that the lock does not prevent a checkpoint of test.db-wal2. +# +# 7.5.3: Test that the lock does prevent the writer from overwriting +# test.db-wal2. +# +# 7.5.4: Test that after the PART1_FULL2 lock is released, the writer +# can switch to test.db-wal2. +# +db close +db2 close +sqlite3 db test.db +sqlite3 db2 test.db +do_test 7.5.1 { + execsql { + PRAGMA wal_autocheckpoint = 0; + PRAGMA journal_size_limit = 10000; + INSERT INTO t1 VALUES(randomblob(10000)); + INSERT INTO t1 VALUES(randomblob(10000)); + PRAGMA wal_checkpoint; + INSERT INTO t1 VALUES(randomblob(5000)); + } + execsql { + BEGIN; + SELECT count(*) FROM t1; + } db2 + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 12608 64512} +do_test 7.5.2 { + execsql { PRAGMA wal_checkpoint } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {12608 12608 75776} +do_test 7.5.3.1 { + execsql { INSERT INTO t1 VALUES(randomblob(5000)) } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {14704 12608 75776} +do_test 7.5.3.2 { + execsql { INSERT INTO t1 VALUES(randomblob(5000)) } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {22040 12608 75776} +do_test 7.5.4 { + execsql END db2 + execsql { INSERT INTO t1 VALUES(randomblob(5000)) } + list [file size test.db-wal] [file size test.db-wal2] [file size test.db] +} {22040 12608 75776} + + +finish_test + ADDED test/wal2snapshot.test Index: test/wal2snapshot.test ================================================================== --- /dev/null +++ test/wal2snapshot.test @@ -0,0 +1,94 @@ +# 2018 December 5 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing the operation of the library in +# "PRAGMA journal_mode=WAL2" mode. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl + +set testprefix wal2snapshot +ifcapable !wal {finish_test ; return } +ifcapable !snapshot {finish_test; return} + +foreach {tn mode} {1 wal 2 wal2} { + reset_db + do_execsql_test $tn.1 "PRAGMA journal_mode = $mode" $mode + + do_execsql_test $tn.2 { + CREATE TABLE t1(a, b); + INSERT INTO t1 VALUES(1, 2); + INSERT INTO t1 VALUES(3, 4); + BEGIN; + } + + # Check that sqlite3_snapshot_get() is an error for a wal2 db. + # + if {$tn==1} { + do_test 1.3 { + set S [sqlite3_snapshot_get db main] + sqlite3_snapshot_free $S + } {} + } else { + do_test 2.3 { + list [catch { sqlite3_snapshot_get db main } msg] $msg + } {1 SQLITE_ERROR} + } + + # Check that sqlite3_snapshot_recover() is an error for a wal2 db. + # + do_execsql_test $tn.4 COMMIT + if {$tn==1} { + do_test 1.5 { + sqlite3_snapshot_recover db main + } {} + } else { + do_test 2.5 { + list [catch { sqlite3_snapshot_recover db main } msg] $msg + } {1 SQLITE_ERROR} + } + + # Check that sqlite3_snapshot_open() is an error for a wal2 db. + # + if {$tn==1} { + do_test 1.6 { + execsql BEGIN + set SNAPSHOT [sqlite3_snapshot_get_blob db main] + sqlite3_snapshot_open_blob db main $SNAPSHOT + execsql COMMIT + } {} + } else { + + do_test 2.6.1 { + execsql BEGIN + set res [ + list [catch { sqlite3_snapshot_open_blob db main $SNAPSHOT } msg] $msg + ] + execsql COMMIT + set res + } {1 SQLITE_ERROR} + do_test 2.6.2 { + execsql BEGIN + execsql {SELECT * FROM sqlite_master} + set res [ + list [catch { sqlite3_snapshot_open_blob db main $SNAPSHOT } msg] $msg + ] + execsql COMMIT + set res + } {1 SQLITE_ERROR} + } +} + + +finish_test + + Index: test/walprotocol2.test ================================================================== --- test/walprotocol2.test +++ test/walprotocol2.test @@ -83,11 +83,11 @@ if {$lock=="0 1 lock exclusive"} { proc lock_callback {method filename handle lock} {} db2 eval { INSERT INTO x VALUES('x') } } } -db timeout 10 +db timeout 1100 do_catchsql_test 2.4 { BEGIN EXCLUSIVE; } {0 {}} do_execsql_test 2.5 { SELECT * FROM x; ADDED tool/tserver.c Index: tool/tserver.c ================================================================== --- /dev/null +++ tool/tserver.c @@ -0,0 +1,636 @@ +/* +** 2017 June 7 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** +** Simple multi-threaded server used for informal testing of concurrency +** between connections in different threads. Listens for tcp/ip connections +** on port 9999 of the 127.0.0.1 interface only. To build: +** +** gcc -g $(TOP)/tool/tserver.c sqlite3.o -lpthread -o tserver +** +** To run using "x.db" as the db file: +** +** ./tserver x.db +** +** To connect, open a client socket on port 9999 and start sending commands. +** Commands are either SQL - which must be terminated by a semi-colon, or +** dot-commands, which must be terminated by a newline. If an SQL statement +** is seen, it is prepared and added to an internal list. +** +** Dot-commands are: +** +** .list Display all SQL statements in the list. +** .quit Disconnect. +** .run Run all SQL statements in the list. +** .repeats N Configure the number of repeats per ".run". +** .seconds N Configure the number of seconds to ".run" for. +** .mutex_commit Add a "COMMIT" protected by a g.commit_mutex +** to the current SQL. +** .stop Stop the tserver process - exit(0). +** .checkpoint N +** .integrity_check +** +** Example input: +** +** BEGIN; +** INSERT INTO t1 VALUES(randomblob(10), randomblob(100)); +** INSERT INTO t1 VALUES(randomblob(10), randomblob(100)); +** INSERT INTO t1 VALUES(randomblob(10), randomblob(100)); +** COMMIT; +** .repeats 100000 +** .run +** +*/ +#define TSERVER_PORTNUMBER 9999 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sqlite3.h" + +#define TSERVER_DEFAULT_CHECKPOINT_THRESHOLD 3900 + +/* Global variables */ +struct TserverGlobal { + char *zDatabaseName; /* Database used by this server */ + char *zVfs; + sqlite3_mutex *commit_mutex; + sqlite3 *db; /* Global db handle */ + + /* The following use native pthreads instead of a portable interface. This + ** is because a condition variable, as well as a mutex, is required. */ + pthread_mutex_t ckpt_mutex; + pthread_cond_t ckpt_cond; + int nThreshold; /* Checkpoint when wal is this large */ + int bCkptRequired; /* True if wal checkpoint is required */ + int nRun; /* Number of clients in ".run" */ + int nWait; /* Number of clients waiting on ckpt_cond */ +}; + +static struct TserverGlobal g = {0}; + +typedef struct ClientSql ClientSql; +struct ClientSql { + sqlite3_stmt *pStmt; + int flags; +}; + +#define TSERVER_CLIENTSQL_MUTEX 0x0001 +#define TSERVER_CLIENTSQL_INTEGRITY 0x0002 + +typedef struct ClientCtx ClientCtx; +struct ClientCtx { + sqlite3 *db; /* Database handle for this client */ + int fd; /* Client fd */ + int nRepeat; /* Number of times to repeat SQL */ + int nSecond; /* Number of seconds to run for */ + ClientSql *aPrepare; /* Array of prepared statements */ + int nPrepare; /* Valid size of apPrepare[] */ + int nAlloc; /* Allocated size of apPrepare[] */ + + int nClientThreshold; /* Threshold for checkpointing */ + int bClientCkptRequired; /* True to do a checkpoint */ +}; + +static int is_eol(int i){ + return (i=='\n' || i=='\r'); +} +static int is_whitespace(int i){ + return (i==' ' || i=='\t' || is_eol(i)); +} + +/* +** Implementation of SQL scalar function usleep(). +*/ +static void usleepFunc( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + int nUs; + sqlite3_vfs *pVfs = (sqlite3_vfs*)sqlite3_user_data(context); + assert( argc==1 ); + nUs = sqlite3_value_int64(argv[0]); + pVfs->xSleep(pVfs, nUs); +} + +static void trim_string(const char **pzStr, int *pnStr){ + const char *zStr = *pzStr; + int nStr = *pnStr; + + while( nStr>0 && is_whitespace(zStr[0]) ){ + zStr++; + nStr--; + } + while( nStr>0 && is_whitespace(zStr[nStr-1]) ){ + nStr--; + } + + *pzStr = zStr; + *pnStr = nStr; +} + +static int send_message(ClientCtx *p, const char *zFmt, ...){ + char *zMsg; + va_list ap; /* Vararg list */ + va_start(ap, zFmt); + int res = -1; + + zMsg = sqlite3_vmprintf(zFmt, ap); + if( zMsg ){ + res = write(p->fd, zMsg, strlen(zMsg)); + } + sqlite3_free(zMsg); + va_end(ap); + + return (res<0); +} + +static int handle_some_sql(ClientCtx *p, const char *zSql, int nSql){ + const char *zTail = zSql; + int nTail = nSql; + int rc = SQLITE_OK; + + while( rc==SQLITE_OK ){ + if( p->nPrepare>=p->nAlloc ){ + int nByte = (p->nPrepare+32) * sizeof(ClientSql); + ClientSql *aNew = sqlite3_realloc(p->aPrepare, nByte); + if( aNew ){ + memset(&aNew[p->nPrepare], 0, sizeof(ClientSql)*32); + p->aPrepare = aNew; + p->nAlloc = p->nPrepare+32; + }else{ + rc = SQLITE_NOMEM; + break; + } + } + rc = sqlite3_prepare_v2( + p->db, zTail, nTail, &p->aPrepare[p->nPrepare].pStmt, &zTail + ); + if( rc!=SQLITE_OK ){ + send_message(p, "error - %s (eec=%d)\n", sqlite3_errmsg(p->db), + sqlite3_extended_errcode(p->db) + ); + rc = 1; + break; + } + if( p->aPrepare[p->nPrepare].pStmt==0 ){ + break; + } + p->nPrepare++; + nTail = nSql - (zTail-zSql); + rc = send_message(p, "ok (%d SQL statements)\n", p->nPrepare); + } + + return rc; +} + +static sqlite3_int64 get_timer(void){ + struct timeval t; + gettimeofday(&t, 0); + return ((sqlite3_int64)t.tv_usec / 1000) + ((sqlite3_int64)t.tv_sec * 1000); +} + +static void clear_sql(ClientCtx *p){ + int j; + for(j=0; jnPrepare; j++){ + sqlite3_finalize(p->aPrepare[j].pStmt); + } + p->nPrepare = 0; +} + +/* +** The sqlite3_wal_hook() callback used by all client database connections. +*/ +static int clientWalHook(void *pArg, sqlite3 *db, const char *zDb, int nFrame){ + if( g.nThreshold>0 ){ + if( nFrame>=g.nThreshold ){ + g.bCkptRequired = 1; + } + }else{ + ClientCtx *pCtx = (ClientCtx*)pArg; + if( pCtx->nClientThreshold && nFrame>=pCtx->nClientThreshold ){ + pCtx->bClientCkptRequired = 1; + } + } + return SQLITE_OK; +} + +static int handle_run_command(ClientCtx *p){ + int i, j; + int nBusy = 0; + sqlite3_int64 t0 = get_timer(); + sqlite3_int64 t1 = t0; + int nT1 = 0; + int nTBusy1 = 0; + int rc = SQLITE_OK; + + pthread_mutex_lock(&g.ckpt_mutex); + g.nRun++; + pthread_mutex_unlock(&g.ckpt_mutex); + + + for(j=0; (p->nRepeat<=0 || jnRepeat) && rc==SQLITE_OK; j++){ + sqlite3_int64 t2; + + for(i=0; inPrepare && rc==SQLITE_OK; i++){ + sqlite3_stmt *pStmt = p->aPrepare[i].pStmt; + + /* If the MUTEX flag is set, grab g.commit_mutex before executing + ** the SQL statement (which is always "COMMIT" in this case). */ + if( p->aPrepare[i].flags & TSERVER_CLIENTSQL_MUTEX ){ + sqlite3_mutex_enter(g.commit_mutex); + } + + /* Execute the statement */ + if( p->aPrepare[i].flags & TSERVER_CLIENTSQL_INTEGRITY ){ + sqlite3_step(pStmt); + if( sqlite3_stricmp("ok", sqlite3_column_text(pStmt, 0)) ){ + send_message(p, "error - integrity_check failed: %s\n", + sqlite3_column_text(pStmt, 0) + ); + } + sqlite3_reset(pStmt); + } + while( sqlite3_step(pStmt)==SQLITE_ROW ); + rc = sqlite3_reset(pStmt); + + /* Relinquish the g.commit_mutex mutex if required. */ + if( p->aPrepare[i].flags & TSERVER_CLIENTSQL_MUTEX ){ + sqlite3_mutex_leave(g.commit_mutex); + } + + if( (rc & 0xFF)==SQLITE_BUSY ){ + if( sqlite3_get_autocommit(p->db)==0 ){ + sqlite3_exec(p->db, "ROLLBACK", 0, 0, 0); + } + nBusy++; + rc = SQLITE_OK; + break; + } + else if( rc!=SQLITE_OK ){ + send_message(p, "error - %s (eec=%d)\n", sqlite3_errmsg(p->db), + sqlite3_extended_errcode(p->db) + ); + } + } + + t2 = get_timer(); + if( t2>=(t1+1000) ){ + int nMs = (t2 - t1); + int nDone = (j+1 - nBusy - nT1); + + rc = send_message( + p, "(%d done @ %d per second, %d busy)\n", + nDone, (1000*nDone + nMs/2) / nMs, nBusy - nTBusy1 + ); + t1 = t2; + nT1 = j+1 - nBusy; + nTBusy1 = nBusy; + if( p->nSecond>0 && (p->nSecond*1000)<=t1-t0 ) break; + } + + /* Global checkpoint handling. */ + if( g.nThreshold>0 ){ + pthread_mutex_lock(&g.ckpt_mutex); + if( rc==SQLITE_OK && g.bCkptRequired ){ + if( g.nWait==g.nRun-1 ){ + /* All other clients are already waiting on the condition variable. + ** Run the checkpoint, signal the condition and move on. */ + rc = sqlite3_wal_checkpoint(p->db, "main"); + g.bCkptRequired = 0; + pthread_cond_broadcast(&g.ckpt_cond); + }else{ + assert( g.nWaitbClientCkptRequired ){ + rc = sqlite3_wal_checkpoint(p->db, "main"); + if( rc==SQLITE_BUSY ) rc = SQLITE_OK; + assert( rc==SQLITE_OK ); + p->bClientCkptRequired = 0; + } + } + + if( rc==SQLITE_OK ){ + int nMs = (int)(get_timer() - t0); + send_message(p, "ok (%d/%d SQLITE_BUSY)\n", nBusy, j); + if( p->nRepeat<=0 ){ + send_message(p, "### ok %d busy %d ms %d\n", j-nBusy, nBusy, nMs); + } + } + clear_sql(p); + + pthread_mutex_lock(&g.ckpt_mutex); + g.nRun--; + pthread_mutex_unlock(&g.ckpt_mutex); + + return rc; +} + +static int handle_dot_command(ClientCtx *p, const char *zCmd, int nCmd){ + int n; + int rc = 0; + const char *z = &zCmd[1]; + const char *zArg; + int nArg; + + assert( zCmd[0]=='.' ); + for(n=0; n<(nCmd-1); n++){ + if( is_whitespace(z[n]) ) break; + } + + zArg = &z[n]; + nArg = nCmd-n; + trim_string(&zArg, &nArg); + + if( n>=1 && n<=4 && 0==strncmp(z, "list", n) ){ + int i; + for(i=0; rc==0 && inPrepare; i++){ + const char *zSql = sqlite3_sql(p->aPrepare[i].pStmt); + int nSql = strlen(zSql); + trim_string(&zSql, &nSql); + rc = send_message(p, "%d: %.*s\n", i, nSql, zSql); + } + } + + else if( n>=1 && n<=4 && 0==strncmp(z, "quit", n) ){ + rc = -1; + } + + else if( n>=2 && n<=7 && 0==strncmp(z, "repeats", n) ){ + if( nArg ){ + p->nRepeat = strtol(zArg, 0, 0); + if( p->nRepeat>0 ) p->nSecond = 0; + } + rc = send_message(p, "ok (repeat=%d)\n", p->nRepeat); + } + + else if( n>=2 && n<=3 && 0==strncmp(z, "run", n) ){ + rc = handle_run_command(p); + } + + else if( n>=2 && n<=7 && 0==strncmp(z, "seconds", n) ){ + if( nArg ){ + p->nSecond = strtol(zArg, 0, 0); + if( p->nSecond>0 ) p->nRepeat = 0; + } + rc = send_message(p, "ok (seconds=%d)\n", p->nSecond); + } + + else if( n>=1 && n<=12 && 0==strncmp(z, "mutex_commit", n) ){ + rc = handle_some_sql(p, "COMMIT;", 7); + if( rc==SQLITE_OK ){ + p->aPrepare[p->nPrepare-1].flags |= TSERVER_CLIENTSQL_MUTEX; + } + } + + else if( n>=1 && n<=10 && 0==strncmp(z, "checkpoint", n) ){ + if( nArg ){ + p->nClientThreshold = strtol(zArg, 0, 0); + } + rc = send_message(p, "ok (checkpoint=%d)\n", p->nClientThreshold); + } + + else if( n>=2 && n<=4 && 0==strncmp(z, "stop", n) ){ + sqlite3_close(g.db); + exit(0); + } + + else if( n>=2 && n<=15 && 0==strncmp(z, "integrity_check", n) ){ + rc = handle_some_sql(p, "PRAGMA integrity_check;", 23); + if( rc==SQLITE_OK ){ + p->aPrepare[p->nPrepare-1].flags |= TSERVER_CLIENTSQL_INTEGRITY; + } + } + + else{ + send_message(p, + "unrecognized dot command: %.*s\n" + "should be \"list\", \"run\", \"repeats\", \"mutex_commit\", " + "\"checkpoint\", \"integrity_check\" or \"seconds\"\n", n, z + ); + rc = 1; + } + + return rc; +} + +static void *handle_client(void *pArg){ + char zCmd[32*1024]; /* Read buffer */ + int nCmd = 0; /* Valid bytes in zCmd[] */ + int res; /* Result of read() call */ + int rc = SQLITE_OK; + + ClientCtx ctx; + memset(&ctx, 0, sizeof(ClientCtx)); + + ctx.fd = (int)(intptr_t)pArg; + ctx.nRepeat = 1; + rc = sqlite3_open_v2(g.zDatabaseName, &ctx.db, + SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE, g.zVfs + ); + if( rc!=SQLITE_OK ){ + fprintf(stderr, "sqlite3_open(): %s\n", sqlite3_errmsg(ctx.db)); + return 0; + } + sqlite3_create_function( + ctx.db, "usleep", 1, SQLITE_UTF8, (void*)sqlite3_vfs_find(0), + usleepFunc, 0, 0 + ); + + /* Register the wal-hook with the new client connection */ + sqlite3_wal_hook(ctx.db, clientWalHook, (void*)&ctx); + + while( rc==SQLITE_OK ){ + int i; + int iStart; + int nConsume; + res = read(ctx.fd, &zCmd[nCmd], sizeof(zCmd)-nCmd-1); + if( res<=0 ) break; + nCmd += res; + if( nCmd>=sizeof(zCmd)-1 ){ + fprintf(stderr, "oversized (>32KiB) message\n"); + res = 0; + break; + } + zCmd[nCmd] = '\0'; + + do { + nConsume = 0; + + /* Gobble up any whitespace */ + iStart = 0; + while( is_whitespace(zCmd[iStart]) ) iStart++; + + if( zCmd[iStart]=='.' ){ + /* This is a dot-command. Search for end-of-line. */ + for(i=iStart; i0 ){ + nCmd = nCmd-nConsume; + if( nCmd>0 ){ + memmove(zCmd, &zCmd[nConsume], nCmd); + } + } + }while( rc==SQLITE_OK && nConsume>0 ); + } + + fprintf(stdout, "Client %d disconnects (rc=%d)\n", ctx.fd, rc); + fflush(stdout); + close(ctx.fd); + clear_sql(&ctx); + sqlite3_free(ctx.aPrepare); + sqlite3_close(ctx.db); + return 0; +} + +static void usage(const char *zExec){ + fprintf(stderr, "Usage: %s ?-vfs VFS? DATABASE\n", zExec); + exit(1); +} + +int main(int argc, char *argv[]) { + int sfd; + int rc; + int yes = 1; + struct sockaddr_in server; + int i; + + /* Ignore SIGPIPE. Otherwise the server exits if a client disconnects + ** abruptly. */ + signal(SIGPIPE, SIG_IGN); + + g.nThreshold = TSERVER_DEFAULT_CHECKPOINT_THRESHOLD; + if( (argc%2) ) usage(argv[0]); + for(i=1; i<(argc-1); i+=2){ + int n = strlen(argv[i]); + if( n>=2 && 0==sqlite3_strnicmp("-walautocheckpoint", argv[i], n) ){ + g.nThreshold = strtol(argv[i+1], 0, 0); + }else + if( n>=2 && 0==sqlite3_strnicmp("-vfs", argv[i], n) ){ + g.zVfs = argv[i+1]; + } + } + g.zDatabaseName = argv[argc-1]; + + g.commit_mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_FAST); + pthread_mutex_init(&g.ckpt_mutex, 0); + pthread_cond_init(&g.ckpt_cond, 0); + + rc = sqlite3_open_v2(g.zDatabaseName, &g.db, + SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE, g.zVfs + ); + if( rc!=SQLITE_OK ){ + fprintf(stderr, "sqlite3_open(): %s\n", sqlite3_errmsg(g.db)); + return 1; + } + + rc = sqlite3_exec(g.db, "SELECT * FROM sqlite_master", 0, 0, 0); + if( rc!=SQLITE_OK ){ + fprintf(stderr, "sqlite3_exec(): %s\n", sqlite3_errmsg(g.db)); + return 1; + } + + sfd = socket(AF_INET, SOCK_STREAM, 0); + if( sfd<0 ){ + fprintf(stderr, "socket() failed\n"); + return 1; + } + + rc = setsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)); + if( rc<0 ){ + perror("setsockopt"); + return 1; + } + + memset(&server, 0, sizeof(server)); + server.sin_family = AF_INET; + server.sin_addr.s_addr = inet_addr("127.0.0.1"); + server.sin_port = htons(TSERVER_PORTNUMBER); + + rc = bind(sfd, (struct sockaddr *)&server, sizeof(struct sockaddr)); + if( rc<0 ){ + fprintf(stderr, "bind() failed\n"); + return 1; + } + + rc = listen(sfd, 8); + if( rc<0 ){ + fprintf(stderr, "listen() failed\n"); + return 1; + } + + while( 1 ){ + pthread_t tid; + int cfd = accept(sfd, NULL, NULL); + if( cfd<0 ){ + perror("accept()"); + return 1; + } + + fprintf(stdout, "Client %d connects\n", cfd); + fflush(stdout); + rc = pthread_create(&tid, NULL, handle_client, (void*)(intptr_t)cfd); + if( rc!=0 ){ + perror("pthread_create()"); + return 1; + } + + pthread_detach(tid); + } + + return 0; +} ADDED tool/tserver_test.tcl Index: tool/tserver_test.tcl ================================================================== --- /dev/null +++ tool/tserver_test.tcl @@ -0,0 +1,297 @@ +#!/usr/bin/tclsh +# +# This script is used to run the performance test cases described in +# README-server-edition.html. +# + + +package require sqlite3 + +# Default values for command line switches: +set O(-database) "" +set O(-rows) [expr 5000000] +set O(-mode) wal2 +set O(-tserver) "./tserver" +set O(-seconds) 20 +set O(-writers) 1 +set O(-readers) 0 +set O(-integrity) 0 +set O(-verbose) 0 + + +proc error_out {err} { + puts stderr $err + exit -1 +} + +proc usage {} { + puts stderr "Usage: $::argv0 ?OPTIONS?" + puts stderr "" + puts stderr "Where OPTIONS are:" + puts stderr " -database (default: test.db)" + puts stderr " -mode (default: wal2)" + puts stderr " -rows (default: 5000000)" + puts stderr " -tserver (default: ./tserver)" + puts stderr " -seconds