Index: src/pager.c ================================================================== --- src/pager.c +++ src/pager.c @@ -614,10 +614,11 @@ u8 useJournal; /* Use a rollback journal on this file */ u8 noReadlock; /* Do not bother to obtain readlocks */ u8 noSync; /* Do not sync the journal if true */ u8 fullSync; /* Do extra syncs of the journal for robustness */ u8 ckptSyncFlags; /* SYNC_NORMAL or SYNC_FULL for checkpoint */ + u8 walSyncFlags; /* SYNC_NORMAL or SYNC_FULL for wal writes */ u8 syncFlags; /* SYNC_NORMAL or SYNC_FULL otherwise */ u8 tempFile; /* zFilename is a temporary file */ u8 readOnly; /* True for a read-only database */ u8 memDb; /* True to inhibit all file I/O */ @@ -784,11 +785,11 @@ return (pPager->pWal!=0); } #else # define pagerUseWal(x) 0 # define pagerRollbackWal(x) 0 -# define pagerWalFrames(v,w,x,y,z) 0 +# define pagerWalFrames(v,w,x,y) 0 # define pagerOpenWalIfPresent(z) SQLITE_OK # define pagerBeginReadTransaction(z) SQLITE_OK #endif #ifndef NDEBUG @@ -2953,12 +2954,11 @@ */ static int pagerWalFrames( Pager *pPager, /* Pager object */ PgHdr *pList, /* List of frames to log */ Pgno nTruncate, /* Database size after this commit */ - int isCommit, /* True if this is a commit */ - int syncFlags /* Flags to pass to OsSync() (or 0) */ + int isCommit /* True if this is a commit */ ){ int rc; /* Return code */ #if defined(SQLITE_DEBUG) || defined(SQLITE_CHECK_PAGES) PgHdr *p; /* For looping over pages */ #endif @@ -2985,11 +2985,11 @@ assert( pList ); } if( pList->pgno==1 ) pager_write_changecounter(pList); rc = sqlite3WalFrames(pPager->pWal, - pPager->pageSize, pList, nTruncate, isCommit, syncFlags + pPager->pageSize, pList, nTruncate, isCommit, pPager->walSyncFlags ); if( rc==SQLITE_OK && pPager->pBackup ){ PgHdr *p; for(p=pList; p; p=p->pDirty){ sqlite3BackupUpdate(pPager->pBackup, p->pgno, (u8 *)p->pData); @@ -3365,10 +3365,14 @@ pPager->ckptSyncFlags = SQLITE_SYNC_FULL; }else{ pPager->syncFlags = SQLITE_SYNC_NORMAL; pPager->ckptSyncFlags = SQLITE_SYNC_NORMAL; } + pPager->walSyncFlags = pPager->syncFlags; + if( pPager->fullSync ){ + pPager->walSyncFlags |= WAL_SYNC_TRANSACTIONS; + } } #endif /* ** The following global variable is incremented whenever the library @@ -4192,11 +4196,11 @@ /* Write a single frame for this page to the log. */ if( subjRequiresPage(pPg) ){ rc = subjournalPage(pPg); } if( rc==SQLITE_OK ){ - rc = pagerWalFrames(pPager, pPg, 0, 0, 0); + rc = pagerWalFrames(pPager, pPg, 0, 0); } }else{ /* Sync the journal file if required. */ if( pPg->flags&PGHDR_NEED_SYNC @@ -4531,13 +4535,21 @@ pPager->changeCountDone = pPager->tempFile; pPager->memDb = (u8)memDb; pPager->readOnly = (u8)readOnly; assert( useJournal || pPager->tempFile ); pPager->noSync = pPager->tempFile; - pPager->fullSync = pPager->noSync ?0:1; - pPager->syncFlags = pPager->noSync ? 0 : SQLITE_SYNC_NORMAL; - pPager->ckptSyncFlags = pPager->syncFlags; + if( pPager->noSync ){ + assert( pPager->fullSync==0 ); + assert( pPager->syncFlags==0 ); + assert( pPager->walSyncFlags==0 ); + assert( pPager->ckptSyncFlags==0 ); + }else{ + pPager->fullSync = 1; + pPager->syncFlags = SQLITE_SYNC_NORMAL; + pPager->walSyncFlags = SQLITE_SYNC_NORMAL | WAL_SYNC_TRANSACTIONS; + pPager->ckptSyncFlags = SQLITE_SYNC_NORMAL; + } /* pPager->pFirst = 0; */ /* pPager->pFirstSynced = 0; */ /* pPager->pLast = 0; */ pPager->nExtra = (u16)nExtra; pPager->journalSizeLimit = SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT; @@ -5763,13 +5775,11 @@ pList = pPageOne; pList->pDirty = 0; } assert( rc==SQLITE_OK ); if( ALWAYS(pList) ){ - rc = pagerWalFrames(pPager, pList, pPager->dbSize, 1, - (pPager->fullSync ? pPager->syncFlags : 0) - ); + rc = pagerWalFrames(pPager, pList, pPager->dbSize, 1); } sqlite3PagerUnref(pPageOne); if( rc==SQLITE_OK ){ sqlite3PcacheCleanAll(pPager->pPCache); } Index: src/wal.c ================================================================== --- src/wal.c +++ src/wal.c @@ -412,18 +412,21 @@ sqlite3_file *pDbFd; /* File handle for the database file */ sqlite3_file *pWalFd; /* File handle for WAL file */ u32 iCallback; /* Value to pass to log callback (or 0) */ i64 mxWalSize; /* Truncate WAL to this size upon reset */ int nWiData; /* Size of array apWiData */ + int szFirstBlock; /* Size of first block written to WAL file */ volatile u32 **apWiData; /* Pointer to wal-index content in memory */ u32 szPage; /* Database page size */ i16 readLock; /* Which read lock is being held. -1 for none */ + u8 syncFlags; /* Flags to use to sync header writes */ u8 exclusiveMode; /* Non-zero if connection is in exclusive mode */ u8 writeLock; /* True if in a write transaction */ u8 ckptLock; /* True if holding a checkpoint lock */ u8 readOnly; /* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */ u8 truncateOnCommit; /* True to truncate WAL file on commit */ + u8 noSyncHeader; /* Avoid WAL header fsyncs if true */ WalIndexHdr hdr; /* Wal-index header for current transaction */ const char *zWalName; /* Name of WAL file */ u32 nCkpt; /* Checkpoint sequence counter in the wal-header */ #ifdef SQLITE_DEBUG u8 lockError; /* True if a locking error has occurred */ @@ -1294,10 +1297,12 @@ if( rc!=SQLITE_OK ){ walIndexClose(pRet, 0); sqlite3OsClose(pRet->pWalFd); sqlite3_free(pRet); }else{ + int iDC = sqlite3OsDeviceCharacteristics(pRet->pWalFd); + if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->noSyncHeader = 1; } *ppWal = pRet; WALTRACE(("WAL%d: opened\n", pRet)); } return rc; } @@ -2614,10 +2619,48 @@ testcase( rc==SQLITE_PROTOCOL ); testcase( rc==SQLITE_OK ); } return rc; } + +/* +** Write iAmt bytes of content into the WAL file beginning at iOffset. +** +** When crossing the boundary between the first and second sectors of the +** file, first write all of the first sector content, then fsync(), then +** continue writing content for the second sector. This ensures that +** the WAL header is overwritten before the first commit mark. +*/ +static int walWriteToLog( + Wal *pWal, /* WAL to write to */ + void *pContent, /* Content to be written */ + int iAmt, /* Number of bytes to write */ + sqlite3_int64 iOffset /* Start writing at this offset */ +){ + int rc; + if( iOffset>=pWal->szFirstBlock + || iOffset+iAmtszFirstBlock + || pWal->syncFlags==0 + ){ + /* The common and fast case. Just write the data. */ + rc = sqlite3OsWrite(pWal->pWalFd, pContent, iAmt, iOffset); + }else{ + /* If this write will cross the first sector boundary, it has to + ** be split it two with a sync in between. */ + int iFirstAmt = pWal->szFirstBlock - iOffset; + assert( iFirstAmt>0 && iFirstAmtpWalFd, pContent, iFirstAmt, iOffset); + if( rc ) return rc; + assert( pWal->syncFlags & (SQLITE_SYNC_NORMAL|SQLITE_SYNC_FULL) ); + rc = sqlite3OsSync(pWal->pWalFd, pWal->syncFlags); + if( rc ) return rc; + pContent = (void*)(iFirstAmt + (char*)pContent); + rc = sqlite3OsWrite(pWal->pWalFd, pContent, + iAmt-iFirstAmt, iOffset+iFirstAmt); + } + return rc; +} /* ** Write a set of frames to the log. The caller must hold the write-lock ** on the log file (obtained using sqlite3WalBeginWriteTransaction()). */ @@ -2683,10 +2726,20 @@ if( rc!=SQLITE_OK ){ return rc; } } assert( (int)pWal->szPage==szPage ); + + /* Setup information needed to do the WAL header sync */ + if( pWal->noSyncHeader ){ + assert( pWal->szFirstBlock==0 ); + assert( pWal->syncFlags==0 ); + }else{ + pWal->szFirstBlock = sqlite3OsSectorSize(pWal->pWalFd); + if( szPage>pWal->szFirstBlock ) pWal->szFirstBlock = szPage; + pWal->syncFlags = sync_flags & SQLITE_SYNC_MASK; + } /* Write the log file. */ for(p=pList; p; p=p->pDirty){ u32 nDbsize; /* Db-size field for frame header */ i64 iOffset; /* Write offset in log file */ @@ -2701,29 +2754,28 @@ if( (pData = sqlite3PagerCodec(p))==0 ) return SQLITE_NOMEM; #else pData = p->pData; #endif walEncodeFrame(pWal, p->pgno, nDbsize, pData, aFrame); - rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOffset); + rc = walWriteToLog(pWal, aFrame, sizeof(aFrame), iOffset); if( rc!=SQLITE_OK ){ return rc; } /* Write the page data */ - rc = sqlite3OsWrite(pWal->pWalFd, pData, szPage, iOffset+sizeof(aFrame)); + rc = walWriteToLog(pWal, pData, szPage, iOffset+sizeof(aFrame)); if( rc!=SQLITE_OK ){ return rc; } pLast = p; } /* Sync the log file if the 'isSync' flag was specified. */ - if( sync_flags ){ + if( isCommit && (sync_flags & WAL_SYNC_TRANSACTIONS)!=0 ){ i64 iSegment = sqlite3OsSectorSize(pWal->pWalFd); i64 iOffset = walFrameOffset(iFrame+1, szPage); - assert( isCommit ); assert( iSegment>0 ); iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment); while( iOffsetpData; #endif walEncodeFrame(pWal, pLast->pgno, nTruncate, pData, aFrame); /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */ - rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOffset); + rc = walWriteToLog(pWal, aFrame, sizeof(aFrame), iOffset); if( rc!=SQLITE_OK ){ return rc; } iOffset += WAL_FRAME_HDRSIZE; - rc = sqlite3OsWrite(pWal->pWalFd, pData, szPage, iOffset); + rc = walWriteToLog(pWal, pData, szPage, iOffset); if( rc!=SQLITE_OK ){ return rc; } nLast++; iOffset += szPage; } - rc = sqlite3OsSync(pWal->pWalFd, sync_flags); + rc = sqlite3OsSync(pWal->pWalFd, sync_flags & SQLITE_SYNC_MASK); } if( isCommit && pWal->truncateOnCommit && pWal->mxWalSize>=0 ){ i64 sz = pWal->mxWalSize; if( walFrameOffset(iFrame+nLast+1, szPage)>pWal->mxWalSize ){ Index: src/wal.h ================================================================== --- src/wal.h +++ src/wal.h @@ -85,10 +85,16 @@ int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData); /* Write a frame or frames to the log. */ int sqlite3WalFrames(Wal *pWal, int, PgHdr *, Pgno, int, int); +/* Additional values that can be added to the sync_flags argument of +** sqlite3WalFrames(): +*/ +#define WAL_SYNC_TRANSACTIONS 0x20 /* Sync at the end of each transaction */ +#define SQLITE_SYNC_MASK 0x13 /* Mask off the SQLITE_SYNC_* values */ + /* Copy pages from the log to the database file */ int sqlite3WalCheckpoint( Wal *pWal, /* Write-ahead log connection */ int eMode, /* One of PASSIVE, FULL and RESTART */ int (*xBusy)(void*), /* Function to call when busy */ Index: test/wal2.test ================================================================== --- test/wal2.test +++ test/wal2.test @@ -1174,13 +1174,13 @@ #------------------------------------------------------------------------- # Test that "PRAGMA checkpoint_fullsync" appears to be working. # foreach {tn sql reslist} { - 1 { } {8 0 3 0 5 0} - 2 { PRAGMA checkpoint_fullfsync = 1 } {8 4 3 2 5 2} - 3 { PRAGMA checkpoint_fullfsync = 0 } {8 0 3 0 5 0} + 1 { } {10 0 4 0 6 0} + 2 { PRAGMA checkpoint_fullfsync = 1 } {10 4 4 2 6 2} + 3 { PRAGMA checkpoint_fullfsync = 0 } {10 0 4 0 6 0} } { faultsim_delete_and_reopen execsql {PRAGMA auto_vacuum = 0} execsql $sql @@ -1190,16 +1190,16 @@ set sqlite_fullsync_count 0 do_execsql_test wal2-14.$tn.2 { PRAGMA wal_autocheckpoint = 10; CREATE TABLE t1(a, b); -- 2 wal syncs - INSERT INTO t1 VALUES(1, 2); -- 1 wal sync + INSERT INTO t1 VALUES(1, 2); -- 2 wal sync PRAGMA wal_checkpoint; -- 1 wal sync, 1 db sync BEGIN; INSERT INTO t1 VALUES(3, 4); INSERT INTO t1 VALUES(5, 6); - COMMIT; -- 1 wal sync + COMMIT; -- 2 wal sync PRAGMA wal_checkpoint; -- 1 wal sync, 1 db sync } {10 0 5 5 0 2 2} do_test wal2-14.$tn.3 { cond_incr_sync_count 1 @@ -1231,26 +1231,26 @@ # PRAGMA checkpoint_fullsync # PRAGMA fullfsync # PRAGMA synchronous # -foreach {tn settings commit_sync ckpt_sync} { - 1 {0 0 off} {0 0} {0 0} - 2 {0 0 normal} {0 0} {2 0} - 3 {0 0 full} {1 0} {2 0} - - 4 {0 1 off} {0 0} {0 0} - 5 {0 1 normal} {0 0} {0 2} - 6 {0 1 full} {0 1} {0 2} - - 7 {1 0 off} {0 0} {0 0} - 8 {1 0 normal} {0 0} {0 2} - 9 {1 0 full} {1 0} {0 2} - - 10 {1 1 off} {0 0} {0 0} - 11 {1 1 normal} {0 0} {0 2} - 12 {1 1 full} {0 1} {0 2} +foreach {tn settings restart_sync commit_sync ckpt_sync} { + 1 {0 0 off} {0 0} {0 0} {0 0} + 2 {0 0 normal} {1 0} {0 0} {2 0} + 3 {0 0 full} {2 0} {1 0} {2 0} + + 4 {0 1 off} {0 0} {0 0} {0 0} + 5 {0 1 normal} {0 1} {0 0} {0 2} + 6 {0 1 full} {0 2} {0 1} {0 2} + + 7 {1 0 off} {0 0} {0 0} {0 0} + 8 {1 0 normal} {1 0} {0 0} {0 2} + 9 {1 0 full} {2 0} {1 0} {0 2} + + 10 {1 1 off} {0 0} {0 0} {0 0} + 11 {1 1 normal} {0 1} {0 0} {0 2} + 12 {1 1 full} {0 2} {0 1} {0 2} } { forcedelete test.db testvfs tvfs -default 1 tvfs filter xSync @@ -1260,31 +1260,39 @@ } sqlite3 db test.db do_execsql_test 15.$tn.1 " CREATE TABLE t1(x); + PRAGMA wal_autocheckpoint = OFF; PRAGMA journal_mode = WAL; PRAGMA checkpoint_fullfsync = [lindex $settings 0]; PRAGMA fullfsync = [lindex $settings 1]; PRAGMA synchronous = [lindex $settings 2]; - " {wal} + " {0 wal} do_test 15.$tn.2 { set sync(normal) 0 set sync(full) 0 execsql { INSERT INTO t1 VALUES('abc') } list $::sync(normal) $::sync(full) - } $commit_sync + } $restart_sync do_test 15.$tn.3 { + set sync(normal) 0 + set sync(full) 0 + execsql { INSERT INTO t1 VALUES('abc') } + list $::sync(normal) $::sync(full) + } $commit_sync + + do_test 15.$tn.4 { set sync(normal) 0 set sync(full) 0 execsql { INSERT INTO t1 VALUES('def') } list $::sync(normal) $::sync(full) } $commit_sync - do_test 15.$tn.4 { + do_test 15.$tn.5 { set sync(normal) 0 set sync(full) 0 execsql { PRAGMA wal_checkpoint } list $::sync(normal) $::sync(full) } $ckpt_sync Index: test/wal3.test ================================================================== --- test/wal3.test +++ test/wal3.test @@ -215,10 +215,11 @@ T script sync_counter sqlite3 db test.db -vfs T execsql "PRAGMA synchronous = $syncmode" execsql { PRAGMA journal_mode = WAL } + execsql { CREATE TABLE filler(a,b,c); } set ::syncs [list] T filter xSync execsql { CREATE TABLE x(y);