Index: src/os_unix.c ================================================================== --- src/os_unix.c +++ src/os_unix.c @@ -246,10 +246,11 @@ #define UNIXFILE_PSOW 0x10 /* SQLITE_IOCAP_POWERSAFE_OVERWRITE */ #define UNIXFILE_DELETE 0x20 /* Delete on close */ #define UNIXFILE_URI 0x40 /* Filename might have query parameters */ #define UNIXFILE_NOLOCK 0x80 /* Do no file locking */ #define UNIXFILE_WARNED 0x0100 /* verifyDbFile() warnings issued */ +#define UNIXFILE_BLOCK 0x0200 /* Next SHM lock might block */ /* ** Include code that is common to all os_*.c files */ #include "os_common.h" @@ -4088,37 +4089,42 @@ ** ** Locks block if the mask is exactly UNIX_SHM_C and are non-blocking ** otherwise. */ static int unixShmSystemLock( - unixShmNode *pShmNode, /* Apply locks to this open shared-memory segment */ + unixFile *pFile, /* Open connection to the WAL file */ int lockType, /* F_UNLCK, F_RDLCK, or F_WRLCK */ int ofst, /* First byte of the locking range */ int n /* Number of bytes to lock */ ){ - struct flock f; /* The posix advisory locking structure */ - int rc = SQLITE_OK; /* Result code form fcntl() */ + unixShmNode *pShmNode; /* Apply locks to this open shared-memory segment */ + struct flock f; /* The posix advisory locking structure */ + int rc = SQLITE_OK; /* Result code form fcntl() */ /* Access to the unixShmNode object is serialized by the caller */ + pShmNode = pFile->pInode->pShmNode; assert( sqlite3_mutex_held(pShmNode->mutex) || pShmNode->nRef==0 ); /* Shared locks never span more than one byte */ assert( n==1 || lockType!=F_RDLCK ); /* Locks are within range */ assert( n>=1 && nh>=0 ){ + int lkType; /* Initialize the locking parameters */ memset(&f, 0, sizeof(f)); f.l_type = lockType; f.l_whence = SEEK_SET; f.l_start = ofst; f.l_len = n; - rc = osFcntl(pShmNode->h, F_SETLK, &f); + lkType = (pFile->ctrlFlags & UNIXFILE_BLOCK)!=0 ? F_SETLKW : F_SETLK; + rc = osFcntl(pShmNode->h, lkType, &f); rc = (rc!=(-1)) ? SQLITE_OK : SQLITE_BUSY; + pFile->ctrlFlags &= ~UNIXFILE_BLOCK; } /* Update the global lock state and do debug tracing */ #ifdef SQLITE_DEBUG { u16 mask; @@ -4324,17 +4330,17 @@ /* Check to see if another process is holding the dead-man switch. ** If not, truncate the file to zero length. */ rc = SQLITE_OK; - if( unixShmSystemLock(pShmNode, F_WRLCK, UNIX_SHM_DMS, 1)==SQLITE_OK ){ + if( unixShmSystemLock(pDbFd, F_WRLCK, UNIX_SHM_DMS, 1)==SQLITE_OK ){ if( robust_ftruncate(pShmNode->h, 0) ){ rc = unixLogError(SQLITE_IOERR_SHMOPEN, "ftruncate", zShmFilename); } } if( rc==SQLITE_OK ){ - rc = unixShmSystemLock(pShmNode, F_RDLCK, UNIX_SHM_DMS, 1); + rc = unixShmSystemLock(pDbFd, F_RDLCK, UNIX_SHM_DMS, 1); } if( rc ) goto shm_open_err; } } @@ -4562,11 +4568,11 @@ allMask |= pX->sharedMask; } /* Unlock the system-level locks */ if( (mask & allMask)==0 ){ - rc = unixShmSystemLock(pShmNode, F_UNLCK, ofst+UNIX_SHM_BASE, n); + rc = unixShmSystemLock(pDbFd, F_UNLCK, ofst+UNIX_SHM_BASE, n); }else{ rc = SQLITE_OK; } /* Undo the local locks */ @@ -4590,11 +4596,11 @@ } /* Get shared locks at the system level, if necessary */ if( rc==SQLITE_OK ){ if( (allShared & mask)==0 ){ - rc = unixShmSystemLock(pShmNode, F_RDLCK, ofst+UNIX_SHM_BASE, n); + rc = unixShmSystemLock(pDbFd, F_RDLCK, ofst+UNIX_SHM_BASE, n); }else{ rc = SQLITE_OK; } } @@ -4615,11 +4621,11 @@ /* Get the exclusive locks at the system level. Then if successful ** also mark the local connection as being locked. */ if( rc==SQLITE_OK ){ - rc = unixShmSystemLock(pShmNode, F_WRLCK, ofst+UNIX_SHM_BASE, n); + rc = unixShmSystemLock(pDbFd, F_WRLCK, ofst+UNIX_SHM_BASE, n); if( rc==SQLITE_OK ){ assert( (p->sharedMask & mask)==0 ); p->exclMask |= mask; } } @@ -7220,10 +7226,14 @@ ** This routine handles sqlite3_file_control() calls that are specific ** to proxy locking. */ static int proxyFileControl(sqlite3_file *id, int op, void *pArg){ switch( op ){ + case SQLITE_FCNTL_WAL_BLOCK: { + id->ctrlFlags |= UNIXFILE_BLOCK; + return SQLITE_OK; + } case SQLITE_FCNTL_GET_LOCKPROXYFILE: { unixFile *pFile = (unixFile*)id; if( pFile->pMethod == &proxyIoMethods ){ proxyLockingContext *pCtx = (proxyLockingContext*)pFile->lockingContext; proxyTakeConch(pFile); Index: src/sqlite.h.in ================================================================== --- src/sqlite.h.in +++ src/sqlite.h.in @@ -943,10 +943,17 @@ ** The [SQLITE_FCNTL_WIN32_SET_HANDLE] opcode is used for debugging. This ** opcode causes the xFileControl method to swap the file handle with the one ** pointed to by the pArg argument. This capability is used during testing ** and only needs to be supported when SQLITE_TEST is defined. ** +**
  • [[SQLITE_FCNTL_WAL_BLOCK]] +** The [SQLITE_FCNTL_WAL_BLOCK] is a single to the VFS layer that it might +** be advantageous to block on the next WAL lock if the lock is not immediately +** available. The WAL subsystem issues this ioctl() during some rare +** circumstances in order to fix a problem with priority inversion. +** Applications should not use this file-control. +** ** */ #define SQLITE_FCNTL_LOCKSTATE 1 #define SQLITE_FCNTL_GET_LOCKPROXYFILE 2 #define SQLITE_FCNTL_SET_LOCKPROXYFILE 3 @@ -967,10 +974,11 @@ #define SQLITE_FCNTL_TRACE 19 #define SQLITE_FCNTL_HAS_MOVED 20 #define SQLITE_FCNTL_SYNC 21 #define SQLITE_FCNTL_COMMIT_PHASETWO 22 #define SQLITE_FCNTL_WIN32_SET_HANDLE 23 +#define SQLITE_FCNTL_WAL_BLOCK 24 /* deprecated names */ #define SQLITE_GET_LOCKPROXYFILE SQLITE_FCNTL_GET_LOCKPROXYFILE #define SQLITE_SET_LOCKPROXYFILE SQLITE_FCNTL_SET_LOCKPROXYFILE #define SQLITE_LAST_ERRNO SQLITE_FCNTL_LAST_ERRNO Index: src/test_vfs.c ================================================================== --- src/test_vfs.c +++ src/test_vfs.c @@ -964,21 +964,20 @@ } static void tvfsShmBarrier(sqlite3_file *pFile){ TestvfsFd *pFd = tvfsGetFd(pFile); Testvfs *p = (Testvfs *)(pFd->pVfs->pAppData); + + if( p->pScript && p->mask&TESTVFS_SHMBARRIER_MASK ){ + const char *z = pFd->pShm ? pFd->pShm->zFile : ""; + tvfsExecTcl(p, "xShmBarrier", Tcl_NewStringObj(z, -1), pFd->pShmId, 0, 0); + } if( p->isFullshm ){ sqlite3OsShmBarrier(pFd->pReal); return; } - - if( p->pScript && p->mask&TESTVFS_SHMBARRIER_MASK ){ - tvfsExecTcl(p, "xShmBarrier", - Tcl_NewStringObj(pFd->pShm->zFile, -1), pFd->pShmId, 0, 0 - ); - } } static int tvfsShmUnmap( sqlite3_file *pFile, int deleteFlag @@ -1530,15 +1529,15 @@ sqlite3_vfs_register(pVfs, isDefault); return TCL_OK; bad_args: - Tcl_WrongNumArgs(interp, 1, objv, "VFSNAME ?-noshm BOOL? ?-default BOOL? ?-mxpathname INT? ?-szosfile INT? ?-iversion INT?"); + Tcl_WrongNumArgs(interp, 1, objv, "VFSNAME ?-noshm BOOL? ?-fullshm BOOL? ?-default BOOL? ?-mxpathname INT? ?-szosfile INT? ?-iversion INT?"); return TCL_ERROR; } int Sqlitetestvfs_Init(Tcl_Interp *interp){ Tcl_CreateObjCommand(interp, "testvfs", testvfs_cmd, 0, 0); return TCL_OK; } #endif Index: src/wal.c ================================================================== --- src/wal.c +++ src/wal.c @@ -786,13 +786,14 @@ if( pWal->exclusiveMode ) return; (void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, 1, SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED); WALTRACE(("WAL%p: release SHARED-%s\n", pWal, walLockName(lockIdx))); } -static int walLockExclusive(Wal *pWal, int lockIdx, int n){ +static int walLockExclusive(Wal *pWal, int lockIdx, int n, int fBlock){ int rc; if( pWal->exclusiveMode ) return SQLITE_OK; + if( fBlock ) sqlite3OsFileControl(pWal->pDbFd, SQLITE_FCNTL_WAL_BLOCK, 0); rc = sqlite3OsShmLock(pWal->pDbFd, lockIdx, n, SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE); WALTRACE(("WAL%p: acquire EXCLUSIVE-%s cnt=%d %s\n", pWal, walLockName(lockIdx), n, rc ? "failed" : "ok")); VVA_ONLY( pWal->lockError = (u8)(rc!=SQLITE_OK && rc!=SQLITE_BUSY); ) @@ -1074,11 +1075,11 @@ assert( WAL_ALL_BUT_WRITE==WAL_WRITE_LOCK+1 ); assert( WAL_CKPT_LOCK==WAL_ALL_BUT_WRITE ); assert( pWal->writeLock ); iLock = WAL_ALL_BUT_WRITE + pWal->ckptLock; nLock = SQLITE_SHM_NLOCK - iLock; - rc = walLockExclusive(pWal, iLock, nLock); + rc = walLockExclusive(pWal, iLock, nLock, 0); if( rc ){ return rc; } WALTRACE(("WAL%p: recovery begin...\n", pWal)); @@ -1608,11 +1609,11 @@ int lockIdx, /* Offset of first byte to lock */ int n /* Number of bytes to lock */ ){ int rc; do { - rc = walLockExclusive(pWal, lockIdx, n); + rc = walLockExclusive(pWal, lockIdx, n, 0); }while( xBusy && rc==SQLITE_BUSY && xBusy(pBusyArg) ); return rc; } /* @@ -2041,11 +2042,11 @@ if( pWal->readOnly & WAL_SHM_RDONLY ){ if( SQLITE_OK==(rc = walLockShared(pWal, WAL_WRITE_LOCK)) ){ walUnlockShared(pWal, WAL_WRITE_LOCK); rc = SQLITE_READONLY_RECOVERY; } - }else if( SQLITE_OK==(rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1)) ){ + }else if( SQLITE_OK==(rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1, 1)) ){ pWal->writeLock = 1; if( SQLITE_OK==(rc = walIndexPage(pWal, 0, &page0)) ){ badHdr = walIndexTryHdr(pWal, pChanged); if( badHdr ){ /* If the wal-index header is still malformed even while holding @@ -2247,11 +2248,11 @@ { if( (pWal->readOnly & WAL_SHM_RDONLY)==0 && (mxReadMarkhdr.mxFrame || mxI==0) ){ for(i=1; iaReadMark[i] = pWal->hdr.mxFrame; mxI = i; walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); break; @@ -2503,11 +2504,11 @@ } /* Only one writer allowed at a time. Get the write lock. Return ** SQLITE_BUSY if unable. */ - rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1); + rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1, 0); if( rc ){ return rc; } pWal->writeLock = 1; @@ -2648,11 +2649,11 @@ volatile WalCkptInfo *pInfo = walCkptInfo(pWal); assert( pInfo->nBackfill==pWal->hdr.mxFrame ); if( pInfo->nBackfill>0 ){ u32 salt1; sqlite3_randomness(4, &salt1); - rc = walLockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1); + rc = walLockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1, 0); if( rc==SQLITE_OK ){ /* If all readers are using WAL_READ_LOCK(0) (in other words if no ** readers are currently using the WAL), then the transactions ** frames will overwrite the start of the existing log. Update the ** wal-index header to reflect this. @@ -2973,11 +2974,11 @@ if( pWal->readOnly ) return SQLITE_READONLY; WALTRACE(("WAL%p: checkpoint begins\n", pWal)); /* IMPLEMENTATION-OF: R-62028-47212 All calls obtain an exclusive ** "checkpoint" lock on the database file. */ - rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1); + rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1, 0); if( rc ){ /* EVIDENCE-OF: R-10421-19736 If any other process is running a ** checkpoint operation at the same time, the lock cannot be obtained and ** SQLITE_BUSY is returned. ** EVIDENCE-OF: R-53820-33897 Even if there is a busy-handler configured, Index: test/lock_common.tcl ================================================================== --- test/lock_common.tcl +++ test/lock_common.tcl @@ -84,25 +84,55 @@ } # Execute a command in a child testfixture process, connected by two-way # channel $chan. Return the result of the command, or an error message. # -proc testfixture {chan cmd} { - puts $chan $cmd - puts $chan OVER - set r "" - while { 1 } { - set line [gets $chan] - if { $line == "OVER" } { - set res [lindex $r 1] - if { [lindex $r 0] } { error $res } - return $res - } - if {[eof $chan]} { - return "ERROR: Child process hung up" - } - append r $line +proc testfixture {chan cmd args} { + + if {[llength $args] == 0} { + fconfigure $chan -blocking 1 + puts $chan $cmd + puts $chan OVER + + set r "" + while { 1 } { + set line [gets $chan] + if { $line == "OVER" } { + set res [lindex $r 1] + if { [lindex $r 0] } { error $res } + return $res + } + if {[eof $chan]} { + return "ERROR: Child process hung up" + } + append r $line + } + return $r + } else { + set ::tfnb($chan) "" + fconfigure $chan -blocking 0 -buffering none + puts $chan $cmd + puts $chan OVER + fileevent $chan readable [list testfixture_script_cb $chan [lindex $args 0]] + return "" + } +} + +proc testfixture_script_cb {chan script} { + if {[eof $chan]} { + append ::tfnb($chan) "ERROR: Child process hung up" + set line "OVER" + } else { + set line [gets $chan] + } + + if { $line == "OVER" } { + uplevel #0 $script [list [lindex $::tfnb($chan) 1]] + unset ::tfnb($chan) + fileevent $chan readable "" + } else { + append ::tfnb($chan) $line } } proc testfixture_nb_cb {varname chan} { if {[eof $chan]} { ADDED test/walblock.test Index: test/walblock.test ================================================================== --- /dev/null +++ test/walblock.test @@ -0,0 +1,117 @@ +# 2015 Mar 17 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/lock_common.tcl +source $testdir/wal_common.tcl + +ifcapable !wal {finish_test ; return } +if {$::tcl_platform(platform)!="unix"} { finish_test ; return } +set testprefix walblock + +catch { db close } +testvfs tvfs -fullshm 1 +foreach f [glob test.db*] { forcedelete $f } + +sqlite3 db test.db -vfs tvfs +do_execsql_test 1.1.0 { + CREATE TABLE t1(x, y); + INSERT INTO t1 VALUES(1, 2); + INSERT INTO t1 VALUES(3, 4); + INSERT INTO t1 VALUES(5, 6); + PRAGMA journal_mode = wal; + INSERT INTO t1 VALUES(7, 8); +} {wal} + +do_test 1.1.1 { + lsort [glob test.db*] +} {test.db test.db-shm test.db-wal} + +do_test 1.1.2 { + set C [launch_testfixture] + testfixture $C { + sqlite3 db test.db + db eval { SELECT * FROM t1 } + } +} {1 2 3 4 5 6 7 8} + +do_test 1.1.3 { + set ::out [list] + testfixture $C { + db eval { SELECT * FROM t1 } + } [list set ::out] + set ::out +} {} + +do_test 1.1.4 { + vwait ::out + set ::out +} {1 2 3 4 5 6 7 8} + +# +# Test that if a read client cannot read the wal-index header because a +# write client is in the middle of updating it, the reader blocks until +# the writer finishes. +# +# 1. Open a write transaction using client [db] in this process. +# +# 2. Attempt to commit the write transaction. Intercept the xShmBarrier() +# call made by the writer between updating the two copies of the +# wal-index header. +# +# 3. Within the xShmBarrier() callback, make an asynchronous request to +# the other process to read from the database. It should block, as it +# cannot get read the wal-index header. +# +# 4. Still in xShmBarrier(), wait for 5 seconds. Check that the other +# process has not answered the request. +# +# 5: Finish committing the transaction. Then wait for 0.5 seconds more. +# Ensure that the second process has by this stage read the database +# and that the snapshot it read included the transaction committed in +# step (4). +# +do_execsql_test 1.2.1 { + BEGIN; + INSERT INTO t1 VALUES(9, 10); +} {} + +tvfs script barrier_callback +tvfs filter xShmBarrier +proc barrier_callback {method args} { + set ::out "" + testfixture $::C { db eval { SELECT * FROM t1 } } {set ::out} + + do_test "1.2.2.(blocking 5 seconds)" { + set ::continue 0 + after 5000 {set ::continue 1} + vwait ::continue + set ::out + } {} +} + +execsql COMMIT + +do_test "1.2.3.(blocking 0.5 seconds)" { + set ::continue 0 + after 500 {set ::continue 1} + vwait ::continue + set ::out +} {1 2 3 4 5 6 7 8 9 10} + + +finish_test + + + +