ADDED doc/F2FS.txt Index: doc/F2FS.txt ================================================================== --- /dev/null +++ doc/F2FS.txt @@ -0,0 +1,87 @@ + +SQLite's OS layer contains the following definitions used in F2FS related +calls: + +#define F2FS_IOCTL_MAGIC 0xf5 +#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1) +#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) +#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) +#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) +#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, u32) +#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 + +After opening a database file on Linux (including Android), SQLite determines +whether or not a file supports F2FS atomic commits as follows: + + u32 flags = 0; + rc = ioctl(fd, F2FS_IOC_GET_FEATURES, &flags); + if( rc==0 && (flags & F2FS_FEATURE_ATOMIC_WRITE) ){ + /* File supports F2FS atomic commits */ + }else{ + /* File does NOT support F2FS atomic commits */ + } + +where "fd" is the file-descriptor open on the database file. + +Usually, when writing to a database file that supports atomic commits, SQLite +accumulates the entire transaction in heap memory, deferring all writes to the +db file until the transaction is committed. + +When it is time to commit a transaction on a file that supports atomic +commits, SQLite does: + + /* Take an F_WRLCK lock on the database file. This prevents any other + ** SQLite clients from reading or writing the file until the lock + ** is released. */ + rc = fcntl(fd, F_SETLK, ...); + if( rc!=0 ) goto failed; + + rc = ioctl(fd, F2FS_IOC_START_ATOMIC_WRITE); + if( rc!=0 ) goto fallback_to_legacy_journal_commit; + + foreach (dirty page){ + rc = write(fd, ...dirty page...); + if( rc!=0 ){ + ioctl(fd, F2FS_IOC_ABORT_VOLATILE_WRITE); + goto fallback_to_legacy_journal_commit; + } + } + + rc = ioctl(fd, F2FS_IOC_COMMIT_ATOMIC_WRITE); + if( rc!=0 ){ + ioctl(fd, F2FS_IOC_ABORT_VOLATILE_WRITE); + goto fallback_to_legacy_journal_commit; + } + + /* If we get there, the transaction has been successfully + ** committed to persistent storage. The following call + ** relinquishes the F_WRLCK lock. */ + fcntl(fd, F_SETLK, ...); + +Assumptions: + +1. After either of the F2FS_IOC_ABORT_VOLATILE_WRITE calls return, + the database file is in the state that it was in before + F2FS_IOC_START_ATOMIC_WRITE was invoked. Even if the ioctl() + fails - we're ignoring the return code. + + This is true regardless of the type of error that occurred in + ioctl() or write(). + +2. If the system fails before the F2FS_IOC_COMMIT_ATOMIC_WRITE is + completed, then following a reboot the database file is in the + state that it was in before F2FS_IOC_START_ATOMIC_WRITE was invoked. + Or, if the write was commited right before the system failed, in a + state indicating that all write() calls were successfully committed + to persistent storage before the failure occurred. + +3. If the process crashes before the F2FS_IOC_COMMIT_ATOMIC_WRITE is + completed then the file is automatically restored to the state that + it was in before F2FS_IOC_START_ATOMIC_WRITE was called. This occurs + before the posix advisory lock is automatically dropped - there is + no chance that another client will be able to read the file in a + half-committed state before the rollback operation occurs. + + + + Index: src/pager.c ================================================================== --- src/pager.c +++ src/pager.c @@ -6380,13 +6380,14 @@ /* If this is an in-memory db, or no pages have been written to, or this ** function has already been called, it is mostly a no-op. However, any ** backup in progress needs to be restarted. */ sqlite3BackupRestart(pPager->pBackup); }else{ + PgHdr *pList; if( pagerUseWal(pPager) ){ - PgHdr *pList = sqlite3PcacheDirtyList(pPager->pPCache); PgHdr *pPageOne = 0; + pList = sqlite3PcacheDirtyList(pPager->pPCache); if( pList==0 ){ /* Must have at least one page for the WAL commit flag. ** Ticket [2d1a5c67dfc2363e44f29d9bbd57f] 2011-05-18 */ rc = sqlite3PagerGet(pPager, 1, &pPageOne, 0); pList = pPageOne; @@ -6405,11 +6406,11 @@ ** should be used. No rollback journal is created if batch-atomic-write ** is enabled. */ sqlite3_file *fd = pPager->fd; #ifdef SQLITE_ENABLE_BATCH_ATOMIC_WRITE - const int bBatch = zMaster==0 /* An SQLITE_IOCAP_BATCH_ATOMIC commit */ + int bBatch = zMaster==0 /* An SQLITE_IOCAP_BATCH_ATOMIC commit */ && (sqlite3OsDeviceCharacteristics(fd) & SQLITE_IOCAP_BATCH_ATOMIC) && !pPager->noSync && sqlite3JournalIsInMemory(pPager->jfd); #else # define bBatch 0 @@ -6462,19 +6463,20 @@ if( rc==SQLITE_OK ){ rc = pager_incr_changecounter(pPager, 0); } } } -#else +#else /* SQLITE_ENABLE_ATOMIC_WRITE */ #ifdef SQLITE_ENABLE_BATCH_ATOMIC_WRITE if( zMaster ){ rc = sqlite3JournalCreate(pPager->jfd); if( rc!=SQLITE_OK ) goto commit_phase_one_exit; + assert( bBatch==0 ); } #endif rc = pager_incr_changecounter(pPager, 0); -#endif +#endif /* !SQLITE_ENABLE_ATOMIC_WRITE */ if( rc!=SQLITE_OK ) goto commit_phase_one_exit; /* Write the master journal name into the journal file. If a master ** journal file name has already been written to the journal file, ** or if zMaster is NULL (no master journal), then this call is a no-op. @@ -6494,27 +6496,39 @@ ** xSync() call will be changed to a no-op by the OS anyhow. */ rc = syncJournal(pPager, 0); if( rc!=SQLITE_OK ) goto commit_phase_one_exit; + pList = sqlite3PcacheDirtyList(pPager->pPCache); +#ifdef SQLITE_ENABLE_BATCH_ATOMIC_WRITE if( bBatch ){ - /* The pager is now in DBMOD state. But regardless of what happens - ** next, attempting to play the journal back into the database would - ** be unsafe. Close it now to make sure that does not happen. */ - sqlite3OsClose(pPager->jfd); rc = sqlite3OsFileControl(fd, SQLITE_FCNTL_BEGIN_ATOMIC_WRITE, 0); - if( rc!=SQLITE_OK ) goto commit_phase_one_exit; - } - rc = pager_write_pagelist(pPager,sqlite3PcacheDirtyList(pPager->pPCache)); - if( bBatch ){ if( rc==SQLITE_OK ){ - rc = sqlite3OsFileControl(fd, SQLITE_FCNTL_COMMIT_ATOMIC_WRITE, 0); + rc = pager_write_pagelist(pPager, pList); + if( rc==SQLITE_OK ){ + rc = sqlite3OsFileControl(fd, SQLITE_FCNTL_COMMIT_ATOMIC_WRITE, 0); + } + if( rc!=SQLITE_OK ){ + sqlite3OsFileControlHint(fd, SQLITE_FCNTL_ROLLBACK_ATOMIC_WRITE, 0); + } } - if( rc!=SQLITE_OK ){ - sqlite3OsFileControlHint(fd, SQLITE_FCNTL_ROLLBACK_ATOMIC_WRITE, 0); + + if( (rc&0xFF)==SQLITE_IOERR && rc!=SQLITE_IOERR_NOMEM ){ + rc = sqlite3JournalCreate(pPager->jfd); + if( rc!=SQLITE_OK ){ + sqlite3OsClose(pPager->jfd); + } + bBatch = 0; + }else{ + sqlite3OsClose(pPager->jfd); } } +#endif /* SQLITE_ENABLE_BATCH_ATOMIC_WRITE */ + + if( bBatch==0 && rc==SQLITE_OK ){ + rc = pager_write_pagelist(pPager, pList); + } if( rc!=SQLITE_OK ){ assert( rc!=SQLITE_IOERR_BLOCKED ); goto commit_phase_one_exit; } Index: src/test_vfs.c ================================================================== --- src/test_vfs.c +++ src/test_vfs.c @@ -131,12 +131,13 @@ #define TESTVFS_FULLPATHNAME_MASK 0x00008000 #define TESTVFS_READ_MASK 0x00010000 #define TESTVFS_UNLOCK_MASK 0x00020000 #define TESTVFS_LOCK_MASK 0x00040000 #define TESTVFS_CKLOCK_MASK 0x00080000 +#define TESTVFS_FCNTL_MASK 0x00100000 -#define TESTVFS_ALL_MASK 0x000FFFFF +#define TESTVFS_ALL_MASK 0x001FFFFF #define TESTVFS_MAX_PAGES 1024 /* @@ -515,11 +516,12 @@ /* ** File control method. For custom operations on an tvfs-file. */ static int tvfsFileControl(sqlite3_file *pFile, int op, void *pArg){ - TestvfsFd *p = tvfsGetFd(pFile); + TestvfsFd *pFd = tvfsGetFd(pFile); + Testvfs *p = (Testvfs *)pFd->pVfs->pAppData; if( op==SQLITE_FCNTL_PRAGMA ){ char **argv = (char**)pArg; if( sqlite3_stricmp(argv[1],"error")==0 ){ int rc = SQLITE_ERROR; if( argv[2] ){ @@ -533,15 +535,38 @@ if( z[0] ) argv[0] = sqlite3_mprintf("%s", z); } return rc; } if( sqlite3_stricmp(argv[1], "filename")==0 ){ - argv[0] = sqlite3_mprintf("%s", p->zFilename); + argv[0] = sqlite3_mprintf("%s", pFd->zFilename); return SQLITE_OK; } } - return sqlite3OsFileControl(p->pReal, op, pArg); + if( p->pScript && (p->mask&TESTVFS_FCNTL_MASK) ){ + struct Fcntl { + int iFnctl; + const char *zFnctl; + } aF[] = { + { SQLITE_FCNTL_BEGIN_ATOMIC_WRITE, "BEGIN_ATOMIC_WRITE" }, + { SQLITE_FCNTL_COMMIT_ATOMIC_WRITE, "COMMIT_ATOMIC_WRITE" }, + }; + int i; + for(i=0; izFilename, -1), + Tcl_NewStringObj(aF[i].zFnctl, -1), + 0, 0 + ); + tvfsResultCode(p, &rc); + if( rc ) return rc; + } + } + return sqlite3OsFileControl(pFd->pReal, op, pArg); } /* ** Return the sector-size in bytes for an tvfs-file. */ @@ -1158,10 +1183,11 @@ { "xAccess", TESTVFS_ACCESS_MASK }, { "xFullPathname", TESTVFS_FULLPATHNAME_MASK }, { "xUnlock", TESTVFS_UNLOCK_MASK }, { "xLock", TESTVFS_LOCK_MASK }, { "xCheckReservedLock", TESTVFS_CKLOCK_MASK }, + { "xFileControl", TESTVFS_FCNTL_MASK }, }; Tcl_Obj **apElem = 0; int nElem = 0; int mask = 0; if( objc!=3 ){ ADDED test/atomic2.test Index: test/atomic2.test ================================================================== --- /dev/null +++ test/atomic2.test @@ -0,0 +1,95 @@ +# 2018-07-15 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# This file implements regression tests for SQLite library. The +# focus of this file is testing that if an IO error is encountered +# as part of an atomic F2FS commit, an attempt is made to commit the +# transaction using a legacy journal commit. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +source $testdir/malloc_common.tcl +set ::testprefix atomic2 + +db close +if {[atomic_batch_write test.db]==0} { + puts "No f2fs atomic-batch-write support. Skipping tests..." + finish_test + return +} + +reset_db + +do_execsql_test 1.0 { + CREATE TABLE t1(x, y); + CREATE INDEX i1x ON t1(x); + CREATE INDEX i2x ON t1(y); + + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s WHERE i<100 ) + INSERT INTO t1 SELECT randomblob(400), randomblob(400) FROM s; +} + +set setup [list \ + -injectstart at_injectstart \ + -injectstop at_injectstop \ +] + +set ::at_fail 0 +set ::at_nfail 0 + +proc at_injectstart {iFail} { + set ::at_fail $iFail + set ::at_nfail 0 +} +proc at_injectstop {} { + set ::at_fail 0 + return $::at_nfail +} + +proc at_vfs_callback {method file z args} { + if {$::at_fail>0} { + incr ::at_fail -1 + if {$::at_fail==0} { + incr ::at_nfail + return SQLITE_IOERR + } elseif {$method=="xFileControl" && $z=="COMMIT_ATOMIC_WRITE"} { + set ::at_fail 0 + } + } + return SQLITE_OK +} + +testvfs tvfs -default 1 +tvfs script at_vfs_callback +tvfs filter {xFileControl xWrite} + +faultsim_save_and_close + +do_one_faultsim_test 2.0 {*}$setup -prep { + faultsim_restore_and_reopen +} -body { + execsql { + WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s WHERE i<100 ) + INSERT INTO t1 SELECT randomblob(400), randomblob(400) FROM s; + } +} -test { + faultsim_test_result {0 {}} + + set res [execsql {SELECT count(*) FROM t1; PRAGMA integrity_check}] + if {$res!="200 ok"} { + error "expected {200 ok}, got $res" + } +} + +db close +tvfs delete + +finish_test