SQLite

Changes On Branch more-aggressive-wal-recovery
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch more-aggressive-wal-recovery Excluding Merge-Ins

This is equivalent to a diff from 73fecc688a to 3549faefea

2020-07-28
17:29
If a writer crashes in WAL mode and leave the SHM file in an inconsistent state, subsequent transactions are now able to recover the SHM file even if there are active read transactions. (check-in: ee8a108058 user: drh tags: trunk)
2020-07-25
20:16
When trying to read the index header, accept either of the two headers that have a valid checksum after 15 attempts to get exactly matching headers fail. (Closed-Leaf check-in: 3549faefea user: drh tags: more-aggressive-wal-recovery)
20:16
Allow a wal mode recovery to proceed even if there are readers. (check-in: 74374aebf9 user: dan tags: unlocked-recovery)
2020-07-24
13:49
Merge recent changes from trunk. (check-in: 22e8e6901a user: drh tags: larger-databases)
11:01
Remove a surplus space from a comment (check-in: 73fecc688a user: drh tags: trunk)
09:17
Fix other potentiall pointer aliasing problems associated with subclassing of the sqlite3_file object for various VFS implementations. (check-in: 270ac1a0f2 user: drh tags: trunk)

Changes to src/wal.c.
2132
2133
2134
2135
2136
2137
2138













2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157

2158
2159
2160
2161
2162
2163
2164
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169

2170
2171
2172
2173
2174
2175
2176
2177







+
+
+
+
+
+
+
+
+
+
+
+
+


















-
+







    }
    WALTRACE(("WAL%p: closed\n", pWal));
    sqlite3_free((void *)pWal->apWiData);
    sqlite3_free(pWal);
  }
  return rc;
}

/*
** Return true if the wal-index header seems valid based on the
** checksum.
*/
static int walIndexHdrValid(WalIndexHdr *p){
  u32 aCksum[2];
  if( p->isInit==0 ) return 0;
  walChecksumBytes(1, (u8*)p, sizeof(*p)-sizeof(p->aCksum), 0, aCksum);
  if( aCksum[0]!=p->aCksum[0] ) return 0;
  if( aCksum[1]!=p->aCksum[1] ) return 0;
  return 1;
}

/*
** Try to read the wal-index header.  Return 0 on success and 1 if
** there is a problem.
**
** The wal-index is in shared memory.  Another thread or process might
** be writing the header at the same time this procedure is trying to
** read it, which might result in inconsistency.  A dirty read is detected
** by verifying that both copies of the header are the same and also by
** a checksum on the header.
**
** If and only if the read is consistent and the header is different from
** pWal->hdr, then pWal->hdr is updated to the content of the new header
** and *pChanged is set to 1.
**
** If the checksum cannot be verified return non-zero. If the header
** is read successfully and the checksum verified, return zero.
*/
static SQLITE_NO_TSAN int walIndexTryHdr(Wal *pWal, int *pChanged){
static SQLITE_NO_TSAN int walIndexTryHdr(Wal *pWal, int *pChanged, int cnt){
  u32 aCksum[2];                  /* Checksum on the header content */
  WalIndexHdr h1, h2;             /* Two copies of the header content */
  WalIndexHdr volatile *aHdr;     /* Header in shared memory */

  /* The first page of the wal-index must be mapped at this point. */
  assert( pWal->nWiData>0 && pWal->apWiData[0] );

2175
2176
2177
2178
2179
2180
2181
2182

2183
2184
2185
2186
2187
2188




2189
2190
2191
2192
2193
2194

2195
2196
2197
2198
2199
2200
2201
2202
2188
2189
2190
2191
2192
2193
2194

2195
2196
2197
2198
2199


2200
2201
2202
2203


2204



2205

2206
2207
2208
2209
2210
2211
2212







-
+




-
-
+
+
+
+
-
-

-
-
-
+
-







  ** give false-positive warnings about these accesses because the tools do not
  ** account for the double-read and the memory barrier. The use of mutexes
  ** here would be problematic as the memory being accessed is potentially
  ** shared among multiple processes and not all mutex implementions work
  ** reliably in that environment.
  */
  aHdr = walIndexHdr(pWal);
  memcpy(&h1, (void *)&aHdr[0], sizeof(h1)); /* Possible TSAN false-positive */
  memcpy(&h1, (void *)&aHdr[0], sizeof(h1));
  walShmBarrier(pWal);
  memcpy(&h2, (void *)&aHdr[1], sizeof(h2));

  if( memcmp(&h1, &h2, sizeof(h1))!=0 ){
    return 1;   /* Dirty read */
  }  
    if( cnt<15 ) return 1;
    if( !walIndexHdrValid(&h1) ){
      memcpy(&h1, &h2, sizeof(h1));
    }
  if( h1.isInit==0 ){
    return 1;   /* Malformed header - probably all zeros */
  }
  walChecksumBytes(1, (u8*)&h1, sizeof(h1)-sizeof(h1.aCksum), 0, aCksum);
  if( aCksum[0]!=h1.aCksum[0] || aCksum[1]!=h1.aCksum[1] ){
    return 1;   /* Checksum does not match */
  if( !walIndexHdrValid(&h1) ) return 1;
  }

  if( memcmp(&pWal->hdr, &h1, sizeof(WalIndexHdr)) ){
    *pChanged = 1;
    memcpy(&pWal->hdr, &h1, sizeof(WalIndexHdr));
    pWal->szPage = (pWal->hdr.szPage&0xfe00) + ((pWal->hdr.szPage&0x0001)<<16);
    testcase( pWal->szPage<=32768 );
    testcase( pWal->szPage>=65536 );
2220
2221
2222
2223
2224
2225
2226
2227

2228
2229
2230
2231
2232
2233
2234
2230
2231
2232
2233
2234
2235
2236

2237
2238
2239
2240
2241
2242
2243
2244







-
+







** Set *pChanged to 1 if the wal-index header value in pWal->hdr is
** changed by this operation.  If pWal->hdr is unchanged, set *pChanged
** to 0.
**
** If the wal-index header is successfully read, return SQLITE_OK. 
** Otherwise an SQLite error code.
*/
static int walIndexReadHdr(Wal *pWal, int *pChanged){
static int walIndexReadHdr(Wal *pWal, int *pChanged, int cnt){
  int rc;                         /* Return code */
  int badHdr;                     /* True if a header read failed */
  volatile u32 *page0;            /* Chunk of wal-index containing header */

  /* Ensure that page 0 of the wal-index (the page that contains the 
  ** wal-index header) is mapped. Return early if an error occurs here.
  */
2260
2261
2262
2263
2264
2265
2266
2267

2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283

2284
2285
2286
2287
2288
2289
2290
2270
2271
2272
2273
2274
2275
2276

2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292

2293
2294
2295
2296
2297
2298
2299
2300







-
+















-
+







  assert( page0!=0 || pWal->writeLock==0 );

  /* If the first page of the wal-index has been mapped, try to read the
  ** wal-index header immediately, without holding any lock. This usually
  ** works, but may fail if the wal-index header is corrupt or currently 
  ** being modified by another thread or process.
  */
  badHdr = (page0 ? walIndexTryHdr(pWal, pChanged) : 1);
  badHdr = (page0 ? walIndexTryHdr(pWal, pChanged, 0) : 1);

  /* If the first attempt failed, it might have been due to a race
  ** with a writer.  So get a WRITE lock and try again.
  */
  if( badHdr ){
    if( pWal->bShmUnreliable==0 && (pWal->readOnly & WAL_SHM_RDONLY) ){
      if( SQLITE_OK==(rc = walLockShared(pWal, WAL_WRITE_LOCK)) ){
        walUnlockShared(pWal, WAL_WRITE_LOCK);
        rc = SQLITE_READONLY_RECOVERY;
      }
    }else{
      int bWriteLock = pWal->writeLock;
      if( bWriteLock || SQLITE_OK==(rc = walLockWriter(pWal)) ){
        pWal->writeLock = 1;
        if( SQLITE_OK==(rc = walIndexPage(pWal, 0, &page0)) ){
          badHdr = walIndexTryHdr(pWal, pChanged);
          badHdr = walIndexTryHdr(pWal, pChanged, cnt);
          if( badHdr ){
            /* If the wal-index header is still malformed even while holding
            ** a WRITE lock, it can only mean that the header is corrupted and
            ** needs to be reconstructed.  So run recovery to do exactly that.
            */
            rc = walIndexRecover(pWal);
            *pChanged = 1;
2582
2583
2584
2585
2586
2587
2588
2589

2590
2591
2592
2593
2594
2595
2596
2592
2593
2594
2595
2596
2597
2598

2599
2600
2601
2602
2603
2604
2605
2606







-
+







    if( cnt>=10 ) nDelay = (cnt-9)*(cnt-9)*39;
    sqlite3OsSleep(pWal->pVfs, nDelay);
  }

  if( !useWal ){
    assert( rc==SQLITE_OK );
    if( pWal->bShmUnreliable==0 ){
      rc = walIndexReadHdr(pWal, pChanged);
      rc = walIndexReadHdr(pWal, pChanged, cnt);
    }
    if( rc==SQLITE_BUSY ){
      /* If there is not a recovery running in another thread or process
      ** then convert BUSY errors to WAL_RETRY.  If recovery is known to
      ** be running, convert BUSY to BUSY_RECOVERY.  There is a race here
      ** which might cause WAL_RETRY to be returned even if BUSY_RECOVERY
      ** would be technically correct.  But the race is benign since with
3741
3742
3743
3744
3745
3746
3747
3748

3749
3750
3751
3752
3753
3754
3755
3751
3752
3753
3754
3755
3756
3757

3758
3759
3760
3761
3762
3763
3764
3765







-
+







    }
  }


  /* Read the wal-index header. */
  if( rc==SQLITE_OK ){
    walDisableBlocking(pWal);
    rc = walIndexReadHdr(pWal, &isChanged);
    rc = walIndexReadHdr(pWal, &isChanged, 0);
    (void)walEnableBlocking(pWal);
    if( isChanged && pWal->pDbFd->pMethods->iVersion>=3 ){
      sqlite3OsUnfetch(pWal->pDbFd, 0, 0);
    }
  }

  /* Copy data from the log to the database file. */