000001 /*
000002 ** 2004 April 6
000003 **
000004 ** The author disclaims copyright to this source code. In place of
000005 ** a legal notice, here is a blessing:
000006 **
000007 ** May you do good and not evil.
000008 ** May you find forgiveness for yourself and forgive others.
000009 ** May you share freely, never taking more than you give.
000010 **
000011 *************************************************************************
000012 ** This file implements an external (disk-based) database using BTrees.
000013 ** See the header comment on "btreeInt.h" for additional information.
000014 ** Including a description of file format and an overview of operation.
000015 */
000016 #include "btreeInt.h"
000017
000018 /*
000019 ** The header string that appears at the beginning of every
000020 ** SQLite database.
000021 */
000022 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
000023
000024 /*
000025 ** Set this global variable to 1 to enable tracing using the TRACE
000026 ** macro.
000027 */
000028 #if 0
000029 int sqlite3BtreeTrace=1; /* True to enable tracing */
000030 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
000031 #else
000032 # define TRACE(X)
000033 #endif
000034
000035 /*
000036 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
000037 ** But if the value is zero, make it 65536.
000038 **
000039 ** This routine is used to extract the "offset to cell content area" value
000040 ** from the header of a btree page. If the page size is 65536 and the page
000041 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
000042 ** This routine makes the necessary adjustment to 65536.
000043 */
000044 #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1)
000045
000046 /*
000047 ** Values passed as the 5th argument to allocateBtreePage()
000048 */
000049 #define BTALLOC_ANY 0 /* Allocate any page */
000050 #define BTALLOC_EXACT 1 /* Allocate exact page if possible */
000051 #define BTALLOC_LE 2 /* Allocate any page <= the parameter */
000052
000053 /*
000054 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not
000055 ** defined, or 0 if it is. For example:
000056 **
000057 ** bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
000058 */
000059 #ifndef SQLITE_OMIT_AUTOVACUUM
000060 #define IfNotOmitAV(expr) (expr)
000061 #else
000062 #define IfNotOmitAV(expr) 0
000063 #endif
000064
000065 #ifndef SQLITE_OMIT_SHARED_CACHE
000066 /*
000067 ** A list of BtShared objects that are eligible for participation
000068 ** in shared cache. This variable has file scope during normal builds,
000069 ** but the test harness needs to access it so we make it global for
000070 ** test builds.
000071 **
000072 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MAIN.
000073 */
000074 #ifdef SQLITE_TEST
000075 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
000076 #else
000077 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
000078 #endif
000079 #endif /* SQLITE_OMIT_SHARED_CACHE */
000080
000081 #ifndef SQLITE_OMIT_SHARED_CACHE
000082 /*
000083 ** Enable or disable the shared pager and schema features.
000084 **
000085 ** This routine has no effect on existing database connections.
000086 ** The shared cache setting effects only future calls to
000087 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
000088 */
000089 int sqlite3_enable_shared_cache(int enable){
000090 sqlite3GlobalConfig.sharedCacheEnabled = enable;
000091 return SQLITE_OK;
000092 }
000093 #endif
000094
000095
000096
000097 #ifdef SQLITE_OMIT_SHARED_CACHE
000098 /*
000099 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
000100 ** and clearAllSharedCacheTableLocks()
000101 ** manipulate entries in the BtShared.pLock linked list used to store
000102 ** shared-cache table level locks. If the library is compiled with the
000103 ** shared-cache feature disabled, then there is only ever one user
000104 ** of each BtShared structure and so this locking is not necessary.
000105 ** So define the lock related functions as no-ops.
000106 */
000107 #define querySharedCacheTableLock(a,b,c) SQLITE_OK
000108 #define setSharedCacheTableLock(a,b,c) SQLITE_OK
000109 #define clearAllSharedCacheTableLocks(a)
000110 #define downgradeAllSharedCacheTableLocks(a)
000111 #define hasSharedCacheTableLock(a,b,c,d) 1
000112 #define hasReadConflicts(a, b) 0
000113 #endif
000114
000115 #ifdef SQLITE_DEBUG
000116 /*
000117 ** Return and reset the seek counter for a Btree object.
000118 */
000119 sqlite3_uint64 sqlite3BtreeSeekCount(Btree *pBt){
000120 u64 n = pBt->nSeek;
000121 pBt->nSeek = 0;
000122 return n;
000123 }
000124 #endif
000125
000126 /*
000127 ** Implementation of the SQLITE_CORRUPT_PAGE() macro. Takes a single
000128 ** (MemPage*) as an argument. The (MemPage*) must not be NULL.
000129 **
000130 ** If SQLITE_DEBUG is not defined, then this macro is equivalent to
000131 ** SQLITE_CORRUPT_BKPT. Or, if SQLITE_DEBUG is set, then the log message
000132 ** normally produced as a side-effect of SQLITE_CORRUPT_BKPT is augmented
000133 ** with the page number and filename associated with the (MemPage*).
000134 */
000135 #ifdef SQLITE_DEBUG
000136 int corruptPageError(int lineno, MemPage *p){
000137 char *zMsg;
000138 sqlite3BeginBenignMalloc();
000139 zMsg = sqlite3_mprintf("database corruption page %u of %s",
000140 p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0)
000141 );
000142 sqlite3EndBenignMalloc();
000143 if( zMsg ){
000144 sqlite3ReportError(SQLITE_CORRUPT, lineno, zMsg);
000145 }
000146 sqlite3_free(zMsg);
000147 return SQLITE_CORRUPT_BKPT;
000148 }
000149 # define SQLITE_CORRUPT_PAGE(pMemPage) corruptPageError(__LINE__, pMemPage)
000150 #else
000151 # define SQLITE_CORRUPT_PAGE(pMemPage) SQLITE_CORRUPT_PGNO(pMemPage->pgno)
000152 #endif
000153
000154 /* Default value for SHARED_LOCK_TRACE macro if shared-cache is disabled
000155 ** or if the lock tracking is disabled. This is always the value for
000156 ** release builds.
000157 */
000158 #define SHARED_LOCK_TRACE(X,MSG,TAB,TYPE) /*no-op*/
000159
000160 #ifndef SQLITE_OMIT_SHARED_CACHE
000161
000162 #if 0
000163 /* ^---- Change to 1 and recompile to enable shared-lock tracing
000164 ** for debugging purposes.
000165 **
000166 ** Print all shared-cache locks on a BtShared. Debugging use only.
000167 */
000168 static void sharedLockTrace(
000169 BtShared *pBt,
000170 const char *zMsg,
000171 int iRoot,
000172 int eLockType
000173 ){
000174 BtLock *pLock;
000175 if( iRoot>0 ){
000176 printf("%s-%p %u%s:", zMsg, pBt, iRoot, eLockType==READ_LOCK?"R":"W");
000177 }else{
000178 printf("%s-%p:", zMsg, pBt);
000179 }
000180 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
000181 printf(" %p/%u%s", pLock->pBtree, pLock->iTable,
000182 pLock->eLock==READ_LOCK ? "R" : "W");
000183 while( pLock->pNext && pLock->pBtree==pLock->pNext->pBtree ){
000184 pLock = pLock->pNext;
000185 printf(",%u%s", pLock->iTable, pLock->eLock==READ_LOCK ? "R" : "W");
000186 }
000187 }
000188 printf("\n");
000189 fflush(stdout);
000190 }
000191 #undef SHARED_LOCK_TRACE
000192 #define SHARED_LOCK_TRACE(X,MSG,TAB,TYPE) sharedLockTrace(X,MSG,TAB,TYPE)
000193 #endif /* Shared-lock tracing */
000194
000195 #ifdef SQLITE_DEBUG
000196 /*
000197 **** This function is only used as part of an assert() statement. ***
000198 **
000199 ** Check to see if pBtree holds the required locks to read or write to the
000200 ** table with root page iRoot. Return 1 if it does and 0 if not.
000201 **
000202 ** For example, when writing to a table with root-page iRoot via
000203 ** Btree connection pBtree:
000204 **
000205 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
000206 **
000207 ** When writing to an index that resides in a sharable database, the
000208 ** caller should have first obtained a lock specifying the root page of
000209 ** the corresponding table. This makes things a bit more complicated,
000210 ** as this module treats each table as a separate structure. To determine
000211 ** the table corresponding to the index being written, this
000212 ** function has to search through the database schema.
000213 **
000214 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
000215 ** hold a write-lock on the schema table (root page 1). This is also
000216 ** acceptable.
000217 */
000218 static int hasSharedCacheTableLock(
000219 Btree *pBtree, /* Handle that must hold lock */
000220 Pgno iRoot, /* Root page of b-tree */
000221 int isIndex, /* True if iRoot is the root of an index b-tree */
000222 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */
000223 ){
000224 Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
000225 Pgno iTab = 0;
000226 BtLock *pLock;
000227
000228 /* If this database is not shareable, or if the client is reading
000229 ** and has the read-uncommitted flag set, then no lock is required.
000230 ** Return true immediately.
000231 */
000232 if( (pBtree->sharable==0)
000233 || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit))
000234 ){
000235 return 1;
000236 }
000237
000238 /* If the client is reading or writing an index and the schema is
000239 ** not loaded, then it is too difficult to actually check to see if
000240 ** the correct locks are held. So do not bother - just return true.
000241 ** This case does not come up very often anyhow.
000242 */
000243 if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
000244 return 1;
000245 }
000246
000247 /* Figure out the root-page that the lock should be held on. For table
000248 ** b-trees, this is just the root page of the b-tree being read or
000249 ** written. For index b-trees, it is the root page of the associated
000250 ** table. */
000251 if( isIndex ){
000252 HashElem *p;
000253 int bSeen = 0;
000254 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
000255 Index *pIdx = (Index *)sqliteHashData(p);
000256 if( pIdx->tnum==iRoot ){
000257 if( bSeen ){
000258 /* Two or more indexes share the same root page. There must
000259 ** be imposter tables. So just return true. The assert is not
000260 ** useful in that case. */
000261 return 1;
000262 }
000263 iTab = pIdx->pTable->tnum;
000264 bSeen = 1;
000265 }
000266 }
000267 }else{
000268 iTab = iRoot;
000269 }
000270
000271 SHARED_LOCK_TRACE(pBtree->pBt,"hasLock",iRoot,eLockType);
000272
000273 /* Search for the required lock. Either a write-lock on root-page iTab, a
000274 ** write-lock on the schema table, or (if the client is reading) a
000275 ** read-lock on iTab will suffice. Return 1 if any of these are found. */
000276 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
000277 if( pLock->pBtree==pBtree
000278 && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
000279 && pLock->eLock>=eLockType
000280 ){
000281 return 1;
000282 }
000283 }
000284
000285 /* Failed to find the required lock. */
000286 return 0;
000287 }
000288 #endif /* SQLITE_DEBUG */
000289
000290 #ifdef SQLITE_DEBUG
000291 /*
000292 **** This function may be used as part of assert() statements only. ****
000293 **
000294 ** Return true if it would be illegal for pBtree to write into the
000295 ** table or index rooted at iRoot because other shared connections are
000296 ** simultaneously reading that same table or index.
000297 **
000298 ** It is illegal for pBtree to write if some other Btree object that
000299 ** shares the same BtShared object is currently reading or writing
000300 ** the iRoot table. Except, if the other Btree object has the
000301 ** read-uncommitted flag set, then it is OK for the other object to
000302 ** have a read cursor.
000303 **
000304 ** For example, before writing to any part of the table or index
000305 ** rooted at page iRoot, one should call:
000306 **
000307 ** assert( !hasReadConflicts(pBtree, iRoot) );
000308 */
000309 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
000310 BtCursor *p;
000311 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
000312 if( p->pgnoRoot==iRoot
000313 && p->pBtree!=pBtree
000314 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit)
000315 ){
000316 return 1;
000317 }
000318 }
000319 return 0;
000320 }
000321 #endif /* #ifdef SQLITE_DEBUG */
000322
000323 /*
000324 ** Query to see if Btree handle p may obtain a lock of type eLock
000325 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
000326 ** SQLITE_OK if the lock may be obtained (by calling
000327 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
000328 */
000329 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
000330 BtShared *pBt = p->pBt;
000331 BtLock *pIter;
000332
000333 assert( sqlite3BtreeHoldsMutex(p) );
000334 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
000335 assert( p->db!=0 );
000336 assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 );
000337
000338 /* If requesting a write-lock, then the Btree must have an open write
000339 ** transaction on this file. And, obviously, for this to be so there
000340 ** must be an open write transaction on the file itself.
000341 */
000342 assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
000343 assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
000344
000345 /* This routine is a no-op if the shared-cache is not enabled */
000346 if( !p->sharable ){
000347 return SQLITE_OK;
000348 }
000349
000350 /* If some other connection is holding an exclusive lock, the
000351 ** requested lock may not be obtained.
000352 */
000353 if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
000354 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
000355 return SQLITE_LOCKED_SHAREDCACHE;
000356 }
000357
000358 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
000359 /* The condition (pIter->eLock!=eLock) in the following if(...)
000360 ** statement is a simplification of:
000361 **
000362 ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
000363 **
000364 ** since we know that if eLock==WRITE_LOCK, then no other connection
000365 ** may hold a WRITE_LOCK on any table in this file (since there can
000366 ** only be a single writer).
000367 */
000368 assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
000369 assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
000370 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
000371 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
000372 if( eLock==WRITE_LOCK ){
000373 assert( p==pBt->pWriter );
000374 pBt->btsFlags |= BTS_PENDING;
000375 }
000376 return SQLITE_LOCKED_SHAREDCACHE;
000377 }
000378 }
000379 return SQLITE_OK;
000380 }
000381 #endif /* !SQLITE_OMIT_SHARED_CACHE */
000382
000383 #ifndef SQLITE_OMIT_SHARED_CACHE
000384 /*
000385 ** Add a lock on the table with root-page iTable to the shared-btree used
000386 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
000387 ** WRITE_LOCK.
000388 **
000389 ** This function assumes the following:
000390 **
000391 ** (a) The specified Btree object p is connected to a sharable
000392 ** database (one with the BtShared.sharable flag set), and
000393 **
000394 ** (b) No other Btree objects hold a lock that conflicts
000395 ** with the requested lock (i.e. querySharedCacheTableLock() has
000396 ** already been called and returned SQLITE_OK).
000397 **
000398 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
000399 ** is returned if a malloc attempt fails.
000400 */
000401 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
000402 BtShared *pBt = p->pBt;
000403 BtLock *pLock = 0;
000404 BtLock *pIter;
000405
000406 SHARED_LOCK_TRACE(pBt,"setLock", iTable, eLock);
000407
000408 assert( sqlite3BtreeHoldsMutex(p) );
000409 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
000410 assert( p->db!=0 );
000411
000412 /* A connection with the read-uncommitted flag set will never try to
000413 ** obtain a read-lock using this function. The only read-lock obtained
000414 ** by a connection in read-uncommitted mode is on the sqlite_schema
000415 ** table, and that lock is obtained in BtreeBeginTrans(). */
000416 assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK );
000417
000418 /* This function should only be called on a sharable b-tree after it
000419 ** has been determined that no other b-tree holds a conflicting lock. */
000420 assert( p->sharable );
000421 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
000422
000423 /* First search the list for an existing lock on this table. */
000424 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
000425 if( pIter->iTable==iTable && pIter->pBtree==p ){
000426 pLock = pIter;
000427 break;
000428 }
000429 }
000430
000431 /* If the above search did not find a BtLock struct associating Btree p
000432 ** with table iTable, allocate one and link it into the list.
000433 */
000434 if( !pLock ){
000435 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
000436 if( !pLock ){
000437 return SQLITE_NOMEM_BKPT;
000438 }
000439 pLock->iTable = iTable;
000440 pLock->pBtree = p;
000441 pLock->pNext = pBt->pLock;
000442 pBt->pLock = pLock;
000443 }
000444
000445 /* Set the BtLock.eLock variable to the maximum of the current lock
000446 ** and the requested lock. This means if a write-lock was already held
000447 ** and a read-lock requested, we don't incorrectly downgrade the lock.
000448 */
000449 assert( WRITE_LOCK>READ_LOCK );
000450 if( eLock>pLock->eLock ){
000451 pLock->eLock = eLock;
000452 }
000453
000454 return SQLITE_OK;
000455 }
000456 #endif /* !SQLITE_OMIT_SHARED_CACHE */
000457
000458 #ifndef SQLITE_OMIT_SHARED_CACHE
000459 /*
000460 ** Release all the table locks (locks obtained via calls to
000461 ** the setSharedCacheTableLock() procedure) held by Btree object p.
000462 **
000463 ** This function assumes that Btree p has an open read or write
000464 ** transaction. If it does not, then the BTS_PENDING flag
000465 ** may be incorrectly cleared.
000466 */
000467 static void clearAllSharedCacheTableLocks(Btree *p){
000468 BtShared *pBt = p->pBt;
000469 BtLock **ppIter = &pBt->pLock;
000470
000471 assert( sqlite3BtreeHoldsMutex(p) );
000472 assert( p->sharable || 0==*ppIter );
000473 assert( p->inTrans>0 );
000474
000475 SHARED_LOCK_TRACE(pBt, "clearAllLocks", 0, 0);
000476
000477 while( *ppIter ){
000478 BtLock *pLock = *ppIter;
000479 assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
000480 assert( pLock->pBtree->inTrans>=pLock->eLock );
000481 if( pLock->pBtree==p ){
000482 *ppIter = pLock->pNext;
000483 assert( pLock->iTable!=1 || pLock==&p->lock );
000484 if( pLock->iTable!=1 ){
000485 sqlite3_free(pLock);
000486 }
000487 }else{
000488 ppIter = &pLock->pNext;
000489 }
000490 }
000491
000492 assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
000493 if( pBt->pWriter==p ){
000494 pBt->pWriter = 0;
000495 pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
000496 }else if( pBt->nTransaction==2 ){
000497 /* This function is called when Btree p is concluding its
000498 ** transaction. If there currently exists a writer, and p is not
000499 ** that writer, then the number of locks held by connections other
000500 ** than the writer must be about to drop to zero. In this case
000501 ** set the BTS_PENDING flag to 0.
000502 **
000503 ** If there is not currently a writer, then BTS_PENDING must
000504 ** be zero already. So this next line is harmless in that case.
000505 */
000506 pBt->btsFlags &= ~BTS_PENDING;
000507 }
000508 }
000509
000510 /*
000511 ** This function changes all write-locks held by Btree p into read-locks.
000512 */
000513 static void downgradeAllSharedCacheTableLocks(Btree *p){
000514 BtShared *pBt = p->pBt;
000515
000516 SHARED_LOCK_TRACE(pBt, "downgradeLocks", 0, 0);
000517
000518 if( pBt->pWriter==p ){
000519 BtLock *pLock;
000520 pBt->pWriter = 0;
000521 pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
000522 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
000523 assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
000524 pLock->eLock = READ_LOCK;
000525 }
000526 }
000527 }
000528
000529 #endif /* SQLITE_OMIT_SHARED_CACHE */
000530
000531 static void releasePage(MemPage *pPage); /* Forward reference */
000532 static void releasePageOne(MemPage *pPage); /* Forward reference */
000533 static void releasePageNotNull(MemPage *pPage); /* Forward reference */
000534
000535 /*
000536 ***** This routine is used inside of assert() only ****
000537 **
000538 ** Verify that the cursor holds the mutex on its BtShared
000539 */
000540 #ifdef SQLITE_DEBUG
000541 static int cursorHoldsMutex(BtCursor *p){
000542 return sqlite3_mutex_held(p->pBt->mutex);
000543 }
000544
000545 /* Verify that the cursor and the BtShared agree about what is the current
000546 ** database connetion. This is important in shared-cache mode. If the database
000547 ** connection pointers get out-of-sync, it is possible for routines like
000548 ** btreeInitPage() to reference an stale connection pointer that references a
000549 ** a connection that has already closed. This routine is used inside assert()
000550 ** statements only and for the purpose of double-checking that the btree code
000551 ** does keep the database connection pointers up-to-date.
000552 */
000553 static int cursorOwnsBtShared(BtCursor *p){
000554 assert( cursorHoldsMutex(p) );
000555 return (p->pBtree->db==p->pBt->db);
000556 }
000557 #endif
000558
000559 /*
000560 ** Invalidate the overflow cache of the cursor passed as the first argument.
000561 ** on the shared btree structure pBt.
000562 */
000563 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
000564
000565 /*
000566 ** Invalidate the overflow page-list cache for all cursors opened
000567 ** on the shared btree structure pBt.
000568 */
000569 static void invalidateAllOverflowCache(BtShared *pBt){
000570 BtCursor *p;
000571 assert( sqlite3_mutex_held(pBt->mutex) );
000572 for(p=pBt->pCursor; p; p=p->pNext){
000573 invalidateOverflowCache(p);
000574 }
000575 }
000576
000577 #ifndef SQLITE_OMIT_INCRBLOB
000578 /*
000579 ** This function is called before modifying the contents of a table
000580 ** to invalidate any incrblob cursors that are open on the
000581 ** row or one of the rows being modified.
000582 **
000583 ** If argument isClearTable is true, then the entire contents of the
000584 ** table is about to be deleted. In this case invalidate all incrblob
000585 ** cursors open on any row within the table with root-page pgnoRoot.
000586 **
000587 ** Otherwise, if argument isClearTable is false, then the row with
000588 ** rowid iRow is being replaced or deleted. In this case invalidate
000589 ** only those incrblob cursors open on that specific row.
000590 */
000591 static void invalidateIncrblobCursors(
000592 Btree *pBtree, /* The database file to check */
000593 Pgno pgnoRoot, /* The table that might be changing */
000594 i64 iRow, /* The rowid that might be changing */
000595 int isClearTable /* True if all rows are being deleted */
000596 ){
000597 BtCursor *p;
000598 assert( pBtree->hasIncrblobCur );
000599 assert( sqlite3BtreeHoldsMutex(pBtree) );
000600 pBtree->hasIncrblobCur = 0;
000601 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
000602 if( (p->curFlags & BTCF_Incrblob)!=0 ){
000603 pBtree->hasIncrblobCur = 1;
000604 if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){
000605 p->eState = CURSOR_INVALID;
000606 }
000607 }
000608 }
000609 }
000610
000611 #else
000612 /* Stub function when INCRBLOB is omitted */
000613 #define invalidateIncrblobCursors(w,x,y,z)
000614 #endif /* SQLITE_OMIT_INCRBLOB */
000615
000616 /*
000617 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
000618 ** when a page that previously contained data becomes a free-list leaf
000619 ** page.
000620 **
000621 ** The BtShared.pHasContent bitvec exists to work around an obscure
000622 ** bug caused by the interaction of two useful IO optimizations surrounding
000623 ** free-list leaf pages:
000624 **
000625 ** 1) When all data is deleted from a page and the page becomes
000626 ** a free-list leaf page, the page is not written to the database
000627 ** (as free-list leaf pages contain no meaningful data). Sometimes
000628 ** such a page is not even journalled (as it will not be modified,
000629 ** why bother journalling it?).
000630 **
000631 ** 2) When a free-list leaf page is reused, its content is not read
000632 ** from the database or written to the journal file (why should it
000633 ** be, if it is not at all meaningful?).
000634 **
000635 ** By themselves, these optimizations work fine and provide a handy
000636 ** performance boost to bulk delete or insert operations. However, if
000637 ** a page is moved to the free-list and then reused within the same
000638 ** transaction, a problem comes up. If the page is not journalled when
000639 ** it is moved to the free-list and it is also not journalled when it
000640 ** is extracted from the free-list and reused, then the original data
000641 ** may be lost. In the event of a rollback, it may not be possible
000642 ** to restore the database to its original configuration.
000643 **
000644 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
000645 ** moved to become a free-list leaf page, the corresponding bit is
000646 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
000647 ** optimization 2 above is omitted if the corresponding bit is already
000648 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
000649 ** at the end of every transaction.
000650 */
000651 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
000652 int rc = SQLITE_OK;
000653 if( !pBt->pHasContent ){
000654 assert( pgno<=pBt->nPage );
000655 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
000656 if( !pBt->pHasContent ){
000657 rc = SQLITE_NOMEM_BKPT;
000658 }
000659 }
000660 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
000661 rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
000662 }
000663 return rc;
000664 }
000665
000666 /*
000667 ** Query the BtShared.pHasContent vector.
000668 **
000669 ** This function is called when a free-list leaf page is removed from the
000670 ** free-list for reuse. It returns false if it is safe to retrieve the
000671 ** page from the pager layer with the 'no-content' flag set. True otherwise.
000672 */
000673 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
000674 Bitvec *p = pBt->pHasContent;
000675 return p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTestNotNull(p, pgno));
000676 }
000677
000678 /*
000679 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
000680 ** invoked at the conclusion of each write-transaction.
000681 */
000682 static void btreeClearHasContent(BtShared *pBt){
000683 sqlite3BitvecDestroy(pBt->pHasContent);
000684 pBt->pHasContent = 0;
000685 }
000686
000687 /*
000688 ** Release all of the apPage[] pages for a cursor.
000689 */
000690 static void btreeReleaseAllCursorPages(BtCursor *pCur){
000691 int i;
000692 if( pCur->iPage>=0 ){
000693 for(i=0; i<pCur->iPage; i++){
000694 releasePageNotNull(pCur->apPage[i]);
000695 }
000696 releasePageNotNull(pCur->pPage);
000697 pCur->iPage = -1;
000698 }
000699 }
000700
000701 /*
000702 ** The cursor passed as the only argument must point to a valid entry
000703 ** when this function is called (i.e. have eState==CURSOR_VALID). This
000704 ** function saves the current cursor key in variables pCur->nKey and
000705 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error
000706 ** code otherwise.
000707 **
000708 ** If the cursor is open on an intkey table, then the integer key
000709 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to
000710 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is
000711 ** set to point to a malloced buffer pCur->nKey bytes in size containing
000712 ** the key.
000713 */
000714 static int saveCursorKey(BtCursor *pCur){
000715 int rc = SQLITE_OK;
000716 assert( CURSOR_VALID==pCur->eState );
000717 assert( 0==pCur->pKey );
000718 assert( cursorHoldsMutex(pCur) );
000719
000720 if( pCur->curIntKey ){
000721 /* Only the rowid is required for a table btree */
000722 pCur->nKey = sqlite3BtreeIntegerKey(pCur);
000723 }else{
000724 /* For an index btree, save the complete key content. It is possible
000725 ** that the current key is corrupt. In that case, it is possible that
000726 ** the sqlite3VdbeRecordUnpack() function may overread the buffer by
000727 ** up to the size of 1 varint plus 1 8-byte value when the cursor
000728 ** position is restored. Hence the 17 bytes of padding allocated
000729 ** below. */
000730 void *pKey;
000731 pCur->nKey = sqlite3BtreePayloadSize(pCur);
000732 pKey = sqlite3Malloc( pCur->nKey + 9 + 8 );
000733 if( pKey ){
000734 rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey);
000735 if( rc==SQLITE_OK ){
000736 memset(((u8*)pKey)+pCur->nKey, 0, 9+8);
000737 pCur->pKey = pKey;
000738 }else{
000739 sqlite3_free(pKey);
000740 }
000741 }else{
000742 rc = SQLITE_NOMEM_BKPT;
000743 }
000744 }
000745 assert( !pCur->curIntKey || !pCur->pKey );
000746 return rc;
000747 }
000748
000749 /*
000750 ** Save the current cursor position in the variables BtCursor.nKey
000751 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
000752 **
000753 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
000754 ** prior to calling this routine.
000755 */
000756 static int saveCursorPosition(BtCursor *pCur){
000757 int rc;
000758
000759 assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
000760 assert( 0==pCur->pKey );
000761 assert( cursorHoldsMutex(pCur) );
000762
000763 if( pCur->curFlags & BTCF_Pinned ){
000764 return SQLITE_CONSTRAINT_PINNED;
000765 }
000766 if( pCur->eState==CURSOR_SKIPNEXT ){
000767 pCur->eState = CURSOR_VALID;
000768 }else{
000769 pCur->skipNext = 0;
000770 }
000771
000772 rc = saveCursorKey(pCur);
000773 if( rc==SQLITE_OK ){
000774 btreeReleaseAllCursorPages(pCur);
000775 pCur->eState = CURSOR_REQUIRESEEK;
000776 }
000777
000778 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast);
000779 return rc;
000780 }
000781
000782 /* Forward reference */
000783 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
000784
000785 /*
000786 ** Save the positions of all cursors (except pExcept) that are open on
000787 ** the table with root-page iRoot. "Saving the cursor position" means that
000788 ** the location in the btree is remembered in such a way that it can be
000789 ** moved back to the same spot after the btree has been modified. This
000790 ** routine is called just before cursor pExcept is used to modify the
000791 ** table, for example in BtreeDelete() or BtreeInsert().
000792 **
000793 ** If there are two or more cursors on the same btree, then all such
000794 ** cursors should have their BTCF_Multiple flag set. The btreeCursor()
000795 ** routine enforces that rule. This routine only needs to be called in
000796 ** the uncommon case when pExpect has the BTCF_Multiple flag set.
000797 **
000798 ** If pExpect!=NULL and if no other cursors are found on the same root-page,
000799 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
000800 ** pointless call to this routine.
000801 **
000802 ** Implementation note: This routine merely checks to see if any cursors
000803 ** need to be saved. It calls out to saveCursorsOnList() in the (unusual)
000804 ** event that cursors are in need to being saved.
000805 */
000806 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
000807 BtCursor *p;
000808 assert( sqlite3_mutex_held(pBt->mutex) );
000809 assert( pExcept==0 || pExcept->pBt==pBt );
000810 for(p=pBt->pCursor; p; p=p->pNext){
000811 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
000812 }
000813 if( p ) return saveCursorsOnList(p, iRoot, pExcept);
000814 if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
000815 return SQLITE_OK;
000816 }
000817
000818 /* This helper routine to saveAllCursors does the actual work of saving
000819 ** the cursors if and when a cursor is found that actually requires saving.
000820 ** The common case is that no cursors need to be saved, so this routine is
000821 ** broken out from its caller to avoid unnecessary stack pointer movement.
000822 */
000823 static int SQLITE_NOINLINE saveCursorsOnList(
000824 BtCursor *p, /* The first cursor that needs saving */
000825 Pgno iRoot, /* Only save cursor with this iRoot. Save all if zero */
000826 BtCursor *pExcept /* Do not save this cursor */
000827 ){
000828 do{
000829 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
000830 if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
000831 int rc = saveCursorPosition(p);
000832 if( SQLITE_OK!=rc ){
000833 return rc;
000834 }
000835 }else{
000836 testcase( p->iPage>=0 );
000837 btreeReleaseAllCursorPages(p);
000838 }
000839 }
000840 p = p->pNext;
000841 }while( p );
000842 return SQLITE_OK;
000843 }
000844
000845 /*
000846 ** Clear the current cursor position.
000847 */
000848 void sqlite3BtreeClearCursor(BtCursor *pCur){
000849 assert( cursorHoldsMutex(pCur) );
000850 sqlite3_free(pCur->pKey);
000851 pCur->pKey = 0;
000852 pCur->eState = CURSOR_INVALID;
000853 }
000854
000855 /*
000856 ** In this version of BtreeMoveto, pKey is a packed index record
000857 ** such as is generated by the OP_MakeRecord opcode. Unpack the
000858 ** record and then call sqlite3BtreeIndexMoveto() to do the work.
000859 */
000860 static int btreeMoveto(
000861 BtCursor *pCur, /* Cursor open on the btree to be searched */
000862 const void *pKey, /* Packed key if the btree is an index */
000863 i64 nKey, /* Integer key for tables. Size of pKey for indices */
000864 int bias, /* Bias search to the high end */
000865 int *pRes /* Write search results here */
000866 ){
000867 int rc; /* Status code */
000868 UnpackedRecord *pIdxKey; /* Unpacked index key */
000869
000870 if( pKey ){
000871 KeyInfo *pKeyInfo = pCur->pKeyInfo;
000872 assert( nKey==(i64)(int)nKey );
000873 pIdxKey = sqlite3VdbeAllocUnpackedRecord(pKeyInfo);
000874 if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;
000875 sqlite3VdbeRecordUnpack(pKeyInfo, (int)nKey, pKey, pIdxKey);
000876 if( pIdxKey->nField==0 || pIdxKey->nField>pKeyInfo->nAllField ){
000877 rc = SQLITE_CORRUPT_BKPT;
000878 }else{
000879 rc = sqlite3BtreeIndexMoveto(pCur, pIdxKey, pRes);
000880 }
000881 sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey);
000882 }else{
000883 pIdxKey = 0;
000884 rc = sqlite3BtreeTableMoveto(pCur, nKey, bias, pRes);
000885 }
000886 return rc;
000887 }
000888
000889 /*
000890 ** Restore the cursor to the position it was in (or as close to as possible)
000891 ** when saveCursorPosition() was called. Note that this call deletes the
000892 ** saved position info stored by saveCursorPosition(), so there can be
000893 ** at most one effective restoreCursorPosition() call after each
000894 ** saveCursorPosition().
000895 */
000896 static int btreeRestoreCursorPosition(BtCursor *pCur){
000897 int rc;
000898 int skipNext = 0;
000899 assert( cursorOwnsBtShared(pCur) );
000900 assert( pCur->eState>=CURSOR_REQUIRESEEK );
000901 if( pCur->eState==CURSOR_FAULT ){
000902 return pCur->skipNext;
000903 }
000904 pCur->eState = CURSOR_INVALID;
000905 if( sqlite3FaultSim(410) ){
000906 rc = SQLITE_IOERR;
000907 }else{
000908 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
000909 }
000910 if( rc==SQLITE_OK ){
000911 sqlite3_free(pCur->pKey);
000912 pCur->pKey = 0;
000913 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
000914 if( skipNext ) pCur->skipNext = skipNext;
000915 if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
000916 pCur->eState = CURSOR_SKIPNEXT;
000917 }
000918 }
000919 return rc;
000920 }
000921
000922 #define restoreCursorPosition(p) \
000923 (p->eState>=CURSOR_REQUIRESEEK ? \
000924 btreeRestoreCursorPosition(p) : \
000925 SQLITE_OK)
000926
000927 /*
000928 ** Determine whether or not a cursor has moved from the position where
000929 ** it was last placed, or has been invalidated for any other reason.
000930 ** Cursors can move when the row they are pointing at is deleted out
000931 ** from under them, for example. Cursor might also move if a btree
000932 ** is rebalanced.
000933 **
000934 ** Calling this routine with a NULL cursor pointer returns false.
000935 **
000936 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
000937 ** back to where it ought to be if this routine returns true.
000938 */
000939 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
000940 assert( EIGHT_BYTE_ALIGNMENT(pCur)
000941 || pCur==sqlite3BtreeFakeValidCursor() );
000942 assert( offsetof(BtCursor, eState)==0 );
000943 assert( sizeof(pCur->eState)==1 );
000944 return CURSOR_VALID != *(u8*)pCur;
000945 }
000946
000947 /*
000948 ** Return a pointer to a fake BtCursor object that will always answer
000949 ** false to the sqlite3BtreeCursorHasMoved() routine above. The fake
000950 ** cursor returned must not be used with any other Btree interface.
000951 */
000952 BtCursor *sqlite3BtreeFakeValidCursor(void){
000953 static u8 fakeCursor = CURSOR_VALID;
000954 assert( offsetof(BtCursor, eState)==0 );
000955 return (BtCursor*)&fakeCursor;
000956 }
000957
000958 /*
000959 ** This routine restores a cursor back to its original position after it
000960 ** has been moved by some outside activity (such as a btree rebalance or
000961 ** a row having been deleted out from under the cursor).
000962 **
000963 ** On success, the *pDifferentRow parameter is false if the cursor is left
000964 ** pointing at exactly the same row. *pDifferntRow is the row the cursor
000965 ** was pointing to has been deleted, forcing the cursor to point to some
000966 ** nearby row.
000967 **
000968 ** This routine should only be called for a cursor that just returned
000969 ** TRUE from sqlite3BtreeCursorHasMoved().
000970 */
000971 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
000972 int rc;
000973
000974 assert( pCur!=0 );
000975 assert( pCur->eState!=CURSOR_VALID );
000976 rc = restoreCursorPosition(pCur);
000977 if( rc ){
000978 *pDifferentRow = 1;
000979 return rc;
000980 }
000981 if( pCur->eState!=CURSOR_VALID ){
000982 *pDifferentRow = 1;
000983 }else{
000984 *pDifferentRow = 0;
000985 }
000986 return SQLITE_OK;
000987 }
000988
000989 #ifdef SQLITE_ENABLE_CURSOR_HINTS
000990 /*
000991 ** Provide hints to the cursor. The particular hint given (and the type
000992 ** and number of the varargs parameters) is determined by the eHintType
000993 ** parameter. See the definitions of the BTREE_HINT_* macros for details.
000994 */
000995 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){
000996 /* Used only by system that substitute their own storage engine */
000997 #ifdef SQLITE_DEBUG
000998 if( ALWAYS(eHintType==BTREE_HINT_RANGE) ){
000999 va_list ap;
001000 Expr *pExpr;
001001 Walker w;
001002 memset(&w, 0, sizeof(w));
001003 w.xExprCallback = sqlite3CursorRangeHintExprCheck;
001004 va_start(ap, eHintType);
001005 pExpr = va_arg(ap, Expr*);
001006 w.u.aMem = va_arg(ap, Mem*);
001007 va_end(ap);
001008 assert( pExpr!=0 );
001009 assert( w.u.aMem!=0 );
001010 sqlite3WalkExpr(&w, pExpr);
001011 }
001012 #endif /* SQLITE_DEBUG */
001013 }
001014 #endif /* SQLITE_ENABLE_CURSOR_HINTS */
001015
001016
001017 /*
001018 ** Provide flag hints to the cursor.
001019 */
001020 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){
001021 assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 );
001022 pCur->hints = x;
001023 }
001024
001025
001026 #ifndef SQLITE_OMIT_AUTOVACUUM
001027 /*
001028 ** Given a page number of a regular database page, return the page
001029 ** number for the pointer-map page that contains the entry for the
001030 ** input page number.
001031 **
001032 ** Return 0 (not a valid page) for pgno==1 since there is
001033 ** no pointer map associated with page 1. The integrity_check logic
001034 ** requires that ptrmapPageno(*,1)!=1.
001035 */
001036 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
001037 int nPagesPerMapPage;
001038 Pgno iPtrMap, ret;
001039 assert( sqlite3_mutex_held(pBt->mutex) );
001040 if( pgno<2 ) return 0;
001041 nPagesPerMapPage = (pBt->usableSize/5)+1;
001042 iPtrMap = (pgno-2)/nPagesPerMapPage;
001043 ret = (iPtrMap*nPagesPerMapPage) + 2;
001044 if( ret==PENDING_BYTE_PAGE(pBt) ){
001045 ret++;
001046 }
001047 return ret;
001048 }
001049
001050 /*
001051 ** Write an entry into the pointer map.
001052 **
001053 ** This routine updates the pointer map entry for page number 'key'
001054 ** so that it maps to type 'eType' and parent page number 'pgno'.
001055 **
001056 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
001057 ** a no-op. If an error occurs, the appropriate error code is written
001058 ** into *pRC.
001059 */
001060 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
001061 DbPage *pDbPage; /* The pointer map page */
001062 u8 *pPtrmap; /* The pointer map data */
001063 Pgno iPtrmap; /* The pointer map page number */
001064 int offset; /* Offset in pointer map page */
001065 int rc; /* Return code from subfunctions */
001066
001067 if( *pRC ) return;
001068
001069 assert( sqlite3_mutex_held(pBt->mutex) );
001070 /* The super-journal page number must never be used as a pointer map page */
001071 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
001072
001073 assert( pBt->autoVacuum );
001074 if( key==0 ){
001075 *pRC = SQLITE_CORRUPT_BKPT;
001076 return;
001077 }
001078 iPtrmap = PTRMAP_PAGENO(pBt, key);
001079 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
001080 if( rc!=SQLITE_OK ){
001081 *pRC = rc;
001082 return;
001083 }
001084 if( ((char*)sqlite3PagerGetExtra(pDbPage))[0]!=0 ){
001085 /* The first byte of the extra data is the MemPage.isInit byte.
001086 ** If that byte is set, it means this page is also being used
001087 ** as a btree page. */
001088 *pRC = SQLITE_CORRUPT_BKPT;
001089 goto ptrmap_exit;
001090 }
001091 offset = PTRMAP_PTROFFSET(iPtrmap, key);
001092 if( offset<0 ){
001093 *pRC = SQLITE_CORRUPT_BKPT;
001094 goto ptrmap_exit;
001095 }
001096 assert( offset <= (int)pBt->usableSize-5 );
001097 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
001098
001099 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
001100 TRACE(("PTRMAP_UPDATE: %u->(%u,%u)\n", key, eType, parent));
001101 *pRC= rc = sqlite3PagerWrite(pDbPage);
001102 if( rc==SQLITE_OK ){
001103 pPtrmap[offset] = eType;
001104 put4byte(&pPtrmap[offset+1], parent);
001105 }
001106 }
001107
001108 ptrmap_exit:
001109 sqlite3PagerUnref(pDbPage);
001110 }
001111
001112 /*
001113 ** Read an entry from the pointer map.
001114 **
001115 ** This routine retrieves the pointer map entry for page 'key', writing
001116 ** the type and parent page number to *pEType and *pPgno respectively.
001117 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
001118 */
001119 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
001120 DbPage *pDbPage; /* The pointer map page */
001121 int iPtrmap; /* Pointer map page index */
001122 u8 *pPtrmap; /* Pointer map page data */
001123 int offset; /* Offset of entry in pointer map */
001124 int rc;
001125
001126 assert( sqlite3_mutex_held(pBt->mutex) );
001127
001128 iPtrmap = PTRMAP_PAGENO(pBt, key);
001129 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
001130 if( rc!=0 ){
001131 return rc;
001132 }
001133 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
001134
001135 offset = PTRMAP_PTROFFSET(iPtrmap, key);
001136 if( offset<0 ){
001137 sqlite3PagerUnref(pDbPage);
001138 return SQLITE_CORRUPT_BKPT;
001139 }
001140 assert( offset <= (int)pBt->usableSize-5 );
001141 assert( pEType!=0 );
001142 *pEType = pPtrmap[offset];
001143 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
001144
001145 sqlite3PagerUnref(pDbPage);
001146 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap);
001147 return SQLITE_OK;
001148 }
001149
001150 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
001151 #define ptrmapPut(w,x,y,z,rc)
001152 #define ptrmapGet(w,x,y,z) SQLITE_OK
001153 #define ptrmapPutOvflPtr(x, y, z, rc)
001154 #endif
001155
001156 /*
001157 ** Given a btree page and a cell index (0 means the first cell on
001158 ** the page, 1 means the second cell, and so forth) return a pointer
001159 ** to the cell content.
001160 **
001161 ** findCellPastPtr() does the same except it skips past the initial
001162 ** 4-byte child pointer found on interior pages, if there is one.
001163 **
001164 ** This routine works only for pages that do not contain overflow cells.
001165 */
001166 #define findCell(P,I) \
001167 ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
001168 #define findCellPastPtr(P,I) \
001169 ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
001170
001171
001172 /*
001173 ** This is common tail processing for btreeParseCellPtr() and
001174 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
001175 ** on a single B-tree page. Make necessary adjustments to the CellInfo
001176 ** structure.
001177 */
001178 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
001179 MemPage *pPage, /* Page containing the cell */
001180 u8 *pCell, /* Pointer to the cell text. */
001181 CellInfo *pInfo /* Fill in this structure */
001182 ){
001183 /* If the payload will not fit completely on the local page, we have
001184 ** to decide how much to store locally and how much to spill onto
001185 ** overflow pages. The strategy is to minimize the amount of unused
001186 ** space on overflow pages while keeping the amount of local storage
001187 ** in between minLocal and maxLocal.
001188 **
001189 ** Warning: changing the way overflow payload is distributed in any
001190 ** way will result in an incompatible file format.
001191 */
001192 int minLocal; /* Minimum amount of payload held locally */
001193 int maxLocal; /* Maximum amount of payload held locally */
001194 int surplus; /* Overflow payload available for local storage */
001195
001196 minLocal = pPage->minLocal;
001197 maxLocal = pPage->maxLocal;
001198 surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
001199 testcase( surplus==maxLocal );
001200 testcase( surplus==maxLocal+1 );
001201 if( surplus <= maxLocal ){
001202 pInfo->nLocal = (u16)surplus;
001203 }else{
001204 pInfo->nLocal = (u16)minLocal;
001205 }
001206 pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;
001207 }
001208
001209 /*
001210 ** Given a record with nPayload bytes of payload stored within btree
001211 ** page pPage, return the number of bytes of payload stored locally.
001212 */
001213 static int btreePayloadToLocal(MemPage *pPage, i64 nPayload){
001214 int maxLocal; /* Maximum amount of payload held locally */
001215 maxLocal = pPage->maxLocal;
001216 if( nPayload<=maxLocal ){
001217 return nPayload;
001218 }else{
001219 int minLocal; /* Minimum amount of payload held locally */
001220 int surplus; /* Overflow payload available for local storage */
001221 minLocal = pPage->minLocal;
001222 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize-4);
001223 return ( surplus <= maxLocal ) ? surplus : minLocal;
001224 }
001225 }
001226
001227 /*
001228 ** The following routines are implementations of the MemPage.xParseCell()
001229 ** method.
001230 **
001231 ** Parse a cell content block and fill in the CellInfo structure.
001232 **
001233 ** btreeParseCellPtr() => table btree leaf nodes
001234 ** btreeParseCellNoPayload() => table btree internal nodes
001235 ** btreeParseCellPtrIndex() => index btree nodes
001236 **
001237 ** There is also a wrapper function btreeParseCell() that works for
001238 ** all MemPage types and that references the cell by index rather than
001239 ** by pointer.
001240 */
001241 static void btreeParseCellPtrNoPayload(
001242 MemPage *pPage, /* Page containing the cell */
001243 u8 *pCell, /* Pointer to the cell text. */
001244 CellInfo *pInfo /* Fill in this structure */
001245 ){
001246 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001247 assert( pPage->leaf==0 );
001248 assert( pPage->childPtrSize==4 );
001249 #ifndef SQLITE_DEBUG
001250 UNUSED_PARAMETER(pPage);
001251 #endif
001252 pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
001253 pInfo->nPayload = 0;
001254 pInfo->nLocal = 0;
001255 pInfo->pPayload = 0;
001256 return;
001257 }
001258 static void btreeParseCellPtr(
001259 MemPage *pPage, /* Page containing the cell */
001260 u8 *pCell, /* Pointer to the cell text. */
001261 CellInfo *pInfo /* Fill in this structure */
001262 ){
001263 u8 *pIter; /* For scanning through pCell */
001264 u32 nPayload; /* Number of bytes of cell payload */
001265 u64 iKey; /* Extracted Key value */
001266
001267 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001268 assert( pPage->leaf==0 || pPage->leaf==1 );
001269 assert( pPage->intKeyLeaf );
001270 assert( pPage->childPtrSize==0 );
001271 pIter = pCell;
001272
001273 /* The next block of code is equivalent to:
001274 **
001275 ** pIter += getVarint32(pIter, nPayload);
001276 **
001277 ** The code is inlined to avoid a function call.
001278 */
001279 nPayload = *pIter;
001280 if( nPayload>=0x80 ){
001281 u8 *pEnd = &pIter[8];
001282 nPayload &= 0x7f;
001283 do{
001284 nPayload = (nPayload<<7) | (*++pIter & 0x7f);
001285 }while( (*pIter)>=0x80 && pIter<pEnd );
001286 }
001287 pIter++;
001288
001289 /* The next block of code is equivalent to:
001290 **
001291 ** pIter += getVarint(pIter, (u64*)&pInfo->nKey);
001292 **
001293 ** The code is inlined and the loop is unrolled for performance.
001294 ** This routine is a high-runner.
001295 */
001296 iKey = *pIter;
001297 if( iKey>=0x80 ){
001298 u8 x;
001299 iKey = (iKey<<7) ^ (x = *++pIter);
001300 if( x>=0x80 ){
001301 iKey = (iKey<<7) ^ (x = *++pIter);
001302 if( x>=0x80 ){
001303 iKey = (iKey<<7) ^ 0x10204000 ^ (x = *++pIter);
001304 if( x>=0x80 ){
001305 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter);
001306 if( x>=0x80 ){
001307 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter);
001308 if( x>=0x80 ){
001309 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter);
001310 if( x>=0x80 ){
001311 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter);
001312 if( x>=0x80 ){
001313 iKey = (iKey<<8) ^ 0x8000 ^ (*++pIter);
001314 }
001315 }
001316 }
001317 }
001318 }
001319 }else{
001320 iKey ^= 0x204000;
001321 }
001322 }else{
001323 iKey ^= 0x4000;
001324 }
001325 }
001326 pIter++;
001327
001328 pInfo->nKey = *(i64*)&iKey;
001329 pInfo->nPayload = nPayload;
001330 pInfo->pPayload = pIter;
001331 testcase( nPayload==pPage->maxLocal );
001332 testcase( nPayload==(u32)pPage->maxLocal+1 );
001333 if( nPayload<=pPage->maxLocal ){
001334 /* This is the (easy) common case where the entire payload fits
001335 ** on the local page. No overflow is required.
001336 */
001337 pInfo->nSize = nPayload + (u16)(pIter - pCell);
001338 if( pInfo->nSize<4 ) pInfo->nSize = 4;
001339 pInfo->nLocal = (u16)nPayload;
001340 }else{
001341 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
001342 }
001343 }
001344 static void btreeParseCellPtrIndex(
001345 MemPage *pPage, /* Page containing the cell */
001346 u8 *pCell, /* Pointer to the cell text. */
001347 CellInfo *pInfo /* Fill in this structure */
001348 ){
001349 u8 *pIter; /* For scanning through pCell */
001350 u32 nPayload; /* Number of bytes of cell payload */
001351
001352 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001353 assert( pPage->leaf==0 || pPage->leaf==1 );
001354 assert( pPage->intKeyLeaf==0 );
001355 pIter = pCell + pPage->childPtrSize;
001356 nPayload = *pIter;
001357 if( nPayload>=0x80 ){
001358 u8 *pEnd = &pIter[8];
001359 nPayload &= 0x7f;
001360 do{
001361 nPayload = (nPayload<<7) | (*++pIter & 0x7f);
001362 }while( *(pIter)>=0x80 && pIter<pEnd );
001363 }
001364 pIter++;
001365 pInfo->nKey = nPayload;
001366 pInfo->nPayload = nPayload;
001367 pInfo->pPayload = pIter;
001368 testcase( nPayload==pPage->maxLocal );
001369 testcase( nPayload==(u32)pPage->maxLocal+1 );
001370 if( nPayload<=pPage->maxLocal ){
001371 /* This is the (easy) common case where the entire payload fits
001372 ** on the local page. No overflow is required.
001373 */
001374 pInfo->nSize = nPayload + (u16)(pIter - pCell);
001375 if( pInfo->nSize<4 ) pInfo->nSize = 4;
001376 pInfo->nLocal = (u16)nPayload;
001377 }else{
001378 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
001379 }
001380 }
001381 static void btreeParseCell(
001382 MemPage *pPage, /* Page containing the cell */
001383 int iCell, /* The cell index. First cell is 0 */
001384 CellInfo *pInfo /* Fill in this structure */
001385 ){
001386 pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
001387 }
001388
001389 /*
001390 ** The following routines are implementations of the MemPage.xCellSize
001391 ** method.
001392 **
001393 ** Compute the total number of bytes that a Cell needs in the cell
001394 ** data area of the btree-page. The return number includes the cell
001395 ** data header and the local payload, but not any overflow page or
001396 ** the space used by the cell pointer.
001397 **
001398 ** cellSizePtrNoPayload() => table internal nodes
001399 ** cellSizePtrTableLeaf() => table leaf nodes
001400 ** cellSizePtr() => index internal nodes
001401 ** cellSizeIdxLeaf() => index leaf nodes
001402 */
001403 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
001404 u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
001405 u8 *pEnd; /* End mark for a varint */
001406 u32 nSize; /* Size value to return */
001407
001408 #ifdef SQLITE_DEBUG
001409 /* The value returned by this function should always be the same as
001410 ** the (CellInfo.nSize) value found by doing a full parse of the
001411 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
001412 ** this function verifies that this invariant is not violated. */
001413 CellInfo debuginfo;
001414 pPage->xParseCell(pPage, pCell, &debuginfo);
001415 #endif
001416
001417 assert( pPage->childPtrSize==4 );
001418 nSize = *pIter;
001419 if( nSize>=0x80 ){
001420 pEnd = &pIter[8];
001421 nSize &= 0x7f;
001422 do{
001423 nSize = (nSize<<7) | (*++pIter & 0x7f);
001424 }while( *(pIter)>=0x80 && pIter<pEnd );
001425 }
001426 pIter++;
001427 testcase( nSize==pPage->maxLocal );
001428 testcase( nSize==(u32)pPage->maxLocal+1 );
001429 if( nSize<=pPage->maxLocal ){
001430 nSize += (u32)(pIter - pCell);
001431 assert( nSize>4 );
001432 }else{
001433 int minLocal = pPage->minLocal;
001434 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
001435 testcase( nSize==pPage->maxLocal );
001436 testcase( nSize==(u32)pPage->maxLocal+1 );
001437 if( nSize>pPage->maxLocal ){
001438 nSize = minLocal;
001439 }
001440 nSize += 4 + (u16)(pIter - pCell);
001441 }
001442 assert( nSize==debuginfo.nSize || CORRUPT_DB );
001443 return (u16)nSize;
001444 }
001445 static u16 cellSizePtrIdxLeaf(MemPage *pPage, u8 *pCell){
001446 u8 *pIter = pCell; /* For looping over bytes of pCell */
001447 u8 *pEnd; /* End mark for a varint */
001448 u32 nSize; /* Size value to return */
001449
001450 #ifdef SQLITE_DEBUG
001451 /* The value returned by this function should always be the same as
001452 ** the (CellInfo.nSize) value found by doing a full parse of the
001453 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
001454 ** this function verifies that this invariant is not violated. */
001455 CellInfo debuginfo;
001456 pPage->xParseCell(pPage, pCell, &debuginfo);
001457 #endif
001458
001459 assert( pPage->childPtrSize==0 );
001460 nSize = *pIter;
001461 if( nSize>=0x80 ){
001462 pEnd = &pIter[8];
001463 nSize &= 0x7f;
001464 do{
001465 nSize = (nSize<<7) | (*++pIter & 0x7f);
001466 }while( *(pIter)>=0x80 && pIter<pEnd );
001467 }
001468 pIter++;
001469 testcase( nSize==pPage->maxLocal );
001470 testcase( nSize==(u32)pPage->maxLocal+1 );
001471 if( nSize<=pPage->maxLocal ){
001472 nSize += (u32)(pIter - pCell);
001473 if( nSize<4 ) nSize = 4;
001474 }else{
001475 int minLocal = pPage->minLocal;
001476 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
001477 testcase( nSize==pPage->maxLocal );
001478 testcase( nSize==(u32)pPage->maxLocal+1 );
001479 if( nSize>pPage->maxLocal ){
001480 nSize = minLocal;
001481 }
001482 nSize += 4 + (u16)(pIter - pCell);
001483 }
001484 assert( nSize==debuginfo.nSize || CORRUPT_DB );
001485 return (u16)nSize;
001486 }
001487 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
001488 u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
001489 u8 *pEnd; /* End mark for a varint */
001490
001491 #ifdef SQLITE_DEBUG
001492 /* The value returned by this function should always be the same as
001493 ** the (CellInfo.nSize) value found by doing a full parse of the
001494 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
001495 ** this function verifies that this invariant is not violated. */
001496 CellInfo debuginfo;
001497 pPage->xParseCell(pPage, pCell, &debuginfo);
001498 #else
001499 UNUSED_PARAMETER(pPage);
001500 #endif
001501
001502 assert( pPage->childPtrSize==4 );
001503 pEnd = pIter + 9;
001504 while( (*pIter++)&0x80 && pIter<pEnd );
001505 assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
001506 return (u16)(pIter - pCell);
001507 }
001508 static u16 cellSizePtrTableLeaf(MemPage *pPage, u8 *pCell){
001509 u8 *pIter = pCell; /* For looping over bytes of pCell */
001510 u8 *pEnd; /* End mark for a varint */
001511 u32 nSize; /* Size value to return */
001512
001513 #ifdef SQLITE_DEBUG
001514 /* The value returned by this function should always be the same as
001515 ** the (CellInfo.nSize) value found by doing a full parse of the
001516 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
001517 ** this function verifies that this invariant is not violated. */
001518 CellInfo debuginfo;
001519 pPage->xParseCell(pPage, pCell, &debuginfo);
001520 #endif
001521
001522 nSize = *pIter;
001523 if( nSize>=0x80 ){
001524 pEnd = &pIter[8];
001525 nSize &= 0x7f;
001526 do{
001527 nSize = (nSize<<7) | (*++pIter & 0x7f);
001528 }while( *(pIter)>=0x80 && pIter<pEnd );
001529 }
001530 pIter++;
001531 /* pIter now points at the 64-bit integer key value, a variable length
001532 ** integer. The following block moves pIter to point at the first byte
001533 ** past the end of the key value. */
001534 if( (*pIter++)&0x80
001535 && (*pIter++)&0x80
001536 && (*pIter++)&0x80
001537 && (*pIter++)&0x80
001538 && (*pIter++)&0x80
001539 && (*pIter++)&0x80
001540 && (*pIter++)&0x80
001541 && (*pIter++)&0x80 ){ pIter++; }
001542 testcase( nSize==pPage->maxLocal );
001543 testcase( nSize==(u32)pPage->maxLocal+1 );
001544 if( nSize<=pPage->maxLocal ){
001545 nSize += (u32)(pIter - pCell);
001546 if( nSize<4 ) nSize = 4;
001547 }else{
001548 int minLocal = pPage->minLocal;
001549 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
001550 testcase( nSize==pPage->maxLocal );
001551 testcase( nSize==(u32)pPage->maxLocal+1 );
001552 if( nSize>pPage->maxLocal ){
001553 nSize = minLocal;
001554 }
001555 nSize += 4 + (u16)(pIter - pCell);
001556 }
001557 assert( nSize==debuginfo.nSize || CORRUPT_DB );
001558 return (u16)nSize;
001559 }
001560
001561
001562 #ifdef SQLITE_DEBUG
001563 /* This variation on cellSizePtr() is used inside of assert() statements
001564 ** only. */
001565 static u16 cellSize(MemPage *pPage, int iCell){
001566 return pPage->xCellSize(pPage, findCell(pPage, iCell));
001567 }
001568 #endif
001569
001570 #ifndef SQLITE_OMIT_AUTOVACUUM
001571 /*
001572 ** The cell pCell is currently part of page pSrc but will ultimately be part
001573 ** of pPage. (pSrc and pPage are often the same.) If pCell contains a
001574 ** pointer to an overflow page, insert an entry into the pointer-map for
001575 ** the overflow page that will be valid after pCell has been moved to pPage.
001576 */
001577 static void ptrmapPutOvflPtr(MemPage *pPage, MemPage *pSrc, u8 *pCell,int *pRC){
001578 CellInfo info;
001579 if( *pRC ) return;
001580 assert( pCell!=0 );
001581 pPage->xParseCell(pPage, pCell, &info);
001582 if( info.nLocal<info.nPayload ){
001583 Pgno ovfl;
001584 if( SQLITE_OVERFLOW(pSrc->aDataEnd, pCell, pCell+info.nLocal) ){
001585 testcase( pSrc!=pPage );
001586 *pRC = SQLITE_CORRUPT_BKPT;
001587 return;
001588 }
001589 ovfl = get4byte(&pCell[info.nSize-4]);
001590 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
001591 }
001592 }
001593 #endif
001594
001595
001596 /*
001597 ** Defragment the page given. This routine reorganizes cells within the
001598 ** page so that there are no free-blocks on the free-block list.
001599 **
001600 ** Parameter nMaxFrag is the maximum amount of fragmented space that may be
001601 ** present in the page after this routine returns.
001602 **
001603 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
001604 ** b-tree page so that there are no freeblocks or fragment bytes, all
001605 ** unused bytes are contained in the unallocated space region, and all
001606 ** cells are packed tightly at the end of the page.
001607 */
001608 static int defragmentPage(MemPage *pPage, int nMaxFrag){
001609 int i; /* Loop counter */
001610 int pc; /* Address of the i-th cell */
001611 int hdr; /* Offset to the page header */
001612 int size; /* Size of a cell */
001613 int usableSize; /* Number of usable bytes on a page */
001614 int cellOffset; /* Offset to the cell pointer array */
001615 int cbrk; /* Offset to the cell content area */
001616 int nCell; /* Number of cells on the page */
001617 unsigned char *data; /* The page data */
001618 unsigned char *temp; /* Temp area for cell content */
001619 unsigned char *src; /* Source of content */
001620 int iCellFirst; /* First allowable cell index */
001621 int iCellLast; /* Last possible cell index */
001622 int iCellStart; /* First cell offset in input */
001623
001624 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001625 assert( pPage->pBt!=0 );
001626 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
001627 assert( pPage->nOverflow==0 );
001628 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001629 data = pPage->aData;
001630 hdr = pPage->hdrOffset;
001631 cellOffset = pPage->cellOffset;
001632 nCell = pPage->nCell;
001633 assert( nCell==get2byte(&data[hdr+3]) || CORRUPT_DB );
001634 iCellFirst = cellOffset + 2*nCell;
001635 usableSize = pPage->pBt->usableSize;
001636
001637 /* This block handles pages with two or fewer free blocks and nMaxFrag
001638 ** or fewer fragmented bytes. In this case it is faster to move the
001639 ** two (or one) blocks of cells using memmove() and add the required
001640 ** offsets to each pointer in the cell-pointer array than it is to
001641 ** reconstruct the entire page. */
001642 if( (int)data[hdr+7]<=nMaxFrag ){
001643 int iFree = get2byte(&data[hdr+1]);
001644 if( iFree>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage);
001645 if( iFree ){
001646 int iFree2 = get2byte(&data[iFree]);
001647 if( iFree2>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage);
001648 if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){
001649 u8 *pEnd = &data[cellOffset + nCell*2];
001650 u8 *pAddr;
001651 int sz2 = 0;
001652 int sz = get2byte(&data[iFree+2]);
001653 int top = get2byte(&data[hdr+5]);
001654 if( top>=iFree ){
001655 return SQLITE_CORRUPT_PAGE(pPage);
001656 }
001657 if( iFree2 ){
001658 if( iFree+sz>iFree2 ) return SQLITE_CORRUPT_PAGE(pPage);
001659 sz2 = get2byte(&data[iFree2+2]);
001660 if( iFree2+sz2 > usableSize ) return SQLITE_CORRUPT_PAGE(pPage);
001661 memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz));
001662 sz += sz2;
001663 }else if( iFree+sz>usableSize ){
001664 return SQLITE_CORRUPT_PAGE(pPage);
001665 }
001666
001667 cbrk = top+sz;
001668 assert( cbrk+(iFree-top) <= usableSize );
001669 memmove(&data[cbrk], &data[top], iFree-top);
001670 for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){
001671 pc = get2byte(pAddr);
001672 if( pc<iFree ){ put2byte(pAddr, pc+sz); }
001673 else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); }
001674 }
001675 goto defragment_out;
001676 }
001677 }
001678 }
001679
001680 cbrk = usableSize;
001681 iCellLast = usableSize - 4;
001682 iCellStart = get2byte(&data[hdr+5]);
001683 if( nCell>0 ){
001684 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
001685 memcpy(temp, data, usableSize);
001686 src = temp;
001687 for(i=0; i<nCell; i++){
001688 u8 *pAddr; /* The i-th cell pointer */
001689 pAddr = &data[cellOffset + i*2];
001690 pc = get2byte(pAddr);
001691 testcase( pc==iCellFirst );
001692 testcase( pc==iCellLast );
001693 /* These conditions have already been verified in btreeInitPage()
001694 ** if PRAGMA cell_size_check=ON.
001695 */
001696 if( pc>iCellLast ){
001697 return SQLITE_CORRUPT_PAGE(pPage);
001698 }
001699 assert( pc>=0 && pc<=iCellLast );
001700 size = pPage->xCellSize(pPage, &src[pc]);
001701 cbrk -= size;
001702 if( cbrk<iCellStart || pc+size>usableSize ){
001703 return SQLITE_CORRUPT_PAGE(pPage);
001704 }
001705 assert( cbrk+size<=usableSize && cbrk>=iCellStart );
001706 testcase( cbrk+size==usableSize );
001707 testcase( pc+size==usableSize );
001708 put2byte(pAddr, cbrk);
001709 memcpy(&data[cbrk], &src[pc], size);
001710 }
001711 }
001712 data[hdr+7] = 0;
001713
001714 defragment_out:
001715 assert( pPage->nFree>=0 );
001716 if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){
001717 return SQLITE_CORRUPT_PAGE(pPage);
001718 }
001719 assert( cbrk>=iCellFirst );
001720 put2byte(&data[hdr+5], cbrk);
001721 data[hdr+1] = 0;
001722 data[hdr+2] = 0;
001723 memset(&data[iCellFirst], 0, cbrk-iCellFirst);
001724 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001725 return SQLITE_OK;
001726 }
001727
001728 /*
001729 ** Search the free-list on page pPg for space to store a cell nByte bytes in
001730 ** size. If one can be found, return a pointer to the space and remove it
001731 ** from the free-list.
001732 **
001733 ** If no suitable space can be found on the free-list, return NULL.
001734 **
001735 ** This function may detect corruption within pPg. If corruption is
001736 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
001737 **
001738 ** Slots on the free list that are between 1 and 3 bytes larger than nByte
001739 ** will be ignored if adding the extra space to the fragmentation count
001740 ** causes the fragmentation count to exceed 60.
001741 */
001742 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
001743 const int hdr = pPg->hdrOffset; /* Offset to page header */
001744 u8 * const aData = pPg->aData; /* Page data */
001745 int iAddr = hdr + 1; /* Address of ptr to pc */
001746 u8 *pTmp = &aData[iAddr]; /* Temporary ptr into aData[] */
001747 int pc = get2byte(pTmp); /* Address of a free slot */
001748 int x; /* Excess size of the slot */
001749 int maxPC = pPg->pBt->usableSize - nByte; /* Max address for a usable slot */
001750 int size; /* Size of the free slot */
001751
001752 assert( pc>0 );
001753 while( pc<=maxPC ){
001754 /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
001755 ** freeblock form a big-endian integer which is the size of the freeblock
001756 ** in bytes, including the 4-byte header. */
001757 pTmp = &aData[pc+2];
001758 size = get2byte(pTmp);
001759 if( (x = size - nByte)>=0 ){
001760 testcase( x==4 );
001761 testcase( x==3 );
001762 if( x<4 ){
001763 /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
001764 ** number of bytes in fragments may not exceed 60. */
001765 if( aData[hdr+7]>57 ) return 0;
001766
001767 /* Remove the slot from the free-list. Update the number of
001768 ** fragmented bytes within the page. */
001769 memcpy(&aData[iAddr], &aData[pc], 2);
001770 aData[hdr+7] += (u8)x;
001771 return &aData[pc];
001772 }else if( x+pc > maxPC ){
001773 /* This slot extends off the end of the usable part of the page */
001774 *pRc = SQLITE_CORRUPT_PAGE(pPg);
001775 return 0;
001776 }else{
001777 /* The slot remains on the free-list. Reduce its size to account
001778 ** for the portion used by the new allocation. */
001779 put2byte(&aData[pc+2], x);
001780 }
001781 return &aData[pc + x];
001782 }
001783 iAddr = pc;
001784 pTmp = &aData[pc];
001785 pc = get2byte(pTmp);
001786 if( pc<=iAddr ){
001787 if( pc ){
001788 /* The next slot in the chain comes before the current slot */
001789 *pRc = SQLITE_CORRUPT_PAGE(pPg);
001790 }
001791 return 0;
001792 }
001793 }
001794 if( pc>maxPC+nByte-4 ){
001795 /* The free slot chain extends off the end of the page */
001796 *pRc = SQLITE_CORRUPT_PAGE(pPg);
001797 }
001798 return 0;
001799 }
001800
001801 /*
001802 ** Allocate nByte bytes of space from within the B-Tree page passed
001803 ** as the first argument. Write into *pIdx the index into pPage->aData[]
001804 ** of the first byte of allocated space. Return either SQLITE_OK or
001805 ** an error code (usually SQLITE_CORRUPT).
001806 **
001807 ** The caller guarantees that there is sufficient space to make the
001808 ** allocation. This routine might need to defragment in order to bring
001809 ** all the space together, however. This routine will avoid using
001810 ** the first two bytes past the cell pointer area since presumably this
001811 ** allocation is being made in order to insert a new cell, so we will
001812 ** also end up needing a new cell pointer.
001813 */
001814 static SQLITE_INLINE int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
001815 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */
001816 u8 * const data = pPage->aData; /* Local cache of pPage->aData */
001817 int top; /* First byte of cell content area */
001818 int rc = SQLITE_OK; /* Integer return code */
001819 u8 *pTmp; /* Temp ptr into data[] */
001820 int gap; /* First byte of gap between cell pointers and cell content */
001821
001822 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001823 assert( pPage->pBt );
001824 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001825 assert( nByte>=0 ); /* Minimum cell size is 4 */
001826 assert( pPage->nFree>=nByte );
001827 assert( pPage->nOverflow==0 );
001828 assert( nByte < (int)(pPage->pBt->usableSize-8) );
001829
001830 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
001831 gap = pPage->cellOffset + 2*pPage->nCell;
001832 assert( gap<=65536 );
001833 /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
001834 ** and the reserved space is zero (the usual value for reserved space)
001835 ** then the cell content offset of an empty page wants to be 65536.
001836 ** However, that integer is too large to be stored in a 2-byte unsigned
001837 ** integer, so a value of 0 is used in its place. */
001838 pTmp = &data[hdr+5];
001839 top = get2byte(pTmp);
001840 if( gap>top ){
001841 if( top==0 && pPage->pBt->usableSize==65536 ){
001842 top = 65536;
001843 }else{
001844 return SQLITE_CORRUPT_PAGE(pPage);
001845 }
001846 }else if( top>(int)pPage->pBt->usableSize ){
001847 return SQLITE_CORRUPT_PAGE(pPage);
001848 }
001849
001850 /* If there is enough space between gap and top for one more cell pointer,
001851 ** and if the freelist is not empty, then search the
001852 ** freelist looking for a slot big enough to satisfy the request.
001853 */
001854 testcase( gap+2==top );
001855 testcase( gap+1==top );
001856 testcase( gap==top );
001857 if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
001858 u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
001859 if( pSpace ){
001860 int g2;
001861 assert( pSpace+nByte<=data+pPage->pBt->usableSize );
001862 *pIdx = g2 = (int)(pSpace-data);
001863 if( g2<=gap ){
001864 return SQLITE_CORRUPT_PAGE(pPage);
001865 }else{
001866 return SQLITE_OK;
001867 }
001868 }else if( rc ){
001869 return rc;
001870 }
001871 }
001872
001873 /* The request could not be fulfilled using a freelist slot. Check
001874 ** to see if defragmentation is necessary.
001875 */
001876 testcase( gap+2+nByte==top );
001877 if( gap+2+nByte>top ){
001878 assert( pPage->nCell>0 || CORRUPT_DB );
001879 assert( pPage->nFree>=0 );
001880 rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte)));
001881 if( rc ) return rc;
001882 top = get2byteNotZero(&data[hdr+5]);
001883 assert( gap+2+nByte<=top );
001884 }
001885
001886
001887 /* Allocate memory from the gap in between the cell pointer array
001888 ** and the cell content area. The btreeComputeFreeSpace() call has already
001889 ** validated the freelist. Given that the freelist is valid, there
001890 ** is no way that the allocation can extend off the end of the page.
001891 ** The assert() below verifies the previous sentence.
001892 */
001893 top -= nByte;
001894 put2byte(&data[hdr+5], top);
001895 assert( top+nByte <= (int)pPage->pBt->usableSize );
001896 *pIdx = top;
001897 return SQLITE_OK;
001898 }
001899
001900 /*
001901 ** Return a section of the pPage->aData to the freelist.
001902 ** The first byte of the new free block is pPage->aData[iStart]
001903 ** and the size of the block is iSize bytes.
001904 **
001905 ** Adjacent freeblocks are coalesced.
001906 **
001907 ** Even though the freeblock list was checked by btreeComputeFreeSpace(),
001908 ** that routine will not detect overlap between cells or freeblocks. Nor
001909 ** does it detect cells or freeblocks that encroach into the reserved bytes
001910 ** at the end of the page. So do additional corruption checks inside this
001911 ** routine and return SQLITE_CORRUPT if any problems are found.
001912 */
001913 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
001914 u16 iPtr; /* Address of ptr to next freeblock */
001915 u16 iFreeBlk; /* Address of the next freeblock */
001916 u8 hdr; /* Page header size. 0 or 100 */
001917 u8 nFrag = 0; /* Reduction in fragmentation */
001918 u16 iOrigSize = iSize; /* Original value of iSize */
001919 u16 x; /* Offset to cell content area */
001920 u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */
001921 unsigned char *data = pPage->aData; /* Page content */
001922 u8 *pTmp; /* Temporary ptr into data[] */
001923
001924 assert( pPage->pBt!=0 );
001925 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001926 assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
001927 assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
001928 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001929 assert( iSize>=4 ); /* Minimum cell size is 4 */
001930 assert( CORRUPT_DB || iStart<=pPage->pBt->usableSize-4 );
001931
001932 /* The list of freeblocks must be in ascending order. Find the
001933 ** spot on the list where iStart should be inserted.
001934 */
001935 hdr = pPage->hdrOffset;
001936 iPtr = hdr + 1;
001937 if( data[iPtr+1]==0 && data[iPtr]==0 ){
001938 iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */
001939 }else{
001940 while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){
001941 if( iFreeBlk<=iPtr ){
001942 if( iFreeBlk==0 ) break; /* TH3: corrupt082.100 */
001943 return SQLITE_CORRUPT_PAGE(pPage);
001944 }
001945 iPtr = iFreeBlk;
001946 }
001947 if( iFreeBlk>pPage->pBt->usableSize-4 ){ /* TH3: corrupt081.100 */
001948 return SQLITE_CORRUPT_PAGE(pPage);
001949 }
001950 assert( iFreeBlk>iPtr || iFreeBlk==0 || CORRUPT_DB );
001951
001952 /* At this point:
001953 ** iFreeBlk: First freeblock after iStart, or zero if none
001954 ** iPtr: The address of a pointer to iFreeBlk
001955 **
001956 ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
001957 */
001958 if( iFreeBlk && iEnd+3>=iFreeBlk ){
001959 nFrag = iFreeBlk - iEnd;
001960 if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PAGE(pPage);
001961 iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
001962 if( iEnd > pPage->pBt->usableSize ){
001963 return SQLITE_CORRUPT_PAGE(pPage);
001964 }
001965 iSize = iEnd - iStart;
001966 iFreeBlk = get2byte(&data[iFreeBlk]);
001967 }
001968
001969 /* If iPtr is another freeblock (that is, if iPtr is not the freelist
001970 ** pointer in the page header) then check to see if iStart should be
001971 ** coalesced onto the end of iPtr.
001972 */
001973 if( iPtr>hdr+1 ){
001974 int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
001975 if( iPtrEnd+3>=iStart ){
001976 if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PAGE(pPage);
001977 nFrag += iStart - iPtrEnd;
001978 iSize = iEnd - iPtr;
001979 iStart = iPtr;
001980 }
001981 }
001982 if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage);
001983 data[hdr+7] -= nFrag;
001984 }
001985 pTmp = &data[hdr+5];
001986 x = get2byte(pTmp);
001987 if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){
001988 /* Overwrite deleted information with zeros when the secure_delete
001989 ** option is enabled */
001990 memset(&data[iStart], 0, iSize);
001991 }
001992 if( iStart<=x ){
001993 /* The new freeblock is at the beginning of the cell content area,
001994 ** so just extend the cell content area rather than create another
001995 ** freelist entry */
001996 if( iStart<x ) return SQLITE_CORRUPT_PAGE(pPage);
001997 if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_PAGE(pPage);
001998 put2byte(&data[hdr+1], iFreeBlk);
001999 put2byte(&data[hdr+5], iEnd);
002000 }else{
002001 /* Insert the new freeblock into the freelist */
002002 put2byte(&data[iPtr], iStart);
002003 put2byte(&data[iStart], iFreeBlk);
002004 put2byte(&data[iStart+2], iSize);
002005 }
002006 pPage->nFree += iOrigSize;
002007 return SQLITE_OK;
002008 }
002009
002010 /*
002011 ** Decode the flags byte (the first byte of the header) for a page
002012 ** and initialize fields of the MemPage structure accordingly.
002013 **
002014 ** Only the following combinations are supported. Anything different
002015 ** indicates a corrupt database files:
002016 **
002017 ** PTF_ZERODATA (0x02, 2)
002018 ** PTF_LEAFDATA | PTF_INTKEY (0x05, 5)
002019 ** PTF_ZERODATA | PTF_LEAF (0x0a, 10)
002020 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF (0x0d, 13)
002021 */
002022 static int decodeFlags(MemPage *pPage, int flagByte){
002023 BtShared *pBt; /* A copy of pPage->pBt */
002024
002025 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
002026 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
002027 pBt = pPage->pBt;
002028 pPage->max1bytePayload = pBt->max1bytePayload;
002029 if( flagByte>=(PTF_ZERODATA | PTF_LEAF) ){
002030 pPage->childPtrSize = 0;
002031 pPage->leaf = 1;
002032 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF) ){
002033 pPage->intKeyLeaf = 1;
002034 pPage->xCellSize = cellSizePtrTableLeaf;
002035 pPage->xParseCell = btreeParseCellPtr;
002036 pPage->intKey = 1;
002037 pPage->maxLocal = pBt->maxLeaf;
002038 pPage->minLocal = pBt->minLeaf;
002039 }else if( flagByte==(PTF_ZERODATA | PTF_LEAF) ){
002040 pPage->intKey = 0;
002041 pPage->intKeyLeaf = 0;
002042 pPage->xCellSize = cellSizePtrIdxLeaf;
002043 pPage->xParseCell = btreeParseCellPtrIndex;
002044 pPage->maxLocal = pBt->maxLocal;
002045 pPage->minLocal = pBt->minLocal;
002046 }else{
002047 pPage->intKey = 0;
002048 pPage->intKeyLeaf = 0;
002049 pPage->xCellSize = cellSizePtrIdxLeaf;
002050 pPage->xParseCell = btreeParseCellPtrIndex;
002051 return SQLITE_CORRUPT_PAGE(pPage);
002052 }
002053 }else{
002054 pPage->childPtrSize = 4;
002055 pPage->leaf = 0;
002056 if( flagByte==(PTF_ZERODATA) ){
002057 pPage->intKey = 0;
002058 pPage->intKeyLeaf = 0;
002059 pPage->xCellSize = cellSizePtr;
002060 pPage->xParseCell = btreeParseCellPtrIndex;
002061 pPage->maxLocal = pBt->maxLocal;
002062 pPage->minLocal = pBt->minLocal;
002063 }else if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
002064 pPage->intKeyLeaf = 0;
002065 pPage->xCellSize = cellSizePtrNoPayload;
002066 pPage->xParseCell = btreeParseCellPtrNoPayload;
002067 pPage->intKey = 1;
002068 pPage->maxLocal = pBt->maxLeaf;
002069 pPage->minLocal = pBt->minLeaf;
002070 }else{
002071 pPage->intKey = 0;
002072 pPage->intKeyLeaf = 0;
002073 pPage->xCellSize = cellSizePtr;
002074 pPage->xParseCell = btreeParseCellPtrIndex;
002075 return SQLITE_CORRUPT_PAGE(pPage);
002076 }
002077 }
002078 return SQLITE_OK;
002079 }
002080
002081 /*
002082 ** Compute the amount of freespace on the page. In other words, fill
002083 ** in the pPage->nFree field.
002084 */
002085 static int btreeComputeFreeSpace(MemPage *pPage){
002086 int pc; /* Address of a freeblock within pPage->aData[] */
002087 u8 hdr; /* Offset to beginning of page header */
002088 u8 *data; /* Equal to pPage->aData */
002089 int usableSize; /* Amount of usable space on each page */
002090 int nFree; /* Number of unused bytes on the page */
002091 int top; /* First byte of the cell content area */
002092 int iCellFirst; /* First allowable cell or freeblock offset */
002093 int iCellLast; /* Last possible cell or freeblock offset */
002094
002095 assert( pPage->pBt!=0 );
002096 assert( pPage->pBt->db!=0 );
002097 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
002098 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
002099 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
002100 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
002101 assert( pPage->isInit==1 );
002102 assert( pPage->nFree<0 );
002103
002104 usableSize = pPage->pBt->usableSize;
002105 hdr = pPage->hdrOffset;
002106 data = pPage->aData;
002107 /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
002108 ** the start of the cell content area. A zero value for this integer is
002109 ** interpreted as 65536. */
002110 top = get2byteNotZero(&data[hdr+5]);
002111 iCellFirst = hdr + 8 + pPage->childPtrSize + 2*pPage->nCell;
002112 iCellLast = usableSize - 4;
002113
002114 /* Compute the total free space on the page
002115 ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
002116 ** start of the first freeblock on the page, or is zero if there are no
002117 ** freeblocks. */
002118 pc = get2byte(&data[hdr+1]);
002119 nFree = data[hdr+7] + top; /* Init nFree to non-freeblock free space */
002120 if( pc>0 ){
002121 u32 next, size;
002122 if( pc<top ){
002123 /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
002124 ** always be at least one cell before the first freeblock.
002125 */
002126 return SQLITE_CORRUPT_PAGE(pPage);
002127 }
002128 while( 1 ){
002129 if( pc>iCellLast ){
002130 /* Freeblock off the end of the page */
002131 return SQLITE_CORRUPT_PAGE(pPage);
002132 }
002133 next = get2byte(&data[pc]);
002134 size = get2byte(&data[pc+2]);
002135 nFree = nFree + size;
002136 if( next<=pc+size+3 ) break;
002137 pc = next;
002138 }
002139 if( next>0 ){
002140 /* Freeblock not in ascending order */
002141 return SQLITE_CORRUPT_PAGE(pPage);
002142 }
002143 if( pc+size>(unsigned int)usableSize ){
002144 /* Last freeblock extends past page end */
002145 return SQLITE_CORRUPT_PAGE(pPage);
002146 }
002147 }
002148
002149 /* At this point, nFree contains the sum of the offset to the start
002150 ** of the cell-content area plus the number of free bytes within
002151 ** the cell-content area. If this is greater than the usable-size
002152 ** of the page, then the page must be corrupted. This check also
002153 ** serves to verify that the offset to the start of the cell-content
002154 ** area, according to the page header, lies within the page.
002155 */
002156 if( nFree>usableSize || nFree<iCellFirst ){
002157 return SQLITE_CORRUPT_PAGE(pPage);
002158 }
002159 pPage->nFree = (u16)(nFree - iCellFirst);
002160 return SQLITE_OK;
002161 }
002162
002163 /*
002164 ** Do additional sanity check after btreeInitPage() if
002165 ** PRAGMA cell_size_check=ON
002166 */
002167 static SQLITE_NOINLINE int btreeCellSizeCheck(MemPage *pPage){
002168 int iCellFirst; /* First allowable cell or freeblock offset */
002169 int iCellLast; /* Last possible cell or freeblock offset */
002170 int i; /* Index into the cell pointer array */
002171 int sz; /* Size of a cell */
002172 int pc; /* Address of a freeblock within pPage->aData[] */
002173 u8 *data; /* Equal to pPage->aData */
002174 int usableSize; /* Maximum usable space on the page */
002175 int cellOffset; /* Start of cell content area */
002176
002177 iCellFirst = pPage->cellOffset + 2*pPage->nCell;
002178 usableSize = pPage->pBt->usableSize;
002179 iCellLast = usableSize - 4;
002180 data = pPage->aData;
002181 cellOffset = pPage->cellOffset;
002182 if( !pPage->leaf ) iCellLast--;
002183 for(i=0; i<pPage->nCell; i++){
002184 pc = get2byteAligned(&data[cellOffset+i*2]);
002185 testcase( pc==iCellFirst );
002186 testcase( pc==iCellLast );
002187 if( pc<iCellFirst || pc>iCellLast ){
002188 return SQLITE_CORRUPT_PAGE(pPage);
002189 }
002190 sz = pPage->xCellSize(pPage, &data[pc]);
002191 testcase( pc+sz==usableSize );
002192 if( pc+sz>usableSize ){
002193 return SQLITE_CORRUPT_PAGE(pPage);
002194 }
002195 }
002196 return SQLITE_OK;
002197 }
002198
002199 /*
002200 ** Initialize the auxiliary information for a disk block.
002201 **
002202 ** Return SQLITE_OK on success. If we see that the page does
002203 ** not contain a well-formed database page, then return
002204 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
002205 ** guarantee that the page is well-formed. It only shows that
002206 ** we failed to detect any corruption.
002207 */
002208 static int btreeInitPage(MemPage *pPage){
002209 u8 *data; /* Equal to pPage->aData */
002210 BtShared *pBt; /* The main btree structure */
002211
002212 assert( pPage->pBt!=0 );
002213 assert( pPage->pBt->db!=0 );
002214 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
002215 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
002216 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
002217 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
002218 assert( pPage->isInit==0 );
002219
002220 pBt = pPage->pBt;
002221 data = pPage->aData + pPage->hdrOffset;
002222 /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
002223 ** the b-tree page type. */
002224 if( decodeFlags(pPage, data[0]) ){
002225 return SQLITE_CORRUPT_PAGE(pPage);
002226 }
002227 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
002228 pPage->maskPage = (u16)(pBt->pageSize - 1);
002229 pPage->nOverflow = 0;
002230 pPage->cellOffset = pPage->hdrOffset + 8 + pPage->childPtrSize;
002231 pPage->aCellIdx = data + pPage->childPtrSize + 8;
002232 pPage->aDataEnd = pPage->aData + pBt->pageSize;
002233 pPage->aDataOfst = pPage->aData + pPage->childPtrSize;
002234 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
002235 ** number of cells on the page. */
002236 pPage->nCell = get2byte(&data[3]);
002237 if( pPage->nCell>MX_CELL(pBt) ){
002238 /* To many cells for a single page. The page must be corrupt */
002239 return SQLITE_CORRUPT_PAGE(pPage);
002240 }
002241 testcase( pPage->nCell==MX_CELL(pBt) );
002242 /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
002243 ** possible for a root page of a table that contains no rows) then the
002244 ** offset to the cell content area will equal the page size minus the
002245 ** bytes of reserved space. */
002246 assert( pPage->nCell>0
002247 || get2byteNotZero(&data[5])==(int)pBt->usableSize
002248 || CORRUPT_DB );
002249 pPage->nFree = -1; /* Indicate that this value is yet uncomputed */
002250 pPage->isInit = 1;
002251 if( pBt->db->flags & SQLITE_CellSizeCk ){
002252 return btreeCellSizeCheck(pPage);
002253 }
002254 return SQLITE_OK;
002255 }
002256
002257 /*
002258 ** Set up a raw page so that it looks like a database page holding
002259 ** no entries.
002260 */
002261 static void zeroPage(MemPage *pPage, int flags){
002262 unsigned char *data = pPage->aData;
002263 BtShared *pBt = pPage->pBt;
002264 u8 hdr = pPage->hdrOffset;
002265 u16 first;
002266
002267 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno || CORRUPT_DB );
002268 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
002269 assert( sqlite3PagerGetData(pPage->pDbPage) == data );
002270 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
002271 assert( sqlite3_mutex_held(pBt->mutex) );
002272 if( pBt->btsFlags & BTS_FAST_SECURE ){
002273 memset(&data[hdr], 0, pBt->usableSize - hdr);
002274 }
002275 data[hdr] = (char)flags;
002276 first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
002277 memset(&data[hdr+1], 0, 4);
002278 data[hdr+7] = 0;
002279 put2byte(&data[hdr+5], pBt->usableSize);
002280 pPage->nFree = (u16)(pBt->usableSize - first);
002281 decodeFlags(pPage, flags);
002282 pPage->cellOffset = first;
002283 pPage->aDataEnd = &data[pBt->pageSize];
002284 pPage->aCellIdx = &data[first];
002285 pPage->aDataOfst = &data[pPage->childPtrSize];
002286 pPage->nOverflow = 0;
002287 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
002288 pPage->maskPage = (u16)(pBt->pageSize - 1);
002289 pPage->nCell = 0;
002290 pPage->isInit = 1;
002291 }
002292
002293
002294 /*
002295 ** Convert a DbPage obtained from the pager into a MemPage used by
002296 ** the btree layer.
002297 */
002298 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
002299 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
002300 if( pgno!=pPage->pgno ){
002301 pPage->aData = sqlite3PagerGetData(pDbPage);
002302 pPage->pDbPage = pDbPage;
002303 pPage->pBt = pBt;
002304 pPage->pgno = pgno;
002305 pPage->hdrOffset = pgno==1 ? 100 : 0;
002306 }
002307 assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
002308 return pPage;
002309 }
002310
002311 /*
002312 ** Get a page from the pager. Initialize the MemPage.pBt and
002313 ** MemPage.aData elements if needed. See also: btreeGetUnusedPage().
002314 **
002315 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
002316 ** about the content of the page at this time. So do not go to the disk
002317 ** to fetch the content. Just fill in the content with zeros for now.
002318 ** If in the future we call sqlite3PagerWrite() on this page, that
002319 ** means we have started to be concerned about content and the disk
002320 ** read should occur at that point.
002321 */
002322 static int btreeGetPage(
002323 BtShared *pBt, /* The btree */
002324 Pgno pgno, /* Number of the page to fetch */
002325 MemPage **ppPage, /* Return the page in this parameter */
002326 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
002327 ){
002328 int rc;
002329 DbPage *pDbPage;
002330
002331 assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
002332 assert( sqlite3_mutex_held(pBt->mutex) );
002333 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
002334 if( rc ) return rc;
002335 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
002336 return SQLITE_OK;
002337 }
002338
002339 /*
002340 ** Retrieve a page from the pager cache. If the requested page is not
002341 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
002342 ** MemPage.aData elements if needed.
002343 */
002344 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
002345 DbPage *pDbPage;
002346 assert( sqlite3_mutex_held(pBt->mutex) );
002347 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
002348 if( pDbPage ){
002349 return btreePageFromDbPage(pDbPage, pgno, pBt);
002350 }
002351 return 0;
002352 }
002353
002354 /*
002355 ** Return the size of the database file in pages. If there is any kind of
002356 ** error, return ((unsigned int)-1).
002357 */
002358 static Pgno btreePagecount(BtShared *pBt){
002359 return pBt->nPage;
002360 }
002361 Pgno sqlite3BtreeLastPage(Btree *p){
002362 assert( sqlite3BtreeHoldsMutex(p) );
002363 return btreePagecount(p->pBt);
002364 }
002365
002366 /*
002367 ** Get a page from the pager and initialize it.
002368 */
002369 static int getAndInitPage(
002370 BtShared *pBt, /* The database file */
002371 Pgno pgno, /* Number of the page to get */
002372 MemPage **ppPage, /* Write the page pointer here */
002373 int bReadOnly /* True for a read-only page */
002374 ){
002375 int rc;
002376 DbPage *pDbPage;
002377 MemPage *pPage;
002378 assert( sqlite3_mutex_held(pBt->mutex) );
002379
002380 if( pgno>btreePagecount(pBt) ){
002381 *ppPage = 0;
002382 return SQLITE_CORRUPT_BKPT;
002383 }
002384 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
002385 if( rc ){
002386 *ppPage = 0;
002387 return rc;
002388 }
002389 pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
002390 if( pPage->isInit==0 ){
002391 btreePageFromDbPage(pDbPage, pgno, pBt);
002392 rc = btreeInitPage(pPage);
002393 if( rc!=SQLITE_OK ){
002394 releasePage(pPage);
002395 *ppPage = 0;
002396 return rc;
002397 }
002398 }
002399 assert( pPage->pgno==pgno || CORRUPT_DB );
002400 assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
002401 *ppPage = pPage;
002402 return SQLITE_OK;
002403 }
002404
002405 /*
002406 ** Release a MemPage. This should be called once for each prior
002407 ** call to btreeGetPage.
002408 **
002409 ** Page1 is a special case and must be released using releasePageOne().
002410 */
002411 static void releasePageNotNull(MemPage *pPage){
002412 assert( pPage->aData );
002413 assert( pPage->pBt );
002414 assert( pPage->pDbPage!=0 );
002415 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
002416 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
002417 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
002418 sqlite3PagerUnrefNotNull(pPage->pDbPage);
002419 }
002420 static void releasePage(MemPage *pPage){
002421 if( pPage ) releasePageNotNull(pPage);
002422 }
002423 static void releasePageOne(MemPage *pPage){
002424 assert( pPage!=0 );
002425 assert( pPage->aData );
002426 assert( pPage->pBt );
002427 assert( pPage->pDbPage!=0 );
002428 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
002429 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
002430 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
002431 sqlite3PagerUnrefPageOne(pPage->pDbPage);
002432 }
002433
002434 /*
002435 ** Get an unused page.
002436 **
002437 ** This works just like btreeGetPage() with the addition:
002438 **
002439 ** * If the page is already in use for some other purpose, immediately
002440 ** release it and return an SQLITE_CURRUPT error.
002441 ** * Make sure the isInit flag is clear
002442 */
002443 static int btreeGetUnusedPage(
002444 BtShared *pBt, /* The btree */
002445 Pgno pgno, /* Number of the page to fetch */
002446 MemPage **ppPage, /* Return the page in this parameter */
002447 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
002448 ){
002449 int rc = btreeGetPage(pBt, pgno, ppPage, flags);
002450 if( rc==SQLITE_OK ){
002451 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
002452 releasePage(*ppPage);
002453 *ppPage = 0;
002454 return SQLITE_CORRUPT_BKPT;
002455 }
002456 (*ppPage)->isInit = 0;
002457 }else{
002458 *ppPage = 0;
002459 }
002460 return rc;
002461 }
002462
002463
002464 /*
002465 ** During a rollback, when the pager reloads information into the cache
002466 ** so that the cache is restored to its original state at the start of
002467 ** the transaction, for each page restored this routine is called.
002468 **
002469 ** This routine needs to reset the extra data section at the end of the
002470 ** page to agree with the restored data.
002471 */
002472 static void pageReinit(DbPage *pData){
002473 MemPage *pPage;
002474 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
002475 assert( sqlite3PagerPageRefcount(pData)>0 );
002476 if( pPage->isInit ){
002477 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
002478 pPage->isInit = 0;
002479 if( sqlite3PagerPageRefcount(pData)>1 ){
002480 /* pPage might not be a btree page; it might be an overflow page
002481 ** or ptrmap page or a free page. In those cases, the following
002482 ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
002483 ** But no harm is done by this. And it is very important that
002484 ** btreeInitPage() be called on every btree page so we make
002485 ** the call for every page that comes in for re-initializing. */
002486 btreeInitPage(pPage);
002487 }
002488 }
002489 }
002490
002491 /*
002492 ** Invoke the busy handler for a btree.
002493 */
002494 static int btreeInvokeBusyHandler(void *pArg){
002495 BtShared *pBt = (BtShared*)pArg;
002496 assert( pBt->db );
002497 assert( sqlite3_mutex_held(pBt->db->mutex) );
002498 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
002499 }
002500
002501 /*
002502 ** Open a database file.
002503 **
002504 ** zFilename is the name of the database file. If zFilename is NULL
002505 ** then an ephemeral database is created. The ephemeral database might
002506 ** be exclusively in memory, or it might use a disk-based memory cache.
002507 ** Either way, the ephemeral database will be automatically deleted
002508 ** when sqlite3BtreeClose() is called.
002509 **
002510 ** If zFilename is ":memory:" then an in-memory database is created
002511 ** that is automatically destroyed when it is closed.
002512 **
002513 ** The "flags" parameter is a bitmask that might contain bits like
002514 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
002515 **
002516 ** If the database is already opened in the same database connection
002517 ** and we are in shared cache mode, then the open will fail with an
002518 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared
002519 ** objects in the same database connection since doing so will lead
002520 ** to problems with locking.
002521 */
002522 int sqlite3BtreeOpen(
002523 sqlite3_vfs *pVfs, /* VFS to use for this b-tree */
002524 const char *zFilename, /* Name of the file containing the BTree database */
002525 sqlite3 *db, /* Associated database handle */
002526 Btree **ppBtree, /* Pointer to new Btree object written here */
002527 int flags, /* Options */
002528 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
002529 ){
002530 BtShared *pBt = 0; /* Shared part of btree structure */
002531 Btree *p; /* Handle to return */
002532 sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */
002533 int rc = SQLITE_OK; /* Result code from this function */
002534 u8 nReserve; /* Byte of unused space on each page */
002535 unsigned char zDbHeader[100]; /* Database header content */
002536
002537 /* True if opening an ephemeral, temporary database */
002538 const int isTempDb = zFilename==0 || zFilename[0]==0;
002539
002540 /* Set the variable isMemdb to true for an in-memory database, or
002541 ** false for a file-based database.
002542 */
002543 #ifdef SQLITE_OMIT_MEMORYDB
002544 const int isMemdb = 0;
002545 #else
002546 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
002547 || (isTempDb && sqlite3TempInMemory(db))
002548 || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
002549 #endif
002550
002551 assert( db!=0 );
002552 assert( pVfs!=0 );
002553 assert( sqlite3_mutex_held(db->mutex) );
002554 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */
002555
002556 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
002557 assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
002558
002559 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
002560 assert( (flags & BTREE_SINGLE)==0 || isTempDb );
002561
002562 if( isMemdb ){
002563 flags |= BTREE_MEMORY;
002564 }
002565 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
002566 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
002567 }
002568 p = sqlite3MallocZero(sizeof(Btree));
002569 if( !p ){
002570 return SQLITE_NOMEM_BKPT;
002571 }
002572 p->inTrans = TRANS_NONE;
002573 p->db = db;
002574 #ifndef SQLITE_OMIT_SHARED_CACHE
002575 p->lock.pBtree = p;
002576 p->lock.iTable = 1;
002577 #endif
002578
002579 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
002580 /*
002581 ** If this Btree is a candidate for shared cache, try to find an
002582 ** existing BtShared object that we can share with
002583 */
002584 if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
002585 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
002586 int nFilename = sqlite3Strlen30(zFilename)+1;
002587 int nFullPathname = pVfs->mxPathname+1;
002588 char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
002589 MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
002590
002591 p->sharable = 1;
002592 if( !zFullPathname ){
002593 sqlite3_free(p);
002594 return SQLITE_NOMEM_BKPT;
002595 }
002596 if( isMemdb ){
002597 memcpy(zFullPathname, zFilename, nFilename);
002598 }else{
002599 rc = sqlite3OsFullPathname(pVfs, zFilename,
002600 nFullPathname, zFullPathname);
002601 if( rc ){
002602 if( rc==SQLITE_OK_SYMLINK ){
002603 rc = SQLITE_OK;
002604 }else{
002605 sqlite3_free(zFullPathname);
002606 sqlite3_free(p);
002607 return rc;
002608 }
002609 }
002610 }
002611 #if SQLITE_THREADSAFE
002612 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
002613 sqlite3_mutex_enter(mutexOpen);
002614 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN);
002615 sqlite3_mutex_enter(mutexShared);
002616 #endif
002617 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
002618 assert( pBt->nRef>0 );
002619 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
002620 && sqlite3PagerVfs(pBt->pPager)==pVfs ){
002621 int iDb;
002622 for(iDb=db->nDb-1; iDb>=0; iDb--){
002623 Btree *pExisting = db->aDb[iDb].pBt;
002624 if( pExisting && pExisting->pBt==pBt ){
002625 sqlite3_mutex_leave(mutexShared);
002626 sqlite3_mutex_leave(mutexOpen);
002627 sqlite3_free(zFullPathname);
002628 sqlite3_free(p);
002629 return SQLITE_CONSTRAINT;
002630 }
002631 }
002632 p->pBt = pBt;
002633 pBt->nRef++;
002634 break;
002635 }
002636 }
002637 sqlite3_mutex_leave(mutexShared);
002638 sqlite3_free(zFullPathname);
002639 }
002640 #ifdef SQLITE_DEBUG
002641 else{
002642 /* In debug mode, we mark all persistent databases as sharable
002643 ** even when they are not. This exercises the locking code and
002644 ** gives more opportunity for asserts(sqlite3_mutex_held())
002645 ** statements to find locking problems.
002646 */
002647 p->sharable = 1;
002648 }
002649 #endif
002650 }
002651 #endif
002652 if( pBt==0 ){
002653 /*
002654 ** The following asserts make sure that structures used by the btree are
002655 ** the right size. This is to guard against size changes that result
002656 ** when compiling on a different architecture.
002657 */
002658 assert( sizeof(i64)==8 );
002659 assert( sizeof(u64)==8 );
002660 assert( sizeof(u32)==4 );
002661 assert( sizeof(u16)==2 );
002662 assert( sizeof(Pgno)==4 );
002663
002664 /* Suppress false-positive compiler warning from PVS-Studio */
002665 memset(&zDbHeader[16], 0, 8);
002666
002667 pBt = sqlite3MallocZero( sizeof(*pBt) );
002668 if( pBt==0 ){
002669 rc = SQLITE_NOMEM_BKPT;
002670 goto btree_open_out;
002671 }
002672 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
002673 sizeof(MemPage), flags, vfsFlags, pageReinit);
002674 if( rc==SQLITE_OK ){
002675 sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
002676 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
002677 }
002678 if( rc!=SQLITE_OK ){
002679 goto btree_open_out;
002680 }
002681 pBt->openFlags = (u8)flags;
002682 pBt->db = db;
002683 sqlite3PagerSetBusyHandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
002684 p->pBt = pBt;
002685
002686 pBt->pCursor = 0;
002687 pBt->pPage1 = 0;
002688 if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
002689 #if defined(SQLITE_SECURE_DELETE)
002690 pBt->btsFlags |= BTS_SECURE_DELETE;
002691 #elif defined(SQLITE_FAST_SECURE_DELETE)
002692 pBt->btsFlags |= BTS_OVERWRITE;
002693 #endif
002694 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
002695 ** determined by the 2-byte integer located at an offset of 16 bytes from
002696 ** the beginning of the database file. */
002697 pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
002698 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
002699 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
002700 pBt->pageSize = 0;
002701 #ifndef SQLITE_OMIT_AUTOVACUUM
002702 /* If the magic name ":memory:" will create an in-memory database, then
002703 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
002704 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
002705 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
002706 ** regular file-name. In this case the auto-vacuum applies as per normal.
002707 */
002708 if( zFilename && !isMemdb ){
002709 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
002710 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
002711 }
002712 #endif
002713 nReserve = 0;
002714 }else{
002715 /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
002716 ** determined by the one-byte unsigned integer found at an offset of 20
002717 ** into the database file header. */
002718 nReserve = zDbHeader[20];
002719 pBt->btsFlags |= BTS_PAGESIZE_FIXED;
002720 #ifndef SQLITE_OMIT_AUTOVACUUM
002721 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
002722 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
002723 #endif
002724 }
002725 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
002726 if( rc ) goto btree_open_out;
002727 pBt->usableSize = pBt->pageSize - nReserve;
002728 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
002729
002730 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
002731 /* Add the new BtShared object to the linked list sharable BtShareds.
002732 */
002733 pBt->nRef = 1;
002734 if( p->sharable ){
002735 MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
002736 MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN);)
002737 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
002738 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
002739 if( pBt->mutex==0 ){
002740 rc = SQLITE_NOMEM_BKPT;
002741 goto btree_open_out;
002742 }
002743 }
002744 sqlite3_mutex_enter(mutexShared);
002745 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
002746 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
002747 sqlite3_mutex_leave(mutexShared);
002748 }
002749 #endif
002750 }
002751
002752 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
002753 /* If the new Btree uses a sharable pBtShared, then link the new
002754 ** Btree into the list of all sharable Btrees for the same connection.
002755 ** The list is kept in ascending order by pBt address.
002756 */
002757 if( p->sharable ){
002758 int i;
002759 Btree *pSib;
002760 for(i=0; i<db->nDb; i++){
002761 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
002762 while( pSib->pPrev ){ pSib = pSib->pPrev; }
002763 if( (uptr)p->pBt<(uptr)pSib->pBt ){
002764 p->pNext = pSib;
002765 p->pPrev = 0;
002766 pSib->pPrev = p;
002767 }else{
002768 while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){
002769 pSib = pSib->pNext;
002770 }
002771 p->pNext = pSib->pNext;
002772 p->pPrev = pSib;
002773 if( p->pNext ){
002774 p->pNext->pPrev = p;
002775 }
002776 pSib->pNext = p;
002777 }
002778 break;
002779 }
002780 }
002781 }
002782 #endif
002783 *ppBtree = p;
002784
002785 btree_open_out:
002786 if( rc!=SQLITE_OK ){
002787 if( pBt && pBt->pPager ){
002788 sqlite3PagerClose(pBt->pPager, 0);
002789 }
002790 sqlite3_free(pBt);
002791 sqlite3_free(p);
002792 *ppBtree = 0;
002793 }else{
002794 sqlite3_file *pFile;
002795
002796 /* If the B-Tree was successfully opened, set the pager-cache size to the
002797 ** default value. Except, when opening on an existing shared pager-cache,
002798 ** do not change the pager-cache size.
002799 */
002800 if( sqlite3BtreeSchema(p, 0, 0)==0 ){
002801 sqlite3BtreeSetCacheSize(p, SQLITE_DEFAULT_CACHE_SIZE);
002802 }
002803
002804 pFile = sqlite3PagerFile(pBt->pPager);
002805 if( pFile->pMethods ){
002806 sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db);
002807 }
002808 }
002809 if( mutexOpen ){
002810 assert( sqlite3_mutex_held(mutexOpen) );
002811 sqlite3_mutex_leave(mutexOpen);
002812 }
002813 assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 );
002814 return rc;
002815 }
002816
002817 /*
002818 ** Decrement the BtShared.nRef counter. When it reaches zero,
002819 ** remove the BtShared structure from the sharing list. Return
002820 ** true if the BtShared.nRef counter reaches zero and return
002821 ** false if it is still positive.
002822 */
002823 static int removeFromSharingList(BtShared *pBt){
002824 #ifndef SQLITE_OMIT_SHARED_CACHE
002825 MUTEX_LOGIC( sqlite3_mutex *pMainMtx; )
002826 BtShared *pList;
002827 int removed = 0;
002828
002829 assert( sqlite3_mutex_notheld(pBt->mutex) );
002830 MUTEX_LOGIC( pMainMtx = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN); )
002831 sqlite3_mutex_enter(pMainMtx);
002832 pBt->nRef--;
002833 if( pBt->nRef<=0 ){
002834 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
002835 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
002836 }else{
002837 pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
002838 while( ALWAYS(pList) && pList->pNext!=pBt ){
002839 pList=pList->pNext;
002840 }
002841 if( ALWAYS(pList) ){
002842 pList->pNext = pBt->pNext;
002843 }
002844 }
002845 if( SQLITE_THREADSAFE ){
002846 sqlite3_mutex_free(pBt->mutex);
002847 }
002848 removed = 1;
002849 }
002850 sqlite3_mutex_leave(pMainMtx);
002851 return removed;
002852 #else
002853 return 1;
002854 #endif
002855 }
002856
002857 /*
002858 ** Make sure pBt->pTmpSpace points to an allocation of
002859 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
002860 ** pointer.
002861 */
002862 static SQLITE_NOINLINE int allocateTempSpace(BtShared *pBt){
002863 assert( pBt!=0 );
002864 assert( pBt->pTmpSpace==0 );
002865 /* This routine is called only by btreeCursor() when allocating the
002866 ** first write cursor for the BtShared object */
002867 assert( pBt->pCursor!=0 && (pBt->pCursor->curFlags & BTCF_WriteFlag)!=0 );
002868 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
002869 if( pBt->pTmpSpace==0 ){
002870 BtCursor *pCur = pBt->pCursor;
002871 pBt->pCursor = pCur->pNext; /* Unlink the cursor */
002872 memset(pCur, 0, sizeof(*pCur));
002873 return SQLITE_NOMEM_BKPT;
002874 }
002875
002876 /* One of the uses of pBt->pTmpSpace is to format cells before
002877 ** inserting them into a leaf page (function fillInCell()). If
002878 ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
002879 ** by the various routines that manipulate binary cells. Which
002880 ** can mean that fillInCell() only initializes the first 2 or 3
002881 ** bytes of pTmpSpace, but that the first 4 bytes are copied from
002882 ** it into a database page. This is not actually a problem, but it
002883 ** does cause a valgrind error when the 1 or 2 bytes of uninitialized
002884 ** data is passed to system call write(). So to avoid this error,
002885 ** zero the first 4 bytes of temp space here.
002886 **
002887 ** Also: Provide four bytes of initialized space before the
002888 ** beginning of pTmpSpace as an area available to prepend the
002889 ** left-child pointer to the beginning of a cell.
002890 */
002891 memset(pBt->pTmpSpace, 0, 8);
002892 pBt->pTmpSpace += 4;
002893 return SQLITE_OK;
002894 }
002895
002896 /*
002897 ** Free the pBt->pTmpSpace allocation
002898 */
002899 static void freeTempSpace(BtShared *pBt){
002900 if( pBt->pTmpSpace ){
002901 pBt->pTmpSpace -= 4;
002902 sqlite3PageFree(pBt->pTmpSpace);
002903 pBt->pTmpSpace = 0;
002904 }
002905 }
002906
002907 /*
002908 ** Close an open database and invalidate all cursors.
002909 */
002910 int sqlite3BtreeClose(Btree *p){
002911 BtShared *pBt = p->pBt;
002912
002913 /* Close all cursors opened via this handle. */
002914 assert( sqlite3_mutex_held(p->db->mutex) );
002915 sqlite3BtreeEnter(p);
002916
002917 /* Verify that no other cursors have this Btree open */
002918 #ifdef SQLITE_DEBUG
002919 {
002920 BtCursor *pCur = pBt->pCursor;
002921 while( pCur ){
002922 BtCursor *pTmp = pCur;
002923 pCur = pCur->pNext;
002924 assert( pTmp->pBtree!=p );
002925
002926 }
002927 }
002928 #endif
002929
002930 /* Rollback any active transaction and free the handle structure.
002931 ** The call to sqlite3BtreeRollback() drops any table-locks held by
002932 ** this handle.
002933 */
002934 sqlite3BtreeRollback(p, SQLITE_OK, 0);
002935 sqlite3BtreeLeave(p);
002936
002937 /* If there are still other outstanding references to the shared-btree
002938 ** structure, return now. The remainder of this procedure cleans
002939 ** up the shared-btree.
002940 */
002941 assert( p->wantToLock==0 && p->locked==0 );
002942 if( !p->sharable || removeFromSharingList(pBt) ){
002943 /* The pBt is no longer on the sharing list, so we can access
002944 ** it without having to hold the mutex.
002945 **
002946 ** Clean out and delete the BtShared object.
002947 */
002948 assert( !pBt->pCursor );
002949 sqlite3PagerClose(pBt->pPager, p->db);
002950 if( pBt->xFreeSchema && pBt->pSchema ){
002951 pBt->xFreeSchema(pBt->pSchema);
002952 }
002953 sqlite3DbFree(0, pBt->pSchema);
002954 freeTempSpace(pBt);
002955 sqlite3_free(pBt);
002956 }
002957
002958 #ifndef SQLITE_OMIT_SHARED_CACHE
002959 assert( p->wantToLock==0 );
002960 assert( p->locked==0 );
002961 if( p->pPrev ) p->pPrev->pNext = p->pNext;
002962 if( p->pNext ) p->pNext->pPrev = p->pPrev;
002963 #endif
002964
002965 sqlite3_free(p);
002966 return SQLITE_OK;
002967 }
002968
002969 /*
002970 ** Change the "soft" limit on the number of pages in the cache.
002971 ** Unused and unmodified pages will be recycled when the number of
002972 ** pages in the cache exceeds this soft limit. But the size of the
002973 ** cache is allowed to grow larger than this limit if it contains
002974 ** dirty pages or pages still in active use.
002975 */
002976 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
002977 BtShared *pBt = p->pBt;
002978 assert( sqlite3_mutex_held(p->db->mutex) );
002979 sqlite3BtreeEnter(p);
002980 sqlite3PagerSetCachesize(pBt->pPager, mxPage);
002981 sqlite3BtreeLeave(p);
002982 return SQLITE_OK;
002983 }
002984
002985 /*
002986 ** Change the "spill" limit on the number of pages in the cache.
002987 ** If the number of pages exceeds this limit during a write transaction,
002988 ** the pager might attempt to "spill" pages to the journal early in
002989 ** order to free up memory.
002990 **
002991 ** The value returned is the current spill size. If zero is passed
002992 ** as an argument, no changes are made to the spill size setting, so
002993 ** using mxPage of 0 is a way to query the current spill size.
002994 */
002995 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){
002996 BtShared *pBt = p->pBt;
002997 int res;
002998 assert( sqlite3_mutex_held(p->db->mutex) );
002999 sqlite3BtreeEnter(p);
003000 res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);
003001 sqlite3BtreeLeave(p);
003002 return res;
003003 }
003004
003005 #if SQLITE_MAX_MMAP_SIZE>0
003006 /*
003007 ** Change the limit on the amount of the database file that may be
003008 ** memory mapped.
003009 */
003010 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
003011 BtShared *pBt = p->pBt;
003012 assert( sqlite3_mutex_held(p->db->mutex) );
003013 sqlite3BtreeEnter(p);
003014 sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
003015 sqlite3BtreeLeave(p);
003016 return SQLITE_OK;
003017 }
003018 #endif /* SQLITE_MAX_MMAP_SIZE>0 */
003019
003020 /*
003021 ** Change the way data is synced to disk in order to increase or decrease
003022 ** how well the database resists damage due to OS crashes and power
003023 ** failures. Level 1 is the same as asynchronous (no syncs() occur and
003024 ** there is a high probability of damage) Level 2 is the default. There
003025 ** is a very low but non-zero probability of damage. Level 3 reduces the
003026 ** probability of damage to near zero but with a write performance reduction.
003027 */
003028 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
003029 int sqlite3BtreeSetPagerFlags(
003030 Btree *p, /* The btree to set the safety level on */
003031 unsigned pgFlags /* Various PAGER_* flags */
003032 ){
003033 BtShared *pBt = p->pBt;
003034 assert( sqlite3_mutex_held(p->db->mutex) );
003035 sqlite3BtreeEnter(p);
003036 sqlite3PagerSetFlags(pBt->pPager, pgFlags);
003037 sqlite3BtreeLeave(p);
003038 return SQLITE_OK;
003039 }
003040 #endif
003041
003042 /*
003043 ** Change the default pages size and the number of reserved bytes per page.
003044 ** Or, if the page size has already been fixed, return SQLITE_READONLY
003045 ** without changing anything.
003046 **
003047 ** The page size must be a power of 2 between 512 and 65536. If the page
003048 ** size supplied does not meet this constraint then the page size is not
003049 ** changed.
003050 **
003051 ** Page sizes are constrained to be a power of two so that the region
003052 ** of the database file used for locking (beginning at PENDING_BYTE,
003053 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
003054 ** at the beginning of a page.
003055 **
003056 ** If parameter nReserve is less than zero, then the number of reserved
003057 ** bytes per page is left unchanged.
003058 **
003059 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
003060 ** and autovacuum mode can no longer be changed.
003061 */
003062 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
003063 int rc = SQLITE_OK;
003064 int x;
003065 BtShared *pBt = p->pBt;
003066 assert( nReserve>=0 && nReserve<=255 );
003067 sqlite3BtreeEnter(p);
003068 pBt->nReserveWanted = nReserve;
003069 x = pBt->pageSize - pBt->usableSize;
003070 if( nReserve<x ) nReserve = x;
003071 if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
003072 sqlite3BtreeLeave(p);
003073 return SQLITE_READONLY;
003074 }
003075 assert( nReserve>=0 && nReserve<=255 );
003076 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
003077 ((pageSize-1)&pageSize)==0 ){
003078 assert( (pageSize & 7)==0 );
003079 assert( !pBt->pCursor );
003080 if( nReserve>32 && pageSize==512 ) pageSize = 1024;
003081 pBt->pageSize = (u32)pageSize;
003082 freeTempSpace(pBt);
003083 }
003084 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
003085 pBt->usableSize = pBt->pageSize - (u16)nReserve;
003086 if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
003087 sqlite3BtreeLeave(p);
003088 return rc;
003089 }
003090
003091 /*
003092 ** Return the currently defined page size
003093 */
003094 int sqlite3BtreeGetPageSize(Btree *p){
003095 return p->pBt->pageSize;
003096 }
003097
003098 /*
003099 ** This function is similar to sqlite3BtreeGetReserve(), except that it
003100 ** may only be called if it is guaranteed that the b-tree mutex is already
003101 ** held.
003102 **
003103 ** This is useful in one special case in the backup API code where it is
003104 ** known that the shared b-tree mutex is held, but the mutex on the
003105 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
003106 ** were to be called, it might collide with some other operation on the
003107 ** database handle that owns *p, causing undefined behavior.
003108 */
003109 int sqlite3BtreeGetReserveNoMutex(Btree *p){
003110 int n;
003111 assert( sqlite3_mutex_held(p->pBt->mutex) );
003112 n = p->pBt->pageSize - p->pBt->usableSize;
003113 return n;
003114 }
003115
003116 /*
003117 ** Return the number of bytes of space at the end of every page that
003118 ** are intentionally left unused. This is the "reserved" space that is
003119 ** sometimes used by extensions.
003120 **
003121 ** The value returned is the larger of the current reserve size and
003122 ** the latest reserve size requested by SQLITE_FILECTRL_RESERVE_BYTES.
003123 ** The amount of reserve can only grow - never shrink.
003124 */
003125 int sqlite3BtreeGetRequestedReserve(Btree *p){
003126 int n1, n2;
003127 sqlite3BtreeEnter(p);
003128 n1 = (int)p->pBt->nReserveWanted;
003129 n2 = sqlite3BtreeGetReserveNoMutex(p);
003130 sqlite3BtreeLeave(p);
003131 return n1>n2 ? n1 : n2;
003132 }
003133
003134
003135 /*
003136 ** Set the maximum page count for a database if mxPage is positive.
003137 ** No changes are made if mxPage is 0 or negative.
003138 ** Regardless of the value of mxPage, return the maximum page count.
003139 */
003140 Pgno sqlite3BtreeMaxPageCount(Btree *p, Pgno mxPage){
003141 Pgno n;
003142 sqlite3BtreeEnter(p);
003143 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
003144 sqlite3BtreeLeave(p);
003145 return n;
003146 }
003147
003148 /*
003149 ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags:
003150 **
003151 ** newFlag==0 Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared
003152 ** newFlag==1 BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared
003153 ** newFlag==2 BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set
003154 ** newFlag==(-1) No changes
003155 **
003156 ** This routine acts as a query if newFlag is less than zero
003157 **
003158 ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but
003159 ** freelist leaf pages are not written back to the database. Thus in-page
003160 ** deleted content is cleared, but freelist deleted content is not.
003161 **
003162 ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition
003163 ** that freelist leaf pages are written back into the database, increasing
003164 ** the amount of disk I/O.
003165 */
003166 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
003167 int b;
003168 if( p==0 ) return 0;
003169 sqlite3BtreeEnter(p);
003170 assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 );
003171 assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) );
003172 if( newFlag>=0 ){
003173 p->pBt->btsFlags &= ~BTS_FAST_SECURE;
003174 p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag;
003175 }
003176 b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE;
003177 sqlite3BtreeLeave(p);
003178 return b;
003179 }
003180
003181 /*
003182 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
003183 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
003184 ** is disabled. The default value for the auto-vacuum property is
003185 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
003186 */
003187 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
003188 #ifdef SQLITE_OMIT_AUTOVACUUM
003189 return SQLITE_READONLY;
003190 #else
003191 BtShared *pBt = p->pBt;
003192 int rc = SQLITE_OK;
003193 u8 av = (u8)autoVacuum;
003194
003195 sqlite3BtreeEnter(p);
003196 if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
003197 rc = SQLITE_READONLY;
003198 }else{
003199 pBt->autoVacuum = av ?1:0;
003200 pBt->incrVacuum = av==2 ?1:0;
003201 }
003202 sqlite3BtreeLeave(p);
003203 return rc;
003204 #endif
003205 }
003206
003207 /*
003208 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
003209 ** enabled 1 is returned. Otherwise 0.
003210 */
003211 int sqlite3BtreeGetAutoVacuum(Btree *p){
003212 #ifdef SQLITE_OMIT_AUTOVACUUM
003213 return BTREE_AUTOVACUUM_NONE;
003214 #else
003215 int rc;
003216 sqlite3BtreeEnter(p);
003217 rc = (
003218 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
003219 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
003220 BTREE_AUTOVACUUM_INCR
003221 );
003222 sqlite3BtreeLeave(p);
003223 return rc;
003224 #endif
003225 }
003226
003227 /*
003228 ** If the user has not set the safety-level for this database connection
003229 ** using "PRAGMA synchronous", and if the safety-level is not already
003230 ** set to the value passed to this function as the second parameter,
003231 ** set it so.
003232 */
003233 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \
003234 && !defined(SQLITE_OMIT_WAL)
003235 static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){
003236 sqlite3 *db;
003237 Db *pDb;
003238 if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){
003239 while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; }
003240 if( pDb->bSyncSet==0
003241 && pDb->safety_level!=safety_level
003242 && pDb!=&db->aDb[1]
003243 ){
003244 pDb->safety_level = safety_level;
003245 sqlite3PagerSetFlags(pBt->pPager,
003246 pDb->safety_level | (db->flags & PAGER_FLAGS_MASK));
003247 }
003248 }
003249 }
003250 #else
003251 # define setDefaultSyncFlag(pBt,safety_level)
003252 #endif
003253
003254 /* Forward declaration */
003255 static int newDatabase(BtShared*);
003256
003257
003258 /*
003259 ** Get a reference to pPage1 of the database file. This will
003260 ** also acquire a readlock on that file.
003261 **
003262 ** SQLITE_OK is returned on success. If the file is not a
003263 ** well-formed database file, then SQLITE_CORRUPT is returned.
003264 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
003265 ** is returned if we run out of memory.
003266 */
003267 static int lockBtree(BtShared *pBt){
003268 int rc; /* Result code from subfunctions */
003269 MemPage *pPage1; /* Page 1 of the database file */
003270 u32 nPage; /* Number of pages in the database */
003271 u32 nPageFile = 0; /* Number of pages in the database file */
003272
003273 assert( sqlite3_mutex_held(pBt->mutex) );
003274 assert( pBt->pPage1==0 );
003275 rc = sqlite3PagerSharedLock(pBt->pPager);
003276 if( rc!=SQLITE_OK ) return rc;
003277 rc = btreeGetPage(pBt, 1, &pPage1, 0);
003278 if( rc!=SQLITE_OK ) return rc;
003279
003280 /* Do some checking to help insure the file we opened really is
003281 ** a valid database file.
003282 */
003283 nPage = get4byte(28+(u8*)pPage1->aData);
003284 sqlite3PagerPagecount(pBt->pPager, (int*)&nPageFile);
003285 if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
003286 nPage = nPageFile;
003287 }
003288 if( (pBt->db->flags & SQLITE_ResetDatabase)!=0 ){
003289 nPage = 0;
003290 }
003291 if( nPage>0 ){
003292 u32 pageSize;
003293 u32 usableSize;
003294 u8 *page1 = pPage1->aData;
003295 rc = SQLITE_NOTADB;
003296 /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
003297 ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
003298 ** 61 74 20 33 00. */
003299 if( memcmp(page1, zMagicHeader, 16)!=0 ){
003300 goto page1_init_failed;
003301 }
003302
003303 #ifdef SQLITE_OMIT_WAL
003304 if( page1[18]>1 ){
003305 pBt->btsFlags |= BTS_READ_ONLY;
003306 }
003307 if( page1[19]>1 ){
003308 goto page1_init_failed;
003309 }
003310 #else
003311 if( page1[18]>2 ){
003312 pBt->btsFlags |= BTS_READ_ONLY;
003313 }
003314 if( page1[19]>2 ){
003315 goto page1_init_failed;
003316 }
003317
003318 /* If the read version is set to 2, this database should be accessed
003319 ** in WAL mode. If the log is not already open, open it now. Then
003320 ** return SQLITE_OK and return without populating BtShared.pPage1.
003321 ** The caller detects this and calls this function again. This is
003322 ** required as the version of page 1 currently in the page1 buffer
003323 ** may not be the latest version - there may be a newer one in the log
003324 ** file.
003325 */
003326 if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
003327 int isOpen = 0;
003328 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
003329 if( rc!=SQLITE_OK ){
003330 goto page1_init_failed;
003331 }else{
003332 setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1);
003333 if( isOpen==0 ){
003334 releasePageOne(pPage1);
003335 return SQLITE_OK;
003336 }
003337 }
003338 rc = SQLITE_NOTADB;
003339 }else{
003340 setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1);
003341 }
003342 #endif
003343
003344 /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
003345 ** fractions and the leaf payload fraction values must be 64, 32, and 32.
003346 **
003347 ** The original design allowed these amounts to vary, but as of
003348 ** version 3.6.0, we require them to be fixed.
003349 */
003350 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
003351 goto page1_init_failed;
003352 }
003353 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
003354 ** determined by the 2-byte integer located at an offset of 16 bytes from
003355 ** the beginning of the database file. */
003356 pageSize = (page1[16]<<8) | (page1[17]<<16);
003357 /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
003358 ** between 512 and 65536 inclusive. */
003359 if( ((pageSize-1)&pageSize)!=0
003360 || pageSize>SQLITE_MAX_PAGE_SIZE
003361 || pageSize<=256
003362 ){
003363 goto page1_init_failed;
003364 }
003365 assert( (pageSize & 7)==0 );
003366 /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
003367 ** integer at offset 20 is the number of bytes of space at the end of
003368 ** each page to reserve for extensions.
003369 **
003370 ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
003371 ** determined by the one-byte unsigned integer found at an offset of 20
003372 ** into the database file header. */
003373 usableSize = pageSize - page1[20];
003374 if( (u32)pageSize!=pBt->pageSize ){
003375 /* After reading the first page of the database assuming a page size
003376 ** of BtShared.pageSize, we have discovered that the page-size is
003377 ** actually pageSize. Unlock the database, leave pBt->pPage1 at
003378 ** zero and return SQLITE_OK. The caller will call this function
003379 ** again with the correct page-size.
003380 */
003381 releasePageOne(pPage1);
003382 pBt->usableSize = usableSize;
003383 pBt->pageSize = pageSize;
003384 pBt->btsFlags |= BTS_PAGESIZE_FIXED;
003385 freeTempSpace(pBt);
003386 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
003387 pageSize-usableSize);
003388 return rc;
003389 }
003390 if( nPage>nPageFile ){
003391 if( sqlite3WritableSchema(pBt->db)==0 ){
003392 rc = SQLITE_CORRUPT_BKPT;
003393 goto page1_init_failed;
003394 }else{
003395 nPage = nPageFile;
003396 }
003397 }
003398 /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
003399 ** be less than 480. In other words, if the page size is 512, then the
003400 ** reserved space size cannot exceed 32. */
003401 if( usableSize<480 ){
003402 goto page1_init_failed;
003403 }
003404 pBt->btsFlags |= BTS_PAGESIZE_FIXED;
003405 pBt->pageSize = pageSize;
003406 pBt->usableSize = usableSize;
003407 #ifndef SQLITE_OMIT_AUTOVACUUM
003408 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
003409 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
003410 #endif
003411 }
003412
003413 /* maxLocal is the maximum amount of payload to store locally for
003414 ** a cell. Make sure it is small enough so that at least minFanout
003415 ** cells can will fit on one page. We assume a 10-byte page header.
003416 ** Besides the payload, the cell must store:
003417 ** 2-byte pointer to the cell
003418 ** 4-byte child pointer
003419 ** 9-byte nKey value
003420 ** 4-byte nData value
003421 ** 4-byte overflow page pointer
003422 ** So a cell consists of a 2-byte pointer, a header which is as much as
003423 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
003424 ** page pointer.
003425 */
003426 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
003427 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
003428 pBt->maxLeaf = (u16)(pBt->usableSize - 35);
003429 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
003430 if( pBt->maxLocal>127 ){
003431 pBt->max1bytePayload = 127;
003432 }else{
003433 pBt->max1bytePayload = (u8)pBt->maxLocal;
003434 }
003435 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
003436 pBt->pPage1 = pPage1;
003437 pBt->nPage = nPage;
003438 return SQLITE_OK;
003439
003440 page1_init_failed:
003441 releasePageOne(pPage1);
003442 pBt->pPage1 = 0;
003443 return rc;
003444 }
003445
003446 #ifndef NDEBUG
003447 /*
003448 ** Return the number of cursors open on pBt. This is for use
003449 ** in assert() expressions, so it is only compiled if NDEBUG is not
003450 ** defined.
003451 **
003452 ** Only write cursors are counted if wrOnly is true. If wrOnly is
003453 ** false then all cursors are counted.
003454 **
003455 ** For the purposes of this routine, a cursor is any cursor that
003456 ** is capable of reading or writing to the database. Cursors that
003457 ** have been tripped into the CURSOR_FAULT state are not counted.
003458 */
003459 static int countValidCursors(BtShared *pBt, int wrOnly){
003460 BtCursor *pCur;
003461 int r = 0;
003462 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
003463 if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
003464 && pCur->eState!=CURSOR_FAULT ) r++;
003465 }
003466 return r;
003467 }
003468 #endif
003469
003470 /*
003471 ** If there are no outstanding cursors and we are not in the middle
003472 ** of a transaction but there is a read lock on the database, then
003473 ** this routine unrefs the first page of the database file which
003474 ** has the effect of releasing the read lock.
003475 **
003476 ** If there is a transaction in progress, this routine is a no-op.
003477 */
003478 static void unlockBtreeIfUnused(BtShared *pBt){
003479 assert( sqlite3_mutex_held(pBt->mutex) );
003480 assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
003481 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
003482 MemPage *pPage1 = pBt->pPage1;
003483 assert( pPage1->aData );
003484 assert( sqlite3PagerRefcount(pBt->pPager)==1 );
003485 pBt->pPage1 = 0;
003486 releasePageOne(pPage1);
003487 }
003488 }
003489
003490 /*
003491 ** If pBt points to an empty file then convert that empty file
003492 ** into a new empty database by initializing the first page of
003493 ** the database.
003494 */
003495 static int newDatabase(BtShared *pBt){
003496 MemPage *pP1;
003497 unsigned char *data;
003498 int rc;
003499
003500 assert( sqlite3_mutex_held(pBt->mutex) );
003501 if( pBt->nPage>0 ){
003502 return SQLITE_OK;
003503 }
003504 pP1 = pBt->pPage1;
003505 assert( pP1!=0 );
003506 data = pP1->aData;
003507 rc = sqlite3PagerWrite(pP1->pDbPage);
003508 if( rc ) return rc;
003509 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
003510 assert( sizeof(zMagicHeader)==16 );
003511 data[16] = (u8)((pBt->pageSize>>8)&0xff);
003512 data[17] = (u8)((pBt->pageSize>>16)&0xff);
003513 data[18] = 1;
003514 data[19] = 1;
003515 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
003516 data[20] = (u8)(pBt->pageSize - pBt->usableSize);
003517 data[21] = 64;
003518 data[22] = 32;
003519 data[23] = 32;
003520 memset(&data[24], 0, 100-24);
003521 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
003522 pBt->btsFlags |= BTS_PAGESIZE_FIXED;
003523 #ifndef SQLITE_OMIT_AUTOVACUUM
003524 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
003525 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
003526 put4byte(&data[36 + 4*4], pBt->autoVacuum);
003527 put4byte(&data[36 + 7*4], pBt->incrVacuum);
003528 #endif
003529 pBt->nPage = 1;
003530 data[31] = 1;
003531 return SQLITE_OK;
003532 }
003533
003534 /*
003535 ** Initialize the first page of the database file (creating a database
003536 ** consisting of a single page and no schema objects). Return SQLITE_OK
003537 ** if successful, or an SQLite error code otherwise.
003538 */
003539 int sqlite3BtreeNewDb(Btree *p){
003540 int rc;
003541 sqlite3BtreeEnter(p);
003542 p->pBt->nPage = 0;
003543 rc = newDatabase(p->pBt);
003544 sqlite3BtreeLeave(p);
003545 return rc;
003546 }
003547
003548 /*
003549 ** Attempt to start a new transaction. A write-transaction
003550 ** is started if the second argument is nonzero, otherwise a read-
003551 ** transaction. If the second argument is 2 or more and exclusive
003552 ** transaction is started, meaning that no other process is allowed
003553 ** to access the database. A preexisting transaction may not be
003554 ** upgraded to exclusive by calling this routine a second time - the
003555 ** exclusivity flag only works for a new transaction.
003556 **
003557 ** A write-transaction must be started before attempting any
003558 ** changes to the database. None of the following routines
003559 ** will work unless a transaction is started first:
003560 **
003561 ** sqlite3BtreeCreateTable()
003562 ** sqlite3BtreeCreateIndex()
003563 ** sqlite3BtreeClearTable()
003564 ** sqlite3BtreeDropTable()
003565 ** sqlite3BtreeInsert()
003566 ** sqlite3BtreeDelete()
003567 ** sqlite3BtreeUpdateMeta()
003568 **
003569 ** If an initial attempt to acquire the lock fails because of lock contention
003570 ** and the database was previously unlocked, then invoke the busy handler
003571 ** if there is one. But if there was previously a read-lock, do not
003572 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
003573 ** returned when there is already a read-lock in order to avoid a deadlock.
003574 **
003575 ** Suppose there are two processes A and B. A has a read lock and B has
003576 ** a reserved lock. B tries to promote to exclusive but is blocked because
003577 ** of A's read lock. A tries to promote to reserved but is blocked by B.
003578 ** One or the other of the two processes must give way or there can be
003579 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback
003580 ** when A already has a read lock, we encourage A to give up and let B
003581 ** proceed.
003582 */
003583 static SQLITE_NOINLINE int btreeBeginTrans(
003584 Btree *p, /* The btree in which to start the transaction */
003585 int wrflag, /* True to start a write transaction */
003586 int *pSchemaVersion /* Put schema version number here, if not NULL */
003587 ){
003588 BtShared *pBt = p->pBt;
003589 Pager *pPager = pBt->pPager;
003590 int rc = SQLITE_OK;
003591
003592 sqlite3BtreeEnter(p);
003593 btreeIntegrity(p);
003594
003595 /* If the btree is already in a write-transaction, or it
003596 ** is already in a read-transaction and a read-transaction
003597 ** is requested, this is a no-op.
003598 */
003599 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
003600 goto trans_begun;
003601 }
003602 assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
003603
003604 if( (p->db->flags & SQLITE_ResetDatabase)
003605 && sqlite3PagerIsreadonly(pPager)==0
003606 ){
003607 pBt->btsFlags &= ~BTS_READ_ONLY;
003608 }
003609
003610 /* Write transactions are not possible on a read-only database */
003611 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
003612 rc = SQLITE_READONLY;
003613 goto trans_begun;
003614 }
003615
003616 #ifndef SQLITE_OMIT_SHARED_CACHE
003617 {
003618 sqlite3 *pBlock = 0;
003619 /* If another database handle has already opened a write transaction
003620 ** on this shared-btree structure and a second write transaction is
003621 ** requested, return SQLITE_LOCKED.
003622 */
003623 if( (wrflag && pBt->inTransaction==TRANS_WRITE)
003624 || (pBt->btsFlags & BTS_PENDING)!=0
003625 ){
003626 pBlock = pBt->pWriter->db;
003627 }else if( wrflag>1 ){
003628 BtLock *pIter;
003629 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
003630 if( pIter->pBtree!=p ){
003631 pBlock = pIter->pBtree->db;
003632 break;
003633 }
003634 }
003635 }
003636 if( pBlock ){
003637 sqlite3ConnectionBlocked(p->db, pBlock);
003638 rc = SQLITE_LOCKED_SHAREDCACHE;
003639 goto trans_begun;
003640 }
003641 }
003642 #endif
003643
003644 /* Any read-only or read-write transaction implies a read-lock on
003645 ** page 1. So if some other shared-cache client already has a write-lock
003646 ** on page 1, the transaction cannot be opened. */
003647 rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK);
003648 if( SQLITE_OK!=rc ) goto trans_begun;
003649
003650 pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
003651 if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
003652 do {
003653 sqlite3PagerWalDb(pPager, p->db);
003654
003655 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT
003656 /* If transitioning from no transaction directly to a write transaction,
003657 ** block for the WRITER lock first if possible. */
003658 if( pBt->pPage1==0 && wrflag ){
003659 assert( pBt->inTransaction==TRANS_NONE );
003660 rc = sqlite3PagerWalWriteLock(pPager, 1);
003661 if( rc!=SQLITE_BUSY && rc!=SQLITE_OK ) break;
003662 }
003663 #endif
003664
003665 /* Call lockBtree() until either pBt->pPage1 is populated or
003666 ** lockBtree() returns something other than SQLITE_OK. lockBtree()
003667 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
003668 ** reading page 1 it discovers that the page-size of the database
003669 ** file is not pBt->pageSize. In this case lockBtree() will update
003670 ** pBt->pageSize to the page-size of the file on disk.
003671 */
003672 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
003673
003674 if( rc==SQLITE_OK && wrflag ){
003675 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
003676 rc = SQLITE_READONLY;
003677 }else{
003678 rc = sqlite3PagerBegin(pPager, wrflag>1, sqlite3TempInMemory(p->db));
003679 if( rc==SQLITE_OK ){
003680 rc = newDatabase(pBt);
003681 }else if( rc==SQLITE_BUSY_SNAPSHOT && pBt->inTransaction==TRANS_NONE ){
003682 /* if there was no transaction opened when this function was
003683 ** called and SQLITE_BUSY_SNAPSHOT is returned, change the error
003684 ** code to SQLITE_BUSY. */
003685 rc = SQLITE_BUSY;
003686 }
003687 }
003688 }
003689
003690 if( rc!=SQLITE_OK ){
003691 (void)sqlite3PagerWalWriteLock(pPager, 0);
003692 unlockBtreeIfUnused(pBt);
003693 }
003694 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
003695 btreeInvokeBusyHandler(pBt) );
003696 sqlite3PagerWalDb(pPager, 0);
003697 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT
003698 if( rc==SQLITE_BUSY_TIMEOUT ) rc = SQLITE_BUSY;
003699 #endif
003700
003701 if( rc==SQLITE_OK ){
003702 if( p->inTrans==TRANS_NONE ){
003703 pBt->nTransaction++;
003704 #ifndef SQLITE_OMIT_SHARED_CACHE
003705 if( p->sharable ){
003706 assert( p->lock.pBtree==p && p->lock.iTable==1 );
003707 p->lock.eLock = READ_LOCK;
003708 p->lock.pNext = pBt->pLock;
003709 pBt->pLock = &p->lock;
003710 }
003711 #endif
003712 }
003713 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
003714 if( p->inTrans>pBt->inTransaction ){
003715 pBt->inTransaction = p->inTrans;
003716 }
003717 if( wrflag ){
003718 MemPage *pPage1 = pBt->pPage1;
003719 #ifndef SQLITE_OMIT_SHARED_CACHE
003720 assert( !pBt->pWriter );
003721 pBt->pWriter = p;
003722 pBt->btsFlags &= ~BTS_EXCLUSIVE;
003723 if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
003724 #endif
003725
003726 /* If the db-size header field is incorrect (as it may be if an old
003727 ** client has been writing the database file), update it now. Doing
003728 ** this sooner rather than later means the database size can safely
003729 ** re-read the database size from page 1 if a savepoint or transaction
003730 ** rollback occurs within the transaction.
003731 */
003732 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
003733 rc = sqlite3PagerWrite(pPage1->pDbPage);
003734 if( rc==SQLITE_OK ){
003735 put4byte(&pPage1->aData[28], pBt->nPage);
003736 }
003737 }
003738 }
003739 }
003740
003741 trans_begun:
003742 if( rc==SQLITE_OK ){
003743 if( pSchemaVersion ){
003744 *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]);
003745 }
003746 if( wrflag ){
003747 /* This call makes sure that the pager has the correct number of
003748 ** open savepoints. If the second parameter is greater than 0 and
003749 ** the sub-journal is not already open, then it will be opened here.
003750 */
003751 rc = sqlite3PagerOpenSavepoint(pPager, p->db->nSavepoint);
003752 }
003753 }
003754
003755 btreeIntegrity(p);
003756 sqlite3BtreeLeave(p);
003757 return rc;
003758 }
003759 int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){
003760 BtShared *pBt;
003761 if( p->sharable
003762 || p->inTrans==TRANS_NONE
003763 || (p->inTrans==TRANS_READ && wrflag!=0)
003764 ){
003765 return btreeBeginTrans(p,wrflag,pSchemaVersion);
003766 }
003767 pBt = p->pBt;
003768 if( pSchemaVersion ){
003769 *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]);
003770 }
003771 if( wrflag ){
003772 /* This call makes sure that the pager has the correct number of
003773 ** open savepoints. If the second parameter is greater than 0 and
003774 ** the sub-journal is not already open, then it will be opened here.
003775 */
003776 return sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
003777 }else{
003778 return SQLITE_OK;
003779 }
003780 }
003781
003782 #ifndef SQLITE_OMIT_AUTOVACUUM
003783
003784 /*
003785 ** Set the pointer-map entries for all children of page pPage. Also, if
003786 ** pPage contains cells that point to overflow pages, set the pointer
003787 ** map entries for the overflow pages as well.
003788 */
003789 static int setChildPtrmaps(MemPage *pPage){
003790 int i; /* Counter variable */
003791 int nCell; /* Number of cells in page pPage */
003792 int rc; /* Return code */
003793 BtShared *pBt = pPage->pBt;
003794 Pgno pgno = pPage->pgno;
003795
003796 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
003797 rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
003798 if( rc!=SQLITE_OK ) return rc;
003799 nCell = pPage->nCell;
003800
003801 for(i=0; i<nCell; i++){
003802 u8 *pCell = findCell(pPage, i);
003803
003804 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc);
003805
003806 if( !pPage->leaf ){
003807 Pgno childPgno = get4byte(pCell);
003808 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
003809 }
003810 }
003811
003812 if( !pPage->leaf ){
003813 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
003814 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
003815 }
003816
003817 return rc;
003818 }
003819
003820 /*
003821 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so
003822 ** that it points to iTo. Parameter eType describes the type of pointer to
003823 ** be modified, as follows:
003824 **
003825 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
003826 ** page of pPage.
003827 **
003828 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
003829 ** page pointed to by one of the cells on pPage.
003830 **
003831 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
003832 ** overflow page in the list.
003833 */
003834 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
003835 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
003836 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
003837 if( eType==PTRMAP_OVERFLOW2 ){
003838 /* The pointer is always the first 4 bytes of the page in this case. */
003839 if( get4byte(pPage->aData)!=iFrom ){
003840 return SQLITE_CORRUPT_PAGE(pPage);
003841 }
003842 put4byte(pPage->aData, iTo);
003843 }else{
003844 int i;
003845 int nCell;
003846 int rc;
003847
003848 rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
003849 if( rc ) return rc;
003850 nCell = pPage->nCell;
003851
003852 for(i=0; i<nCell; i++){
003853 u8 *pCell = findCell(pPage, i);
003854 if( eType==PTRMAP_OVERFLOW1 ){
003855 CellInfo info;
003856 pPage->xParseCell(pPage, pCell, &info);
003857 if( info.nLocal<info.nPayload ){
003858 if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){
003859 return SQLITE_CORRUPT_PAGE(pPage);
003860 }
003861 if( iFrom==get4byte(pCell+info.nSize-4) ){
003862 put4byte(pCell+info.nSize-4, iTo);
003863 break;
003864 }
003865 }
003866 }else{
003867 if( pCell+4 > pPage->aData+pPage->pBt->usableSize ){
003868 return SQLITE_CORRUPT_PAGE(pPage);
003869 }
003870 if( get4byte(pCell)==iFrom ){
003871 put4byte(pCell, iTo);
003872 break;
003873 }
003874 }
003875 }
003876
003877 if( i==nCell ){
003878 if( eType!=PTRMAP_BTREE ||
003879 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
003880 return SQLITE_CORRUPT_PAGE(pPage);
003881 }
003882 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
003883 }
003884 }
003885 return SQLITE_OK;
003886 }
003887
003888
003889 /*
003890 ** Move the open database page pDbPage to location iFreePage in the
003891 ** database. The pDbPage reference remains valid.
003892 **
003893 ** The isCommit flag indicates that there is no need to remember that
003894 ** the journal needs to be sync()ed before database page pDbPage->pgno
003895 ** can be written to. The caller has already promised not to write to that
003896 ** page.
003897 */
003898 static int relocatePage(
003899 BtShared *pBt, /* Btree */
003900 MemPage *pDbPage, /* Open page to move */
003901 u8 eType, /* Pointer map 'type' entry for pDbPage */
003902 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
003903 Pgno iFreePage, /* The location to move pDbPage to */
003904 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */
003905 ){
003906 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
003907 Pgno iDbPage = pDbPage->pgno;
003908 Pager *pPager = pBt->pPager;
003909 int rc;
003910
003911 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
003912 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
003913 assert( sqlite3_mutex_held(pBt->mutex) );
003914 assert( pDbPage->pBt==pBt );
003915 if( iDbPage<3 ) return SQLITE_CORRUPT_BKPT;
003916
003917 /* Move page iDbPage from its current location to page number iFreePage */
003918 TRACE(("AUTOVACUUM: Moving %u to free page %u (ptr page %u type %u)\n",
003919 iDbPage, iFreePage, iPtrPage, eType));
003920 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
003921 if( rc!=SQLITE_OK ){
003922 return rc;
003923 }
003924 pDbPage->pgno = iFreePage;
003925
003926 /* If pDbPage was a btree-page, then it may have child pages and/or cells
003927 ** that point to overflow pages. The pointer map entries for all these
003928 ** pages need to be changed.
003929 **
003930 ** If pDbPage is an overflow page, then the first 4 bytes may store a
003931 ** pointer to a subsequent overflow page. If this is the case, then
003932 ** the pointer map needs to be updated for the subsequent overflow page.
003933 */
003934 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
003935 rc = setChildPtrmaps(pDbPage);
003936 if( rc!=SQLITE_OK ){
003937 return rc;
003938 }
003939 }else{
003940 Pgno nextOvfl = get4byte(pDbPage->aData);
003941 if( nextOvfl!=0 ){
003942 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
003943 if( rc!=SQLITE_OK ){
003944 return rc;
003945 }
003946 }
003947 }
003948
003949 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
003950 ** that it points at iFreePage. Also fix the pointer map entry for
003951 ** iPtrPage.
003952 */
003953 if( eType!=PTRMAP_ROOTPAGE ){
003954 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
003955 if( rc!=SQLITE_OK ){
003956 return rc;
003957 }
003958 rc = sqlite3PagerWrite(pPtrPage->pDbPage);
003959 if( rc!=SQLITE_OK ){
003960 releasePage(pPtrPage);
003961 return rc;
003962 }
003963 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
003964 releasePage(pPtrPage);
003965 if( rc==SQLITE_OK ){
003966 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
003967 }
003968 }
003969 return rc;
003970 }
003971
003972 /* Forward declaration required by incrVacuumStep(). */
003973 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
003974
003975 /*
003976 ** Perform a single step of an incremental-vacuum. If successful, return
003977 ** SQLITE_OK. If there is no work to do (and therefore no point in
003978 ** calling this function again), return SQLITE_DONE. Or, if an error
003979 ** occurs, return some other error code.
003980 **
003981 ** More specifically, this function attempts to re-organize the database so
003982 ** that the last page of the file currently in use is no longer in use.
003983 **
003984 ** Parameter nFin is the number of pages that this database would contain
003985 ** were this function called until it returns SQLITE_DONE.
003986 **
003987 ** If the bCommit parameter is non-zero, this function assumes that the
003988 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE
003989 ** or an error. bCommit is passed true for an auto-vacuum-on-commit
003990 ** operation, or false for an incremental vacuum.
003991 */
003992 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
003993 Pgno nFreeList; /* Number of pages still on the free-list */
003994 int rc;
003995
003996 assert( sqlite3_mutex_held(pBt->mutex) );
003997 assert( iLastPg>nFin );
003998
003999 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
004000 u8 eType;
004001 Pgno iPtrPage;
004002
004003 nFreeList = get4byte(&pBt->pPage1->aData[36]);
004004 if( nFreeList==0 ){
004005 return SQLITE_DONE;
004006 }
004007
004008 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
004009 if( rc!=SQLITE_OK ){
004010 return rc;
004011 }
004012 if( eType==PTRMAP_ROOTPAGE ){
004013 return SQLITE_CORRUPT_BKPT;
004014 }
004015
004016 if( eType==PTRMAP_FREEPAGE ){
004017 if( bCommit==0 ){
004018 /* Remove the page from the files free-list. This is not required
004019 ** if bCommit is non-zero. In that case, the free-list will be
004020 ** truncated to zero after this function returns, so it doesn't
004021 ** matter if it still contains some garbage entries.
004022 */
004023 Pgno iFreePg;
004024 MemPage *pFreePg;
004025 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
004026 if( rc!=SQLITE_OK ){
004027 return rc;
004028 }
004029 assert( iFreePg==iLastPg );
004030 releasePage(pFreePg);
004031 }
004032 } else {
004033 Pgno iFreePg; /* Index of free page to move pLastPg to */
004034 MemPage *pLastPg;
004035 u8 eMode = BTALLOC_ANY; /* Mode parameter for allocateBtreePage() */
004036 Pgno iNear = 0; /* nearby parameter for allocateBtreePage() */
004037
004038 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
004039 if( rc!=SQLITE_OK ){
004040 return rc;
004041 }
004042
004043 /* If bCommit is zero, this loop runs exactly once and page pLastPg
004044 ** is swapped with the first free page pulled off the free list.
004045 **
004046 ** On the other hand, if bCommit is greater than zero, then keep
004047 ** looping until a free-page located within the first nFin pages
004048 ** of the file is found.
004049 */
004050 if( bCommit==0 ){
004051 eMode = BTALLOC_LE;
004052 iNear = nFin;
004053 }
004054 do {
004055 MemPage *pFreePg;
004056 Pgno dbSize = btreePagecount(pBt);
004057 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
004058 if( rc!=SQLITE_OK ){
004059 releasePage(pLastPg);
004060 return rc;
004061 }
004062 releasePage(pFreePg);
004063 if( iFreePg>dbSize ){
004064 releasePage(pLastPg);
004065 return SQLITE_CORRUPT_BKPT;
004066 }
004067 }while( bCommit && iFreePg>nFin );
004068 assert( iFreePg<iLastPg );
004069
004070 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
004071 releasePage(pLastPg);
004072 if( rc!=SQLITE_OK ){
004073 return rc;
004074 }
004075 }
004076 }
004077
004078 if( bCommit==0 ){
004079 do {
004080 iLastPg--;
004081 }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
004082 pBt->bDoTruncate = 1;
004083 pBt->nPage = iLastPg;
004084 }
004085 return SQLITE_OK;
004086 }
004087
004088 /*
004089 ** The database opened by the first argument is an auto-vacuum database
004090 ** nOrig pages in size containing nFree free pages. Return the expected
004091 ** size of the database in pages following an auto-vacuum operation.
004092 */
004093 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
004094 int nEntry; /* Number of entries on one ptrmap page */
004095 Pgno nPtrmap; /* Number of PtrMap pages to be freed */
004096 Pgno nFin; /* Return value */
004097
004098 nEntry = pBt->usableSize/5;
004099 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
004100 nFin = nOrig - nFree - nPtrmap;
004101 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
004102 nFin--;
004103 }
004104 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
004105 nFin--;
004106 }
004107
004108 return nFin;
004109 }
004110
004111 /*
004112 ** A write-transaction must be opened before calling this function.
004113 ** It performs a single unit of work towards an incremental vacuum.
004114 **
004115 ** If the incremental vacuum is finished after this function has run,
004116 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
004117 ** SQLITE_OK is returned. Otherwise an SQLite error code.
004118 */
004119 int sqlite3BtreeIncrVacuum(Btree *p){
004120 int rc;
004121 BtShared *pBt = p->pBt;
004122
004123 sqlite3BtreeEnter(p);
004124 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
004125 if( !pBt->autoVacuum ){
004126 rc = SQLITE_DONE;
004127 }else{
004128 Pgno nOrig = btreePagecount(pBt);
004129 Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
004130 Pgno nFin = finalDbSize(pBt, nOrig, nFree);
004131
004132 if( nOrig<nFin || nFree>=nOrig ){
004133 rc = SQLITE_CORRUPT_BKPT;
004134 }else if( nFree>0 ){
004135 rc = saveAllCursors(pBt, 0, 0);
004136 if( rc==SQLITE_OK ){
004137 invalidateAllOverflowCache(pBt);
004138 rc = incrVacuumStep(pBt, nFin, nOrig, 0);
004139 }
004140 if( rc==SQLITE_OK ){
004141 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
004142 put4byte(&pBt->pPage1->aData[28], pBt->nPage);
004143 }
004144 }else{
004145 rc = SQLITE_DONE;
004146 }
004147 }
004148 sqlite3BtreeLeave(p);
004149 return rc;
004150 }
004151
004152 /*
004153 ** This routine is called prior to sqlite3PagerCommit when a transaction
004154 ** is committed for an auto-vacuum database.
004155 */
004156 static int autoVacuumCommit(Btree *p){
004157 int rc = SQLITE_OK;
004158 Pager *pPager;
004159 BtShared *pBt;
004160 sqlite3 *db;
004161 VVA_ONLY( int nRef );
004162
004163 assert( p!=0 );
004164 pBt = p->pBt;
004165 pPager = pBt->pPager;
004166 VVA_ONLY( nRef = sqlite3PagerRefcount(pPager); )
004167
004168 assert( sqlite3_mutex_held(pBt->mutex) );
004169 invalidateAllOverflowCache(pBt);
004170 assert(pBt->autoVacuum);
004171 if( !pBt->incrVacuum ){
004172 Pgno nFin; /* Number of pages in database after autovacuuming */
004173 Pgno nFree; /* Number of pages on the freelist initially */
004174 Pgno nVac; /* Number of pages to vacuum */
004175 Pgno iFree; /* The next page to be freed */
004176 Pgno nOrig; /* Database size before freeing */
004177
004178 nOrig = btreePagecount(pBt);
004179 if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
004180 /* It is not possible to create a database for which the final page
004181 ** is either a pointer-map page or the pending-byte page. If one
004182 ** is encountered, this indicates corruption.
004183 */
004184 return SQLITE_CORRUPT_BKPT;
004185 }
004186
004187 nFree = get4byte(&pBt->pPage1->aData[36]);
004188 db = p->db;
004189 if( db->xAutovacPages ){
004190 int iDb;
004191 for(iDb=0; ALWAYS(iDb<db->nDb); iDb++){
004192 if( db->aDb[iDb].pBt==p ) break;
004193 }
004194 nVac = db->xAutovacPages(
004195 db->pAutovacPagesArg,
004196 db->aDb[iDb].zDbSName,
004197 nOrig,
004198 nFree,
004199 pBt->pageSize
004200 );
004201 if( nVac>nFree ){
004202 nVac = nFree;
004203 }
004204 if( nVac==0 ){
004205 return SQLITE_OK;
004206 }
004207 }else{
004208 nVac = nFree;
004209 }
004210 nFin = finalDbSize(pBt, nOrig, nVac);
004211 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
004212 if( nFin<nOrig ){
004213 rc = saveAllCursors(pBt, 0, 0);
004214 }
004215 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
004216 rc = incrVacuumStep(pBt, nFin, iFree, nVac==nFree);
004217 }
004218 if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
004219 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
004220 if( nVac==nFree ){
004221 put4byte(&pBt->pPage1->aData[32], 0);
004222 put4byte(&pBt->pPage1->aData[36], 0);
004223 }
004224 put4byte(&pBt->pPage1->aData[28], nFin);
004225 pBt->bDoTruncate = 1;
004226 pBt->nPage = nFin;
004227 }
004228 if( rc!=SQLITE_OK ){
004229 sqlite3PagerRollback(pPager);
004230 }
004231 }
004232
004233 assert( nRef>=sqlite3PagerRefcount(pPager) );
004234 return rc;
004235 }
004236
004237 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
004238 # define setChildPtrmaps(x) SQLITE_OK
004239 #endif
004240
004241 /*
004242 ** This routine does the first phase of a two-phase commit. This routine
004243 ** causes a rollback journal to be created (if it does not already exist)
004244 ** and populated with enough information so that if a power loss occurs
004245 ** the database can be restored to its original state by playing back
004246 ** the journal. Then the contents of the journal are flushed out to
004247 ** the disk. After the journal is safely on oxide, the changes to the
004248 ** database are written into the database file and flushed to oxide.
004249 ** At the end of this call, the rollback journal still exists on the
004250 ** disk and we are still holding all locks, so the transaction has not
004251 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the
004252 ** commit process.
004253 **
004254 ** This call is a no-op if no write-transaction is currently active on pBt.
004255 **
004256 ** Otherwise, sync the database file for the btree pBt. zSuperJrnl points to
004257 ** the name of a super-journal file that should be written into the
004258 ** individual journal file, or is NULL, indicating no super-journal file
004259 ** (single database transaction).
004260 **
004261 ** When this is called, the super-journal should already have been
004262 ** created, populated with this journal pointer and synced to disk.
004263 **
004264 ** Once this is routine has returned, the only thing required to commit
004265 ** the write-transaction for this database file is to delete the journal.
004266 */
004267 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zSuperJrnl){
004268 int rc = SQLITE_OK;
004269 if( p->inTrans==TRANS_WRITE ){
004270 BtShared *pBt = p->pBt;
004271 sqlite3BtreeEnter(p);
004272 #ifndef SQLITE_OMIT_AUTOVACUUM
004273 if( pBt->autoVacuum ){
004274 rc = autoVacuumCommit(p);
004275 if( rc!=SQLITE_OK ){
004276 sqlite3BtreeLeave(p);
004277 return rc;
004278 }
004279 }
004280 if( pBt->bDoTruncate ){
004281 sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
004282 }
004283 #endif
004284 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zSuperJrnl, 0);
004285 sqlite3BtreeLeave(p);
004286 }
004287 return rc;
004288 }
004289
004290 /*
004291 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
004292 ** at the conclusion of a transaction.
004293 */
004294 static void btreeEndTransaction(Btree *p){
004295 BtShared *pBt = p->pBt;
004296 sqlite3 *db = p->db;
004297 assert( sqlite3BtreeHoldsMutex(p) );
004298
004299 #ifndef SQLITE_OMIT_AUTOVACUUM
004300 pBt->bDoTruncate = 0;
004301 #endif
004302 if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
004303 /* If there are other active statements that belong to this database
004304 ** handle, downgrade to a read-only transaction. The other statements
004305 ** may still be reading from the database. */
004306 downgradeAllSharedCacheTableLocks(p);
004307 p->inTrans = TRANS_READ;
004308 }else{
004309 /* If the handle had any kind of transaction open, decrement the
004310 ** transaction count of the shared btree. If the transaction count
004311 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
004312 ** call below will unlock the pager. */
004313 if( p->inTrans!=TRANS_NONE ){
004314 clearAllSharedCacheTableLocks(p);
004315 pBt->nTransaction--;
004316 if( 0==pBt->nTransaction ){
004317 pBt->inTransaction = TRANS_NONE;
004318 }
004319 }
004320
004321 /* Set the current transaction state to TRANS_NONE and unlock the
004322 ** pager if this call closed the only read or write transaction. */
004323 p->inTrans = TRANS_NONE;
004324 unlockBtreeIfUnused(pBt);
004325 }
004326
004327 btreeIntegrity(p);
004328 }
004329
004330 /*
004331 ** Commit the transaction currently in progress.
004332 **
004333 ** This routine implements the second phase of a 2-phase commit. The
004334 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
004335 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne()
004336 ** routine did all the work of writing information out to disk and flushing the
004337 ** contents so that they are written onto the disk platter. All this
004338 ** routine has to do is delete or truncate or zero the header in the
004339 ** the rollback journal (which causes the transaction to commit) and
004340 ** drop locks.
004341 **
004342 ** Normally, if an error occurs while the pager layer is attempting to
004343 ** finalize the underlying journal file, this function returns an error and
004344 ** the upper layer will attempt a rollback. However, if the second argument
004345 ** is non-zero then this b-tree transaction is part of a multi-file
004346 ** transaction. In this case, the transaction has already been committed
004347 ** (by deleting a super-journal file) and the caller will ignore this
004348 ** functions return code. So, even if an error occurs in the pager layer,
004349 ** reset the b-tree objects internal state to indicate that the write
004350 ** transaction has been closed. This is quite safe, as the pager will have
004351 ** transitioned to the error state.
004352 **
004353 ** This will release the write lock on the database file. If there
004354 ** are no active cursors, it also releases the read lock.
004355 */
004356 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
004357
004358 if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
004359 sqlite3BtreeEnter(p);
004360 btreeIntegrity(p);
004361
004362 /* If the handle has a write-transaction open, commit the shared-btrees
004363 ** transaction and set the shared state to TRANS_READ.
004364 */
004365 if( p->inTrans==TRANS_WRITE ){
004366 int rc;
004367 BtShared *pBt = p->pBt;
004368 assert( pBt->inTransaction==TRANS_WRITE );
004369 assert( pBt->nTransaction>0 );
004370 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
004371 if( rc!=SQLITE_OK && bCleanup==0 ){
004372 sqlite3BtreeLeave(p);
004373 return rc;
004374 }
004375 p->iBDataVersion--; /* Compensate for pPager->iDataVersion++; */
004376 pBt->inTransaction = TRANS_READ;
004377 btreeClearHasContent(pBt);
004378 }
004379
004380 btreeEndTransaction(p);
004381 sqlite3BtreeLeave(p);
004382 return SQLITE_OK;
004383 }
004384
004385 /*
004386 ** Do both phases of a commit.
004387 */
004388 int sqlite3BtreeCommit(Btree *p){
004389 int rc;
004390 sqlite3BtreeEnter(p);
004391 rc = sqlite3BtreeCommitPhaseOne(p, 0);
004392 if( rc==SQLITE_OK ){
004393 rc = sqlite3BtreeCommitPhaseTwo(p, 0);
004394 }
004395 sqlite3BtreeLeave(p);
004396 return rc;
004397 }
004398
004399 /*
004400 ** This routine sets the state to CURSOR_FAULT and the error
004401 ** code to errCode for every cursor on any BtShared that pBtree
004402 ** references. Or if the writeOnly flag is set to 1, then only
004403 ** trip write cursors and leave read cursors unchanged.
004404 **
004405 ** Every cursor is a candidate to be tripped, including cursors
004406 ** that belong to other database connections that happen to be
004407 ** sharing the cache with pBtree.
004408 **
004409 ** This routine gets called when a rollback occurs. If the writeOnly
004410 ** flag is true, then only write-cursors need be tripped - read-only
004411 ** cursors save their current positions so that they may continue
004412 ** following the rollback. Or, if writeOnly is false, all cursors are
004413 ** tripped. In general, writeOnly is false if the transaction being
004414 ** rolled back modified the database schema. In this case b-tree root
004415 ** pages may be moved or deleted from the database altogether, making
004416 ** it unsafe for read cursors to continue.
004417 **
004418 ** If the writeOnly flag is true and an error is encountered while
004419 ** saving the current position of a read-only cursor, all cursors,
004420 ** including all read-cursors are tripped.
004421 **
004422 ** SQLITE_OK is returned if successful, or if an error occurs while
004423 ** saving a cursor position, an SQLite error code.
004424 */
004425 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
004426 BtCursor *p;
004427 int rc = SQLITE_OK;
004428
004429 assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
004430 if( pBtree ){
004431 sqlite3BtreeEnter(pBtree);
004432 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
004433 if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
004434 if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
004435 rc = saveCursorPosition(p);
004436 if( rc!=SQLITE_OK ){
004437 (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
004438 break;
004439 }
004440 }
004441 }else{
004442 sqlite3BtreeClearCursor(p);
004443 p->eState = CURSOR_FAULT;
004444 p->skipNext = errCode;
004445 }
004446 btreeReleaseAllCursorPages(p);
004447 }
004448 sqlite3BtreeLeave(pBtree);
004449 }
004450 return rc;
004451 }
004452
004453 /*
004454 ** Set the pBt->nPage field correctly, according to the current
004455 ** state of the database. Assume pBt->pPage1 is valid.
004456 */
004457 static void btreeSetNPage(BtShared *pBt, MemPage *pPage1){
004458 int nPage = get4byte(&pPage1->aData[28]);
004459 testcase( nPage==0 );
004460 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
004461 testcase( pBt->nPage!=(u32)nPage );
004462 pBt->nPage = nPage;
004463 }
004464
004465 /*
004466 ** Rollback the transaction in progress.
004467 **
004468 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
004469 ** Only write cursors are tripped if writeOnly is true but all cursors are
004470 ** tripped if writeOnly is false. Any attempt to use
004471 ** a tripped cursor will result in an error.
004472 **
004473 ** This will release the write lock on the database file. If there
004474 ** are no active cursors, it also releases the read lock.
004475 */
004476 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
004477 int rc;
004478 BtShared *pBt = p->pBt;
004479 MemPage *pPage1;
004480
004481 assert( writeOnly==1 || writeOnly==0 );
004482 assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
004483 sqlite3BtreeEnter(p);
004484 if( tripCode==SQLITE_OK ){
004485 rc = tripCode = saveAllCursors(pBt, 0, 0);
004486 if( rc ) writeOnly = 0;
004487 }else{
004488 rc = SQLITE_OK;
004489 }
004490 if( tripCode ){
004491 int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
004492 assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
004493 if( rc2!=SQLITE_OK ) rc = rc2;
004494 }
004495 btreeIntegrity(p);
004496
004497 if( p->inTrans==TRANS_WRITE ){
004498 int rc2;
004499
004500 assert( TRANS_WRITE==pBt->inTransaction );
004501 rc2 = sqlite3PagerRollback(pBt->pPager);
004502 if( rc2!=SQLITE_OK ){
004503 rc = rc2;
004504 }
004505
004506 /* The rollback may have destroyed the pPage1->aData value. So
004507 ** call btreeGetPage() on page 1 again to make
004508 ** sure pPage1->aData is set correctly. */
004509 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
004510 btreeSetNPage(pBt, pPage1);
004511 releasePageOne(pPage1);
004512 }
004513 assert( countValidCursors(pBt, 1)==0 );
004514 pBt->inTransaction = TRANS_READ;
004515 btreeClearHasContent(pBt);
004516 }
004517
004518 btreeEndTransaction(p);
004519 sqlite3BtreeLeave(p);
004520 return rc;
004521 }
004522
004523 /*
004524 ** Start a statement subtransaction. The subtransaction can be rolled
004525 ** back independently of the main transaction. You must start a transaction
004526 ** before starting a subtransaction. The subtransaction is ended automatically
004527 ** if the main transaction commits or rolls back.
004528 **
004529 ** Statement subtransactions are used around individual SQL statements
004530 ** that are contained within a BEGIN...COMMIT block. If a constraint
004531 ** error occurs within the statement, the effect of that one statement
004532 ** can be rolled back without having to rollback the entire transaction.
004533 **
004534 ** A statement sub-transaction is implemented as an anonymous savepoint. The
004535 ** value passed as the second parameter is the total number of savepoints,
004536 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
004537 ** are no active savepoints and no other statement-transactions open,
004538 ** iStatement is 1. This anonymous savepoint can be released or rolled back
004539 ** using the sqlite3BtreeSavepoint() function.
004540 */
004541 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
004542 int rc;
004543 BtShared *pBt = p->pBt;
004544 sqlite3BtreeEnter(p);
004545 assert( p->inTrans==TRANS_WRITE );
004546 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
004547 assert( iStatement>0 );
004548 assert( iStatement>p->db->nSavepoint );
004549 assert( pBt->inTransaction==TRANS_WRITE );
004550 /* At the pager level, a statement transaction is a savepoint with
004551 ** an index greater than all savepoints created explicitly using
004552 ** SQL statements. It is illegal to open, release or rollback any
004553 ** such savepoints while the statement transaction savepoint is active.
004554 */
004555 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
004556 sqlite3BtreeLeave(p);
004557 return rc;
004558 }
004559
004560 /*
004561 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
004562 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
004563 ** savepoint identified by parameter iSavepoint, depending on the value
004564 ** of op.
004565 **
004566 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
004567 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
004568 ** contents of the entire transaction are rolled back. This is different
004569 ** from a normal transaction rollback, as no locks are released and the
004570 ** transaction remains open.
004571 */
004572 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
004573 int rc = SQLITE_OK;
004574 if( p && p->inTrans==TRANS_WRITE ){
004575 BtShared *pBt = p->pBt;
004576 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
004577 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
004578 sqlite3BtreeEnter(p);
004579 if( op==SAVEPOINT_ROLLBACK ){
004580 rc = saveAllCursors(pBt, 0, 0);
004581 }
004582 if( rc==SQLITE_OK ){
004583 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
004584 }
004585 if( rc==SQLITE_OK ){
004586 if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
004587 pBt->nPage = 0;
004588 }
004589 rc = newDatabase(pBt);
004590 btreeSetNPage(pBt, pBt->pPage1);
004591
004592 /* pBt->nPage might be zero if the database was corrupt when
004593 ** the transaction was started. Otherwise, it must be at least 1. */
004594 assert( CORRUPT_DB || pBt->nPage>0 );
004595 }
004596 sqlite3BtreeLeave(p);
004597 }
004598 return rc;
004599 }
004600
004601 /*
004602 ** Create a new cursor for the BTree whose root is on the page
004603 ** iTable. If a read-only cursor is requested, it is assumed that
004604 ** the caller already has at least a read-only transaction open
004605 ** on the database already. If a write-cursor is requested, then
004606 ** the caller is assumed to have an open write transaction.
004607 **
004608 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only
004609 ** be used for reading. If the BTREE_WRCSR bit is set, then the cursor
004610 ** can be used for reading or for writing if other conditions for writing
004611 ** are also met. These are the conditions that must be met in order
004612 ** for writing to be allowed:
004613 **
004614 ** 1: The cursor must have been opened with wrFlag containing BTREE_WRCSR
004615 **
004616 ** 2: Other database connections that share the same pager cache
004617 ** but which are not in the READ_UNCOMMITTED state may not have
004618 ** cursors open with wrFlag==0 on the same table. Otherwise
004619 ** the changes made by this write cursor would be visible to
004620 ** the read cursors in the other database connection.
004621 **
004622 ** 3: The database must be writable (not on read-only media)
004623 **
004624 ** 4: There must be an active transaction.
004625 **
004626 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR
004627 ** is set. If FORDELETE is set, that is a hint to the implementation that
004628 ** this cursor will only be used to seek to and delete entries of an index
004629 ** as part of a larger DELETE statement. The FORDELETE hint is not used by
004630 ** this implementation. But in a hypothetical alternative storage engine
004631 ** in which index entries are automatically deleted when corresponding table
004632 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE
004633 ** operations on this cursor can be no-ops and all READ operations can
004634 ** return a null row (2-bytes: 0x01 0x00).
004635 **
004636 ** No checking is done to make sure that page iTable really is the
004637 ** root page of a b-tree. If it is not, then the cursor acquired
004638 ** will not work correctly.
004639 **
004640 ** It is assumed that the sqlite3BtreeCursorZero() has been called
004641 ** on pCur to initialize the memory space prior to invoking this routine.
004642 */
004643 static int btreeCursor(
004644 Btree *p, /* The btree */
004645 Pgno iTable, /* Root page of table to open */
004646 int wrFlag, /* 1 to write. 0 read-only */
004647 struct KeyInfo *pKeyInfo, /* First arg to comparison function */
004648 BtCursor *pCur /* Space for new cursor */
004649 ){
004650 BtShared *pBt = p->pBt; /* Shared b-tree handle */
004651 BtCursor *pX; /* Looping over other all cursors */
004652
004653 assert( sqlite3BtreeHoldsMutex(p) );
004654 assert( wrFlag==0
004655 || wrFlag==BTREE_WRCSR
004656 || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE)
004657 );
004658
004659 /* The following assert statements verify that if this is a sharable
004660 ** b-tree database, the connection is holding the required table locks,
004661 ** and that no other connection has any open cursor that conflicts with
004662 ** this lock. The iTable<1 term disables the check for corrupt schemas. */
004663 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1))
004664 || iTable<1 );
004665 assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
004666
004667 /* Assert that the caller has opened the required transaction. */
004668 assert( p->inTrans>TRANS_NONE );
004669 assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
004670 assert( pBt->pPage1 && pBt->pPage1->aData );
004671 assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
004672
004673 if( iTable<=1 ){
004674 if( iTable<1 ){
004675 return SQLITE_CORRUPT_BKPT;
004676 }else if( btreePagecount(pBt)==0 ){
004677 assert( wrFlag==0 );
004678 iTable = 0;
004679 }
004680 }
004681
004682 /* Now that no other errors can occur, finish filling in the BtCursor
004683 ** variables and link the cursor into the BtShared list. */
004684 pCur->pgnoRoot = iTable;
004685 pCur->iPage = -1;
004686 pCur->pKeyInfo = pKeyInfo;
004687 pCur->pBtree = p;
004688 pCur->pBt = pBt;
004689 pCur->curFlags = 0;
004690 /* If there are two or more cursors on the same btree, then all such
004691 ** cursors *must* have the BTCF_Multiple flag set. */
004692 for(pX=pBt->pCursor; pX; pX=pX->pNext){
004693 if( pX->pgnoRoot==iTable ){
004694 pX->curFlags |= BTCF_Multiple;
004695 pCur->curFlags = BTCF_Multiple;
004696 }
004697 }
004698 pCur->eState = CURSOR_INVALID;
004699 pCur->pNext = pBt->pCursor;
004700 pBt->pCursor = pCur;
004701 if( wrFlag ){
004702 pCur->curFlags |= BTCF_WriteFlag;
004703 pCur->curPagerFlags = 0;
004704 if( pBt->pTmpSpace==0 ) return allocateTempSpace(pBt);
004705 }else{
004706 pCur->curPagerFlags = PAGER_GET_READONLY;
004707 }
004708 return SQLITE_OK;
004709 }
004710 static int btreeCursorWithLock(
004711 Btree *p, /* The btree */
004712 Pgno iTable, /* Root page of table to open */
004713 int wrFlag, /* 1 to write. 0 read-only */
004714 struct KeyInfo *pKeyInfo, /* First arg to comparison function */
004715 BtCursor *pCur /* Space for new cursor */
004716 ){
004717 int rc;
004718 sqlite3BtreeEnter(p);
004719 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
004720 sqlite3BtreeLeave(p);
004721 return rc;
004722 }
004723 int sqlite3BtreeCursor(
004724 Btree *p, /* The btree */
004725 Pgno iTable, /* Root page of table to open */
004726 int wrFlag, /* 1 to write. 0 read-only */
004727 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */
004728 BtCursor *pCur /* Write new cursor here */
004729 ){
004730 if( p->sharable ){
004731 return btreeCursorWithLock(p, iTable, wrFlag, pKeyInfo, pCur);
004732 }else{
004733 return btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
004734 }
004735 }
004736
004737 /*
004738 ** Return the size of a BtCursor object in bytes.
004739 **
004740 ** This interfaces is needed so that users of cursors can preallocate
004741 ** sufficient storage to hold a cursor. The BtCursor object is opaque
004742 ** to users so they cannot do the sizeof() themselves - they must call
004743 ** this routine.
004744 */
004745 int sqlite3BtreeCursorSize(void){
004746 return ROUND8(sizeof(BtCursor));
004747 }
004748
004749 #ifdef SQLITE_DEBUG
004750 /*
004751 ** Return true if and only if the Btree object will be automatically
004752 ** closed with the BtCursor closes. This is used within assert() statements
004753 ** only.
004754 */
004755 int sqlite3BtreeClosesWithCursor(
004756 Btree *pBtree, /* the btree object */
004757 BtCursor *pCur /* Corresponding cursor */
004758 ){
004759 BtShared *pBt = pBtree->pBt;
004760 if( (pBt->openFlags & BTREE_SINGLE)==0 ) return 0;
004761 if( pBt->pCursor!=pCur ) return 0;
004762 if( pCur->pNext!=0 ) return 0;
004763 if( pCur->pBtree!=pBtree ) return 0;
004764 return 1;
004765 }
004766 #endif
004767
004768 /*
004769 ** Initialize memory that will be converted into a BtCursor object.
004770 **
004771 ** The simple approach here would be to memset() the entire object
004772 ** to zero. But it turns out that the apPage[] and aiIdx[] arrays
004773 ** do not need to be zeroed and they are large, so we can save a lot
004774 ** of run-time by skipping the initialization of those elements.
004775 */
004776 void sqlite3BtreeCursorZero(BtCursor *p){
004777 memset(p, 0, offsetof(BtCursor, BTCURSOR_FIRST_UNINIT));
004778 }
004779
004780 /*
004781 ** Close a cursor. The read lock on the database file is released
004782 ** when the last cursor is closed.
004783 */
004784 int sqlite3BtreeCloseCursor(BtCursor *pCur){
004785 Btree *pBtree = pCur->pBtree;
004786 if( pBtree ){
004787 BtShared *pBt = pCur->pBt;
004788 sqlite3BtreeEnter(pBtree);
004789 assert( pBt->pCursor!=0 );
004790 if( pBt->pCursor==pCur ){
004791 pBt->pCursor = pCur->pNext;
004792 }else{
004793 BtCursor *pPrev = pBt->pCursor;
004794 do{
004795 if( pPrev->pNext==pCur ){
004796 pPrev->pNext = pCur->pNext;
004797 break;
004798 }
004799 pPrev = pPrev->pNext;
004800 }while( ALWAYS(pPrev) );
004801 }
004802 btreeReleaseAllCursorPages(pCur);
004803 unlockBtreeIfUnused(pBt);
004804 sqlite3_free(pCur->aOverflow);
004805 sqlite3_free(pCur->pKey);
004806 if( (pBt->openFlags & BTREE_SINGLE) && pBt->pCursor==0 ){
004807 /* Since the BtShared is not sharable, there is no need to
004808 ** worry about the missing sqlite3BtreeLeave() call here. */
004809 assert( pBtree->sharable==0 );
004810 sqlite3BtreeClose(pBtree);
004811 }else{
004812 sqlite3BtreeLeave(pBtree);
004813 }
004814 pCur->pBtree = 0;
004815 }
004816 return SQLITE_OK;
004817 }
004818
004819 /*
004820 ** Make sure the BtCursor* given in the argument has a valid
004821 ** BtCursor.info structure. If it is not already valid, call
004822 ** btreeParseCell() to fill it in.
004823 **
004824 ** BtCursor.info is a cache of the information in the current cell.
004825 ** Using this cache reduces the number of calls to btreeParseCell().
004826 */
004827 #ifndef NDEBUG
004828 static int cellInfoEqual(CellInfo *a, CellInfo *b){
004829 if( a->nKey!=b->nKey ) return 0;
004830 if( a->pPayload!=b->pPayload ) return 0;
004831 if( a->nPayload!=b->nPayload ) return 0;
004832 if( a->nLocal!=b->nLocal ) return 0;
004833 if( a->nSize!=b->nSize ) return 0;
004834 return 1;
004835 }
004836 static void assertCellInfo(BtCursor *pCur){
004837 CellInfo info;
004838 memset(&info, 0, sizeof(info));
004839 btreeParseCell(pCur->pPage, pCur->ix, &info);
004840 assert( CORRUPT_DB || cellInfoEqual(&info, &pCur->info) );
004841 }
004842 #else
004843 #define assertCellInfo(x)
004844 #endif
004845 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
004846 if( pCur->info.nSize==0 ){
004847 pCur->curFlags |= BTCF_ValidNKey;
004848 btreeParseCell(pCur->pPage,pCur->ix,&pCur->info);
004849 }else{
004850 assertCellInfo(pCur);
004851 }
004852 }
004853
004854 #ifndef NDEBUG /* The next routine used only within assert() statements */
004855 /*
004856 ** Return true if the given BtCursor is valid. A valid cursor is one
004857 ** that is currently pointing to a row in a (non-empty) table.
004858 ** This is a verification routine is used only within assert() statements.
004859 */
004860 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
004861 return pCur && pCur->eState==CURSOR_VALID;
004862 }
004863 #endif /* NDEBUG */
004864 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){
004865 assert( pCur!=0 );
004866 return pCur->eState==CURSOR_VALID;
004867 }
004868
004869 /*
004870 ** Return the value of the integer key or "rowid" for a table btree.
004871 ** This routine is only valid for a cursor that is pointing into a
004872 ** ordinary table btree. If the cursor points to an index btree or
004873 ** is invalid, the result of this routine is undefined.
004874 */
004875 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){
004876 assert( cursorHoldsMutex(pCur) );
004877 assert( pCur->eState==CURSOR_VALID );
004878 assert( pCur->curIntKey );
004879 getCellInfo(pCur);
004880 return pCur->info.nKey;
004881 }
004882
004883 /*
004884 ** Pin or unpin a cursor.
004885 */
004886 void sqlite3BtreeCursorPin(BtCursor *pCur){
004887 assert( (pCur->curFlags & BTCF_Pinned)==0 );
004888 pCur->curFlags |= BTCF_Pinned;
004889 }
004890 void sqlite3BtreeCursorUnpin(BtCursor *pCur){
004891 assert( (pCur->curFlags & BTCF_Pinned)!=0 );
004892 pCur->curFlags &= ~BTCF_Pinned;
004893 }
004894
004895 /*
004896 ** Return the offset into the database file for the start of the
004897 ** payload to which the cursor is pointing.
004898 */
004899 i64 sqlite3BtreeOffset(BtCursor *pCur){
004900 assert( cursorHoldsMutex(pCur) );
004901 assert( pCur->eState==CURSOR_VALID );
004902 getCellInfo(pCur);
004903 return (i64)pCur->pBt->pageSize*((i64)pCur->pPage->pgno - 1) +
004904 (i64)(pCur->info.pPayload - pCur->pPage->aData);
004905 }
004906
004907 /*
004908 ** Return the number of bytes of payload for the entry that pCur is
004909 ** currently pointing to. For table btrees, this will be the amount
004910 ** of data. For index btrees, this will be the size of the key.
004911 **
004912 ** The caller must guarantee that the cursor is pointing to a non-NULL
004913 ** valid entry. In other words, the calling procedure must guarantee
004914 ** that the cursor has Cursor.eState==CURSOR_VALID.
004915 */
004916 u32 sqlite3BtreePayloadSize(BtCursor *pCur){
004917 assert( cursorHoldsMutex(pCur) );
004918 assert( pCur->eState==CURSOR_VALID );
004919 getCellInfo(pCur);
004920 return pCur->info.nPayload;
004921 }
004922
004923 /*
004924 ** Return an upper bound on the size of any record for the table
004925 ** that the cursor is pointing into.
004926 **
004927 ** This is an optimization. Everything will still work if this
004928 ** routine always returns 2147483647 (which is the largest record
004929 ** that SQLite can handle) or more. But returning a smaller value might
004930 ** prevent large memory allocations when trying to interpret a
004931 ** corrupt database.
004932 **
004933 ** The current implementation merely returns the size of the underlying
004934 ** database file.
004935 */
004936 sqlite3_int64 sqlite3BtreeMaxRecordSize(BtCursor *pCur){
004937 assert( cursorHoldsMutex(pCur) );
004938 assert( pCur->eState==CURSOR_VALID );
004939 return pCur->pBt->pageSize * (sqlite3_int64)pCur->pBt->nPage;
004940 }
004941
004942 /*
004943 ** Given the page number of an overflow page in the database (parameter
004944 ** ovfl), this function finds the page number of the next page in the
004945 ** linked list of overflow pages. If possible, it uses the auto-vacuum
004946 ** pointer-map data instead of reading the content of page ovfl to do so.
004947 **
004948 ** If an error occurs an SQLite error code is returned. Otherwise:
004949 **
004950 ** The page number of the next overflow page in the linked list is
004951 ** written to *pPgnoNext. If page ovfl is the last page in its linked
004952 ** list, *pPgnoNext is set to zero.
004953 **
004954 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
004955 ** to page number pOvfl was obtained, then *ppPage is set to point to that
004956 ** reference. It is the responsibility of the caller to call releasePage()
004957 ** on *ppPage to free the reference. In no reference was obtained (because
004958 ** the pointer-map was used to obtain the value for *pPgnoNext), then
004959 ** *ppPage is set to zero.
004960 */
004961 static int getOverflowPage(
004962 BtShared *pBt, /* The database file */
004963 Pgno ovfl, /* Current overflow page number */
004964 MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */
004965 Pgno *pPgnoNext /* OUT: Next overflow page number */
004966 ){
004967 Pgno next = 0;
004968 MemPage *pPage = 0;
004969 int rc = SQLITE_OK;
004970
004971 assert( sqlite3_mutex_held(pBt->mutex) );
004972 assert(pPgnoNext);
004973
004974 #ifndef SQLITE_OMIT_AUTOVACUUM
004975 /* Try to find the next page in the overflow list using the
004976 ** autovacuum pointer-map pages. Guess that the next page in
004977 ** the overflow list is page number (ovfl+1). If that guess turns
004978 ** out to be wrong, fall back to loading the data of page
004979 ** number ovfl to determine the next page number.
004980 */
004981 if( pBt->autoVacuum ){
004982 Pgno pgno;
004983 Pgno iGuess = ovfl+1;
004984 u8 eType;
004985
004986 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
004987 iGuess++;
004988 }
004989
004990 if( iGuess<=btreePagecount(pBt) ){
004991 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
004992 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
004993 next = iGuess;
004994 rc = SQLITE_DONE;
004995 }
004996 }
004997 }
004998 #endif
004999
005000 assert( next==0 || rc==SQLITE_DONE );
005001 if( rc==SQLITE_OK ){
005002 rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
005003 assert( rc==SQLITE_OK || pPage==0 );
005004 if( rc==SQLITE_OK ){
005005 next = get4byte(pPage->aData);
005006 }
005007 }
005008
005009 *pPgnoNext = next;
005010 if( ppPage ){
005011 *ppPage = pPage;
005012 }else{
005013 releasePage(pPage);
005014 }
005015 return (rc==SQLITE_DONE ? SQLITE_OK : rc);
005016 }
005017
005018 /*
005019 ** Copy data from a buffer to a page, or from a page to a buffer.
005020 **
005021 ** pPayload is a pointer to data stored on database page pDbPage.
005022 ** If argument eOp is false, then nByte bytes of data are copied
005023 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
005024 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
005025 ** of data are copied from the buffer pBuf to pPayload.
005026 **
005027 ** SQLITE_OK is returned on success, otherwise an error code.
005028 */
005029 static int copyPayload(
005030 void *pPayload, /* Pointer to page data */
005031 void *pBuf, /* Pointer to buffer */
005032 int nByte, /* Number of bytes to copy */
005033 int eOp, /* 0 -> copy from page, 1 -> copy to page */
005034 DbPage *pDbPage /* Page containing pPayload */
005035 ){
005036 if( eOp ){
005037 /* Copy data from buffer to page (a write operation) */
005038 int rc = sqlite3PagerWrite(pDbPage);
005039 if( rc!=SQLITE_OK ){
005040 return rc;
005041 }
005042 memcpy(pPayload, pBuf, nByte);
005043 }else{
005044 /* Copy data from page to buffer (a read operation) */
005045 memcpy(pBuf, pPayload, nByte);
005046 }
005047 return SQLITE_OK;
005048 }
005049
005050 /*
005051 ** This function is used to read or overwrite payload information
005052 ** for the entry that the pCur cursor is pointing to. The eOp
005053 ** argument is interpreted as follows:
005054 **
005055 ** 0: The operation is a read. Populate the overflow cache.
005056 ** 1: The operation is a write. Populate the overflow cache.
005057 **
005058 ** A total of "amt" bytes are read or written beginning at "offset".
005059 ** Data is read to or from the buffer pBuf.
005060 **
005061 ** The content being read or written might appear on the main page
005062 ** or be scattered out on multiple overflow pages.
005063 **
005064 ** If the current cursor entry uses one or more overflow pages
005065 ** this function may allocate space for and lazily populate
005066 ** the overflow page-list cache array (BtCursor.aOverflow).
005067 ** Subsequent calls use this cache to make seeking to the supplied offset
005068 ** more efficient.
005069 **
005070 ** Once an overflow page-list cache has been allocated, it must be
005071 ** invalidated if some other cursor writes to the same table, or if
005072 ** the cursor is moved to a different row. Additionally, in auto-vacuum
005073 ** mode, the following events may invalidate an overflow page-list cache.
005074 **
005075 ** * An incremental vacuum,
005076 ** * A commit in auto_vacuum="full" mode,
005077 ** * Creating a table (may require moving an overflow page).
005078 */
005079 static int accessPayload(
005080 BtCursor *pCur, /* Cursor pointing to entry to read from */
005081 u32 offset, /* Begin reading this far into payload */
005082 u32 amt, /* Read this many bytes */
005083 unsigned char *pBuf, /* Write the bytes into this buffer */
005084 int eOp /* zero to read. non-zero to write. */
005085 ){
005086 unsigned char *aPayload;
005087 int rc = SQLITE_OK;
005088 int iIdx = 0;
005089 MemPage *pPage = pCur->pPage; /* Btree page of current entry */
005090 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */
005091 #ifdef SQLITE_DIRECT_OVERFLOW_READ
005092 unsigned char * const pBufStart = pBuf; /* Start of original out buffer */
005093 #endif
005094
005095 assert( pPage );
005096 assert( eOp==0 || eOp==1 );
005097 assert( pCur->eState==CURSOR_VALID );
005098 if( pCur->ix>=pPage->nCell ){
005099 return SQLITE_CORRUPT_PAGE(pPage);
005100 }
005101 assert( cursorHoldsMutex(pCur) );
005102
005103 getCellInfo(pCur);
005104 aPayload = pCur->info.pPayload;
005105 assert( offset+amt <= pCur->info.nPayload );
005106
005107 assert( aPayload > pPage->aData );
005108 if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
005109 /* Trying to read or write past the end of the data is an error. The
005110 ** conditional above is really:
005111 ** &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
005112 ** but is recast into its current form to avoid integer overflow problems
005113 */
005114 return SQLITE_CORRUPT_PAGE(pPage);
005115 }
005116
005117 /* Check if data must be read/written to/from the btree page itself. */
005118 if( offset<pCur->info.nLocal ){
005119 int a = amt;
005120 if( a+offset>pCur->info.nLocal ){
005121 a = pCur->info.nLocal - offset;
005122 }
005123 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
005124 offset = 0;
005125 pBuf += a;
005126 amt -= a;
005127 }else{
005128 offset -= pCur->info.nLocal;
005129 }
005130
005131
005132 if( rc==SQLITE_OK && amt>0 ){
005133 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
005134 Pgno nextPage;
005135
005136 nextPage = get4byte(&aPayload[pCur->info.nLocal]);
005137
005138 /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
005139 **
005140 ** The aOverflow[] array is sized at one entry for each overflow page
005141 ** in the overflow chain. The page number of the first overflow page is
005142 ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
005143 ** means "not yet known" (the cache is lazily populated).
005144 */
005145 if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){
005146 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
005147 if( pCur->aOverflow==0
005148 || nOvfl*(int)sizeof(Pgno) > sqlite3MallocSize(pCur->aOverflow)
005149 ){
005150 Pgno *aNew;
005151 if( sqlite3FaultSim(413) ){
005152 aNew = 0;
005153 }else{
005154 aNew = (Pgno*)sqlite3Realloc(pCur->aOverflow, nOvfl*2*sizeof(Pgno));
005155 }
005156 if( aNew==0 ){
005157 return SQLITE_NOMEM_BKPT;
005158 }else{
005159 pCur->aOverflow = aNew;
005160 }
005161 }
005162 memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
005163 pCur->curFlags |= BTCF_ValidOvfl;
005164 }else{
005165 /* Sanity check the validity of the overflow page cache */
005166 assert( pCur->aOverflow[0]==nextPage
005167 || pCur->aOverflow[0]==0
005168 || CORRUPT_DB );
005169 assert( pCur->aOverflow[0]!=0 || pCur->aOverflow[offset/ovflSize]==0 );
005170
005171 /* If the overflow page-list cache has been allocated and the
005172 ** entry for the first required overflow page is valid, skip
005173 ** directly to it.
005174 */
005175 if( pCur->aOverflow[offset/ovflSize] ){
005176 iIdx = (offset/ovflSize);
005177 nextPage = pCur->aOverflow[iIdx];
005178 offset = (offset%ovflSize);
005179 }
005180 }
005181
005182 assert( rc==SQLITE_OK && amt>0 );
005183 while( nextPage ){
005184 /* If required, populate the overflow page-list cache. */
005185 if( nextPage > pBt->nPage ) return SQLITE_CORRUPT_BKPT;
005186 assert( pCur->aOverflow[iIdx]==0
005187 || pCur->aOverflow[iIdx]==nextPage
005188 || CORRUPT_DB );
005189 pCur->aOverflow[iIdx] = nextPage;
005190
005191 if( offset>=ovflSize ){
005192 /* The only reason to read this page is to obtain the page
005193 ** number for the next page in the overflow chain. The page
005194 ** data is not required. So first try to lookup the overflow
005195 ** page-list cache, if any, then fall back to the getOverflowPage()
005196 ** function.
005197 */
005198 assert( pCur->curFlags & BTCF_ValidOvfl );
005199 assert( pCur->pBtree->db==pBt->db );
005200 if( pCur->aOverflow[iIdx+1] ){
005201 nextPage = pCur->aOverflow[iIdx+1];
005202 }else{
005203 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
005204 }
005205 offset -= ovflSize;
005206 }else{
005207 /* Need to read this page properly. It contains some of the
005208 ** range of data that is being read (eOp==0) or written (eOp!=0).
005209 */
005210 int a = amt;
005211 if( a + offset > ovflSize ){
005212 a = ovflSize - offset;
005213 }
005214
005215 #ifdef SQLITE_DIRECT_OVERFLOW_READ
005216 /* If all the following are true:
005217 **
005218 ** 1) this is a read operation, and
005219 ** 2) data is required from the start of this overflow page, and
005220 ** 3) there are no dirty pages in the page-cache
005221 ** 4) the database is file-backed, and
005222 ** 5) the page is not in the WAL file
005223 ** 6) at least 4 bytes have already been read into the output buffer
005224 **
005225 ** then data can be read directly from the database file into the
005226 ** output buffer, bypassing the page-cache altogether. This speeds
005227 ** up loading large records that span many overflow pages.
005228 */
005229 if( eOp==0 /* (1) */
005230 && offset==0 /* (2) */
005231 && sqlite3PagerDirectReadOk(pBt->pPager, nextPage) /* (3,4,5) */
005232 && &pBuf[-4]>=pBufStart /* (6) */
005233 ){
005234 sqlite3_file *fd = sqlite3PagerFile(pBt->pPager);
005235 u8 aSave[4];
005236 u8 *aWrite = &pBuf[-4];
005237 assert( aWrite>=pBufStart ); /* due to (6) */
005238 memcpy(aSave, aWrite, 4);
005239 rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
005240 nextPage = get4byte(aWrite);
005241 memcpy(aWrite, aSave, 4);
005242 }else
005243 #endif
005244
005245 {
005246 DbPage *pDbPage;
005247 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
005248 (eOp==0 ? PAGER_GET_READONLY : 0)
005249 );
005250 if( rc==SQLITE_OK ){
005251 aPayload = sqlite3PagerGetData(pDbPage);
005252 nextPage = get4byte(aPayload);
005253 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
005254 sqlite3PagerUnref(pDbPage);
005255 offset = 0;
005256 }
005257 }
005258 amt -= a;
005259 if( amt==0 ) return rc;
005260 pBuf += a;
005261 }
005262 if( rc ) break;
005263 iIdx++;
005264 }
005265 }
005266
005267 if( rc==SQLITE_OK && amt>0 ){
005268 /* Overflow chain ends prematurely */
005269 return SQLITE_CORRUPT_PAGE(pPage);
005270 }
005271 return rc;
005272 }
005273
005274 /*
005275 ** Read part of the payload for the row at which that cursor pCur is currently
005276 ** pointing. "amt" bytes will be transferred into pBuf[]. The transfer
005277 ** begins at "offset".
005278 **
005279 ** pCur can be pointing to either a table or an index b-tree.
005280 ** If pointing to a table btree, then the content section is read. If
005281 ** pCur is pointing to an index b-tree then the key section is read.
005282 **
005283 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing
005284 ** to a valid row in the table. For sqlite3BtreePayloadChecked(), the
005285 ** cursor might be invalid or might need to be restored before being read.
005286 **
005287 ** Return SQLITE_OK on success or an error code if anything goes
005288 ** wrong. An error is returned if "offset+amt" is larger than
005289 ** the available payload.
005290 */
005291 int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
005292 assert( cursorHoldsMutex(pCur) );
005293 assert( pCur->eState==CURSOR_VALID );
005294 assert( pCur->iPage>=0 && pCur->pPage );
005295 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
005296 }
005297
005298 /*
005299 ** This variant of sqlite3BtreePayload() works even if the cursor has not
005300 ** in the CURSOR_VALID state. It is only used by the sqlite3_blob_read()
005301 ** interface.
005302 */
005303 #ifndef SQLITE_OMIT_INCRBLOB
005304 static SQLITE_NOINLINE int accessPayloadChecked(
005305 BtCursor *pCur,
005306 u32 offset,
005307 u32 amt,
005308 void *pBuf
005309 ){
005310 int rc;
005311 if ( pCur->eState==CURSOR_INVALID ){
005312 return SQLITE_ABORT;
005313 }
005314 assert( cursorOwnsBtShared(pCur) );
005315 rc = btreeRestoreCursorPosition(pCur);
005316 return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0);
005317 }
005318 int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
005319 if( pCur->eState==CURSOR_VALID ){
005320 assert( cursorOwnsBtShared(pCur) );
005321 return accessPayload(pCur, offset, amt, pBuf, 0);
005322 }else{
005323 return accessPayloadChecked(pCur, offset, amt, pBuf);
005324 }
005325 }
005326 #endif /* SQLITE_OMIT_INCRBLOB */
005327
005328 /*
005329 ** Return a pointer to payload information from the entry that the
005330 ** pCur cursor is pointing to. The pointer is to the beginning of
005331 ** the key if index btrees (pPage->intKey==0) and is the data for
005332 ** table btrees (pPage->intKey==1). The number of bytes of available
005333 ** key/data is written into *pAmt. If *pAmt==0, then the value
005334 ** returned will not be a valid pointer.
005335 **
005336 ** This routine is an optimization. It is common for the entire key
005337 ** and data to fit on the local page and for there to be no overflow
005338 ** pages. When that is so, this routine can be used to access the
005339 ** key and data without making a copy. If the key and/or data spills
005340 ** onto overflow pages, then accessPayload() must be used to reassemble
005341 ** the key/data and copy it into a preallocated buffer.
005342 **
005343 ** The pointer returned by this routine looks directly into the cached
005344 ** page of the database. The data might change or move the next time
005345 ** any btree routine is called.
005346 */
005347 static const void *fetchPayload(
005348 BtCursor *pCur, /* Cursor pointing to entry to read from */
005349 u32 *pAmt /* Write the number of available bytes here */
005350 ){
005351 int amt;
005352 assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage);
005353 assert( pCur->eState==CURSOR_VALID );
005354 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005355 assert( cursorOwnsBtShared(pCur) );
005356 assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB );
005357 assert( pCur->info.nSize>0 );
005358 assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB );
005359 assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB);
005360 amt = pCur->info.nLocal;
005361 if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){
005362 /* There is too little space on the page for the expected amount
005363 ** of local content. Database must be corrupt. */
005364 assert( CORRUPT_DB );
005365 amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload));
005366 }
005367 *pAmt = (u32)amt;
005368 return (void*)pCur->info.pPayload;
005369 }
005370
005371
005372 /*
005373 ** For the entry that cursor pCur is point to, return as
005374 ** many bytes of the key or data as are available on the local
005375 ** b-tree page. Write the number of available bytes into *pAmt.
005376 **
005377 ** The pointer returned is ephemeral. The key/data may move
005378 ** or be destroyed on the next call to any Btree routine,
005379 ** including calls from other threads against the same cache.
005380 ** Hence, a mutex on the BtShared should be held prior to calling
005381 ** this routine.
005382 **
005383 ** These routines is used to get quick access to key and data
005384 ** in the common case where no overflow pages are used.
005385 */
005386 const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){
005387 return fetchPayload(pCur, pAmt);
005388 }
005389
005390
005391 /*
005392 ** Move the cursor down to a new child page. The newPgno argument is the
005393 ** page number of the child page to move to.
005394 **
005395 ** This function returns SQLITE_CORRUPT if the page-header flags field of
005396 ** the new child page does not match the flags field of the parent (i.e.
005397 ** if an intkey page appears to be the parent of a non-intkey page, or
005398 ** vice-versa).
005399 */
005400 static int moveToChild(BtCursor *pCur, u32 newPgno){
005401 int rc;
005402 assert( cursorOwnsBtShared(pCur) );
005403 assert( pCur->eState==CURSOR_VALID );
005404 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
005405 assert( pCur->iPage>=0 );
005406 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
005407 return SQLITE_CORRUPT_BKPT;
005408 }
005409 pCur->info.nSize = 0;
005410 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
005411 pCur->aiIdx[pCur->iPage] = pCur->ix;
005412 pCur->apPage[pCur->iPage] = pCur->pPage;
005413 pCur->ix = 0;
005414 pCur->iPage++;
005415 rc = getAndInitPage(pCur->pBt, newPgno, &pCur->pPage, pCur->curPagerFlags);
005416 assert( pCur->pPage!=0 || rc!=SQLITE_OK );
005417 if( rc==SQLITE_OK
005418 && (pCur->pPage->nCell<1 || pCur->pPage->intKey!=pCur->curIntKey)
005419 ){
005420 releasePage(pCur->pPage);
005421 rc = SQLITE_CORRUPT_PGNO(newPgno);
005422 }
005423 if( rc ){
005424 pCur->pPage = pCur->apPage[--pCur->iPage];
005425 }
005426 return rc;
005427 }
005428
005429 #ifdef SQLITE_DEBUG
005430 /*
005431 ** Page pParent is an internal (non-leaf) tree page. This function
005432 ** asserts that page number iChild is the left-child if the iIdx'th
005433 ** cell in page pParent. Or, if iIdx is equal to the total number of
005434 ** cells in pParent, that page number iChild is the right-child of
005435 ** the page.
005436 */
005437 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
005438 if( CORRUPT_DB ) return; /* The conditions tested below might not be true
005439 ** in a corrupt database */
005440 assert( iIdx<=pParent->nCell );
005441 if( iIdx==pParent->nCell ){
005442 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
005443 }else{
005444 assert( get4byte(findCell(pParent, iIdx))==iChild );
005445 }
005446 }
005447 #else
005448 # define assertParentIndex(x,y,z)
005449 #endif
005450
005451 /*
005452 ** Move the cursor up to the parent page.
005453 **
005454 ** pCur->idx is set to the cell index that contains the pointer
005455 ** to the page we are coming from. If we are coming from the
005456 ** right-most child page then pCur->idx is set to one more than
005457 ** the largest cell index.
005458 */
005459 static void moveToParent(BtCursor *pCur){
005460 MemPage *pLeaf;
005461 assert( cursorOwnsBtShared(pCur) );
005462 assert( pCur->eState==CURSOR_VALID );
005463 assert( pCur->iPage>0 );
005464 assert( pCur->pPage );
005465 assertParentIndex(
005466 pCur->apPage[pCur->iPage-1],
005467 pCur->aiIdx[pCur->iPage-1],
005468 pCur->pPage->pgno
005469 );
005470 testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
005471 pCur->info.nSize = 0;
005472 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
005473 pCur->ix = pCur->aiIdx[pCur->iPage-1];
005474 pLeaf = pCur->pPage;
005475 pCur->pPage = pCur->apPage[--pCur->iPage];
005476 releasePageNotNull(pLeaf);
005477 }
005478
005479 /*
005480 ** Move the cursor to point to the root page of its b-tree structure.
005481 **
005482 ** If the table has a virtual root page, then the cursor is moved to point
005483 ** to the virtual root page instead of the actual root page. A table has a
005484 ** virtual root page when the actual root page contains no cells and a
005485 ** single child page. This can only happen with the table rooted at page 1.
005486 **
005487 ** If the b-tree structure is empty, the cursor state is set to
005488 ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise,
005489 ** the cursor is set to point to the first cell located on the root
005490 ** (or virtual root) page and the cursor state is set to CURSOR_VALID.
005491 **
005492 ** If this function returns successfully, it may be assumed that the
005493 ** page-header flags indicate that the [virtual] root-page is the expected
005494 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
005495 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
005496 ** indicating a table b-tree, or if the caller did specify a KeyInfo
005497 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
005498 ** b-tree).
005499 */
005500 static int moveToRoot(BtCursor *pCur){
005501 MemPage *pRoot;
005502 int rc = SQLITE_OK;
005503
005504 assert( cursorOwnsBtShared(pCur) );
005505 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
005506 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
005507 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
005508 assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 );
005509 assert( pCur->pgnoRoot>0 || pCur->iPage<0 );
005510
005511 if( pCur->iPage>=0 ){
005512 if( pCur->iPage ){
005513 releasePageNotNull(pCur->pPage);
005514 while( --pCur->iPage ){
005515 releasePageNotNull(pCur->apPage[pCur->iPage]);
005516 }
005517 pRoot = pCur->pPage = pCur->apPage[0];
005518 goto skip_init;
005519 }
005520 }else if( pCur->pgnoRoot==0 ){
005521 pCur->eState = CURSOR_INVALID;
005522 return SQLITE_EMPTY;
005523 }else{
005524 assert( pCur->iPage==(-1) );
005525 if( pCur->eState>=CURSOR_REQUIRESEEK ){
005526 if( pCur->eState==CURSOR_FAULT ){
005527 assert( pCur->skipNext!=SQLITE_OK );
005528 return pCur->skipNext;
005529 }
005530 sqlite3BtreeClearCursor(pCur);
005531 }
005532 rc = getAndInitPage(pCur->pBt, pCur->pgnoRoot, &pCur->pPage,
005533 pCur->curPagerFlags);
005534 if( rc!=SQLITE_OK ){
005535 pCur->eState = CURSOR_INVALID;
005536 return rc;
005537 }
005538 pCur->iPage = 0;
005539 pCur->curIntKey = pCur->pPage->intKey;
005540 }
005541 pRoot = pCur->pPage;
005542 assert( pRoot->pgno==pCur->pgnoRoot || CORRUPT_DB );
005543
005544 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
005545 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
005546 ** NULL, the caller expects a table b-tree. If this is not the case,
005547 ** return an SQLITE_CORRUPT error.
005548 **
005549 ** Earlier versions of SQLite assumed that this test could not fail
005550 ** if the root page was already loaded when this function was called (i.e.
005551 ** if pCur->iPage>=0). But this is not so if the database is corrupted
005552 ** in such a way that page pRoot is linked into a second b-tree table
005553 ** (or the freelist). */
005554 assert( pRoot->intKey==1 || pRoot->intKey==0 );
005555 if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
005556 return SQLITE_CORRUPT_PAGE(pCur->pPage);
005557 }
005558
005559 skip_init:
005560 pCur->ix = 0;
005561 pCur->info.nSize = 0;
005562 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
005563
005564 if( pRoot->nCell>0 ){
005565 pCur->eState = CURSOR_VALID;
005566 }else if( !pRoot->leaf ){
005567 Pgno subpage;
005568 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
005569 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
005570 pCur->eState = CURSOR_VALID;
005571 rc = moveToChild(pCur, subpage);
005572 }else{
005573 pCur->eState = CURSOR_INVALID;
005574 rc = SQLITE_EMPTY;
005575 }
005576 return rc;
005577 }
005578
005579 /*
005580 ** Move the cursor down to the left-most leaf entry beneath the
005581 ** entry to which it is currently pointing.
005582 **
005583 ** The left-most leaf is the one with the smallest key - the first
005584 ** in ascending order.
005585 */
005586 static int moveToLeftmost(BtCursor *pCur){
005587 Pgno pgno;
005588 int rc = SQLITE_OK;
005589 MemPage *pPage;
005590
005591 assert( cursorOwnsBtShared(pCur) );
005592 assert( pCur->eState==CURSOR_VALID );
005593 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
005594 assert( pCur->ix<pPage->nCell );
005595 pgno = get4byte(findCell(pPage, pCur->ix));
005596 rc = moveToChild(pCur, pgno);
005597 }
005598 return rc;
005599 }
005600
005601 /*
005602 ** Move the cursor down to the right-most leaf entry beneath the
005603 ** page to which it is currently pointing. Notice the difference
005604 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
005605 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
005606 ** finds the right-most entry beneath the *page*.
005607 **
005608 ** The right-most entry is the one with the largest key - the last
005609 ** key in ascending order.
005610 */
005611 static int moveToRightmost(BtCursor *pCur){
005612 Pgno pgno;
005613 int rc = SQLITE_OK;
005614 MemPage *pPage = 0;
005615
005616 assert( cursorOwnsBtShared(pCur) );
005617 assert( pCur->eState==CURSOR_VALID );
005618 while( !(pPage = pCur->pPage)->leaf ){
005619 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
005620 pCur->ix = pPage->nCell;
005621 rc = moveToChild(pCur, pgno);
005622 if( rc ) return rc;
005623 }
005624 pCur->ix = pPage->nCell-1;
005625 assert( pCur->info.nSize==0 );
005626 assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
005627 return SQLITE_OK;
005628 }
005629
005630 /* Move the cursor to the first entry in the table. Return SQLITE_OK
005631 ** on success. Set *pRes to 0 if the cursor actually points to something
005632 ** or set *pRes to 1 if the table is empty.
005633 */
005634 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
005635 int rc;
005636
005637 assert( cursorOwnsBtShared(pCur) );
005638 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005639 rc = moveToRoot(pCur);
005640 if( rc==SQLITE_OK ){
005641 assert( pCur->pPage->nCell>0 );
005642 *pRes = 0;
005643 rc = moveToLeftmost(pCur);
005644 }else if( rc==SQLITE_EMPTY ){
005645 assert( pCur->pgnoRoot==0 || (pCur->pPage!=0 && pCur->pPage->nCell==0) );
005646 *pRes = 1;
005647 rc = SQLITE_OK;
005648 }
005649 return rc;
005650 }
005651
005652 #ifdef SQLITE_DEBUG
005653 /* The cursors is CURSOR_VALID and has BTCF_AtLast set. Verify that
005654 ** this flags are true for a consistent database.
005655 **
005656 ** This routine is is called from within assert() statements only.
005657 ** It is an internal verification routine and does not appear in production
005658 ** builds.
005659 */
005660 static int cursorIsAtLastEntry(BtCursor *pCur){
005661 int ii;
005662 for(ii=0; ii<pCur->iPage; ii++){
005663 if( pCur->aiIdx[ii]!=pCur->apPage[ii]->nCell ) return 0;
005664 }
005665 return pCur->ix==pCur->pPage->nCell-1 && pCur->pPage->leaf!=0;
005666 }
005667 #endif
005668
005669 /* Move the cursor to the last entry in the table. Return SQLITE_OK
005670 ** on success. Set *pRes to 0 if the cursor actually points to something
005671 ** or set *pRes to 1 if the table is empty.
005672 */
005673 static SQLITE_NOINLINE int btreeLast(BtCursor *pCur, int *pRes){
005674 int rc = moveToRoot(pCur);
005675 if( rc==SQLITE_OK ){
005676 assert( pCur->eState==CURSOR_VALID );
005677 *pRes = 0;
005678 rc = moveToRightmost(pCur);
005679 if( rc==SQLITE_OK ){
005680 pCur->curFlags |= BTCF_AtLast;
005681 }else{
005682 pCur->curFlags &= ~BTCF_AtLast;
005683 }
005684 }else if( rc==SQLITE_EMPTY ){
005685 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
005686 *pRes = 1;
005687 rc = SQLITE_OK;
005688 }
005689 return rc;
005690 }
005691 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
005692 assert( cursorOwnsBtShared(pCur) );
005693 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005694
005695 /* If the cursor already points to the last entry, this is a no-op. */
005696 if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
005697 assert( cursorIsAtLastEntry(pCur) || CORRUPT_DB );
005698 *pRes = 0;
005699 return SQLITE_OK;
005700 }
005701 return btreeLast(pCur, pRes);
005702 }
005703
005704 /* Move the cursor so that it points to an entry in a table (a.k.a INTKEY)
005705 ** table near the key intKey. Return a success code.
005706 **
005707 ** If an exact match is not found, then the cursor is always
005708 ** left pointing at a leaf page which would hold the entry if it
005709 ** were present. The cursor might point to an entry that comes
005710 ** before or after the key.
005711 **
005712 ** An integer is written into *pRes which is the result of
005713 ** comparing the key with the entry to which the cursor is
005714 ** pointing. The meaning of the integer written into
005715 ** *pRes is as follows:
005716 **
005717 ** *pRes<0 The cursor is left pointing at an entry that
005718 ** is smaller than intKey or if the table is empty
005719 ** and the cursor is therefore left point to nothing.
005720 **
005721 ** *pRes==0 The cursor is left pointing at an entry that
005722 ** exactly matches intKey.
005723 **
005724 ** *pRes>0 The cursor is left pointing at an entry that
005725 ** is larger than intKey.
005726 */
005727 int sqlite3BtreeTableMoveto(
005728 BtCursor *pCur, /* The cursor to be moved */
005729 i64 intKey, /* The table key */
005730 int biasRight, /* If true, bias the search to the high end */
005731 int *pRes /* Write search results here */
005732 ){
005733 int rc;
005734
005735 assert( cursorOwnsBtShared(pCur) );
005736 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005737 assert( pRes );
005738 assert( pCur->pKeyInfo==0 );
005739 assert( pCur->eState!=CURSOR_VALID || pCur->curIntKey!=0 );
005740
005741 /* If the cursor is already positioned at the point we are trying
005742 ** to move to, then just return without doing any work */
005743 if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0 ){
005744 if( pCur->info.nKey==intKey ){
005745 *pRes = 0;
005746 return SQLITE_OK;
005747 }
005748 if( pCur->info.nKey<intKey ){
005749 if( (pCur->curFlags & BTCF_AtLast)!=0 ){
005750 assert( cursorIsAtLastEntry(pCur) || CORRUPT_DB );
005751 *pRes = -1;
005752 return SQLITE_OK;
005753 }
005754 /* If the requested key is one more than the previous key, then
005755 ** try to get there using sqlite3BtreeNext() rather than a full
005756 ** binary search. This is an optimization only. The correct answer
005757 ** is still obtained without this case, only a little more slowly. */
005758 if( pCur->info.nKey+1==intKey ){
005759 *pRes = 0;
005760 rc = sqlite3BtreeNext(pCur, 0);
005761 if( rc==SQLITE_OK ){
005762 getCellInfo(pCur);
005763 if( pCur->info.nKey==intKey ){
005764 return SQLITE_OK;
005765 }
005766 }else if( rc!=SQLITE_DONE ){
005767 return rc;
005768 }
005769 }
005770 }
005771 }
005772
005773 #ifdef SQLITE_DEBUG
005774 pCur->pBtree->nSeek++; /* Performance measurement during testing */
005775 #endif
005776
005777 rc = moveToRoot(pCur);
005778 if( rc ){
005779 if( rc==SQLITE_EMPTY ){
005780 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
005781 *pRes = -1;
005782 return SQLITE_OK;
005783 }
005784 return rc;
005785 }
005786 assert( pCur->pPage );
005787 assert( pCur->pPage->isInit );
005788 assert( pCur->eState==CURSOR_VALID );
005789 assert( pCur->pPage->nCell > 0 );
005790 assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey );
005791 assert( pCur->curIntKey );
005792
005793 for(;;){
005794 int lwr, upr, idx, c;
005795 Pgno chldPg;
005796 MemPage *pPage = pCur->pPage;
005797 u8 *pCell; /* Pointer to current cell in pPage */
005798
005799 /* pPage->nCell must be greater than zero. If this is the root-page
005800 ** the cursor would have been INVALID above and this for(;;) loop
005801 ** not run. If this is not the root-page, then the moveToChild() routine
005802 ** would have already detected db corruption. Similarly, pPage must
005803 ** be the right kind (index or table) of b-tree page. Otherwise
005804 ** a moveToChild() or moveToRoot() call would have detected corruption. */
005805 assert( pPage->nCell>0 );
005806 assert( pPage->intKey );
005807 lwr = 0;
005808 upr = pPage->nCell-1;
005809 assert( biasRight==0 || biasRight==1 );
005810 idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
005811 for(;;){
005812 i64 nCellKey;
005813 pCell = findCellPastPtr(pPage, idx);
005814 if( pPage->intKeyLeaf ){
005815 while( 0x80 <= *(pCell++) ){
005816 if( pCell>=pPage->aDataEnd ){
005817 return SQLITE_CORRUPT_PAGE(pPage);
005818 }
005819 }
005820 }
005821 getVarint(pCell, (u64*)&nCellKey);
005822 if( nCellKey<intKey ){
005823 lwr = idx+1;
005824 if( lwr>upr ){ c = -1; break; }
005825 }else if( nCellKey>intKey ){
005826 upr = idx-1;
005827 if( lwr>upr ){ c = +1; break; }
005828 }else{
005829 assert( nCellKey==intKey );
005830 pCur->ix = (u16)idx;
005831 if( !pPage->leaf ){
005832 lwr = idx;
005833 goto moveto_table_next_layer;
005834 }else{
005835 pCur->curFlags |= BTCF_ValidNKey;
005836 pCur->info.nKey = nCellKey;
005837 pCur->info.nSize = 0;
005838 *pRes = 0;
005839 return SQLITE_OK;
005840 }
005841 }
005842 assert( lwr+upr>=0 );
005843 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */
005844 }
005845 assert( lwr==upr+1 || !pPage->leaf );
005846 assert( pPage->isInit );
005847 if( pPage->leaf ){
005848 assert( pCur->ix<pCur->pPage->nCell );
005849 pCur->ix = (u16)idx;
005850 *pRes = c;
005851 rc = SQLITE_OK;
005852 goto moveto_table_finish;
005853 }
005854 moveto_table_next_layer:
005855 if( lwr>=pPage->nCell ){
005856 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
005857 }else{
005858 chldPg = get4byte(findCell(pPage, lwr));
005859 }
005860 pCur->ix = (u16)lwr;
005861 rc = moveToChild(pCur, chldPg);
005862 if( rc ) break;
005863 }
005864 moveto_table_finish:
005865 pCur->info.nSize = 0;
005866 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
005867 return rc;
005868 }
005869
005870 /*
005871 ** Compare the "idx"-th cell on the page the cursor pCur is currently
005872 ** pointing to to pIdxKey using xRecordCompare. Return negative or
005873 ** zero if the cell is less than or equal pIdxKey. Return positive
005874 ** if unknown.
005875 **
005876 ** Return value negative: Cell at pCur[idx] less than pIdxKey
005877 **
005878 ** Return value is zero: Cell at pCur[idx] equals pIdxKey
005879 **
005880 ** Return value positive: Nothing is known about the relationship
005881 ** of the cell at pCur[idx] and pIdxKey.
005882 **
005883 ** This routine is part of an optimization. It is always safe to return
005884 ** a positive value as that will cause the optimization to be skipped.
005885 */
005886 static int indexCellCompare(
005887 BtCursor *pCur,
005888 int idx,
005889 UnpackedRecord *pIdxKey,
005890 RecordCompare xRecordCompare
005891 ){
005892 MemPage *pPage = pCur->pPage;
005893 int c;
005894 int nCell; /* Size of the pCell cell in bytes */
005895 u8 *pCell = findCellPastPtr(pPage, idx);
005896
005897 nCell = pCell[0];
005898 if( nCell<=pPage->max1bytePayload ){
005899 /* This branch runs if the record-size field of the cell is a
005900 ** single byte varint and the record fits entirely on the main
005901 ** b-tree page. */
005902 testcase( pCell+nCell+1==pPage->aDataEnd );
005903 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
005904 }else if( !(pCell[1] & 0x80)
005905 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
005906 ){
005907 /* The record-size field is a 2 byte varint and the record
005908 ** fits entirely on the main b-tree page. */
005909 testcase( pCell+nCell+2==pPage->aDataEnd );
005910 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
005911 }else{
005912 /* If the record extends into overflow pages, do not attempt
005913 ** the optimization. */
005914 c = 99;
005915 }
005916 return c;
005917 }
005918
005919 /*
005920 ** Return true (non-zero) if pCur is current pointing to the last
005921 ** page of a table.
005922 */
005923 static int cursorOnLastPage(BtCursor *pCur){
005924 int i;
005925 assert( pCur->eState==CURSOR_VALID );
005926 for(i=0; i<pCur->iPage; i++){
005927 MemPage *pPage = pCur->apPage[i];
005928 if( pCur->aiIdx[i]<pPage->nCell ) return 0;
005929 }
005930 return 1;
005931 }
005932
005933 /* Move the cursor so that it points to an entry in an index table
005934 ** near the key pIdxKey. Return a success code.
005935 **
005936 ** If an exact match is not found, then the cursor is always
005937 ** left pointing at a leaf page which would hold the entry if it
005938 ** were present. The cursor might point to an entry that comes
005939 ** before or after the key.
005940 **
005941 ** An integer is written into *pRes which is the result of
005942 ** comparing the key with the entry to which the cursor is
005943 ** pointing. The meaning of the integer written into
005944 ** *pRes is as follows:
005945 **
005946 ** *pRes<0 The cursor is left pointing at an entry that
005947 ** is smaller than pIdxKey or if the table is empty
005948 ** and the cursor is therefore left point to nothing.
005949 **
005950 ** *pRes==0 The cursor is left pointing at an entry that
005951 ** exactly matches pIdxKey.
005952 **
005953 ** *pRes>0 The cursor is left pointing at an entry that
005954 ** is larger than pIdxKey.
005955 **
005956 ** The pIdxKey->eqSeen field is set to 1 if there
005957 ** exists an entry in the table that exactly matches pIdxKey.
005958 */
005959 int sqlite3BtreeIndexMoveto(
005960 BtCursor *pCur, /* The cursor to be moved */
005961 UnpackedRecord *pIdxKey, /* Unpacked index key */
005962 int *pRes /* Write search results here */
005963 ){
005964 int rc;
005965 RecordCompare xRecordCompare;
005966
005967 assert( cursorOwnsBtShared(pCur) );
005968 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005969 assert( pRes );
005970 assert( pCur->pKeyInfo!=0 );
005971
005972 #ifdef SQLITE_DEBUG
005973 pCur->pBtree->nSeek++; /* Performance measurement during testing */
005974 #endif
005975
005976 xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
005977 pIdxKey->errCode = 0;
005978 assert( pIdxKey->default_rc==1
005979 || pIdxKey->default_rc==0
005980 || pIdxKey->default_rc==-1
005981 );
005982
005983
005984 /* Check to see if we can skip a lot of work. Two cases:
005985 **
005986 ** (1) If the cursor is already pointing to the very last cell
005987 ** in the table and the pIdxKey search key is greater than or
005988 ** equal to that last cell, then no movement is required.
005989 **
005990 ** (2) If the cursor is on the last page of the table and the first
005991 ** cell on that last page is less than or equal to the pIdxKey
005992 ** search key, then we can start the search on the current page
005993 ** without needing to go back to root.
005994 */
005995 if( pCur->eState==CURSOR_VALID
005996 && pCur->pPage->leaf
005997 && cursorOnLastPage(pCur)
005998 ){
005999 int c;
006000 if( pCur->ix==pCur->pPage->nCell-1
006001 && (c = indexCellCompare(pCur, pCur->ix, pIdxKey, xRecordCompare))<=0
006002 && pIdxKey->errCode==SQLITE_OK
006003 ){
006004 *pRes = c;
006005 return SQLITE_OK; /* Cursor already pointing at the correct spot */
006006 }
006007 if( pCur->iPage>0
006008 && indexCellCompare(pCur, 0, pIdxKey, xRecordCompare)<=0
006009 && pIdxKey->errCode==SQLITE_OK
006010 ){
006011 pCur->curFlags &= ~(BTCF_ValidOvfl|BTCF_AtLast);
006012 if( !pCur->pPage->isInit ){
006013 return SQLITE_CORRUPT_BKPT;
006014 }
006015 goto bypass_moveto_root; /* Start search on the current page */
006016 }
006017 pIdxKey->errCode = SQLITE_OK;
006018 }
006019
006020 rc = moveToRoot(pCur);
006021 if( rc ){
006022 if( rc==SQLITE_EMPTY ){
006023 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
006024 *pRes = -1;
006025 return SQLITE_OK;
006026 }
006027 return rc;
006028 }
006029
006030 bypass_moveto_root:
006031 assert( pCur->pPage );
006032 assert( pCur->pPage->isInit );
006033 assert( pCur->eState==CURSOR_VALID );
006034 assert( pCur->pPage->nCell > 0 );
006035 assert( pCur->curIntKey==0 );
006036 assert( pIdxKey!=0 );
006037 for(;;){
006038 int lwr, upr, idx, c;
006039 Pgno chldPg;
006040 MemPage *pPage = pCur->pPage;
006041 u8 *pCell; /* Pointer to current cell in pPage */
006042
006043 /* pPage->nCell must be greater than zero. If this is the root-page
006044 ** the cursor would have been INVALID above and this for(;;) loop
006045 ** not run. If this is not the root-page, then the moveToChild() routine
006046 ** would have already detected db corruption. Similarly, pPage must
006047 ** be the right kind (index or table) of b-tree page. Otherwise
006048 ** a moveToChild() or moveToRoot() call would have detected corruption. */
006049 assert( pPage->nCell>0 );
006050 assert( pPage->intKey==0 );
006051 lwr = 0;
006052 upr = pPage->nCell-1;
006053 idx = upr>>1; /* idx = (lwr+upr)/2; */
006054 for(;;){
006055 int nCell; /* Size of the pCell cell in bytes */
006056 pCell = findCellPastPtr(pPage, idx);
006057
006058 /* The maximum supported page-size is 65536 bytes. This means that
006059 ** the maximum number of record bytes stored on an index B-Tree
006060 ** page is less than 16384 bytes and may be stored as a 2-byte
006061 ** varint. This information is used to attempt to avoid parsing
006062 ** the entire cell by checking for the cases where the record is
006063 ** stored entirely within the b-tree page by inspecting the first
006064 ** 2 bytes of the cell.
006065 */
006066 nCell = pCell[0];
006067 if( nCell<=pPage->max1bytePayload ){
006068 /* This branch runs if the record-size field of the cell is a
006069 ** single byte varint and the record fits entirely on the main
006070 ** b-tree page. */
006071 testcase( pCell+nCell+1==pPage->aDataEnd );
006072 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
006073 }else if( !(pCell[1] & 0x80)
006074 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
006075 ){
006076 /* The record-size field is a 2 byte varint and the record
006077 ** fits entirely on the main b-tree page. */
006078 testcase( pCell+nCell+2==pPage->aDataEnd );
006079 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
006080 }else{
006081 /* The record flows over onto one or more overflow pages. In
006082 ** this case the whole cell needs to be parsed, a buffer allocated
006083 ** and accessPayload() used to retrieve the record into the
006084 ** buffer before VdbeRecordCompare() can be called.
006085 **
006086 ** If the record is corrupt, the xRecordCompare routine may read
006087 ** up to two varints past the end of the buffer. An extra 18
006088 ** bytes of padding is allocated at the end of the buffer in
006089 ** case this happens. */
006090 void *pCellKey;
006091 u8 * const pCellBody = pCell - pPage->childPtrSize;
006092 const int nOverrun = 18; /* Size of the overrun padding */
006093 pPage->xParseCell(pPage, pCellBody, &pCur->info);
006094 nCell = (int)pCur->info.nKey;
006095 testcase( nCell<0 ); /* True if key size is 2^32 or more */
006096 testcase( nCell==0 ); /* Invalid key size: 0x80 0x80 0x00 */
006097 testcase( nCell==1 ); /* Invalid key size: 0x80 0x80 0x01 */
006098 testcase( nCell==2 ); /* Minimum legal index key size */
006099 if( nCell<2 || nCell/pCur->pBt->usableSize>pCur->pBt->nPage ){
006100 rc = SQLITE_CORRUPT_PAGE(pPage);
006101 goto moveto_index_finish;
006102 }
006103 pCellKey = sqlite3Malloc( nCell+nOverrun );
006104 if( pCellKey==0 ){
006105 rc = SQLITE_NOMEM_BKPT;
006106 goto moveto_index_finish;
006107 }
006108 pCur->ix = (u16)idx;
006109 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
006110 memset(((u8*)pCellKey)+nCell,0,nOverrun); /* Fix uninit warnings */
006111 pCur->curFlags &= ~BTCF_ValidOvfl;
006112 if( rc ){
006113 sqlite3_free(pCellKey);
006114 goto moveto_index_finish;
006115 }
006116 c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
006117 sqlite3_free(pCellKey);
006118 }
006119 assert(
006120 (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
006121 && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
006122 );
006123 if( c<0 ){
006124 lwr = idx+1;
006125 }else if( c>0 ){
006126 upr = idx-1;
006127 }else{
006128 assert( c==0 );
006129 *pRes = 0;
006130 rc = SQLITE_OK;
006131 pCur->ix = (u16)idx;
006132 if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT;
006133 goto moveto_index_finish;
006134 }
006135 if( lwr>upr ) break;
006136 assert( lwr+upr>=0 );
006137 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2 */
006138 }
006139 assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
006140 assert( pPage->isInit );
006141 if( pPage->leaf ){
006142 assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB );
006143 pCur->ix = (u16)idx;
006144 *pRes = c;
006145 rc = SQLITE_OK;
006146 goto moveto_index_finish;
006147 }
006148 if( lwr>=pPage->nCell ){
006149 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
006150 }else{
006151 chldPg = get4byte(findCell(pPage, lwr));
006152 }
006153
006154 /* This block is similar to an in-lined version of:
006155 **
006156 ** pCur->ix = (u16)lwr;
006157 ** rc = moveToChild(pCur, chldPg);
006158 ** if( rc ) break;
006159 */
006160 pCur->info.nSize = 0;
006161 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
006162 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
006163 return SQLITE_CORRUPT_BKPT;
006164 }
006165 pCur->aiIdx[pCur->iPage] = (u16)lwr;
006166 pCur->apPage[pCur->iPage] = pCur->pPage;
006167 pCur->ix = 0;
006168 pCur->iPage++;
006169 rc = getAndInitPage(pCur->pBt, chldPg, &pCur->pPage, pCur->curPagerFlags);
006170 if( rc==SQLITE_OK
006171 && (pCur->pPage->nCell<1 || pCur->pPage->intKey!=pCur->curIntKey)
006172 ){
006173 releasePage(pCur->pPage);
006174 rc = SQLITE_CORRUPT_PGNO(chldPg);
006175 }
006176 if( rc ){
006177 pCur->pPage = pCur->apPage[--pCur->iPage];
006178 break;
006179 }
006180 /*
006181 ***** End of in-lined moveToChild() call */
006182 }
006183 moveto_index_finish:
006184 pCur->info.nSize = 0;
006185 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
006186 return rc;
006187 }
006188
006189
006190 /*
006191 ** Return TRUE if the cursor is not pointing at an entry of the table.
006192 **
006193 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
006194 ** past the last entry in the table or sqlite3BtreePrev() moves past
006195 ** the first entry. TRUE is also returned if the table is empty.
006196 */
006197 int sqlite3BtreeEof(BtCursor *pCur){
006198 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
006199 ** have been deleted? This API will need to change to return an error code
006200 ** as well as the boolean result value.
006201 */
006202 return (CURSOR_VALID!=pCur->eState);
006203 }
006204
006205 /*
006206 ** Return an estimate for the number of rows in the table that pCur is
006207 ** pointing to. Return a negative number if no estimate is currently
006208 ** available.
006209 */
006210 i64 sqlite3BtreeRowCountEst(BtCursor *pCur){
006211 i64 n;
006212 u8 i;
006213
006214 assert( cursorOwnsBtShared(pCur) );
006215 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
006216
006217 /* Currently this interface is only called by the OP_IfSizeBetween
006218 ** opcode and the OP_Count opcode with P3=1. In either case,
006219 ** the cursor will always be valid unless the btree is empty. */
006220 if( pCur->eState!=CURSOR_VALID ) return 0;
006221 if( NEVER(pCur->pPage->leaf==0) ) return -1;
006222
006223 n = pCur->pPage->nCell;
006224 for(i=0; i<pCur->iPage; i++){
006225 n *= pCur->apPage[i]->nCell;
006226 }
006227 return n;
006228 }
006229
006230 /*
006231 ** Advance the cursor to the next entry in the database.
006232 ** Return value:
006233 **
006234 ** SQLITE_OK success
006235 ** SQLITE_DONE cursor is already pointing at the last element
006236 ** otherwise some kind of error occurred
006237 **
006238 ** The main entry point is sqlite3BtreeNext(). That routine is optimized
006239 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
006240 ** to the next cell on the current page. The (slower) btreeNext() helper
006241 ** routine is called when it is necessary to move to a different page or
006242 ** to restore the cursor.
006243 **
006244 ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the
006245 ** cursor corresponds to an SQL index and this routine could have been
006246 ** skipped if the SQL index had been a unique index. The F argument
006247 ** is a hint to the implement. SQLite btree implementation does not use
006248 ** this hint, but COMDB2 does.
006249 */
006250 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){
006251 int rc;
006252 int idx;
006253 MemPage *pPage;
006254
006255 assert( cursorOwnsBtShared(pCur) );
006256 if( pCur->eState!=CURSOR_VALID ){
006257 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
006258 rc = restoreCursorPosition(pCur);
006259 if( rc!=SQLITE_OK ){
006260 return rc;
006261 }
006262 if( CURSOR_INVALID==pCur->eState ){
006263 return SQLITE_DONE;
006264 }
006265 if( pCur->eState==CURSOR_SKIPNEXT ){
006266 pCur->eState = CURSOR_VALID;
006267 if( pCur->skipNext>0 ) return SQLITE_OK;
006268 }
006269 }
006270
006271 pPage = pCur->pPage;
006272 idx = ++pCur->ix;
006273 if( sqlite3FaultSim(412) ) pPage->isInit = 0;
006274 if( !pPage->isInit ){
006275 return SQLITE_CORRUPT_BKPT;
006276 }
006277
006278 if( idx>=pPage->nCell ){
006279 if( !pPage->leaf ){
006280 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
006281 if( rc ) return rc;
006282 return moveToLeftmost(pCur);
006283 }
006284 do{
006285 if( pCur->iPage==0 ){
006286 pCur->eState = CURSOR_INVALID;
006287 return SQLITE_DONE;
006288 }
006289 moveToParent(pCur);
006290 pPage = pCur->pPage;
006291 }while( pCur->ix>=pPage->nCell );
006292 if( pPage->intKey ){
006293 return sqlite3BtreeNext(pCur, 0);
006294 }else{
006295 return SQLITE_OK;
006296 }
006297 }
006298 if( pPage->leaf ){
006299 return SQLITE_OK;
006300 }else{
006301 return moveToLeftmost(pCur);
006302 }
006303 }
006304 int sqlite3BtreeNext(BtCursor *pCur, int flags){
006305 MemPage *pPage;
006306 UNUSED_PARAMETER( flags ); /* Used in COMDB2 but not native SQLite */
006307 assert( cursorOwnsBtShared(pCur) );
006308 assert( flags==0 || flags==1 );
006309 pCur->info.nSize = 0;
006310 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
006311 if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur);
006312 pPage = pCur->pPage;
006313 if( (++pCur->ix)>=pPage->nCell ){
006314 pCur->ix--;
006315 return btreeNext(pCur);
006316 }
006317 if( pPage->leaf ){
006318 return SQLITE_OK;
006319 }else{
006320 return moveToLeftmost(pCur);
006321 }
006322 }
006323
006324 /*
006325 ** Step the cursor to the back to the previous entry in the database.
006326 ** Return values:
006327 **
006328 ** SQLITE_OK success
006329 ** SQLITE_DONE the cursor is already on the first element of the table
006330 ** otherwise some kind of error occurred
006331 **
006332 ** The main entry point is sqlite3BtreePrevious(). That routine is optimized
006333 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
006334 ** to the previous cell on the current page. The (slower) btreePrevious()
006335 ** helper routine is called when it is necessary to move to a different page
006336 ** or to restore the cursor.
006337 **
006338 ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then
006339 ** the cursor corresponds to an SQL index and this routine could have been
006340 ** skipped if the SQL index had been a unique index. The F argument is a
006341 ** hint to the implement. The native SQLite btree implementation does not
006342 ** use this hint, but COMDB2 does.
006343 */
006344 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){
006345 int rc;
006346 MemPage *pPage;
006347
006348 assert( cursorOwnsBtShared(pCur) );
006349 assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
006350 assert( pCur->info.nSize==0 );
006351 if( pCur->eState!=CURSOR_VALID ){
006352 rc = restoreCursorPosition(pCur);
006353 if( rc!=SQLITE_OK ){
006354 return rc;
006355 }
006356 if( CURSOR_INVALID==pCur->eState ){
006357 return SQLITE_DONE;
006358 }
006359 if( CURSOR_SKIPNEXT==pCur->eState ){
006360 pCur->eState = CURSOR_VALID;
006361 if( pCur->skipNext<0 ) return SQLITE_OK;
006362 }
006363 }
006364
006365 pPage = pCur->pPage;
006366 if( sqlite3FaultSim(412) ) pPage->isInit = 0;
006367 if( !pPage->isInit ){
006368 return SQLITE_CORRUPT_BKPT;
006369 }
006370 if( !pPage->leaf ){
006371 int idx = pCur->ix;
006372 rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
006373 if( rc ) return rc;
006374 rc = moveToRightmost(pCur);
006375 }else{
006376 while( pCur->ix==0 ){
006377 if( pCur->iPage==0 ){
006378 pCur->eState = CURSOR_INVALID;
006379 return SQLITE_DONE;
006380 }
006381 moveToParent(pCur);
006382 }
006383 assert( pCur->info.nSize==0 );
006384 assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 );
006385
006386 pCur->ix--;
006387 pPage = pCur->pPage;
006388 if( pPage->intKey && !pPage->leaf ){
006389 rc = sqlite3BtreePrevious(pCur, 0);
006390 }else{
006391 rc = SQLITE_OK;
006392 }
006393 }
006394 return rc;
006395 }
006396 int sqlite3BtreePrevious(BtCursor *pCur, int flags){
006397 assert( cursorOwnsBtShared(pCur) );
006398 assert( flags==0 || flags==1 );
006399 UNUSED_PARAMETER( flags ); /* Used in COMDB2 but not native SQLite */
006400 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
006401 pCur->info.nSize = 0;
006402 if( pCur->eState!=CURSOR_VALID
006403 || pCur->ix==0
006404 || pCur->pPage->leaf==0
006405 ){
006406 return btreePrevious(pCur);
006407 }
006408 pCur->ix--;
006409 return SQLITE_OK;
006410 }
006411
006412 /*
006413 ** Allocate a new page from the database file.
006414 **
006415 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
006416 ** has already been called on the new page.) The new page has also
006417 ** been referenced and the calling routine is responsible for calling
006418 ** sqlite3PagerUnref() on the new page when it is done.
006419 **
006420 ** SQLITE_OK is returned on success. Any other return value indicates
006421 ** an error. *ppPage is set to NULL in the event of an error.
006422 **
006423 ** If the "nearby" parameter is not 0, then an effort is made to
006424 ** locate a page close to the page number "nearby". This can be used in an
006425 ** attempt to keep related pages close to each other in the database file,
006426 ** which in turn can make database access faster.
006427 **
006428 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
006429 ** anywhere on the free-list, then it is guaranteed to be returned. If
006430 ** eMode is BTALLOC_LT then the page returned will be less than or equal
006431 ** to nearby if any such page exists. If eMode is BTALLOC_ANY then there
006432 ** are no restrictions on which page is returned.
006433 */
006434 static int allocateBtreePage(
006435 BtShared *pBt, /* The btree */
006436 MemPage **ppPage, /* Store pointer to the allocated page here */
006437 Pgno *pPgno, /* Store the page number here */
006438 Pgno nearby, /* Search for a page near this one */
006439 u8 eMode /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
006440 ){
006441 MemPage *pPage1;
006442 int rc;
006443 u32 n; /* Number of pages on the freelist */
006444 u32 k; /* Number of leaves on the trunk of the freelist */
006445 MemPage *pTrunk = 0;
006446 MemPage *pPrevTrunk = 0;
006447 Pgno mxPage; /* Total size of the database file */
006448
006449 assert( sqlite3_mutex_held(pBt->mutex) );
006450 assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
006451 pPage1 = pBt->pPage1;
006452 mxPage = btreePagecount(pBt);
006453 /* EVIDENCE-OF: R-21003-45125 The 4-byte big-endian integer at offset 36
006454 ** stores the total number of pages on the freelist. */
006455 n = get4byte(&pPage1->aData[36]);
006456 testcase( n==mxPage-1 );
006457 if( n>=mxPage ){
006458 return SQLITE_CORRUPT_BKPT;
006459 }
006460 if( n>0 ){
006461 /* There are pages on the freelist. Reuse one of those pages. */
006462 Pgno iTrunk;
006463 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
006464 u32 nSearch = 0; /* Count of the number of search attempts */
006465
006466 /* If eMode==BTALLOC_EXACT and a query of the pointer-map
006467 ** shows that the page 'nearby' is somewhere on the free-list, then
006468 ** the entire-list will be searched for that page.
006469 */
006470 #ifndef SQLITE_OMIT_AUTOVACUUM
006471 if( eMode==BTALLOC_EXACT ){
006472 if( nearby<=mxPage ){
006473 u8 eType;
006474 assert( nearby>0 );
006475 assert( pBt->autoVacuum );
006476 rc = ptrmapGet(pBt, nearby, &eType, 0);
006477 if( rc ) return rc;
006478 if( eType==PTRMAP_FREEPAGE ){
006479 searchList = 1;
006480 }
006481 }
006482 }else if( eMode==BTALLOC_LE ){
006483 searchList = 1;
006484 }
006485 #endif
006486
006487 /* Decrement the free-list count by 1. Set iTrunk to the index of the
006488 ** first free-list trunk page. iPrevTrunk is initially 1.
006489 */
006490 rc = sqlite3PagerWrite(pPage1->pDbPage);
006491 if( rc ) return rc;
006492 put4byte(&pPage1->aData[36], n-1);
006493
006494 /* The code within this loop is run only once if the 'searchList' variable
006495 ** is not true. Otherwise, it runs once for each trunk-page on the
006496 ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
006497 ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
006498 */
006499 do {
006500 pPrevTrunk = pTrunk;
006501 if( pPrevTrunk ){
006502 /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
006503 ** is the page number of the next freelist trunk page in the list or
006504 ** zero if this is the last freelist trunk page. */
006505 iTrunk = get4byte(&pPrevTrunk->aData[0]);
006506 }else{
006507 /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
006508 ** stores the page number of the first page of the freelist, or zero if
006509 ** the freelist is empty. */
006510 iTrunk = get4byte(&pPage1->aData[32]);
006511 }
006512 testcase( iTrunk==mxPage );
006513 if( iTrunk>mxPage || nSearch++ > n ){
006514 rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1);
006515 }else{
006516 rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
006517 }
006518 if( rc ){
006519 pTrunk = 0;
006520 goto end_allocate_page;
006521 }
006522 assert( pTrunk!=0 );
006523 assert( pTrunk->aData!=0 );
006524 /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
006525 ** is the number of leaf page pointers to follow. */
006526 k = get4byte(&pTrunk->aData[4]);
006527 if( k==0 && !searchList ){
006528 /* The trunk has no leaves and the list is not being searched.
006529 ** So extract the trunk page itself and use it as the newly
006530 ** allocated page */
006531 assert( pPrevTrunk==0 );
006532 rc = sqlite3PagerWrite(pTrunk->pDbPage);
006533 if( rc ){
006534 goto end_allocate_page;
006535 }
006536 *pPgno = iTrunk;
006537 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
006538 *ppPage = pTrunk;
006539 pTrunk = 0;
006540 TRACE(("ALLOCATE: %u trunk - %u free pages left\n", *pPgno, n-1));
006541 }else if( k>(u32)(pBt->usableSize/4 - 2) ){
006542 /* Value of k is out of range. Database corruption */
006543 rc = SQLITE_CORRUPT_PGNO(iTrunk);
006544 goto end_allocate_page;
006545 #ifndef SQLITE_OMIT_AUTOVACUUM
006546 }else if( searchList
006547 && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE))
006548 ){
006549 /* The list is being searched and this trunk page is the page
006550 ** to allocate, regardless of whether it has leaves.
006551 */
006552 *pPgno = iTrunk;
006553 *ppPage = pTrunk;
006554 searchList = 0;
006555 rc = sqlite3PagerWrite(pTrunk->pDbPage);
006556 if( rc ){
006557 goto end_allocate_page;
006558 }
006559 if( k==0 ){
006560 if( !pPrevTrunk ){
006561 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
006562 }else{
006563 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
006564 if( rc!=SQLITE_OK ){
006565 goto end_allocate_page;
006566 }
006567 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
006568 }
006569 }else{
006570 /* The trunk page is required by the caller but it contains
006571 ** pointers to free-list leaves. The first leaf becomes a trunk
006572 ** page in this case.
006573 */
006574 MemPage *pNewTrunk;
006575 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
006576 if( iNewTrunk>mxPage ){
006577 rc = SQLITE_CORRUPT_PGNO(iTrunk);
006578 goto end_allocate_page;
006579 }
006580 testcase( iNewTrunk==mxPage );
006581 rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
006582 if( rc!=SQLITE_OK ){
006583 goto end_allocate_page;
006584 }
006585 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
006586 if( rc!=SQLITE_OK ){
006587 releasePage(pNewTrunk);
006588 goto end_allocate_page;
006589 }
006590 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
006591 put4byte(&pNewTrunk->aData[4], k-1);
006592 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
006593 releasePage(pNewTrunk);
006594 if( !pPrevTrunk ){
006595 assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
006596 put4byte(&pPage1->aData[32], iNewTrunk);
006597 }else{
006598 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
006599 if( rc ){
006600 goto end_allocate_page;
006601 }
006602 put4byte(&pPrevTrunk->aData[0], iNewTrunk);
006603 }
006604 }
006605 pTrunk = 0;
006606 TRACE(("ALLOCATE: %u trunk - %u free pages left\n", *pPgno, n-1));
006607 #endif
006608 }else if( k>0 ){
006609 /* Extract a leaf from the trunk */
006610 u32 closest;
006611 Pgno iPage;
006612 unsigned char *aData = pTrunk->aData;
006613 if( nearby>0 ){
006614 u32 i;
006615 closest = 0;
006616 if( eMode==BTALLOC_LE ){
006617 for(i=0; i<k; i++){
006618 iPage = get4byte(&aData[8+i*4]);
006619 if( iPage<=nearby ){
006620 closest = i;
006621 break;
006622 }
006623 }
006624 }else{
006625 int dist;
006626 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
006627 for(i=1; i<k; i++){
006628 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
006629 if( d2<dist ){
006630 closest = i;
006631 dist = d2;
006632 }
006633 }
006634 }
006635 }else{
006636 closest = 0;
006637 }
006638
006639 iPage = get4byte(&aData[8+closest*4]);
006640 testcase( iPage==mxPage );
006641 if( iPage>mxPage || iPage<2 ){
006642 rc = SQLITE_CORRUPT_PGNO(iTrunk);
006643 goto end_allocate_page;
006644 }
006645 testcase( iPage==mxPage );
006646 if( !searchList
006647 || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE))
006648 ){
006649 int noContent;
006650 *pPgno = iPage;
006651 TRACE(("ALLOCATE: %u was leaf %u of %u on trunk %u"
006652 ": %u more free pages\n",
006653 *pPgno, closest+1, k, pTrunk->pgno, n-1));
006654 rc = sqlite3PagerWrite(pTrunk->pDbPage);
006655 if( rc ) goto end_allocate_page;
006656 if( closest<k-1 ){
006657 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
006658 }
006659 put4byte(&aData[4], k-1);
006660 noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
006661 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
006662 if( rc==SQLITE_OK ){
006663 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
006664 if( rc!=SQLITE_OK ){
006665 releasePage(*ppPage);
006666 *ppPage = 0;
006667 }
006668 }
006669 searchList = 0;
006670 }
006671 }
006672 releasePage(pPrevTrunk);
006673 pPrevTrunk = 0;
006674 }while( searchList );
006675 }else{
006676 /* There are no pages on the freelist, so append a new page to the
006677 ** database image.
006678 **
006679 ** Normally, new pages allocated by this block can be requested from the
006680 ** pager layer with the 'no-content' flag set. This prevents the pager
006681 ** from trying to read the pages content from disk. However, if the
006682 ** current transaction has already run one or more incremental-vacuum
006683 ** steps, then the page we are about to allocate may contain content
006684 ** that is required in the event of a rollback. In this case, do
006685 ** not set the no-content flag. This causes the pager to load and journal
006686 ** the current page content before overwriting it.
006687 **
006688 ** Note that the pager will not actually attempt to load or journal
006689 ** content for any page that really does lie past the end of the database
006690 ** file on disk. So the effects of disabling the no-content optimization
006691 ** here are confined to those pages that lie between the end of the
006692 ** database image and the end of the database file.
006693 */
006694 int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
006695
006696 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
006697 if( rc ) return rc;
006698 pBt->nPage++;
006699 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
006700
006701 #ifndef SQLITE_OMIT_AUTOVACUUM
006702 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
006703 /* If *pPgno refers to a pointer-map page, allocate two new pages
006704 ** at the end of the file instead of one. The first allocated page
006705 ** becomes a new pointer-map page, the second is used by the caller.
006706 */
006707 MemPage *pPg = 0;
006708 TRACE(("ALLOCATE: %u from end of file (pointer-map page)\n", pBt->nPage));
006709 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
006710 rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
006711 if( rc==SQLITE_OK ){
006712 rc = sqlite3PagerWrite(pPg->pDbPage);
006713 releasePage(pPg);
006714 }
006715 if( rc ) return rc;
006716 pBt->nPage++;
006717 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
006718 }
006719 #endif
006720 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
006721 *pPgno = pBt->nPage;
006722
006723 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
006724 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
006725 if( rc ) return rc;
006726 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
006727 if( rc!=SQLITE_OK ){
006728 releasePage(*ppPage);
006729 *ppPage = 0;
006730 }
006731 TRACE(("ALLOCATE: %u from end of file\n", *pPgno));
006732 }
006733
006734 assert( CORRUPT_DB || *pPgno!=PENDING_BYTE_PAGE(pBt) );
006735
006736 end_allocate_page:
006737 releasePage(pTrunk);
006738 releasePage(pPrevTrunk);
006739 assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
006740 assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
006741 return rc;
006742 }
006743
006744 /*
006745 ** This function is used to add page iPage to the database file free-list.
006746 ** It is assumed that the page is not already a part of the free-list.
006747 **
006748 ** The value passed as the second argument to this function is optional.
006749 ** If the caller happens to have a pointer to the MemPage object
006750 ** corresponding to page iPage handy, it may pass it as the second value.
006751 ** Otherwise, it may pass NULL.
006752 **
006753 ** If a pointer to a MemPage object is passed as the second argument,
006754 ** its reference count is not altered by this function.
006755 */
006756 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
006757 MemPage *pTrunk = 0; /* Free-list trunk page */
006758 Pgno iTrunk = 0; /* Page number of free-list trunk page */
006759 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */
006760 MemPage *pPage; /* Page being freed. May be NULL. */
006761 int rc; /* Return Code */
006762 u32 nFree; /* Initial number of pages on free-list */
006763
006764 assert( sqlite3_mutex_held(pBt->mutex) );
006765 assert( CORRUPT_DB || iPage>1 );
006766 assert( !pMemPage || pMemPage->pgno==iPage );
006767
006768 if( iPage<2 || iPage>pBt->nPage ){
006769 return SQLITE_CORRUPT_BKPT;
006770 }
006771 if( pMemPage ){
006772 pPage = pMemPage;
006773 sqlite3PagerRef(pPage->pDbPage);
006774 }else{
006775 pPage = btreePageLookup(pBt, iPage);
006776 }
006777
006778 /* Increment the free page count on pPage1 */
006779 rc = sqlite3PagerWrite(pPage1->pDbPage);
006780 if( rc ) goto freepage_out;
006781 nFree = get4byte(&pPage1->aData[36]);
006782 put4byte(&pPage1->aData[36], nFree+1);
006783
006784 if( pBt->btsFlags & BTS_SECURE_DELETE ){
006785 /* If the secure_delete option is enabled, then
006786 ** always fully overwrite deleted information with zeros.
006787 */
006788 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
006789 || ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
006790 ){
006791 goto freepage_out;
006792 }
006793 memset(pPage->aData, 0, pPage->pBt->pageSize);
006794 }
006795
006796 /* If the database supports auto-vacuum, write an entry in the pointer-map
006797 ** to indicate that the page is free.
006798 */
006799 if( ISAUTOVACUUM(pBt) ){
006800 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
006801 if( rc ) goto freepage_out;
006802 }
006803
006804 /* Now manipulate the actual database free-list structure. There are two
006805 ** possibilities. If the free-list is currently empty, or if the first
006806 ** trunk page in the free-list is full, then this page will become a
006807 ** new free-list trunk page. Otherwise, it will become a leaf of the
006808 ** first trunk page in the current free-list. This block tests if it
006809 ** is possible to add the page as a new free-list leaf.
006810 */
006811 if( nFree!=0 ){
006812 u32 nLeaf; /* Initial number of leaf cells on trunk page */
006813
006814 iTrunk = get4byte(&pPage1->aData[32]);
006815 if( iTrunk>btreePagecount(pBt) ){
006816 rc = SQLITE_CORRUPT_BKPT;
006817 goto freepage_out;
006818 }
006819 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
006820 if( rc!=SQLITE_OK ){
006821 goto freepage_out;
006822 }
006823
006824 nLeaf = get4byte(&pTrunk->aData[4]);
006825 assert( pBt->usableSize>32 );
006826 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
006827 rc = SQLITE_CORRUPT_BKPT;
006828 goto freepage_out;
006829 }
006830 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
006831 /* In this case there is room on the trunk page to insert the page
006832 ** being freed as a new leaf.
006833 **
006834 ** Note that the trunk page is not really full until it contains
006835 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
006836 ** coded. But due to a coding error in versions of SQLite prior to
006837 ** 3.6.0, databases with freelist trunk pages holding more than
006838 ** usableSize/4 - 8 entries will be reported as corrupt. In order
006839 ** to maintain backwards compatibility with older versions of SQLite,
006840 ** we will continue to restrict the number of entries to usableSize/4 - 8
006841 ** for now. At some point in the future (once everyone has upgraded
006842 ** to 3.6.0 or later) we should consider fixing the conditional above
006843 ** to read "usableSize/4-2" instead of "usableSize/4-8".
006844 **
006845 ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
006846 ** avoid using the last six entries in the freelist trunk page array in
006847 ** order that database files created by newer versions of SQLite can be
006848 ** read by older versions of SQLite.
006849 */
006850 rc = sqlite3PagerWrite(pTrunk->pDbPage);
006851 if( rc==SQLITE_OK ){
006852 put4byte(&pTrunk->aData[4], nLeaf+1);
006853 put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
006854 if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
006855 sqlite3PagerDontWrite(pPage->pDbPage);
006856 }
006857 rc = btreeSetHasContent(pBt, iPage);
006858 }
006859 TRACE(("FREE-PAGE: %u leaf on trunk page %u\n",pPage->pgno,pTrunk->pgno));
006860 goto freepage_out;
006861 }
006862 }
006863
006864 /* If control flows to this point, then it was not possible to add the
006865 ** the page being freed as a leaf page of the first trunk in the free-list.
006866 ** Possibly because the free-list is empty, or possibly because the
006867 ** first trunk in the free-list is full. Either way, the page being freed
006868 ** will become the new first trunk page in the free-list.
006869 */
006870 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
006871 goto freepage_out;
006872 }
006873 rc = sqlite3PagerWrite(pPage->pDbPage);
006874 if( rc!=SQLITE_OK ){
006875 goto freepage_out;
006876 }
006877 put4byte(pPage->aData, iTrunk);
006878 put4byte(&pPage->aData[4], 0);
006879 put4byte(&pPage1->aData[32], iPage);
006880 TRACE(("FREE-PAGE: %u new trunk page replacing %u\n", pPage->pgno, iTrunk));
006881
006882 freepage_out:
006883 if( pPage ){
006884 pPage->isInit = 0;
006885 }
006886 releasePage(pPage);
006887 releasePage(pTrunk);
006888 return rc;
006889 }
006890 static void freePage(MemPage *pPage, int *pRC){
006891 if( (*pRC)==SQLITE_OK ){
006892 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
006893 }
006894 }
006895
006896 /*
006897 ** Free the overflow pages associated with the given Cell.
006898 */
006899 static SQLITE_NOINLINE int clearCellOverflow(
006900 MemPage *pPage, /* The page that contains the Cell */
006901 unsigned char *pCell, /* First byte of the Cell */
006902 CellInfo *pInfo /* Size information about the cell */
006903 ){
006904 BtShared *pBt;
006905 Pgno ovflPgno;
006906 int rc;
006907 int nOvfl;
006908 u32 ovflPageSize;
006909
006910 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006911 assert( pInfo->nLocal!=pInfo->nPayload );
006912 testcase( pCell + pInfo->nSize == pPage->aDataEnd );
006913 testcase( pCell + (pInfo->nSize-1) == pPage->aDataEnd );
006914 if( pCell + pInfo->nSize > pPage->aDataEnd ){
006915 /* Cell extends past end of page */
006916 return SQLITE_CORRUPT_PAGE(pPage);
006917 }
006918 ovflPgno = get4byte(pCell + pInfo->nSize - 4);
006919 pBt = pPage->pBt;
006920 assert( pBt->usableSize > 4 );
006921 ovflPageSize = pBt->usableSize - 4;
006922 nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize;
006923 assert( nOvfl>0 ||
006924 (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize)
006925 );
006926 while( nOvfl-- ){
006927 Pgno iNext = 0;
006928 MemPage *pOvfl = 0;
006929 if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
006930 /* 0 is not a legal page number and page 1 cannot be an
006931 ** overflow page. Therefore if ovflPgno<2 or past the end of the
006932 ** file the database must be corrupt. */
006933 return SQLITE_CORRUPT_BKPT;
006934 }
006935 if( nOvfl ){
006936 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
006937 if( rc ) return rc;
006938 }
006939
006940 if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
006941 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
006942 ){
006943 /* There is no reason any cursor should have an outstanding reference
006944 ** to an overflow page belonging to a cell that is being deleted/updated.
006945 ** So if there exists more than one reference to this page, then it
006946 ** must not really be an overflow page and the database must be corrupt.
006947 ** It is helpful to detect this before calling freePage2(), as
006948 ** freePage2() may zero the page contents if secure-delete mode is
006949 ** enabled. If this 'overflow' page happens to be a page that the
006950 ** caller is iterating through or using in some other way, this
006951 ** can be problematic.
006952 */
006953 rc = SQLITE_CORRUPT_BKPT;
006954 }else{
006955 rc = freePage2(pBt, pOvfl, ovflPgno);
006956 }
006957
006958 if( pOvfl ){
006959 sqlite3PagerUnref(pOvfl->pDbPage);
006960 }
006961 if( rc ) return rc;
006962 ovflPgno = iNext;
006963 }
006964 return SQLITE_OK;
006965 }
006966
006967 /* Call xParseCell to compute the size of a cell. If the cell contains
006968 ** overflow, then invoke cellClearOverflow to clear out that overflow.
006969 ** Store the result code (SQLITE_OK or some error code) in rc.
006970 **
006971 ** Implemented as macro to force inlining for performance.
006972 */
006973 #define BTREE_CLEAR_CELL(rc, pPage, pCell, sInfo) \
006974 pPage->xParseCell(pPage, pCell, &sInfo); \
006975 if( sInfo.nLocal!=sInfo.nPayload ){ \
006976 rc = clearCellOverflow(pPage, pCell, &sInfo); \
006977 }else{ \
006978 rc = SQLITE_OK; \
006979 }
006980
006981
006982 /*
006983 ** Create the byte sequence used to represent a cell on page pPage
006984 ** and write that byte sequence into pCell[]. Overflow pages are
006985 ** allocated and filled in as necessary. The calling procedure
006986 ** is responsible for making sure sufficient space has been allocated
006987 ** for pCell[].
006988 **
006989 ** Note that pCell does not necessary need to point to the pPage->aData
006990 ** area. pCell might point to some temporary storage. The cell will
006991 ** be constructed in this temporary area then copied into pPage->aData
006992 ** later.
006993 */
006994 static int fillInCell(
006995 MemPage *pPage, /* The page that contains the cell */
006996 unsigned char *pCell, /* Complete text of the cell */
006997 const BtreePayload *pX, /* Payload with which to construct the cell */
006998 int *pnSize /* Write cell size here */
006999 ){
007000 int nPayload;
007001 const u8 *pSrc;
007002 int nSrc, n, rc, mn;
007003 int spaceLeft;
007004 MemPage *pToRelease;
007005 unsigned char *pPrior;
007006 unsigned char *pPayload;
007007 BtShared *pBt;
007008 Pgno pgnoOvfl;
007009 int nHeader;
007010
007011 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
007012
007013 /* pPage is not necessarily writeable since pCell might be auxiliary
007014 ** buffer space that is separate from the pPage buffer area */
007015 assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize]
007016 || sqlite3PagerIswriteable(pPage->pDbPage) );
007017
007018 /* Fill in the header. */
007019 nHeader = pPage->childPtrSize;
007020 if( pPage->intKey ){
007021 nPayload = pX->nData + pX->nZero;
007022 pSrc = pX->pData;
007023 nSrc = pX->nData;
007024 assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */
007025 nHeader += putVarint32(&pCell[nHeader], nPayload);
007026 nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey);
007027 }else{
007028 assert( pX->nKey<=0x7fffffff && pX->pKey!=0 );
007029 nSrc = nPayload = (int)pX->nKey;
007030 pSrc = pX->pKey;
007031 nHeader += putVarint32(&pCell[nHeader], nPayload);
007032 }
007033
007034 /* Fill in the payload */
007035 pPayload = &pCell[nHeader];
007036 if( nPayload<=pPage->maxLocal ){
007037 /* This is the common case where everything fits on the btree page
007038 ** and no overflow pages are required. */
007039 n = nHeader + nPayload;
007040 testcase( n==3 );
007041 testcase( n==4 );
007042 if( n<4 ){
007043 n = 4;
007044 pPayload[nPayload] = 0;
007045 }
007046 *pnSize = n;
007047 assert( nSrc<=nPayload );
007048 testcase( nSrc<nPayload );
007049 memcpy(pPayload, pSrc, nSrc);
007050 memset(pPayload+nSrc, 0, nPayload-nSrc);
007051 return SQLITE_OK;
007052 }
007053
007054 /* If we reach this point, it means that some of the content will need
007055 ** to spill onto overflow pages.
007056 */
007057 mn = pPage->minLocal;
007058 n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
007059 testcase( n==pPage->maxLocal );
007060 testcase( n==pPage->maxLocal+1 );
007061 if( n > pPage->maxLocal ) n = mn;
007062 spaceLeft = n;
007063 *pnSize = n + nHeader + 4;
007064 pPrior = &pCell[nHeader+n];
007065 pToRelease = 0;
007066 pgnoOvfl = 0;
007067 pBt = pPage->pBt;
007068
007069 /* At this point variables should be set as follows:
007070 **
007071 ** nPayload Total payload size in bytes
007072 ** pPayload Begin writing payload here
007073 ** spaceLeft Space available at pPayload. If nPayload>spaceLeft,
007074 ** that means content must spill into overflow pages.
007075 ** *pnSize Size of the local cell (not counting overflow pages)
007076 ** pPrior Where to write the pgno of the first overflow page
007077 **
007078 ** Use a call to btreeParseCellPtr() to verify that the values above
007079 ** were computed correctly.
007080 */
007081 #ifdef SQLITE_DEBUG
007082 {
007083 CellInfo info;
007084 pPage->xParseCell(pPage, pCell, &info);
007085 assert( nHeader==(int)(info.pPayload - pCell) );
007086 assert( info.nKey==pX->nKey );
007087 assert( *pnSize == info.nSize );
007088 assert( spaceLeft == info.nLocal );
007089 }
007090 #endif
007091
007092 /* Write the payload into the local Cell and any extra into overflow pages */
007093 while( 1 ){
007094 n = nPayload;
007095 if( n>spaceLeft ) n = spaceLeft;
007096
007097 /* If pToRelease is not zero than pPayload points into the data area
007098 ** of pToRelease. Make sure pToRelease is still writeable. */
007099 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
007100
007101 /* If pPayload is part of the data area of pPage, then make sure pPage
007102 ** is still writeable */
007103 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
007104 || sqlite3PagerIswriteable(pPage->pDbPage) );
007105
007106 if( nSrc>=n ){
007107 memcpy(pPayload, pSrc, n);
007108 }else if( nSrc>0 ){
007109 n = nSrc;
007110 memcpy(pPayload, pSrc, n);
007111 }else{
007112 memset(pPayload, 0, n);
007113 }
007114 nPayload -= n;
007115 if( nPayload<=0 ) break;
007116 pPayload += n;
007117 pSrc += n;
007118 nSrc -= n;
007119 spaceLeft -= n;
007120 if( spaceLeft==0 ){
007121 MemPage *pOvfl = 0;
007122 #ifndef SQLITE_OMIT_AUTOVACUUM
007123 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
007124 if( pBt->autoVacuum ){
007125 do{
007126 pgnoOvfl++;
007127 } while(
007128 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
007129 );
007130 }
007131 #endif
007132 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
007133 #ifndef SQLITE_OMIT_AUTOVACUUM
007134 /* If the database supports auto-vacuum, and the second or subsequent
007135 ** overflow page is being allocated, add an entry to the pointer-map
007136 ** for that page now.
007137 **
007138 ** If this is the first overflow page, then write a partial entry
007139 ** to the pointer-map. If we write nothing to this pointer-map slot,
007140 ** then the optimistic overflow chain processing in clearCell()
007141 ** may misinterpret the uninitialized values and delete the
007142 ** wrong pages from the database.
007143 */
007144 if( pBt->autoVacuum && rc==SQLITE_OK ){
007145 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
007146 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
007147 if( rc ){
007148 releasePage(pOvfl);
007149 }
007150 }
007151 #endif
007152 if( rc ){
007153 releasePage(pToRelease);
007154 return rc;
007155 }
007156
007157 /* If pToRelease is not zero than pPrior points into the data area
007158 ** of pToRelease. Make sure pToRelease is still writeable. */
007159 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
007160
007161 /* If pPrior is part of the data area of pPage, then make sure pPage
007162 ** is still writeable */
007163 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
007164 || sqlite3PagerIswriteable(pPage->pDbPage) );
007165
007166 put4byte(pPrior, pgnoOvfl);
007167 releasePage(pToRelease);
007168 pToRelease = pOvfl;
007169 pPrior = pOvfl->aData;
007170 put4byte(pPrior, 0);
007171 pPayload = &pOvfl->aData[4];
007172 spaceLeft = pBt->usableSize - 4;
007173 }
007174 }
007175 releasePage(pToRelease);
007176 return SQLITE_OK;
007177 }
007178
007179 /*
007180 ** Remove the i-th cell from pPage. This routine effects pPage only.
007181 ** The cell content is not freed or deallocated. It is assumed that
007182 ** the cell content has been copied someplace else. This routine just
007183 ** removes the reference to the cell from pPage.
007184 **
007185 ** "sz" must be the number of bytes in the cell.
007186 */
007187 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
007188 u32 pc; /* Offset to cell content of cell being deleted */
007189 u8 *data; /* pPage->aData */
007190 u8 *ptr; /* Used to move bytes around within data[] */
007191 int rc; /* The return code */
007192 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */
007193
007194 if( *pRC ) return;
007195 assert( idx>=0 );
007196 assert( idx<pPage->nCell );
007197 assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
007198 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
007199 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
007200 assert( pPage->nFree>=0 );
007201 data = pPage->aData;
007202 ptr = &pPage->aCellIdx[2*idx];
007203 assert( pPage->pBt->usableSize > (u32)(ptr-data) );
007204 pc = get2byte(ptr);
007205 hdr = pPage->hdrOffset;
007206 testcase( pc==(u32)get2byte(&data[hdr+5]) );
007207 testcase( pc+sz==pPage->pBt->usableSize );
007208 if( pc+sz > pPage->pBt->usableSize ){
007209 *pRC = SQLITE_CORRUPT_BKPT;
007210 return;
007211 }
007212 rc = freeSpace(pPage, pc, sz);
007213 if( rc ){
007214 *pRC = rc;
007215 return;
007216 }
007217 pPage->nCell--;
007218 if( pPage->nCell==0 ){
007219 memset(&data[hdr+1], 0, 4);
007220 data[hdr+7] = 0;
007221 put2byte(&data[hdr+5], pPage->pBt->usableSize);
007222 pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
007223 - pPage->childPtrSize - 8;
007224 }else{
007225 memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
007226 put2byte(&data[hdr+3], pPage->nCell);
007227 pPage->nFree += 2;
007228 }
007229 }
007230
007231 /*
007232 ** Insert a new cell on pPage at cell index "i". pCell points to the
007233 ** content of the cell.
007234 **
007235 ** If the cell content will fit on the page, then put it there. If it
007236 ** will not fit, then make a copy of the cell content into pTemp if
007237 ** pTemp is not null. Regardless of pTemp, allocate a new entry
007238 ** in pPage->apOvfl[] and make it point to the cell content (either
007239 ** in pTemp or the original pCell) and also record its index.
007240 ** Allocating a new entry in pPage->aCell[] implies that
007241 ** pPage->nOverflow is incremented.
007242 **
007243 ** The insertCellFast() routine below works exactly the same as
007244 ** insertCell() except that it lacks the pTemp and iChild parameters
007245 ** which are assumed zero. Other than that, the two routines are the
007246 ** same.
007247 **
007248 ** Fixes or enhancements to this routine should be reflected in
007249 ** insertCellFast()!
007250 */
007251 static int insertCell(
007252 MemPage *pPage, /* Page into which we are copying */
007253 int i, /* New cell becomes the i-th cell of the page */
007254 u8 *pCell, /* Content of the new cell */
007255 int sz, /* Bytes of content in pCell */
007256 u8 *pTemp, /* Temp storage space for pCell, if needed */
007257 Pgno iChild /* If non-zero, replace first 4 bytes with this value */
007258 ){
007259 int idx = 0; /* Where to write new cell content in data[] */
007260 int j; /* Loop counter */
007261 u8 *data; /* The content of the whole page */
007262 u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */
007263
007264 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
007265 assert( MX_CELL(pPage->pBt)<=10921 );
007266 assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
007267 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
007268 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
007269 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
007270 assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB );
007271 assert( pPage->nFree>=0 );
007272 assert( iChild>0 );
007273 if( pPage->nOverflow || sz+2>pPage->nFree ){
007274 if( pTemp ){
007275 memcpy(pTemp, pCell, sz);
007276 pCell = pTemp;
007277 }
007278 put4byte(pCell, iChild);
007279 j = pPage->nOverflow++;
007280 /* Comparison against ArraySize-1 since we hold back one extra slot
007281 ** as a contingency. In other words, never need more than 3 overflow
007282 ** slots but 4 are allocated, just to be safe. */
007283 assert( j < ArraySize(pPage->apOvfl)-1 );
007284 pPage->apOvfl[j] = pCell;
007285 pPage->aiOvfl[j] = (u16)i;
007286
007287 /* When multiple overflows occur, they are always sequential and in
007288 ** sorted order. This invariants arise because multiple overflows can
007289 ** only occur when inserting divider cells into the parent page during
007290 ** balancing, and the dividers are adjacent and sorted.
007291 */
007292 assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
007293 assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */
007294 }else{
007295 int rc = sqlite3PagerWrite(pPage->pDbPage);
007296 if( NEVER(rc!=SQLITE_OK) ){
007297 return rc;
007298 }
007299 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
007300 data = pPage->aData;
007301 assert( &data[pPage->cellOffset]==pPage->aCellIdx );
007302 rc = allocateSpace(pPage, sz, &idx);
007303 if( rc ){ return rc; }
007304 /* The allocateSpace() routine guarantees the following properties
007305 ** if it returns successfully */
007306 assert( idx >= 0 );
007307 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
007308 assert( idx+sz <= (int)pPage->pBt->usableSize );
007309 pPage->nFree -= (u16)(2 + sz);
007310 /* In a corrupt database where an entry in the cell index section of
007311 ** a btree page has a value of 3 or less, the pCell value might point
007312 ** as many as 4 bytes in front of the start of the aData buffer for
007313 ** the source page. Make sure this does not cause problems by not
007314 ** reading the first 4 bytes */
007315 memcpy(&data[idx+4], pCell+4, sz-4);
007316 put4byte(&data[idx], iChild);
007317 pIns = pPage->aCellIdx + i*2;
007318 memmove(pIns+2, pIns, 2*(pPage->nCell - i));
007319 put2byte(pIns, idx);
007320 pPage->nCell++;
007321 /* increment the cell count */
007322 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
007323 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB );
007324 #ifndef SQLITE_OMIT_AUTOVACUUM
007325 if( pPage->pBt->autoVacuum ){
007326 int rc2 = SQLITE_OK;
007327 /* The cell may contain a pointer to an overflow page. If so, write
007328 ** the entry for the overflow page into the pointer map.
007329 */
007330 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc2);
007331 if( rc2 ) return rc2;
007332 }
007333 #endif
007334 }
007335 return SQLITE_OK;
007336 }
007337
007338 /*
007339 ** This variant of insertCell() assumes that the pTemp and iChild
007340 ** parameters are both zero. Use this variant in sqlite3BtreeInsert()
007341 ** for performance improvement, and also so that this variant is only
007342 ** called from that one place, and is thus inlined, and thus runs must
007343 ** faster.
007344 **
007345 ** Fixes or enhancements to this routine should be reflected into
007346 ** the insertCell() routine.
007347 */
007348 static int insertCellFast(
007349 MemPage *pPage, /* Page into which we are copying */
007350 int i, /* New cell becomes the i-th cell of the page */
007351 u8 *pCell, /* Content of the new cell */
007352 int sz /* Bytes of content in pCell */
007353 ){
007354 int idx = 0; /* Where to write new cell content in data[] */
007355 int j; /* Loop counter */
007356 u8 *data; /* The content of the whole page */
007357 u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */
007358
007359 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
007360 assert( MX_CELL(pPage->pBt)<=10921 );
007361 assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
007362 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
007363 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
007364 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
007365 assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB );
007366 assert( pPage->nFree>=0 );
007367 assert( pPage->nOverflow==0 );
007368 if( sz+2>pPage->nFree ){
007369 j = pPage->nOverflow++;
007370 /* Comparison against ArraySize-1 since we hold back one extra slot
007371 ** as a contingency. In other words, never need more than 3 overflow
007372 ** slots but 4 are allocated, just to be safe. */
007373 assert( j < ArraySize(pPage->apOvfl)-1 );
007374 pPage->apOvfl[j] = pCell;
007375 pPage->aiOvfl[j] = (u16)i;
007376
007377 /* When multiple overflows occur, they are always sequential and in
007378 ** sorted order. This invariants arise because multiple overflows can
007379 ** only occur when inserting divider cells into the parent page during
007380 ** balancing, and the dividers are adjacent and sorted.
007381 */
007382 assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
007383 assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */
007384 }else{
007385 int rc = sqlite3PagerWrite(pPage->pDbPage);
007386 if( rc!=SQLITE_OK ){
007387 return rc;
007388 }
007389 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
007390 data = pPage->aData;
007391 assert( &data[pPage->cellOffset]==pPage->aCellIdx );
007392 rc = allocateSpace(pPage, sz, &idx);
007393 if( rc ){ return rc; }
007394 /* The allocateSpace() routine guarantees the following properties
007395 ** if it returns successfully */
007396 assert( idx >= 0 );
007397 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
007398 assert( idx+sz <= (int)pPage->pBt->usableSize );
007399 pPage->nFree -= (u16)(2 + sz);
007400 memcpy(&data[idx], pCell, sz);
007401 pIns = pPage->aCellIdx + i*2;
007402 memmove(pIns+2, pIns, 2*(pPage->nCell - i));
007403 put2byte(pIns, idx);
007404 pPage->nCell++;
007405 /* increment the cell count */
007406 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
007407 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB );
007408 #ifndef SQLITE_OMIT_AUTOVACUUM
007409 if( pPage->pBt->autoVacuum ){
007410 int rc2 = SQLITE_OK;
007411 /* The cell may contain a pointer to an overflow page. If so, write
007412 ** the entry for the overflow page into the pointer map.
007413 */
007414 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc2);
007415 if( rc2 ) return rc2;
007416 }
007417 #endif
007418 }
007419 return SQLITE_OK;
007420 }
007421
007422 /*
007423 ** The following parameters determine how many adjacent pages get involved
007424 ** in a balancing operation. NN is the number of neighbors on either side
007425 ** of the page that participate in the balancing operation. NB is the
007426 ** total number of pages that participate, including the target page and
007427 ** NN neighbors on either side.
007428 **
007429 ** The minimum value of NN is 1 (of course). Increasing NN above 1
007430 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
007431 ** in exchange for a larger degradation in INSERT and UPDATE performance.
007432 ** The value of NN appears to give the best results overall.
007433 **
007434 ** (Later:) The description above makes it seem as if these values are
007435 ** tunable - as if you could change them and recompile and it would all work.
007436 ** But that is unlikely. NB has been 3 since the inception of SQLite and
007437 ** we have never tested any other value.
007438 */
007439 #define NN 1 /* Number of neighbors on either side of pPage */
007440 #define NB 3 /* (NN*2+1): Total pages involved in the balance */
007441
007442 /*
007443 ** A CellArray object contains a cache of pointers and sizes for a
007444 ** consecutive sequence of cells that might be held on multiple pages.
007445 **
007446 ** The cells in this array are the divider cell or cells from the pParent
007447 ** page plus up to three child pages. There are a total of nCell cells.
007448 **
007449 ** pRef is a pointer to one of the pages that contributes cells. This is
007450 ** used to access information such as MemPage.intKey and MemPage.pBt->pageSize
007451 ** which should be common to all pages that contribute cells to this array.
007452 **
007453 ** apCell[] and szCell[] hold, respectively, pointers to the start of each
007454 ** cell and the size of each cell. Some of the apCell[] pointers might refer
007455 ** to overflow cells. In other words, some apCel[] pointers might not point
007456 ** to content area of the pages.
007457 **
007458 ** A szCell[] of zero means the size of that cell has not yet been computed.
007459 **
007460 ** The cells come from as many as four different pages:
007461 **
007462 ** -----------
007463 ** | Parent |
007464 ** -----------
007465 ** / | \
007466 ** / | \
007467 ** --------- --------- ---------
007468 ** |Child-1| |Child-2| |Child-3|
007469 ** --------- --------- ---------
007470 **
007471 ** The order of cells is in the array is for an index btree is:
007472 **
007473 ** 1. All cells from Child-1 in order
007474 ** 2. The first divider cell from Parent
007475 ** 3. All cells from Child-2 in order
007476 ** 4. The second divider cell from Parent
007477 ** 5. All cells from Child-3 in order
007478 **
007479 ** For a table-btree (with rowids) the items 2 and 4 are empty because
007480 ** content exists only in leaves and there are no divider cells.
007481 **
007482 ** For an index btree, the apEnd[] array holds pointer to the end of page
007483 ** for Child-1, the Parent, Child-2, the Parent (again), and Child-3,
007484 ** respectively. The ixNx[] array holds the number of cells contained in
007485 ** each of these 5 stages, and all stages to the left. Hence:
007486 **
007487 ** ixNx[0] = Number of cells in Child-1.
007488 ** ixNx[1] = Number of cells in Child-1 plus 1 for first divider.
007489 ** ixNx[2] = Number of cells in Child-1 and Child-2 + 1 for 1st divider.
007490 ** ixNx[3] = Number of cells in Child-1 and Child-2 + both divider cells
007491 ** ixNx[4] = Total number of cells.
007492 **
007493 ** For a table-btree, the concept is similar, except only apEnd[0]..apEnd[2]
007494 ** are used and they point to the leaf pages only, and the ixNx value are:
007495 **
007496 ** ixNx[0] = Number of cells in Child-1.
007497 ** ixNx[1] = Number of cells in Child-1 and Child-2.
007498 ** ixNx[2] = Total number of cells.
007499 **
007500 ** Sometimes when deleting, a child page can have zero cells. In those
007501 ** cases, ixNx[] entries with higher indexes, and the corresponding apEnd[]
007502 ** entries, shift down. The end result is that each ixNx[] entry should
007503 ** be larger than the previous
007504 */
007505 typedef struct CellArray CellArray;
007506 struct CellArray {
007507 int nCell; /* Number of cells in apCell[] */
007508 MemPage *pRef; /* Reference page */
007509 u8 **apCell; /* All cells begin balanced */
007510 u16 *szCell; /* Local size of all cells in apCell[] */
007511 u8 *apEnd[NB*2]; /* MemPage.aDataEnd values */
007512 int ixNx[NB*2]; /* Index of at which we move to the next apEnd[] */
007513 };
007514
007515 /*
007516 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
007517 ** computed.
007518 */
007519 static void populateCellCache(CellArray *p, int idx, int N){
007520 MemPage *pRef = p->pRef;
007521 u16 *szCell = p->szCell;
007522 assert( idx>=0 && idx+N<=p->nCell );
007523 while( N>0 ){
007524 assert( p->apCell[idx]!=0 );
007525 if( szCell[idx]==0 ){
007526 szCell[idx] = pRef->xCellSize(pRef, p->apCell[idx]);
007527 }else{
007528 assert( CORRUPT_DB ||
007529 szCell[idx]==pRef->xCellSize(pRef, p->apCell[idx]) );
007530 }
007531 idx++;
007532 N--;
007533 }
007534 }
007535
007536 /*
007537 ** Return the size of the Nth element of the cell array
007538 */
007539 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
007540 assert( N>=0 && N<p->nCell );
007541 assert( p->szCell[N]==0 );
007542 p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
007543 return p->szCell[N];
007544 }
007545 static u16 cachedCellSize(CellArray *p, int N){
007546 assert( N>=0 && N<p->nCell );
007547 if( p->szCell[N] ) return p->szCell[N];
007548 return computeCellSize(p, N);
007549 }
007550
007551 /*
007552 ** Array apCell[] contains pointers to nCell b-tree page cells. The
007553 ** szCell[] array contains the size in bytes of each cell. This function
007554 ** replaces the current contents of page pPg with the contents of the cell
007555 ** array.
007556 **
007557 ** Some of the cells in apCell[] may currently be stored in pPg. This
007558 ** function works around problems caused by this by making a copy of any
007559 ** such cells before overwriting the page data.
007560 **
007561 ** The MemPage.nFree field is invalidated by this function. It is the
007562 ** responsibility of the caller to set it correctly.
007563 */
007564 static int rebuildPage(
007565 CellArray *pCArray, /* Content to be added to page pPg */
007566 int iFirst, /* First cell in pCArray to use */
007567 int nCell, /* Final number of cells on page */
007568 MemPage *pPg /* The page to be reconstructed */
007569 ){
007570 const int hdr = pPg->hdrOffset; /* Offset of header on pPg */
007571 u8 * const aData = pPg->aData; /* Pointer to data for pPg */
007572 const int usableSize = pPg->pBt->usableSize;
007573 u8 * const pEnd = &aData[usableSize];
007574 int i = iFirst; /* Which cell to copy from pCArray*/
007575 u32 j; /* Start of cell content area */
007576 int iEnd = i+nCell; /* Loop terminator */
007577 u8 *pCellptr = pPg->aCellIdx;
007578 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
007579 u8 *pData;
007580 int k; /* Current slot in pCArray->apEnd[] */
007581 u8 *pSrcEnd; /* Current pCArray->apEnd[k] value */
007582
007583 assert( nCell>0 );
007584 assert( i<iEnd );
007585 j = get2byte(&aData[hdr+5]);
007586 if( j>(u32)usableSize ){ j = 0; }
007587 memcpy(&pTmp[j], &aData[j], usableSize - j);
007588
007589 assert( pCArray->ixNx[NB*2-1]>i );
007590 for(k=0; pCArray->ixNx[k]<=i; k++){}
007591 pSrcEnd = pCArray->apEnd[k];
007592
007593 pData = pEnd;
007594 while( 1/*exit by break*/ ){
007595 u8 *pCell = pCArray->apCell[i];
007596 u16 sz = pCArray->szCell[i];
007597 assert( sz>0 );
007598 if( SQLITE_WITHIN(pCell,aData+j,pEnd) ){
007599 if( ((uptr)(pCell+sz))>(uptr)pEnd ) return SQLITE_CORRUPT_BKPT;
007600 pCell = &pTmp[pCell - aData];
007601 }else if( (uptr)(pCell+sz)>(uptr)pSrcEnd
007602 && (uptr)(pCell)<(uptr)pSrcEnd
007603 ){
007604 return SQLITE_CORRUPT_BKPT;
007605 }
007606
007607 pData -= sz;
007608 put2byte(pCellptr, (pData - aData));
007609 pCellptr += 2;
007610 if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
007611 memmove(pData, pCell, sz);
007612 assert( sz==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
007613 i++;
007614 if( i>=iEnd ) break;
007615 if( pCArray->ixNx[k]<=i ){
007616 k++;
007617 pSrcEnd = pCArray->apEnd[k];
007618 }
007619 }
007620
007621 /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
007622 pPg->nCell = nCell;
007623 pPg->nOverflow = 0;
007624
007625 put2byte(&aData[hdr+1], 0);
007626 put2byte(&aData[hdr+3], pPg->nCell);
007627 put2byte(&aData[hdr+5], pData - aData);
007628 aData[hdr+7] = 0x00;
007629 return SQLITE_OK;
007630 }
007631
007632 /*
007633 ** The pCArray objects contains pointers to b-tree cells and the cell sizes.
007634 ** This function attempts to add the cells stored in the array to page pPg.
007635 ** If it cannot (because the page needs to be defragmented before the cells
007636 ** will fit), non-zero is returned. Otherwise, if the cells are added
007637 ** successfully, zero is returned.
007638 **
007639 ** Argument pCellptr points to the first entry in the cell-pointer array
007640 ** (part of page pPg) to populate. After cell apCell[0] is written to the
007641 ** page body, a 16-bit offset is written to pCellptr. And so on, for each
007642 ** cell in the array. It is the responsibility of the caller to ensure
007643 ** that it is safe to overwrite this part of the cell-pointer array.
007644 **
007645 ** When this function is called, *ppData points to the start of the
007646 ** content area on page pPg. If the size of the content area is extended,
007647 ** *ppData is updated to point to the new start of the content area
007648 ** before returning.
007649 **
007650 ** Finally, argument pBegin points to the byte immediately following the
007651 ** end of the space required by this page for the cell-pointer area (for
007652 ** all cells - not just those inserted by the current call). If the content
007653 ** area must be extended to before this point in order to accommodate all
007654 ** cells in apCell[], then the cells do not fit and non-zero is returned.
007655 */
007656 static int pageInsertArray(
007657 MemPage *pPg, /* Page to add cells to */
007658 u8 *pBegin, /* End of cell-pointer array */
007659 u8 **ppData, /* IN/OUT: Page content-area pointer */
007660 u8 *pCellptr, /* Pointer to cell-pointer area */
007661 int iFirst, /* Index of first cell to add */
007662 int nCell, /* Number of cells to add to pPg */
007663 CellArray *pCArray /* Array of cells */
007664 ){
007665 int i = iFirst; /* Loop counter - cell index to insert */
007666 u8 *aData = pPg->aData; /* Complete page */
007667 u8 *pData = *ppData; /* Content area. A subset of aData[] */
007668 int iEnd = iFirst + nCell; /* End of loop. One past last cell to ins */
007669 int k; /* Current slot in pCArray->apEnd[] */
007670 u8 *pEnd; /* Maximum extent of cell data */
007671 assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */
007672 if( iEnd<=iFirst ) return 0;
007673 assert( pCArray->ixNx[NB*2-1]>i );
007674 for(k=0; pCArray->ixNx[k]<=i ; k++){}
007675 pEnd = pCArray->apEnd[k];
007676 while( 1 /*Exit by break*/ ){
007677 int sz, rc;
007678 u8 *pSlot;
007679 assert( pCArray->szCell[i]!=0 );
007680 sz = pCArray->szCell[i];
007681 if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
007682 if( (pData - pBegin)<sz ) return 1;
007683 pData -= sz;
007684 pSlot = pData;
007685 }
007686 /* pSlot and pCArray->apCell[i] will never overlap on a well-formed
007687 ** database. But they might for a corrupt database. Hence use memmove()
007688 ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */
007689 assert( (pSlot+sz)<=pCArray->apCell[i]
007690 || pSlot>=(pCArray->apCell[i]+sz)
007691 || CORRUPT_DB );
007692 if( (uptr)(pCArray->apCell[i]+sz)>(uptr)pEnd
007693 && (uptr)(pCArray->apCell[i])<(uptr)pEnd
007694 ){
007695 assert( CORRUPT_DB );
007696 (void)SQLITE_CORRUPT_BKPT;
007697 return 1;
007698 }
007699 memmove(pSlot, pCArray->apCell[i], sz);
007700 put2byte(pCellptr, (pSlot - aData));
007701 pCellptr += 2;
007702 i++;
007703 if( i>=iEnd ) break;
007704 if( pCArray->ixNx[k]<=i ){
007705 k++;
007706 pEnd = pCArray->apEnd[k];
007707 }
007708 }
007709 *ppData = pData;
007710 return 0;
007711 }
007712
007713 /*
007714 ** The pCArray object contains pointers to b-tree cells and their sizes.
007715 **
007716 ** This function adds the space associated with each cell in the array
007717 ** that is currently stored within the body of pPg to the pPg free-list.
007718 ** The cell-pointers and other fields of the page are not updated.
007719 **
007720 ** This function returns the total number of cells added to the free-list.
007721 */
007722 static int pageFreeArray(
007723 MemPage *pPg, /* Page to edit */
007724 int iFirst, /* First cell to delete */
007725 int nCell, /* Cells to delete */
007726 CellArray *pCArray /* Array of cells */
007727 ){
007728 u8 * const aData = pPg->aData;
007729 u8 * const pEnd = &aData[pPg->pBt->usableSize];
007730 u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
007731 int nRet = 0;
007732 int i, j;
007733 int iEnd = iFirst + nCell;
007734 int nFree = 0;
007735 int aOfst[10];
007736 int aAfter[10];
007737
007738 for(i=iFirst; i<iEnd; i++){
007739 u8 *pCell = pCArray->apCell[i];
007740 if( SQLITE_WITHIN(pCell, pStart, pEnd) ){
007741 int sz;
007742 int iAfter;
007743 int iOfst;
007744 /* No need to use cachedCellSize() here. The sizes of all cells that
007745 ** are to be freed have already been computing while deciding which
007746 ** cells need freeing */
007747 sz = pCArray->szCell[i]; assert( sz>0 );
007748 iOfst = (u16)(pCell - aData);
007749 iAfter = iOfst+sz;
007750 for(j=0; j<nFree; j++){
007751 if( aOfst[j]==iAfter ){
007752 aOfst[j] = iOfst;
007753 break;
007754 }else if( aAfter[j]==iOfst ){
007755 aAfter[j] = iAfter;
007756 break;
007757 }
007758 }
007759 if( j>=nFree ){
007760 if( nFree>=(int)(sizeof(aOfst)/sizeof(aOfst[0])) ){
007761 for(j=0; j<nFree; j++){
007762 freeSpace(pPg, aOfst[j], aAfter[j]-aOfst[j]);
007763 }
007764 nFree = 0;
007765 }
007766 aOfst[nFree] = iOfst;
007767 aAfter[nFree] = iAfter;
007768 if( &aData[iAfter]>pEnd ) return 0;
007769 nFree++;
007770 }
007771 nRet++;
007772 }
007773 }
007774 for(j=0; j<nFree; j++){
007775 freeSpace(pPg, aOfst[j], aAfter[j]-aOfst[j]);
007776 }
007777 return nRet;
007778 }
007779
007780 /*
007781 ** pCArray contains pointers to and sizes of all cells in the page being
007782 ** balanced. The current page, pPg, has pPg->nCell cells starting with
007783 ** pCArray->apCell[iOld]. After balancing, this page should hold nNew cells
007784 ** starting at apCell[iNew].
007785 **
007786 ** This routine makes the necessary adjustments to pPg so that it contains
007787 ** the correct cells after being balanced.
007788 **
007789 ** The pPg->nFree field is invalid when this function returns. It is the
007790 ** responsibility of the caller to set it correctly.
007791 */
007792 static int editPage(
007793 MemPage *pPg, /* Edit this page */
007794 int iOld, /* Index of first cell currently on page */
007795 int iNew, /* Index of new first cell on page */
007796 int nNew, /* Final number of cells on page */
007797 CellArray *pCArray /* Array of cells and sizes */
007798 ){
007799 u8 * const aData = pPg->aData;
007800 const int hdr = pPg->hdrOffset;
007801 u8 *pBegin = &pPg->aCellIdx[nNew * 2];
007802 int nCell = pPg->nCell; /* Cells stored on pPg */
007803 u8 *pData;
007804 u8 *pCellptr;
007805 int i;
007806 int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
007807 int iNewEnd = iNew + nNew;
007808
007809 #ifdef SQLITE_DEBUG
007810 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
007811 memcpy(pTmp, aData, pPg->pBt->usableSize);
007812 #endif
007813
007814 /* Remove cells from the start and end of the page */
007815 assert( nCell>=0 );
007816 if( iOld<iNew ){
007817 int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
007818 if( NEVER(nShift>nCell) ) return SQLITE_CORRUPT_BKPT;
007819 memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
007820 nCell -= nShift;
007821 }
007822 if( iNewEnd < iOldEnd ){
007823 int nTail = pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
007824 assert( nCell>=nTail );
007825 nCell -= nTail;
007826 }
007827
007828 pData = &aData[get2byte(&aData[hdr+5])];
007829 if( pData<pBegin ) goto editpage_fail;
007830 if( NEVER(pData>pPg->aDataEnd) ) goto editpage_fail;
007831
007832 /* Add cells to the start of the page */
007833 if( iNew<iOld ){
007834 int nAdd = MIN(nNew,iOld-iNew);
007835 assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
007836 assert( nAdd>=0 );
007837 pCellptr = pPg->aCellIdx;
007838 memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
007839 if( pageInsertArray(
007840 pPg, pBegin, &pData, pCellptr,
007841 iNew, nAdd, pCArray
007842 ) ) goto editpage_fail;
007843 nCell += nAdd;
007844 }
007845
007846 /* Add any overflow cells */
007847 for(i=0; i<pPg->nOverflow; i++){
007848 int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
007849 if( iCell>=0 && iCell<nNew ){
007850 pCellptr = &pPg->aCellIdx[iCell * 2];
007851 if( nCell>iCell ){
007852 memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
007853 }
007854 nCell++;
007855 cachedCellSize(pCArray, iCell+iNew);
007856 if( pageInsertArray(
007857 pPg, pBegin, &pData, pCellptr,
007858 iCell+iNew, 1, pCArray
007859 ) ) goto editpage_fail;
007860 }
007861 }
007862
007863 /* Append cells to the end of the page */
007864 assert( nCell>=0 );
007865 pCellptr = &pPg->aCellIdx[nCell*2];
007866 if( pageInsertArray(
007867 pPg, pBegin, &pData, pCellptr,
007868 iNew+nCell, nNew-nCell, pCArray
007869 ) ) goto editpage_fail;
007870
007871 pPg->nCell = nNew;
007872 pPg->nOverflow = 0;
007873
007874 put2byte(&aData[hdr+3], pPg->nCell);
007875 put2byte(&aData[hdr+5], pData - aData);
007876
007877 #ifdef SQLITE_DEBUG
007878 for(i=0; i<nNew && !CORRUPT_DB; i++){
007879 u8 *pCell = pCArray->apCell[i+iNew];
007880 int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
007881 if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){
007882 pCell = &pTmp[pCell - aData];
007883 }
007884 assert( 0==memcmp(pCell, &aData[iOff],
007885 pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
007886 }
007887 #endif
007888
007889 return SQLITE_OK;
007890 editpage_fail:
007891 /* Unable to edit this page. Rebuild it from scratch instead. */
007892 if( nNew<1 ) return SQLITE_CORRUPT_BKPT;
007893 populateCellCache(pCArray, iNew, nNew);
007894 return rebuildPage(pCArray, iNew, nNew, pPg);
007895 }
007896
007897
007898 #ifndef SQLITE_OMIT_QUICKBALANCE
007899 /*
007900 ** This version of balance() handles the common special case where
007901 ** a new entry is being inserted on the extreme right-end of the
007902 ** tree, in other words, when the new entry will become the largest
007903 ** entry in the tree.
007904 **
007905 ** Instead of trying to balance the 3 right-most leaf pages, just add
007906 ** a new page to the right-hand side and put the one new entry in
007907 ** that page. This leaves the right side of the tree somewhat
007908 ** unbalanced. But odds are that we will be inserting new entries
007909 ** at the end soon afterwards so the nearly empty page will quickly
007910 ** fill up. On average.
007911 **
007912 ** pPage is the leaf page which is the right-most page in the tree.
007913 ** pParent is its parent. pPage must have a single overflow entry
007914 ** which is also the right-most entry on the page.
007915 **
007916 ** The pSpace buffer is used to store a temporary copy of the divider
007917 ** cell that will be inserted into pParent. Such a cell consists of a 4
007918 ** byte page number followed by a variable length integer. In other
007919 ** words, at most 13 bytes. Hence the pSpace buffer must be at
007920 ** least 13 bytes in size.
007921 */
007922 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
007923 BtShared *const pBt = pPage->pBt; /* B-Tree Database */
007924 MemPage *pNew; /* Newly allocated page */
007925 int rc; /* Return Code */
007926 Pgno pgnoNew; /* Page number of pNew */
007927
007928 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
007929 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
007930 assert( pPage->nOverflow==1 );
007931
007932 if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT; /* dbfuzz001.test */
007933 assert( pPage->nFree>=0 );
007934 assert( pParent->nFree>=0 );
007935
007936 /* Allocate a new page. This page will become the right-sibling of
007937 ** pPage. Make the parent page writable, so that the new divider cell
007938 ** may be inserted. If both these operations are successful, proceed.
007939 */
007940 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
007941
007942 if( rc==SQLITE_OK ){
007943
007944 u8 *pOut = &pSpace[4];
007945 u8 *pCell = pPage->apOvfl[0];
007946 u16 szCell = pPage->xCellSize(pPage, pCell);
007947 u8 *pStop;
007948 CellArray b;
007949
007950 assert( sqlite3PagerIswriteable(pNew->pDbPage) );
007951 assert( CORRUPT_DB || pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
007952 zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
007953 b.nCell = 1;
007954 b.pRef = pPage;
007955 b.apCell = &pCell;
007956 b.szCell = &szCell;
007957 b.apEnd[0] = pPage->aDataEnd;
007958 b.ixNx[0] = 2;
007959 b.ixNx[NB*2-1] = 0x7fffffff;
007960 rc = rebuildPage(&b, 0, 1, pNew);
007961 if( NEVER(rc) ){
007962 releasePage(pNew);
007963 return rc;
007964 }
007965 pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
007966
007967 /* If this is an auto-vacuum database, update the pointer map
007968 ** with entries for the new page, and any pointer from the
007969 ** cell on the page to an overflow page. If either of these
007970 ** operations fails, the return code is set, but the contents
007971 ** of the parent page are still manipulated by the code below.
007972 ** That is Ok, at this point the parent page is guaranteed to
007973 ** be marked as dirty. Returning an error code will cause a
007974 ** rollback, undoing any changes made to the parent page.
007975 */
007976 if( ISAUTOVACUUM(pBt) ){
007977 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
007978 if( szCell>pNew->minLocal ){
007979 ptrmapPutOvflPtr(pNew, pNew, pCell, &rc);
007980 }
007981 }
007982
007983 /* Create a divider cell to insert into pParent. The divider cell
007984 ** consists of a 4-byte page number (the page number of pPage) and
007985 ** a variable length key value (which must be the same value as the
007986 ** largest key on pPage).
007987 **
007988 ** To find the largest key value on pPage, first find the right-most
007989 ** cell on pPage. The first two fields of this cell are the
007990 ** record-length (a variable length integer at most 32-bits in size)
007991 ** and the key value (a variable length integer, may have any value).
007992 ** The first of the while(...) loops below skips over the record-length
007993 ** field. The second while(...) loop copies the key value from the
007994 ** cell on pPage into the pSpace buffer.
007995 */
007996 pCell = findCell(pPage, pPage->nCell-1);
007997 pStop = &pCell[9];
007998 while( (*(pCell++)&0x80) && pCell<pStop );
007999 pStop = &pCell[9];
008000 while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
008001
008002 /* Insert the new divider cell into pParent. */
008003 if( rc==SQLITE_OK ){
008004 rc = insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
008005 0, pPage->pgno);
008006 }
008007
008008 /* Set the right-child pointer of pParent to point to the new page. */
008009 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
008010
008011 /* Release the reference to the new page. */
008012 releasePage(pNew);
008013 }
008014
008015 return rc;
008016 }
008017 #endif /* SQLITE_OMIT_QUICKBALANCE */
008018
008019 #if 0
008020 /*
008021 ** This function does not contribute anything to the operation of SQLite.
008022 ** it is sometimes activated temporarily while debugging code responsible
008023 ** for setting pointer-map entries.
008024 */
008025 static int ptrmapCheckPages(MemPage **apPage, int nPage){
008026 int i, j;
008027 for(i=0; i<nPage; i++){
008028 Pgno n;
008029 u8 e;
008030 MemPage *pPage = apPage[i];
008031 BtShared *pBt = pPage->pBt;
008032 assert( pPage->isInit );
008033
008034 for(j=0; j<pPage->nCell; j++){
008035 CellInfo info;
008036 u8 *z;
008037
008038 z = findCell(pPage, j);
008039 pPage->xParseCell(pPage, z, &info);
008040 if( info.nLocal<info.nPayload ){
008041 Pgno ovfl = get4byte(&z[info.nSize-4]);
008042 ptrmapGet(pBt, ovfl, &e, &n);
008043 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
008044 }
008045 if( !pPage->leaf ){
008046 Pgno child = get4byte(z);
008047 ptrmapGet(pBt, child, &e, &n);
008048 assert( n==pPage->pgno && e==PTRMAP_BTREE );
008049 }
008050 }
008051 if( !pPage->leaf ){
008052 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
008053 ptrmapGet(pBt, child, &e, &n);
008054 assert( n==pPage->pgno && e==PTRMAP_BTREE );
008055 }
008056 }
008057 return 1;
008058 }
008059 #endif
008060
008061 /*
008062 ** This function is used to copy the contents of the b-tree node stored
008063 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
008064 ** the pointer-map entries for each child page are updated so that the
008065 ** parent page stored in the pointer map is page pTo. If pFrom contained
008066 ** any cells with overflow page pointers, then the corresponding pointer
008067 ** map entries are also updated so that the parent page is page pTo.
008068 **
008069 ** If pFrom is currently carrying any overflow cells (entries in the
008070 ** MemPage.apOvfl[] array), they are not copied to pTo.
008071 **
008072 ** Before returning, page pTo is reinitialized using btreeInitPage().
008073 **
008074 ** The performance of this function is not critical. It is only used by
008075 ** the balance_shallower() and balance_deeper() procedures, neither of
008076 ** which are called often under normal circumstances.
008077 */
008078 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
008079 if( (*pRC)==SQLITE_OK ){
008080 BtShared * const pBt = pFrom->pBt;
008081 u8 * const aFrom = pFrom->aData;
008082 u8 * const aTo = pTo->aData;
008083 int const iFromHdr = pFrom->hdrOffset;
008084 int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
008085 int rc;
008086 int iData;
008087
008088
008089 assert( pFrom->isInit );
008090 assert( pFrom->nFree>=iToHdr );
008091 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
008092
008093 /* Copy the b-tree node content from page pFrom to page pTo. */
008094 iData = get2byte(&aFrom[iFromHdr+5]);
008095 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
008096 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
008097
008098 /* Reinitialize page pTo so that the contents of the MemPage structure
008099 ** match the new data. The initialization of pTo can actually fail under
008100 ** fairly obscure circumstances, even though it is a copy of initialized
008101 ** page pFrom.
008102 */
008103 pTo->isInit = 0;
008104 rc = btreeInitPage(pTo);
008105 if( rc==SQLITE_OK ) rc = btreeComputeFreeSpace(pTo);
008106 if( rc!=SQLITE_OK ){
008107 *pRC = rc;
008108 return;
008109 }
008110
008111 /* If this is an auto-vacuum database, update the pointer-map entries
008112 ** for any b-tree or overflow pages that pTo now contains the pointers to.
008113 */
008114 if( ISAUTOVACUUM(pBt) ){
008115 *pRC = setChildPtrmaps(pTo);
008116 }
008117 }
008118 }
008119
008120 /*
008121 ** This routine redistributes cells on the iParentIdx'th child of pParent
008122 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
008123 ** same amount of free space. Usually a single sibling on either side of the
008124 ** page are used in the balancing, though both siblings might come from one
008125 ** side if the page is the first or last child of its parent. If the page
008126 ** has fewer than 2 siblings (something which can only happen if the page
008127 ** is a root page or a child of a root page) then all available siblings
008128 ** participate in the balancing.
008129 **
008130 ** The number of siblings of the page might be increased or decreased by
008131 ** one or two in an effort to keep pages nearly full but not over full.
008132 **
008133 ** Note that when this routine is called, some of the cells on the page
008134 ** might not actually be stored in MemPage.aData[]. This can happen
008135 ** if the page is overfull. This routine ensures that all cells allocated
008136 ** to the page and its siblings fit into MemPage.aData[] before returning.
008137 **
008138 ** In the course of balancing the page and its siblings, cells may be
008139 ** inserted into or removed from the parent page (pParent). Doing so
008140 ** may cause the parent page to become overfull or underfull. If this
008141 ** happens, it is the responsibility of the caller to invoke the correct
008142 ** balancing routine to fix this problem (see the balance() routine).
008143 **
008144 ** If this routine fails for any reason, it might leave the database
008145 ** in a corrupted state. So if this routine fails, the database should
008146 ** be rolled back.
008147 **
008148 ** The third argument to this function, aOvflSpace, is a pointer to a
008149 ** buffer big enough to hold one page. If while inserting cells into the parent
008150 ** page (pParent) the parent page becomes overfull, this buffer is
008151 ** used to store the parent's overflow cells. Because this function inserts
008152 ** a maximum of four divider cells into the parent page, and the maximum
008153 ** size of a cell stored within an internal node is always less than 1/4
008154 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
008155 ** enough for all overflow cells.
008156 **
008157 ** If aOvflSpace is set to a null pointer, this function returns
008158 ** SQLITE_NOMEM.
008159 */
008160 static int balance_nonroot(
008161 MemPage *pParent, /* Parent page of siblings being balanced */
008162 int iParentIdx, /* Index of "the page" in pParent */
008163 u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */
008164 int isRoot, /* True if pParent is a root-page */
008165 int bBulk /* True if this call is part of a bulk load */
008166 ){
008167 BtShared *pBt; /* The whole database */
008168 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
008169 int nNew = 0; /* Number of pages in apNew[] */
008170 int nOld; /* Number of pages in apOld[] */
008171 int i, j, k; /* Loop counters */
008172 int nxDiv; /* Next divider slot in pParent->aCell[] */
008173 int rc = SQLITE_OK; /* The return code */
008174 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */
008175 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
008176 int usableSpace; /* Bytes in pPage beyond the header */
008177 int pageFlags; /* Value of pPage->aData[0] */
008178 int iSpace1 = 0; /* First unused byte of aSpace1[] */
008179 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */
008180 int szScratch; /* Size of scratch memory requested */
008181 MemPage *apOld[NB]; /* pPage and up to two siblings */
008182 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
008183 u8 *pRight; /* Location in parent of right-sibling pointer */
008184 u8 *apDiv[NB-1]; /* Divider cells in pParent */
008185 int cntNew[NB+2]; /* Index in b.paCell[] of cell after i-th page */
008186 int cntOld[NB+2]; /* Old index in b.apCell[] */
008187 int szNew[NB+2]; /* Combined size of cells placed on i-th page */
008188 u8 *aSpace1; /* Space for copies of dividers cells */
008189 Pgno pgno; /* Temp var to store a page number in */
008190 u8 abDone[NB+2]; /* True after i'th new page is populated */
008191 Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */
008192 CellArray b; /* Parsed information on cells being balanced */
008193
008194 memset(abDone, 0, sizeof(abDone));
008195 assert( sizeof(b) - sizeof(b.ixNx) == offsetof(CellArray,ixNx) );
008196 memset(&b, 0, sizeof(b)-sizeof(b.ixNx[0]));
008197 b.ixNx[NB*2-1] = 0x7fffffff;
008198 pBt = pParent->pBt;
008199 assert( sqlite3_mutex_held(pBt->mutex) );
008200 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
008201
008202 /* At this point pParent may have at most one overflow cell. And if
008203 ** this overflow cell is present, it must be the cell with
008204 ** index iParentIdx. This scenario comes about when this function
008205 ** is called (indirectly) from sqlite3BtreeDelete().
008206 */
008207 assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
008208 assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
008209
008210 if( !aOvflSpace ){
008211 return SQLITE_NOMEM_BKPT;
008212 }
008213 assert( pParent->nFree>=0 );
008214
008215 /* Find the sibling pages to balance. Also locate the cells in pParent
008216 ** that divide the siblings. An attempt is made to find NN siblings on
008217 ** either side of pPage. More siblings are taken from one side, however,
008218 ** if there are fewer than NN siblings on the other side. If pParent
008219 ** has NB or fewer children then all children of pParent are taken.
008220 **
008221 ** This loop also drops the divider cells from the parent page. This
008222 ** way, the remainder of the function does not have to deal with any
008223 ** overflow cells in the parent page, since if any existed they will
008224 ** have already been removed.
008225 */
008226 i = pParent->nOverflow + pParent->nCell;
008227 if( i<2 ){
008228 nxDiv = 0;
008229 }else{
008230 assert( bBulk==0 || bBulk==1 );
008231 if( iParentIdx==0 ){
008232 nxDiv = 0;
008233 }else if( iParentIdx==i ){
008234 nxDiv = i-2+bBulk;
008235 }else{
008236 nxDiv = iParentIdx-1;
008237 }
008238 i = 2-bBulk;
008239 }
008240 nOld = i+1;
008241 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
008242 pRight = &pParent->aData[pParent->hdrOffset+8];
008243 }else{
008244 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
008245 }
008246 pgno = get4byte(pRight);
008247 while( 1 ){
008248 if( rc==SQLITE_OK ){
008249 rc = getAndInitPage(pBt, pgno, &apOld[i], 0);
008250 }
008251 if( rc ){
008252 memset(apOld, 0, (i+1)*sizeof(MemPage*));
008253 goto balance_cleanup;
008254 }
008255 if( apOld[i]->nFree<0 ){
008256 rc = btreeComputeFreeSpace(apOld[i]);
008257 if( rc ){
008258 memset(apOld, 0, (i)*sizeof(MemPage*));
008259 goto balance_cleanup;
008260 }
008261 }
008262 nMaxCells += apOld[i]->nCell + ArraySize(pParent->apOvfl);
008263 if( (i--)==0 ) break;
008264
008265 if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){
008266 apDiv[i] = pParent->apOvfl[0];
008267 pgno = get4byte(apDiv[i]);
008268 szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
008269 pParent->nOverflow = 0;
008270 }else{
008271 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
008272 pgno = get4byte(apDiv[i]);
008273 szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
008274
008275 /* Drop the cell from the parent page. apDiv[i] still points to
008276 ** the cell within the parent, even though it has been dropped.
008277 ** This is safe because dropping a cell only overwrites the first
008278 ** four bytes of it, and this function does not need the first
008279 ** four bytes of the divider cell. So the pointer is safe to use
008280 ** later on.
008281 **
008282 ** But not if we are in secure-delete mode. In secure-delete mode,
008283 ** the dropCell() routine will overwrite the entire cell with zeroes.
008284 ** In this case, temporarily copy the cell into the aOvflSpace[]
008285 ** buffer. It will be copied out again as soon as the aSpace[] buffer
008286 ** is allocated. */
008287 if( pBt->btsFlags & BTS_FAST_SECURE ){
008288 int iOff;
008289
008290 /* If the following if() condition is not true, the db is corrupted.
008291 ** The call to dropCell() below will detect this. */
008292 iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
008293 if( (iOff+szNew[i])<=(int)pBt->usableSize ){
008294 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
008295 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
008296 }
008297 }
008298 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
008299 }
008300 }
008301
008302 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
008303 ** alignment */
008304 nMaxCells = (nMaxCells + 3)&~3;
008305
008306 /*
008307 ** Allocate space for memory structures
008308 */
008309 szScratch =
008310 nMaxCells*sizeof(u8*) /* b.apCell */
008311 + nMaxCells*sizeof(u16) /* b.szCell */
008312 + pBt->pageSize; /* aSpace1 */
008313
008314 assert( szScratch<=7*(int)pBt->pageSize );
008315 b.apCell = sqlite3StackAllocRaw(0, szScratch );
008316 if( b.apCell==0 ){
008317 rc = SQLITE_NOMEM_BKPT;
008318 goto balance_cleanup;
008319 }
008320 b.szCell = (u16*)&b.apCell[nMaxCells];
008321 aSpace1 = (u8*)&b.szCell[nMaxCells];
008322 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
008323
008324 /*
008325 ** Load pointers to all cells on sibling pages and the divider cells
008326 ** into the local b.apCell[] array. Make copies of the divider cells
008327 ** into space obtained from aSpace1[]. The divider cells have already
008328 ** been removed from pParent.
008329 **
008330 ** If the siblings are on leaf pages, then the child pointers of the
008331 ** divider cells are stripped from the cells before they are copied
008332 ** into aSpace1[]. In this way, all cells in b.apCell[] are without
008333 ** child pointers. If siblings are not leaves, then all cell in
008334 ** b.apCell[] include child pointers. Either way, all cells in b.apCell[]
008335 ** are alike.
008336 **
008337 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
008338 ** leafData: 1 if pPage holds key+data and pParent holds only keys.
008339 */
008340 b.pRef = apOld[0];
008341 leafCorrection = b.pRef->leaf*4;
008342 leafData = b.pRef->intKeyLeaf;
008343 for(i=0; i<nOld; i++){
008344 MemPage *pOld = apOld[i];
008345 int limit = pOld->nCell;
008346 u8 *aData = pOld->aData;
008347 u16 maskPage = pOld->maskPage;
008348 u8 *piCell = aData + pOld->cellOffset;
008349 u8 *piEnd;
008350 VVA_ONLY( int nCellAtStart = b.nCell; )
008351
008352 /* Verify that all sibling pages are of the same "type" (table-leaf,
008353 ** table-interior, index-leaf, or index-interior).
008354 */
008355 if( pOld->aData[0]!=apOld[0]->aData[0] ){
008356 rc = SQLITE_CORRUPT_PAGE(pOld);
008357 goto balance_cleanup;
008358 }
008359
008360 /* Load b.apCell[] with pointers to all cells in pOld. If pOld
008361 ** contains overflow cells, include them in the b.apCell[] array
008362 ** in the correct spot.
008363 **
008364 ** Note that when there are multiple overflow cells, it is always the
008365 ** case that they are sequential and adjacent. This invariant arises
008366 ** because multiple overflows can only occurs when inserting divider
008367 ** cells into a parent on a prior balance, and divider cells are always
008368 ** adjacent and are inserted in order. There is an assert() tagged
008369 ** with "NOTE 1" in the overflow cell insertion loop to prove this
008370 ** invariant.
008371 **
008372 ** This must be done in advance. Once the balance starts, the cell
008373 ** offset section of the btree page will be overwritten and we will no
008374 ** long be able to find the cells if a pointer to each cell is not saved
008375 ** first.
008376 */
008377 memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
008378 if( pOld->nOverflow>0 ){
008379 if( NEVER(limit<pOld->aiOvfl[0]) ){
008380 rc = SQLITE_CORRUPT_PAGE(pOld);
008381 goto balance_cleanup;
008382 }
008383 limit = pOld->aiOvfl[0];
008384 for(j=0; j<limit; j++){
008385 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
008386 piCell += 2;
008387 b.nCell++;
008388 }
008389 for(k=0; k<pOld->nOverflow; k++){
008390 assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
008391 b.apCell[b.nCell] = pOld->apOvfl[k];
008392 b.nCell++;
008393 }
008394 }
008395 piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
008396 while( piCell<piEnd ){
008397 assert( b.nCell<nMaxCells );
008398 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
008399 piCell += 2;
008400 b.nCell++;
008401 }
008402 assert( (b.nCell-nCellAtStart)==(pOld->nCell+pOld->nOverflow) );
008403
008404 cntOld[i] = b.nCell;
008405 if( i<nOld-1 && !leafData){
008406 u16 sz = (u16)szNew[i];
008407 u8 *pTemp;
008408 assert( b.nCell<nMaxCells );
008409 b.szCell[b.nCell] = sz;
008410 pTemp = &aSpace1[iSpace1];
008411 iSpace1 += sz;
008412 assert( sz<=pBt->maxLocal+23 );
008413 assert( iSpace1 <= (int)pBt->pageSize );
008414 memcpy(pTemp, apDiv[i], sz);
008415 b.apCell[b.nCell] = pTemp+leafCorrection;
008416 assert( leafCorrection==0 || leafCorrection==4 );
008417 b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
008418 if( !pOld->leaf ){
008419 assert( leafCorrection==0 );
008420 assert( pOld->hdrOffset==0 || CORRUPT_DB );
008421 /* The right pointer of the child page pOld becomes the left
008422 ** pointer of the divider cell */
008423 memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
008424 }else{
008425 assert( leafCorrection==4 );
008426 while( b.szCell[b.nCell]<4 ){
008427 /* Do not allow any cells smaller than 4 bytes. If a smaller cell
008428 ** does exist, pad it with 0x00 bytes. */
008429 assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
008430 assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
008431 aSpace1[iSpace1++] = 0x00;
008432 b.szCell[b.nCell]++;
008433 }
008434 }
008435 b.nCell++;
008436 }
008437 }
008438
008439 /*
008440 ** Figure out the number of pages needed to hold all b.nCell cells.
008441 ** Store this number in "k". Also compute szNew[] which is the total
008442 ** size of all cells on the i-th page and cntNew[] which is the index
008443 ** in b.apCell[] of the cell that divides page i from page i+1.
008444 ** cntNew[k] should equal b.nCell.
008445 **
008446 ** Values computed by this block:
008447 **
008448 ** k: The total number of sibling pages
008449 ** szNew[i]: Spaced used on the i-th sibling page.
008450 ** cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
008451 ** the right of the i-th sibling page.
008452 ** usableSpace: Number of bytes of space available on each sibling.
008453 **
008454 */
008455 usableSpace = pBt->usableSize - 12 + leafCorrection;
008456 for(i=k=0; i<nOld; i++, k++){
008457 MemPage *p = apOld[i];
008458 b.apEnd[k] = p->aDataEnd;
008459 b.ixNx[k] = cntOld[i];
008460 if( k && b.ixNx[k]==b.ixNx[k-1] ){
008461 k--; /* Omit b.ixNx[] entry for child pages with no cells */
008462 }
008463 if( !leafData ){
008464 k++;
008465 b.apEnd[k] = pParent->aDataEnd;
008466 b.ixNx[k] = cntOld[i]+1;
008467 }
008468 assert( p->nFree>=0 );
008469 szNew[i] = usableSpace - p->nFree;
008470 for(j=0; j<p->nOverflow; j++){
008471 szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
008472 }
008473 cntNew[i] = cntOld[i];
008474 }
008475 k = nOld;
008476 for(i=0; i<k; i++){
008477 int sz;
008478 while( szNew[i]>usableSpace ){
008479 if( i+1>=k ){
008480 k = i+2;
008481 if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
008482 szNew[k-1] = 0;
008483 cntNew[k-1] = b.nCell;
008484 }
008485 sz = 2 + cachedCellSize(&b, cntNew[i]-1);
008486 szNew[i] -= sz;
008487 if( !leafData ){
008488 if( cntNew[i]<b.nCell ){
008489 sz = 2 + cachedCellSize(&b, cntNew[i]);
008490 }else{
008491 sz = 0;
008492 }
008493 }
008494 szNew[i+1] += sz;
008495 cntNew[i]--;
008496 }
008497 while( cntNew[i]<b.nCell ){
008498 sz = 2 + cachedCellSize(&b, cntNew[i]);
008499 if( szNew[i]+sz>usableSpace ) break;
008500 szNew[i] += sz;
008501 cntNew[i]++;
008502 if( !leafData ){
008503 if( cntNew[i]<b.nCell ){
008504 sz = 2 + cachedCellSize(&b, cntNew[i]);
008505 }else{
008506 sz = 0;
008507 }
008508 }
008509 szNew[i+1] -= sz;
008510 }
008511 if( cntNew[i]>=b.nCell ){
008512 k = i+1;
008513 }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
008514 rc = SQLITE_CORRUPT_BKPT;
008515 goto balance_cleanup;
008516 }
008517 }
008518
008519 /*
008520 ** The packing computed by the previous block is biased toward the siblings
008521 ** on the left side (siblings with smaller keys). The left siblings are
008522 ** always nearly full, while the right-most sibling might be nearly empty.
008523 ** The next block of code attempts to adjust the packing of siblings to
008524 ** get a better balance.
008525 **
008526 ** This adjustment is more than an optimization. The packing above might
008527 ** be so out of balance as to be illegal. For example, the right-most
008528 ** sibling might be completely empty. This adjustment is not optional.
008529 */
008530 for(i=k-1; i>0; i--){
008531 int szRight = szNew[i]; /* Size of sibling on the right */
008532 int szLeft = szNew[i-1]; /* Size of sibling on the left */
008533 int r; /* Index of right-most cell in left sibling */
008534 int d; /* Index of first cell to the left of right sibling */
008535
008536 r = cntNew[i-1] - 1;
008537 d = r + 1 - leafData;
008538 (void)cachedCellSize(&b, d);
008539 do{
008540 int szR, szD;
008541 assert( d<nMaxCells );
008542 assert( r<nMaxCells );
008543 szR = cachedCellSize(&b, r);
008544 szD = b.szCell[d];
008545 if( szRight!=0
008546 && (bBulk || szRight+szD+2 > szLeft-(szR+(i==k-1?0:2)))){
008547 break;
008548 }
008549 szRight += szD + 2;
008550 szLeft -= szR + 2;
008551 cntNew[i-1] = r;
008552 r--;
008553 d--;
008554 }while( r>=0 );
008555 szNew[i] = szRight;
008556 szNew[i-1] = szLeft;
008557 if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
008558 rc = SQLITE_CORRUPT_BKPT;
008559 goto balance_cleanup;
008560 }
008561 }
008562
008563 /* Sanity check: For a non-corrupt database file one of the following
008564 ** must be true:
008565 ** (1) We found one or more cells (cntNew[0])>0), or
008566 ** (2) pPage is a virtual root page. A virtual root page is when
008567 ** the real root page is page 1 and we are the only child of
008568 ** that page.
008569 */
008570 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
008571 TRACE(("BALANCE: old: %u(nc=%u) %u(nc=%u) %u(nc=%u)\n",
008572 apOld[0]->pgno, apOld[0]->nCell,
008573 nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
008574 nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
008575 ));
008576
008577 /*
008578 ** Allocate k new pages. Reuse old pages where possible.
008579 */
008580 pageFlags = apOld[0]->aData[0];
008581 for(i=0; i<k; i++){
008582 MemPage *pNew;
008583 if( i<nOld ){
008584 pNew = apNew[i] = apOld[i];
008585 apOld[i] = 0;
008586 rc = sqlite3PagerWrite(pNew->pDbPage);
008587 nNew++;
008588 if( sqlite3PagerPageRefcount(pNew->pDbPage)!=1+(i==(iParentIdx-nxDiv))
008589 && rc==SQLITE_OK
008590 ){
008591 rc = SQLITE_CORRUPT_BKPT;
008592 }
008593 if( rc ) goto balance_cleanup;
008594 }else{
008595 assert( i>0 );
008596 rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
008597 if( rc ) goto balance_cleanup;
008598 zeroPage(pNew, pageFlags);
008599 apNew[i] = pNew;
008600 nNew++;
008601 cntOld[i] = b.nCell;
008602
008603 /* Set the pointer-map entry for the new sibling page. */
008604 if( ISAUTOVACUUM(pBt) ){
008605 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
008606 if( rc!=SQLITE_OK ){
008607 goto balance_cleanup;
008608 }
008609 }
008610 }
008611 }
008612
008613 /*
008614 ** Reassign page numbers so that the new pages are in ascending order.
008615 ** This helps to keep entries in the disk file in order so that a scan
008616 ** of the table is closer to a linear scan through the file. That in turn
008617 ** helps the operating system to deliver pages from the disk more rapidly.
008618 **
008619 ** An O(N*N) sort algorithm is used, but since N is never more than NB+2
008620 ** (5), that is not a performance concern.
008621 **
008622 ** When NB==3, this one optimization makes the database about 25% faster
008623 ** for large insertions and deletions.
008624 */
008625 for(i=0; i<nNew; i++){
008626 aPgno[i] = apNew[i]->pgno;
008627 assert( apNew[i]->pDbPage->flags & PGHDR_WRITEABLE );
008628 assert( apNew[i]->pDbPage->flags & PGHDR_DIRTY );
008629 }
008630 for(i=0; i<nNew-1; i++){
008631 int iB = i;
008632 for(j=i+1; j<nNew; j++){
008633 if( apNew[j]->pgno < apNew[iB]->pgno ) iB = j;
008634 }
008635
008636 /* If apNew[i] has a page number that is bigger than any of the
008637 ** subsequence apNew[i] entries, then swap apNew[i] with the subsequent
008638 ** entry that has the smallest page number (which we know to be
008639 ** entry apNew[iB]).
008640 */
008641 if( iB!=i ){
008642 Pgno pgnoA = apNew[i]->pgno;
008643 Pgno pgnoB = apNew[iB]->pgno;
008644 Pgno pgnoTemp = (PENDING_BYTE/pBt->pageSize)+1;
008645 u16 fgA = apNew[i]->pDbPage->flags;
008646 u16 fgB = apNew[iB]->pDbPage->flags;
008647 sqlite3PagerRekey(apNew[i]->pDbPage, pgnoTemp, fgB);
008648 sqlite3PagerRekey(apNew[iB]->pDbPage, pgnoA, fgA);
008649 sqlite3PagerRekey(apNew[i]->pDbPage, pgnoB, fgB);
008650 apNew[i]->pgno = pgnoB;
008651 apNew[iB]->pgno = pgnoA;
008652 }
008653 }
008654
008655 TRACE(("BALANCE: new: %u(%u nc=%u) %u(%u nc=%u) %u(%u nc=%u) "
008656 "%u(%u nc=%u) %u(%u nc=%u)\n",
008657 apNew[0]->pgno, szNew[0], cntNew[0],
008658 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
008659 nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
008660 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
008661 nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
008662 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
008663 nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
008664 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
008665 nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
008666 ));
008667
008668 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
008669 assert( nNew>=1 && nNew<=ArraySize(apNew) );
008670 assert( apNew[nNew-1]!=0 );
008671 put4byte(pRight, apNew[nNew-1]->pgno);
008672
008673 /* If the sibling pages are not leaves, ensure that the right-child pointer
008674 ** of the right-most new sibling page is set to the value that was
008675 ** originally in the same field of the right-most old sibling page. */
008676 if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
008677 MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
008678 memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
008679 }
008680
008681 /* Make any required updates to pointer map entries associated with
008682 ** cells stored on sibling pages following the balance operation. Pointer
008683 ** map entries associated with divider cells are set by the insertCell()
008684 ** routine. The associated pointer map entries are:
008685 **
008686 ** a) if the cell contains a reference to an overflow chain, the
008687 ** entry associated with the first page in the overflow chain, and
008688 **
008689 ** b) if the sibling pages are not leaves, the child page associated
008690 ** with the cell.
008691 **
008692 ** If the sibling pages are not leaves, then the pointer map entry
008693 ** associated with the right-child of each sibling may also need to be
008694 ** updated. This happens below, after the sibling pages have been
008695 ** populated, not here.
008696 */
008697 if( ISAUTOVACUUM(pBt) ){
008698 MemPage *pOld;
008699 MemPage *pNew = pOld = apNew[0];
008700 int cntOldNext = pNew->nCell + pNew->nOverflow;
008701 int iNew = 0;
008702 int iOld = 0;
008703
008704 for(i=0; i<b.nCell; i++){
008705 u8 *pCell = b.apCell[i];
008706 while( i==cntOldNext ){
008707 iOld++;
008708 assert( iOld<nNew || iOld<nOld );
008709 assert( iOld>=0 && iOld<NB );
008710 pOld = iOld<nNew ? apNew[iOld] : apOld[iOld];
008711 cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
008712 }
008713 if( i==cntNew[iNew] ){
008714 pNew = apNew[++iNew];
008715 if( !leafData ) continue;
008716 }
008717
008718 /* Cell pCell is destined for new sibling page pNew. Originally, it
008719 ** was either part of sibling page iOld (possibly an overflow cell),
008720 ** or else the divider cell to the left of sibling page iOld. So,
008721 ** if sibling page iOld had the same page number as pNew, and if
008722 ** pCell really was a part of sibling page iOld (not a divider or
008723 ** overflow cell), we can skip updating the pointer map entries. */
008724 if( iOld>=nNew
008725 || pNew->pgno!=aPgno[iOld]
008726 || !SQLITE_WITHIN(pCell,pOld->aData,pOld->aDataEnd)
008727 ){
008728 if( !leafCorrection ){
008729 ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
008730 }
008731 if( cachedCellSize(&b,i)>pNew->minLocal ){
008732 ptrmapPutOvflPtr(pNew, pOld, pCell, &rc);
008733 }
008734 if( rc ) goto balance_cleanup;
008735 }
008736 }
008737 }
008738
008739 /* Insert new divider cells into pParent. */
008740 for(i=0; i<nNew-1; i++){
008741 u8 *pCell;
008742 u8 *pTemp;
008743 int sz;
008744 u8 *pSrcEnd;
008745 MemPage *pNew = apNew[i];
008746 j = cntNew[i];
008747
008748 assert( j<nMaxCells );
008749 assert( b.apCell[j]!=0 );
008750 pCell = b.apCell[j];
008751 sz = b.szCell[j] + leafCorrection;
008752 pTemp = &aOvflSpace[iOvflSpace];
008753 if( !pNew->leaf ){
008754 memcpy(&pNew->aData[8], pCell, 4);
008755 }else if( leafData ){
008756 /* If the tree is a leaf-data tree, and the siblings are leaves,
008757 ** then there is no divider cell in b.apCell[]. Instead, the divider
008758 ** cell consists of the integer key for the right-most cell of
008759 ** the sibling-page assembled above only.
008760 */
008761 CellInfo info;
008762 j--;
008763 pNew->xParseCell(pNew, b.apCell[j], &info);
008764 pCell = pTemp;
008765 sz = 4 + putVarint(&pCell[4], info.nKey);
008766 pTemp = 0;
008767 }else{
008768 pCell -= 4;
008769 /* Obscure case for non-leaf-data trees: If the cell at pCell was
008770 ** previously stored on a leaf node, and its reported size was 4
008771 ** bytes, then it may actually be smaller than this
008772 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
008773 ** any cell). But it is important to pass the correct size to
008774 ** insertCell(), so reparse the cell now.
008775 **
008776 ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
008777 ** and WITHOUT ROWID tables with exactly one column which is the
008778 ** primary key.
008779 */
008780 if( b.szCell[j]==4 ){
008781 assert(leafCorrection==4);
008782 sz = pParent->xCellSize(pParent, pCell);
008783 }
008784 }
008785 iOvflSpace += sz;
008786 assert( sz<=pBt->maxLocal+23 );
008787 assert( iOvflSpace <= (int)pBt->pageSize );
008788 assert( b.ixNx[NB*2-1]>j );
008789 for(k=0; b.ixNx[k]<=j; k++){}
008790 pSrcEnd = b.apEnd[k];
008791 if( SQLITE_OVERFLOW(pSrcEnd, pCell, pCell+sz) ){
008792 rc = SQLITE_CORRUPT_BKPT;
008793 goto balance_cleanup;
008794 }
008795 rc = insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno);
008796 if( rc!=SQLITE_OK ) goto balance_cleanup;
008797 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
008798 }
008799
008800 /* Now update the actual sibling pages. The order in which they are updated
008801 ** is important, as this code needs to avoid disrupting any page from which
008802 ** cells may still to be read. In practice, this means:
008803 **
008804 ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
008805 ** then it is not safe to update page apNew[iPg] until after
008806 ** the left-hand sibling apNew[iPg-1] has been updated.
008807 **
008808 ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
008809 ** then it is not safe to update page apNew[iPg] until after
008810 ** the right-hand sibling apNew[iPg+1] has been updated.
008811 **
008812 ** If neither of the above apply, the page is safe to update.
008813 **
008814 ** The iPg value in the following loop starts at nNew-1 goes down
008815 ** to 0, then back up to nNew-1 again, thus making two passes over
008816 ** the pages. On the initial downward pass, only condition (1) above
008817 ** needs to be tested because (2) will always be true from the previous
008818 ** step. On the upward pass, both conditions are always true, so the
008819 ** upwards pass simply processes pages that were missed on the downward
008820 ** pass.
008821 */
008822 for(i=1-nNew; i<nNew; i++){
008823 int iPg = i<0 ? -i : i;
008824 assert( iPg>=0 && iPg<nNew );
008825 assert( iPg>=1 || i>=0 );
008826 assert( iPg<ArraySize(cntOld) );
008827 if( abDone[iPg] ) continue; /* Skip pages already processed */
008828 if( i>=0 /* On the upwards pass, or... */
008829 || cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */
008830 ){
008831 int iNew;
008832 int iOld;
008833 int nNewCell;
008834
008835 /* Verify condition (1): If cells are moving left, update iPg
008836 ** only after iPg-1 has already been updated. */
008837 assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
008838
008839 /* Verify condition (2): If cells are moving right, update iPg
008840 ** only after iPg+1 has already been updated. */
008841 assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
008842
008843 if( iPg==0 ){
008844 iNew = iOld = 0;
008845 nNewCell = cntNew[0];
008846 }else{
008847 iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
008848 iNew = cntNew[iPg-1] + !leafData;
008849 nNewCell = cntNew[iPg] - iNew;
008850 }
008851
008852 rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
008853 if( rc ) goto balance_cleanup;
008854 abDone[iPg]++;
008855 apNew[iPg]->nFree = usableSpace-szNew[iPg];
008856 assert( apNew[iPg]->nOverflow==0 );
008857 assert( apNew[iPg]->nCell==nNewCell );
008858 }
008859 }
008860
008861 /* All pages have been processed exactly once */
008862 assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
008863
008864 assert( nOld>0 );
008865 assert( nNew>0 );
008866
008867 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
008868 /* The root page of the b-tree now contains no cells. The only sibling
008869 ** page is the right-child of the parent. Copy the contents of the
008870 ** child page into the parent, decreasing the overall height of the
008871 ** b-tree structure by one. This is described as the "balance-shallower"
008872 ** sub-algorithm in some documentation.
008873 **
008874 ** If this is an auto-vacuum database, the call to copyNodeContent()
008875 ** sets all pointer-map entries corresponding to database image pages
008876 ** for which the pointer is stored within the content being copied.
008877 **
008878 ** It is critical that the child page be defragmented before being
008879 ** copied into the parent, because if the parent is page 1 then it will
008880 ** by smaller than the child due to the database header, and so all the
008881 ** free space needs to be up front.
008882 */
008883 assert( nNew==1 || CORRUPT_DB );
008884 rc = defragmentPage(apNew[0], -1);
008885 testcase( rc!=SQLITE_OK );
008886 assert( apNew[0]->nFree ==
008887 (get2byteNotZero(&apNew[0]->aData[5]) - apNew[0]->cellOffset
008888 - apNew[0]->nCell*2)
008889 || rc!=SQLITE_OK
008890 );
008891 copyNodeContent(apNew[0], pParent, &rc);
008892 freePage(apNew[0], &rc);
008893 }else if( ISAUTOVACUUM(pBt) && !leafCorrection ){
008894 /* Fix the pointer map entries associated with the right-child of each
008895 ** sibling page. All other pointer map entries have already been taken
008896 ** care of. */
008897 for(i=0; i<nNew; i++){
008898 u32 key = get4byte(&apNew[i]->aData[8]);
008899 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
008900 }
008901 }
008902
008903 assert( pParent->isInit );
008904 TRACE(("BALANCE: finished: old=%u new=%u cells=%u\n",
008905 nOld, nNew, b.nCell));
008906
008907 /* Free any old pages that were not reused as new pages.
008908 */
008909 for(i=nNew; i<nOld; i++){
008910 freePage(apOld[i], &rc);
008911 }
008912
008913 #if 0
008914 if( ISAUTOVACUUM(pBt) && rc==SQLITE_OK && apNew[0]->isInit ){
008915 /* The ptrmapCheckPages() contains assert() statements that verify that
008916 ** all pointer map pages are set correctly. This is helpful while
008917 ** debugging. This is usually disabled because a corrupt database may
008918 ** cause an assert() statement to fail. */
008919 ptrmapCheckPages(apNew, nNew);
008920 ptrmapCheckPages(&pParent, 1);
008921 }
008922 #endif
008923
008924 /*
008925 ** Cleanup before returning.
008926 */
008927 balance_cleanup:
008928 sqlite3StackFree(0, b.apCell);
008929 for(i=0; i<nOld; i++){
008930 releasePage(apOld[i]);
008931 }
008932 for(i=0; i<nNew; i++){
008933 releasePage(apNew[i]);
008934 }
008935
008936 return rc;
008937 }
008938
008939
008940 /*
008941 ** This function is called when the root page of a b-tree structure is
008942 ** overfull (has one or more overflow pages).
008943 **
008944 ** A new child page is allocated and the contents of the current root
008945 ** page, including overflow cells, are copied into the child. The root
008946 ** page is then overwritten to make it an empty page with the right-child
008947 ** pointer pointing to the new page.
008948 **
008949 ** Before returning, all pointer-map entries corresponding to pages
008950 ** that the new child-page now contains pointers to are updated. The
008951 ** entry corresponding to the new right-child pointer of the root
008952 ** page is also updated.
008953 **
008954 ** If successful, *ppChild is set to contain a reference to the child
008955 ** page and SQLITE_OK is returned. In this case the caller is required
008956 ** to call releasePage() on *ppChild exactly once. If an error occurs,
008957 ** an error code is returned and *ppChild is set to 0.
008958 */
008959 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
008960 int rc; /* Return value from subprocedures */
008961 MemPage *pChild = 0; /* Pointer to a new child page */
008962 Pgno pgnoChild = 0; /* Page number of the new child page */
008963 BtShared *pBt = pRoot->pBt; /* The BTree */
008964
008965 assert( pRoot->nOverflow>0 );
008966 assert( sqlite3_mutex_held(pBt->mutex) );
008967
008968 /* Make pRoot, the root page of the b-tree, writable. Allocate a new
008969 ** page that will become the new right-child of pPage. Copy the contents
008970 ** of the node stored on pRoot into the new child page.
008971 */
008972 rc = sqlite3PagerWrite(pRoot->pDbPage);
008973 if( rc==SQLITE_OK ){
008974 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
008975 copyNodeContent(pRoot, pChild, &rc);
008976 if( ISAUTOVACUUM(pBt) ){
008977 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
008978 }
008979 }
008980 if( rc ){
008981 *ppChild = 0;
008982 releasePage(pChild);
008983 return rc;
008984 }
008985 assert( sqlite3PagerIswriteable(pChild->pDbPage) );
008986 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
008987 assert( pChild->nCell==pRoot->nCell || CORRUPT_DB );
008988
008989 TRACE(("BALANCE: copy root %u into %u\n", pRoot->pgno, pChild->pgno));
008990
008991 /* Copy the overflow cells from pRoot to pChild */
008992 memcpy(pChild->aiOvfl, pRoot->aiOvfl,
008993 pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
008994 memcpy(pChild->apOvfl, pRoot->apOvfl,
008995 pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
008996 pChild->nOverflow = pRoot->nOverflow;
008997
008998 /* Zero the contents of pRoot. Then install pChild as the right-child. */
008999 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
009000 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
009001
009002 *ppChild = pChild;
009003 return SQLITE_OK;
009004 }
009005
009006 /*
009007 ** Return SQLITE_CORRUPT if any cursor other than pCur is currently valid
009008 ** on the same B-tree as pCur.
009009 **
009010 ** This can occur if a database is corrupt with two or more SQL tables
009011 ** pointing to the same b-tree. If an insert occurs on one SQL table
009012 ** and causes a BEFORE TRIGGER to do a secondary insert on the other SQL
009013 ** table linked to the same b-tree. If the secondary insert causes a
009014 ** rebalance, that can change content out from under the cursor on the
009015 ** first SQL table, violating invariants on the first insert.
009016 */
009017 static int anotherValidCursor(BtCursor *pCur){
009018 BtCursor *pOther;
009019 for(pOther=pCur->pBt->pCursor; pOther; pOther=pOther->pNext){
009020 if( pOther!=pCur
009021 && pOther->eState==CURSOR_VALID
009022 && pOther->pPage==pCur->pPage
009023 ){
009024 return SQLITE_CORRUPT_PAGE(pCur->pPage);
009025 }
009026 }
009027 return SQLITE_OK;
009028 }
009029
009030 /*
009031 ** The page that pCur currently points to has just been modified in
009032 ** some way. This function figures out if this modification means the
009033 ** tree needs to be balanced, and if so calls the appropriate balancing
009034 ** routine. Balancing routines are:
009035 **
009036 ** balance_quick()
009037 ** balance_deeper()
009038 ** balance_nonroot()
009039 */
009040 static int balance(BtCursor *pCur){
009041 int rc = SQLITE_OK;
009042 u8 aBalanceQuickSpace[13];
009043 u8 *pFree = 0;
009044
009045 VVA_ONLY( int balance_quick_called = 0 );
009046 VVA_ONLY( int balance_deeper_called = 0 );
009047
009048 do {
009049 int iPage;
009050 MemPage *pPage = pCur->pPage;
009051
009052 if( NEVER(pPage->nFree<0) && btreeComputeFreeSpace(pPage) ) break;
009053 if( pPage->nOverflow==0 && pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){
009054 /* No rebalance required as long as:
009055 ** (1) There are no overflow cells
009056 ** (2) The amount of free space on the page is less than 2/3rds of
009057 ** the total usable space on the page. */
009058 break;
009059 }else if( (iPage = pCur->iPage)==0 ){
009060 if( pPage->nOverflow && (rc = anotherValidCursor(pCur))==SQLITE_OK ){
009061 /* The root page of the b-tree is overfull. In this case call the
009062 ** balance_deeper() function to create a new child for the root-page
009063 ** and copy the current contents of the root-page to it. The
009064 ** next iteration of the do-loop will balance the child page.
009065 */
009066 assert( balance_deeper_called==0 );
009067 VVA_ONLY( balance_deeper_called++ );
009068 rc = balance_deeper(pPage, &pCur->apPage[1]);
009069 if( rc==SQLITE_OK ){
009070 pCur->iPage = 1;
009071 pCur->ix = 0;
009072 pCur->aiIdx[0] = 0;
009073 pCur->apPage[0] = pPage;
009074 pCur->pPage = pCur->apPage[1];
009075 assert( pCur->pPage->nOverflow );
009076 }
009077 }else{
009078 break;
009079 }
009080 }else if( sqlite3PagerPageRefcount(pPage->pDbPage)>1 ){
009081 /* The page being written is not a root page, and there is currently
009082 ** more than one reference to it. This only happens if the page is one
009083 ** of its own ancestor pages. Corruption. */
009084 rc = SQLITE_CORRUPT_PAGE(pPage);
009085 }else{
009086 MemPage * const pParent = pCur->apPage[iPage-1];
009087 int const iIdx = pCur->aiIdx[iPage-1];
009088
009089 rc = sqlite3PagerWrite(pParent->pDbPage);
009090 if( rc==SQLITE_OK && pParent->nFree<0 ){
009091 rc = btreeComputeFreeSpace(pParent);
009092 }
009093 if( rc==SQLITE_OK ){
009094 #ifndef SQLITE_OMIT_QUICKBALANCE
009095 if( pPage->intKeyLeaf
009096 && pPage->nOverflow==1
009097 && pPage->aiOvfl[0]==pPage->nCell
009098 && pParent->pgno!=1
009099 && pParent->nCell==iIdx
009100 ){
009101 /* Call balance_quick() to create a new sibling of pPage on which
009102 ** to store the overflow cell. balance_quick() inserts a new cell
009103 ** into pParent, which may cause pParent overflow. If this
009104 ** happens, the next iteration of the do-loop will balance pParent
009105 ** use either balance_nonroot() or balance_deeper(). Until this
009106 ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
009107 ** buffer.
009108 **
009109 ** The purpose of the following assert() is to check that only a
009110 ** single call to balance_quick() is made for each call to this
009111 ** function. If this were not verified, a subtle bug involving reuse
009112 ** of the aBalanceQuickSpace[] might sneak in.
009113 */
009114 assert( balance_quick_called==0 );
009115 VVA_ONLY( balance_quick_called++ );
009116 rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
009117 }else
009118 #endif
009119 {
009120 /* In this case, call balance_nonroot() to redistribute cells
009121 ** between pPage and up to 2 of its sibling pages. This involves
009122 ** modifying the contents of pParent, which may cause pParent to
009123 ** become overfull or underfull. The next iteration of the do-loop
009124 ** will balance the parent page to correct this.
009125 **
009126 ** If the parent page becomes overfull, the overflow cell or cells
009127 ** are stored in the pSpace buffer allocated immediately below.
009128 ** A subsequent iteration of the do-loop will deal with this by
009129 ** calling balance_nonroot() (balance_deeper() may be called first,
009130 ** but it doesn't deal with overflow cells - just moves them to a
009131 ** different page). Once this subsequent call to balance_nonroot()
009132 ** has completed, it is safe to release the pSpace buffer used by
009133 ** the previous call, as the overflow cell data will have been
009134 ** copied either into the body of a database page or into the new
009135 ** pSpace buffer passed to the latter call to balance_nonroot().
009136 */
009137 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
009138 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
009139 pCur->hints&BTREE_BULKLOAD);
009140 if( pFree ){
009141 /* If pFree is not NULL, it points to the pSpace buffer used
009142 ** by a previous call to balance_nonroot(). Its contents are
009143 ** now stored either on real database pages or within the
009144 ** new pSpace buffer, so it may be safely freed here. */
009145 sqlite3PageFree(pFree);
009146 }
009147
009148 /* The pSpace buffer will be freed after the next call to
009149 ** balance_nonroot(), or just before this function returns, whichever
009150 ** comes first. */
009151 pFree = pSpace;
009152 }
009153 }
009154
009155 pPage->nOverflow = 0;
009156
009157 /* The next iteration of the do-loop balances the parent page. */
009158 releasePage(pPage);
009159 pCur->iPage--;
009160 assert( pCur->iPage>=0 );
009161 pCur->pPage = pCur->apPage[pCur->iPage];
009162 }
009163 }while( rc==SQLITE_OK );
009164
009165 if( pFree ){
009166 sqlite3PageFree(pFree);
009167 }
009168 return rc;
009169 }
009170
009171 /* Overwrite content from pX into pDest. Only do the write if the
009172 ** content is different from what is already there.
009173 */
009174 static int btreeOverwriteContent(
009175 MemPage *pPage, /* MemPage on which writing will occur */
009176 u8 *pDest, /* Pointer to the place to start writing */
009177 const BtreePayload *pX, /* Source of data to write */
009178 int iOffset, /* Offset of first byte to write */
009179 int iAmt /* Number of bytes to be written */
009180 ){
009181 int nData = pX->nData - iOffset;
009182 if( nData<=0 ){
009183 /* Overwriting with zeros */
009184 int i;
009185 for(i=0; i<iAmt && pDest[i]==0; i++){}
009186 if( i<iAmt ){
009187 int rc = sqlite3PagerWrite(pPage->pDbPage);
009188 if( rc ) return rc;
009189 memset(pDest + i, 0, iAmt - i);
009190 }
009191 }else{
009192 if( nData<iAmt ){
009193 /* Mixed read data and zeros at the end. Make a recursive call
009194 ** to write the zeros then fall through to write the real data */
009195 int rc = btreeOverwriteContent(pPage, pDest+nData, pX, iOffset+nData,
009196 iAmt-nData);
009197 if( rc ) return rc;
009198 iAmt = nData;
009199 }
009200 if( memcmp(pDest, ((u8*)pX->pData) + iOffset, iAmt)!=0 ){
009201 int rc = sqlite3PagerWrite(pPage->pDbPage);
009202 if( rc ) return rc;
009203 /* In a corrupt database, it is possible for the source and destination
009204 ** buffers to overlap. This is harmless since the database is already
009205 ** corrupt but it does cause valgrind and ASAN warnings. So use
009206 ** memmove(). */
009207 memmove(pDest, ((u8*)pX->pData) + iOffset, iAmt);
009208 }
009209 }
009210 return SQLITE_OK;
009211 }
009212
009213 /*
009214 ** Overwrite the cell that cursor pCur is pointing to with fresh content
009215 ** contained in pX. In this variant, pCur is pointing to an overflow
009216 ** cell.
009217 */
009218 static SQLITE_NOINLINE int btreeOverwriteOverflowCell(
009219 BtCursor *pCur, /* Cursor pointing to cell to overwrite */
009220 const BtreePayload *pX /* Content to write into the cell */
009221 ){
009222 int iOffset; /* Next byte of pX->pData to write */
009223 int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */
009224 int rc; /* Return code */
009225 MemPage *pPage = pCur->pPage; /* Page being written */
009226 BtShared *pBt; /* Btree */
009227 Pgno ovflPgno; /* Next overflow page to write */
009228 u32 ovflPageSize; /* Size to write on overflow page */
009229
009230 assert( pCur->info.nLocal<nTotal ); /* pCur is an overflow cell */
009231
009232 /* Overwrite the local portion first */
009233 rc = btreeOverwriteContent(pPage, pCur->info.pPayload, pX,
009234 0, pCur->info.nLocal);
009235 if( rc ) return rc;
009236
009237 /* Now overwrite the overflow pages */
009238 iOffset = pCur->info.nLocal;
009239 assert( nTotal>=0 );
009240 assert( iOffset>=0 );
009241 ovflPgno = get4byte(pCur->info.pPayload + iOffset);
009242 pBt = pPage->pBt;
009243 ovflPageSize = pBt->usableSize - 4;
009244 do{
009245 rc = btreeGetPage(pBt, ovflPgno, &pPage, 0);
009246 if( rc ) return rc;
009247 if( sqlite3PagerPageRefcount(pPage->pDbPage)!=1 || pPage->isInit ){
009248 rc = SQLITE_CORRUPT_PAGE(pPage);
009249 }else{
009250 if( iOffset+ovflPageSize<(u32)nTotal ){
009251 ovflPgno = get4byte(pPage->aData);
009252 }else{
009253 ovflPageSize = nTotal - iOffset;
009254 }
009255 rc = btreeOverwriteContent(pPage, pPage->aData+4, pX,
009256 iOffset, ovflPageSize);
009257 }
009258 sqlite3PagerUnref(pPage->pDbPage);
009259 if( rc ) return rc;
009260 iOffset += ovflPageSize;
009261 }while( iOffset<nTotal );
009262 return SQLITE_OK;
009263 }
009264
009265 /*
009266 ** Overwrite the cell that cursor pCur is pointing to with fresh content
009267 ** contained in pX.
009268 */
009269 static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){
009270 int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */
009271 MemPage *pPage = pCur->pPage; /* Page being written */
009272
009273 if( pCur->info.pPayload + pCur->info.nLocal > pPage->aDataEnd
009274 || pCur->info.pPayload < pPage->aData + pPage->cellOffset
009275 ){
009276 return SQLITE_CORRUPT_PAGE(pPage);
009277 }
009278 if( pCur->info.nLocal==nTotal ){
009279 /* The entire cell is local */
009280 return btreeOverwriteContent(pPage, pCur->info.pPayload, pX,
009281 0, pCur->info.nLocal);
009282 }else{
009283 /* The cell contains overflow content */
009284 return btreeOverwriteOverflowCell(pCur, pX);
009285 }
009286 }
009287
009288
009289 /*
009290 ** Insert a new record into the BTree. The content of the new record
009291 ** is described by the pX object. The pCur cursor is used only to
009292 ** define what table the record should be inserted into, and is left
009293 ** pointing at a random location.
009294 **
009295 ** For a table btree (used for rowid tables), only the pX.nKey value of
009296 ** the key is used. The pX.pKey value must be NULL. The pX.nKey is the
009297 ** rowid or INTEGER PRIMARY KEY of the row. The pX.nData,pData,nZero fields
009298 ** hold the content of the row.
009299 **
009300 ** For an index btree (used for indexes and WITHOUT ROWID tables), the
009301 ** key is an arbitrary byte sequence stored in pX.pKey,nKey. The
009302 ** pX.pData,nData,nZero fields must be zero.
009303 **
009304 ** If the seekResult parameter is non-zero, then a successful call to
009305 ** sqlite3BtreeIndexMoveto() to seek cursor pCur to (pKey,nKey) has already
009306 ** been performed. In other words, if seekResult!=0 then the cursor
009307 ** is currently pointing to a cell that will be adjacent to the cell
009308 ** to be inserted. If seekResult<0 then pCur points to a cell that is
009309 ** smaller then (pKey,nKey). If seekResult>0 then pCur points to a cell
009310 ** that is larger than (pKey,nKey).
009311 **
009312 ** If seekResult==0, that means pCur is pointing at some unknown location.
009313 ** In that case, this routine must seek the cursor to the correct insertion
009314 ** point for (pKey,nKey) before doing the insertion. For index btrees,
009315 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked
009316 ** key values and pX->aMem can be used instead of pX->pKey to avoid having
009317 ** to decode the key.
009318 */
009319 int sqlite3BtreeInsert(
009320 BtCursor *pCur, /* Insert data into the table of this cursor */
009321 const BtreePayload *pX, /* Content of the row to be inserted */
009322 int flags, /* True if this is likely an append */
009323 int seekResult /* Result of prior IndexMoveto() call */
009324 ){
009325 int rc;
009326 int loc = seekResult; /* -1: before desired location +1: after */
009327 int szNew = 0;
009328 int idx;
009329 MemPage *pPage;
009330 Btree *p = pCur->pBtree;
009331 unsigned char *oldCell;
009332 unsigned char *newCell = 0;
009333
009334 assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND|BTREE_PREFORMAT))==flags );
009335 assert( (flags & BTREE_PREFORMAT)==0 || seekResult || pCur->pKeyInfo==0 );
009336
009337 /* Save the positions of any other cursors open on this table.
009338 **
009339 ** In some cases, the call to btreeMoveto() below is a no-op. For
009340 ** example, when inserting data into a table with auto-generated integer
009341 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
009342 ** integer key to use. It then calls this function to actually insert the
009343 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
009344 ** that the cursor is already where it needs to be and returns without
009345 ** doing any work. To avoid thwarting these optimizations, it is important
009346 ** not to clear the cursor here.
009347 */
009348 if( pCur->curFlags & BTCF_Multiple ){
009349 rc = saveAllCursors(p->pBt, pCur->pgnoRoot, pCur);
009350 if( rc ) return rc;
009351 if( loc && pCur->iPage<0 ){
009352 /* This can only happen if the schema is corrupt such that there is more
009353 ** than one table or index with the same root page as used by the cursor.
009354 ** Which can only happen if the SQLITE_NoSchemaError flag was set when
009355 ** the schema was loaded. This cannot be asserted though, as a user might
009356 ** set the flag, load the schema, and then unset the flag. */
009357 return SQLITE_CORRUPT_PGNO(pCur->pgnoRoot);
009358 }
009359 }
009360
009361 /* Ensure that the cursor is not in the CURSOR_FAULT state and that it
009362 ** points to a valid cell.
009363 */
009364 if( pCur->eState>=CURSOR_REQUIRESEEK ){
009365 testcase( pCur->eState==CURSOR_REQUIRESEEK );
009366 testcase( pCur->eState==CURSOR_FAULT );
009367 rc = moveToRoot(pCur);
009368 if( rc && rc!=SQLITE_EMPTY ) return rc;
009369 }
009370
009371 assert( cursorOwnsBtShared(pCur) );
009372 assert( (pCur->curFlags & BTCF_WriteFlag)!=0
009373 && p->pBt->inTransaction==TRANS_WRITE
009374 && (p->pBt->btsFlags & BTS_READ_ONLY)==0 );
009375 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
009376
009377 /* Assert that the caller has been consistent. If this cursor was opened
009378 ** expecting an index b-tree, then the caller should be inserting blob
009379 ** keys with no associated data. If the cursor was opened expecting an
009380 ** intkey table, the caller should be inserting integer keys with a
009381 ** blob of associated data. */
009382 assert( (flags & BTREE_PREFORMAT) || (pX->pKey==0)==(pCur->pKeyInfo==0) );
009383
009384 if( pCur->pKeyInfo==0 ){
009385 assert( pX->pKey==0 );
009386 /* If this is an insert into a table b-tree, invalidate any incrblob
009387 ** cursors open on the row being replaced */
009388 if( p->hasIncrblobCur ){
009389 invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0);
009390 }
009391
009392 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
009393 ** to a row with the same key as the new entry being inserted.
009394 */
009395 #ifdef SQLITE_DEBUG
009396 if( flags & BTREE_SAVEPOSITION ){
009397 assert( pCur->curFlags & BTCF_ValidNKey );
009398 assert( pX->nKey==pCur->info.nKey );
009399 assert( loc==0 );
009400 }
009401 #endif
009402
009403 /* On the other hand, BTREE_SAVEPOSITION==0 does not imply
009404 ** that the cursor is not pointing to a row to be overwritten.
009405 ** So do a complete check.
009406 */
009407 if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){
009408 /* The cursor is pointing to the entry that is to be
009409 ** overwritten */
009410 assert( pX->nData>=0 && pX->nZero>=0 );
009411 if( pCur->info.nSize!=0
009412 && pCur->info.nPayload==(u32)pX->nData+pX->nZero
009413 ){
009414 /* New entry is the same size as the old. Do an overwrite */
009415 return btreeOverwriteCell(pCur, pX);
009416 }
009417 assert( loc==0 );
009418 }else if( loc==0 ){
009419 /* The cursor is *not* pointing to the cell to be overwritten, nor
009420 ** to an adjacent cell. Move the cursor so that it is pointing either
009421 ** to the cell to be overwritten or an adjacent cell.
009422 */
009423 rc = sqlite3BtreeTableMoveto(pCur, pX->nKey,
009424 (flags & BTREE_APPEND)!=0, &loc);
009425 if( rc ) return rc;
009426 }
009427 }else{
009428 /* This is an index or a WITHOUT ROWID table */
009429
009430 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
009431 ** to a row with the same key as the new entry being inserted.
009432 */
009433 assert( (flags & BTREE_SAVEPOSITION)==0 || loc==0 );
009434
009435 /* If the cursor is not already pointing either to the cell to be
009436 ** overwritten, or if a new cell is being inserted, if the cursor is
009437 ** not pointing to an immediately adjacent cell, then move the cursor
009438 ** so that it does.
009439 */
009440 if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){
009441 if( pX->nMem ){
009442 UnpackedRecord r;
009443 r.pKeyInfo = pCur->pKeyInfo;
009444 r.aMem = pX->aMem;
009445 r.nField = pX->nMem;
009446 r.default_rc = 0;
009447 r.eqSeen = 0;
009448 rc = sqlite3BtreeIndexMoveto(pCur, &r, &loc);
009449 }else{
009450 rc = btreeMoveto(pCur, pX->pKey, pX->nKey,
009451 (flags & BTREE_APPEND)!=0, &loc);
009452 }
009453 if( rc ) return rc;
009454 }
009455
009456 /* If the cursor is currently pointing to an entry to be overwritten
009457 ** and the new content is the same as as the old, then use the
009458 ** overwrite optimization.
009459 */
009460 if( loc==0 ){
009461 getCellInfo(pCur);
009462 if( pCur->info.nKey==pX->nKey ){
009463 BtreePayload x2;
009464 x2.pData = pX->pKey;
009465 x2.nData = pX->nKey;
009466 x2.nZero = 0;
009467 return btreeOverwriteCell(pCur, &x2);
009468 }
009469 }
009470 }
009471 assert( pCur->eState==CURSOR_VALID
009472 || (pCur->eState==CURSOR_INVALID && loc) || CORRUPT_DB );
009473
009474 pPage = pCur->pPage;
009475 assert( pPage->intKey || pX->nKey>=0 || (flags & BTREE_PREFORMAT) );
009476 assert( pPage->leaf || !pPage->intKey );
009477 if( pPage->nFree<0 ){
009478 if( NEVER(pCur->eState>CURSOR_INVALID) ){
009479 /* ^^^^^--- due to the moveToRoot() call above */
009480 rc = SQLITE_CORRUPT_PAGE(pPage);
009481 }else{
009482 rc = btreeComputeFreeSpace(pPage);
009483 }
009484 if( rc ) return rc;
009485 }
009486
009487 TRACE(("INSERT: table=%u nkey=%lld ndata=%u page=%u %s\n",
009488 pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
009489 loc==0 ? "overwrite" : "new entry"));
009490 assert( pPage->isInit || CORRUPT_DB );
009491 newCell = p->pBt->pTmpSpace;
009492 assert( newCell!=0 );
009493 assert( BTREE_PREFORMAT==OPFLAG_PREFORMAT );
009494 if( flags & BTREE_PREFORMAT ){
009495 rc = SQLITE_OK;
009496 szNew = p->pBt->nPreformatSize;
009497 if( szNew<4 ){
009498 szNew = 4;
009499 newCell[3] = 0;
009500 }
009501 if( ISAUTOVACUUM(p->pBt) && szNew>pPage->maxLocal ){
009502 CellInfo info;
009503 pPage->xParseCell(pPage, newCell, &info);
009504 if( info.nPayload!=info.nLocal ){
009505 Pgno ovfl = get4byte(&newCell[szNew-4]);
009506 ptrmapPut(p->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, &rc);
009507 if( NEVER(rc) ) goto end_insert;
009508 }
009509 }
009510 }else{
009511 rc = fillInCell(pPage, newCell, pX, &szNew);
009512 if( rc ) goto end_insert;
009513 }
009514 assert( szNew==pPage->xCellSize(pPage, newCell) );
009515 assert( szNew <= MX_CELL_SIZE(p->pBt) );
009516 idx = pCur->ix;
009517 pCur->info.nSize = 0;
009518 if( loc==0 ){
009519 CellInfo info;
009520 assert( idx>=0 );
009521 if( idx>=pPage->nCell ){
009522 return SQLITE_CORRUPT_PAGE(pPage);
009523 }
009524 rc = sqlite3PagerWrite(pPage->pDbPage);
009525 if( rc ){
009526 goto end_insert;
009527 }
009528 oldCell = findCell(pPage, idx);
009529 if( !pPage->leaf ){
009530 memcpy(newCell, oldCell, 4);
009531 }
009532 BTREE_CLEAR_CELL(rc, pPage, oldCell, info);
009533 testcase( pCur->curFlags & BTCF_ValidOvfl );
009534 invalidateOverflowCache(pCur);
009535 if( info.nSize==szNew && info.nLocal==info.nPayload
009536 && (!ISAUTOVACUUM(p->pBt) || szNew<pPage->minLocal)
009537 ){
009538 /* Overwrite the old cell with the new if they are the same size.
009539 ** We could also try to do this if the old cell is smaller, then add
009540 ** the leftover space to the free list. But experiments show that
009541 ** doing that is no faster then skipping this optimization and just
009542 ** calling dropCell() and insertCell().
009543 **
009544 ** This optimization cannot be used on an autovacuum database if the
009545 ** new entry uses overflow pages, as the insertCell() call below is
009546 ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry. */
009547 assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */
009548 if( oldCell < pPage->aData+pPage->hdrOffset+10 ){
009549 return SQLITE_CORRUPT_PAGE(pPage);
009550 }
009551 if( oldCell+szNew > pPage->aDataEnd ){
009552 return SQLITE_CORRUPT_PAGE(pPage);
009553 }
009554 memcpy(oldCell, newCell, szNew);
009555 return SQLITE_OK;
009556 }
009557 dropCell(pPage, idx, info.nSize, &rc);
009558 if( rc ) goto end_insert;
009559 }else if( loc<0 && pPage->nCell>0 ){
009560 assert( pPage->leaf );
009561 idx = ++pCur->ix;
009562 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
009563 }else{
009564 assert( pPage->leaf );
009565 }
009566 rc = insertCellFast(pPage, idx, newCell, szNew);
009567 assert( pPage->nOverflow==0 || rc==SQLITE_OK );
009568 assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
009569
009570 /* If no error has occurred and pPage has an overflow cell, call balance()
009571 ** to redistribute the cells within the tree. Since balance() may move
009572 ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
009573 ** variables.
009574 **
009575 ** Previous versions of SQLite called moveToRoot() to move the cursor
009576 ** back to the root page as balance() used to invalidate the contents
009577 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
009578 ** set the cursor state to "invalid". This makes common insert operations
009579 ** slightly faster.
009580 **
009581 ** There is a subtle but important optimization here too. When inserting
009582 ** multiple records into an intkey b-tree using a single cursor (as can
009583 ** happen while processing an "INSERT INTO ... SELECT" statement), it
009584 ** is advantageous to leave the cursor pointing to the last entry in
009585 ** the b-tree if possible. If the cursor is left pointing to the last
009586 ** entry in the table, and the next row inserted has an integer key
009587 ** larger than the largest existing key, it is possible to insert the
009588 ** row without seeking the cursor. This can be a big performance boost.
009589 */
009590 if( pPage->nOverflow ){
009591 assert( rc==SQLITE_OK );
009592 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
009593 rc = balance(pCur);
009594
009595 /* Must make sure nOverflow is reset to zero even if the balance()
009596 ** fails. Internal data structure corruption will result otherwise.
009597 ** Also, set the cursor state to invalid. This stops saveCursorPosition()
009598 ** from trying to save the current position of the cursor. */
009599 pCur->pPage->nOverflow = 0;
009600 pCur->eState = CURSOR_INVALID;
009601 if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){
009602 btreeReleaseAllCursorPages(pCur);
009603 if( pCur->pKeyInfo ){
009604 assert( pCur->pKey==0 );
009605 pCur->pKey = sqlite3Malloc( pX->nKey );
009606 if( pCur->pKey==0 ){
009607 rc = SQLITE_NOMEM;
009608 }else{
009609 memcpy(pCur->pKey, pX->pKey, pX->nKey);
009610 }
009611 }
009612 pCur->eState = CURSOR_REQUIRESEEK;
009613 pCur->nKey = pX->nKey;
009614 }
009615 }
009616 assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 );
009617
009618 end_insert:
009619 return rc;
009620 }
009621
009622 /*
009623 ** This function is used as part of copying the current row from cursor
009624 ** pSrc into cursor pDest. If the cursors are open on intkey tables, then
009625 ** parameter iKey is used as the rowid value when the record is copied
009626 ** into pDest. Otherwise, the record is copied verbatim.
009627 **
009628 ** This function does not actually write the new value to cursor pDest.
009629 ** Instead, it creates and populates any required overflow pages and
009630 ** writes the data for the new cell into the BtShared.pTmpSpace buffer
009631 ** for the destination database. The size of the cell, in bytes, is left
009632 ** in BtShared.nPreformatSize. The caller completes the insertion by
009633 ** calling sqlite3BtreeInsert() with the BTREE_PREFORMAT flag specified.
009634 **
009635 ** SQLITE_OK is returned if successful, or an SQLite error code otherwise.
009636 */
009637 int sqlite3BtreeTransferRow(BtCursor *pDest, BtCursor *pSrc, i64 iKey){
009638 BtShared *pBt = pDest->pBt;
009639 u8 *aOut = pBt->pTmpSpace; /* Pointer to next output buffer */
009640 const u8 *aIn; /* Pointer to next input buffer */
009641 u32 nIn; /* Size of input buffer aIn[] */
009642 u32 nRem; /* Bytes of data still to copy */
009643
009644 getCellInfo(pSrc);
009645 if( pSrc->info.nPayload<0x80 ){
009646 *(aOut++) = pSrc->info.nPayload;
009647 }else{
009648 aOut += sqlite3PutVarint(aOut, pSrc->info.nPayload);
009649 }
009650 if( pDest->pKeyInfo==0 ) aOut += putVarint(aOut, iKey);
009651 nIn = pSrc->info.nLocal;
009652 aIn = pSrc->info.pPayload;
009653 if( aIn+nIn>pSrc->pPage->aDataEnd ){
009654 return SQLITE_CORRUPT_PAGE(pSrc->pPage);
009655 }
009656 nRem = pSrc->info.nPayload;
009657 if( nIn==nRem && nIn<pDest->pPage->maxLocal ){
009658 memcpy(aOut, aIn, nIn);
009659 pBt->nPreformatSize = nIn + (aOut - pBt->pTmpSpace);
009660 return SQLITE_OK;
009661 }else{
009662 int rc = SQLITE_OK;
009663 Pager *pSrcPager = pSrc->pBt->pPager;
009664 u8 *pPgnoOut = 0;
009665 Pgno ovflIn = 0;
009666 DbPage *pPageIn = 0;
009667 MemPage *pPageOut = 0;
009668 u32 nOut; /* Size of output buffer aOut[] */
009669
009670 nOut = btreePayloadToLocal(pDest->pPage, pSrc->info.nPayload);
009671 pBt->nPreformatSize = nOut + (aOut - pBt->pTmpSpace);
009672 if( nOut<pSrc->info.nPayload ){
009673 pPgnoOut = &aOut[nOut];
009674 pBt->nPreformatSize += 4;
009675 }
009676
009677 if( nRem>nIn ){
009678 if( aIn+nIn+4>pSrc->pPage->aDataEnd ){
009679 return SQLITE_CORRUPT_PAGE(pSrc->pPage);
009680 }
009681 ovflIn = get4byte(&pSrc->info.pPayload[nIn]);
009682 }
009683
009684 do {
009685 nRem -= nOut;
009686 do{
009687 assert( nOut>0 );
009688 if( nIn>0 ){
009689 int nCopy = MIN(nOut, nIn);
009690 memcpy(aOut, aIn, nCopy);
009691 nOut -= nCopy;
009692 nIn -= nCopy;
009693 aOut += nCopy;
009694 aIn += nCopy;
009695 }
009696 if( nOut>0 ){
009697 sqlite3PagerUnref(pPageIn);
009698 pPageIn = 0;
009699 rc = sqlite3PagerGet(pSrcPager, ovflIn, &pPageIn, PAGER_GET_READONLY);
009700 if( rc==SQLITE_OK ){
009701 aIn = (const u8*)sqlite3PagerGetData(pPageIn);
009702 ovflIn = get4byte(aIn);
009703 aIn += 4;
009704 nIn = pSrc->pBt->usableSize - 4;
009705 }
009706 }
009707 }while( rc==SQLITE_OK && nOut>0 );
009708
009709 if( rc==SQLITE_OK && nRem>0 && ALWAYS(pPgnoOut) ){
009710 Pgno pgnoNew;
009711 MemPage *pNew = 0;
009712 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
009713 put4byte(pPgnoOut, pgnoNew);
009714 if( ISAUTOVACUUM(pBt) && pPageOut ){
009715 ptrmapPut(pBt, pgnoNew, PTRMAP_OVERFLOW2, pPageOut->pgno, &rc);
009716 }
009717 releasePage(pPageOut);
009718 pPageOut = pNew;
009719 if( pPageOut ){
009720 pPgnoOut = pPageOut->aData;
009721 put4byte(pPgnoOut, 0);
009722 aOut = &pPgnoOut[4];
009723 nOut = MIN(pBt->usableSize - 4, nRem);
009724 }
009725 }
009726 }while( nRem>0 && rc==SQLITE_OK );
009727
009728 releasePage(pPageOut);
009729 sqlite3PagerUnref(pPageIn);
009730 return rc;
009731 }
009732 }
009733
009734 /*
009735 ** Delete the entry that the cursor is pointing to.
009736 **
009737 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then
009738 ** the cursor is left pointing at an arbitrary location after the delete.
009739 ** But if that bit is set, then the cursor is left in a state such that
009740 ** the next call to BtreeNext() or BtreePrev() moves it to the same row
009741 ** as it would have been on if the call to BtreeDelete() had been omitted.
009742 **
009743 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes
009744 ** associated with a single table entry and its indexes. Only one of those
009745 ** deletes is considered the "primary" delete. The primary delete occurs
009746 ** on a cursor that is not a BTREE_FORDELETE cursor. All but one delete
009747 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.
009748 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,
009749 ** but which might be used by alternative storage engines.
009750 */
009751 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){
009752 Btree *p = pCur->pBtree;
009753 BtShared *pBt = p->pBt;
009754 int rc; /* Return code */
009755 MemPage *pPage; /* Page to delete cell from */
009756 unsigned char *pCell; /* Pointer to cell to delete */
009757 int iCellIdx; /* Index of cell to delete */
009758 int iCellDepth; /* Depth of node containing pCell */
009759 CellInfo info; /* Size of the cell being deleted */
009760 u8 bPreserve; /* Keep cursor valid. 2 for CURSOR_SKIPNEXT */
009761
009762 assert( cursorOwnsBtShared(pCur) );
009763 assert( pBt->inTransaction==TRANS_WRITE );
009764 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
009765 assert( pCur->curFlags & BTCF_WriteFlag );
009766 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
009767 assert( !hasReadConflicts(p, pCur->pgnoRoot) );
009768 assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );
009769 if( pCur->eState!=CURSOR_VALID ){
009770 if( pCur->eState>=CURSOR_REQUIRESEEK ){
009771 rc = btreeRestoreCursorPosition(pCur);
009772 assert( rc!=SQLITE_OK || CORRUPT_DB || pCur->eState==CURSOR_VALID );
009773 if( rc || pCur->eState!=CURSOR_VALID ) return rc;
009774 }else{
009775 return SQLITE_CORRUPT_PGNO(pCur->pgnoRoot);
009776 }
009777 }
009778 assert( pCur->eState==CURSOR_VALID );
009779
009780 iCellDepth = pCur->iPage;
009781 iCellIdx = pCur->ix;
009782 pPage = pCur->pPage;
009783 if( pPage->nCell<=iCellIdx ){
009784 return SQLITE_CORRUPT_PAGE(pPage);
009785 }
009786 pCell = findCell(pPage, iCellIdx);
009787 if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ){
009788 return SQLITE_CORRUPT_PAGE(pPage);
009789 }
009790 if( pCell<&pPage->aCellIdx[pPage->nCell] ){
009791 return SQLITE_CORRUPT_PAGE(pPage);
009792 }
009793
009794 /* If the BTREE_SAVEPOSITION bit is on, then the cursor position must
009795 ** be preserved following this delete operation. If the current delete
009796 ** will cause a b-tree rebalance, then this is done by saving the cursor
009797 ** key and leaving the cursor in CURSOR_REQUIRESEEK state before
009798 ** returning.
009799 **
009800 ** If the current delete will not cause a rebalance, then the cursor
009801 ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately
009802 ** before or after the deleted entry.
009803 **
009804 ** The bPreserve value records which path is required:
009805 **
009806 ** bPreserve==0 Not necessary to save the cursor position
009807 ** bPreserve==1 Use CURSOR_REQUIRESEEK to save the cursor position
009808 ** bPreserve==2 Cursor won't move. Set CURSOR_SKIPNEXT.
009809 */
009810 bPreserve = (flags & BTREE_SAVEPOSITION)!=0;
009811 if( bPreserve ){
009812 if( !pPage->leaf
009813 || (pPage->nFree+pPage->xCellSize(pPage,pCell)+2) >
009814 (int)(pBt->usableSize*2/3)
009815 || pPage->nCell==1 /* See dbfuzz001.test for a test case */
009816 ){
009817 /* A b-tree rebalance will be required after deleting this entry.
009818 ** Save the cursor key. */
009819 rc = saveCursorKey(pCur);
009820 if( rc ) return rc;
009821 }else{
009822 bPreserve = 2;
009823 }
009824 }
009825
009826 /* If the page containing the entry to delete is not a leaf page, move
009827 ** the cursor to the largest entry in the tree that is smaller than
009828 ** the entry being deleted. This cell will replace the cell being deleted
009829 ** from the internal node. The 'previous' entry is used for this instead
009830 ** of the 'next' entry, as the previous entry is always a part of the
009831 ** sub-tree headed by the child page of the cell being deleted. This makes
009832 ** balancing the tree following the delete operation easier. */
009833 if( !pPage->leaf ){
009834 rc = sqlite3BtreePrevious(pCur, 0);
009835 assert( rc!=SQLITE_DONE );
009836 if( rc ) return rc;
009837 }
009838
009839 /* Save the positions of any other cursors open on this table before
009840 ** making any modifications. */
009841 if( pCur->curFlags & BTCF_Multiple ){
009842 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
009843 if( rc ) return rc;
009844 }
009845
009846 /* If this is a delete operation to remove a row from a table b-tree,
009847 ** invalidate any incrblob cursors open on the row being deleted. */
009848 if( pCur->pKeyInfo==0 && p->hasIncrblobCur ){
009849 invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0);
009850 }
009851
009852 /* Make the page containing the entry to be deleted writable. Then free any
009853 ** overflow pages associated with the entry and finally remove the cell
009854 ** itself from within the page. */
009855 rc = sqlite3PagerWrite(pPage->pDbPage);
009856 if( rc ) return rc;
009857 BTREE_CLEAR_CELL(rc, pPage, pCell, info);
009858 dropCell(pPage, iCellIdx, info.nSize, &rc);
009859 if( rc ) return rc;
009860
009861 /* If the cell deleted was not located on a leaf page, then the cursor
009862 ** is currently pointing to the largest entry in the sub-tree headed
009863 ** by the child-page of the cell that was just deleted from an internal
009864 ** node. The cell from the leaf node needs to be moved to the internal
009865 ** node to replace the deleted cell. */
009866 if( !pPage->leaf ){
009867 MemPage *pLeaf = pCur->pPage;
009868 int nCell;
009869 Pgno n;
009870 unsigned char *pTmp;
009871
009872 if( pLeaf->nFree<0 ){
009873 rc = btreeComputeFreeSpace(pLeaf);
009874 if( rc ) return rc;
009875 }
009876 if( iCellDepth<pCur->iPage-1 ){
009877 n = pCur->apPage[iCellDepth+1]->pgno;
009878 }else{
009879 n = pCur->pPage->pgno;
009880 }
009881 pCell = findCell(pLeaf, pLeaf->nCell-1);
009882 if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_PAGE(pLeaf);
009883 nCell = pLeaf->xCellSize(pLeaf, pCell);
009884 assert( MX_CELL_SIZE(pBt) >= nCell );
009885 pTmp = pBt->pTmpSpace;
009886 assert( pTmp!=0 );
009887 rc = sqlite3PagerWrite(pLeaf->pDbPage);
009888 if( rc==SQLITE_OK ){
009889 rc = insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n);
009890 }
009891 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
009892 if( rc ) return rc;
009893 }
009894
009895 /* Balance the tree. If the entry deleted was located on a leaf page,
009896 ** then the cursor still points to that page. In this case the first
009897 ** call to balance() repairs the tree, and the if(...) condition is
009898 ** never true.
009899 **
009900 ** Otherwise, if the entry deleted was on an internal node page, then
009901 ** pCur is pointing to the leaf page from which a cell was removed to
009902 ** replace the cell deleted from the internal node. This is slightly
009903 ** tricky as the leaf node may be underfull, and the internal node may
009904 ** be either under or overfull. In this case run the balancing algorithm
009905 ** on the leaf node first. If the balance proceeds far enough up the
009906 ** tree that we can be sure that any problem in the internal node has
009907 ** been corrected, so be it. Otherwise, after balancing the leaf node,
009908 ** walk the cursor up the tree to the internal node and balance it as
009909 ** well. */
009910 assert( pCur->pPage->nOverflow==0 );
009911 assert( pCur->pPage->nFree>=0 );
009912 if( pCur->pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){
009913 /* Optimization: If the free space is less than 2/3rds of the page,
009914 ** then balance() will always be a no-op. No need to invoke it. */
009915 rc = SQLITE_OK;
009916 }else{
009917 rc = balance(pCur);
009918 }
009919 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
009920 releasePageNotNull(pCur->pPage);
009921 pCur->iPage--;
009922 while( pCur->iPage>iCellDepth ){
009923 releasePage(pCur->apPage[pCur->iPage--]);
009924 }
009925 pCur->pPage = pCur->apPage[pCur->iPage];
009926 rc = balance(pCur);
009927 }
009928
009929 if( rc==SQLITE_OK ){
009930 if( bPreserve>1 ){
009931 assert( (pCur->iPage==iCellDepth || CORRUPT_DB) );
009932 assert( pPage==pCur->pPage || CORRUPT_DB );
009933 assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell );
009934 pCur->eState = CURSOR_SKIPNEXT;
009935 if( iCellIdx>=pPage->nCell ){
009936 pCur->skipNext = -1;
009937 pCur->ix = pPage->nCell-1;
009938 }else{
009939 pCur->skipNext = 1;
009940 }
009941 }else{
009942 rc = moveToRoot(pCur);
009943 if( bPreserve ){
009944 btreeReleaseAllCursorPages(pCur);
009945 pCur->eState = CURSOR_REQUIRESEEK;
009946 }
009947 if( rc==SQLITE_EMPTY ) rc = SQLITE_OK;
009948 }
009949 }
009950 return rc;
009951 }
009952
009953 /*
009954 ** Create a new BTree table. Write into *piTable the page
009955 ** number for the root page of the new table.
009956 **
009957 ** The type of type is determined by the flags parameter. Only the
009958 ** following values of flags are currently in use. Other values for
009959 ** flags might not work:
009960 **
009961 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
009962 ** BTREE_ZERODATA Used for SQL indices
009963 */
009964 static int btreeCreateTable(Btree *p, Pgno *piTable, int createTabFlags){
009965 BtShared *pBt = p->pBt;
009966 MemPage *pRoot;
009967 Pgno pgnoRoot;
009968 int rc;
009969 int ptfFlags; /* Page-type flags for the root page of new table */
009970
009971 assert( sqlite3BtreeHoldsMutex(p) );
009972 assert( pBt->inTransaction==TRANS_WRITE );
009973 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
009974
009975 #ifdef SQLITE_OMIT_AUTOVACUUM
009976 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
009977 if( rc ){
009978 return rc;
009979 }
009980 #else
009981 if( pBt->autoVacuum ){
009982 Pgno pgnoMove; /* Move a page here to make room for the root-page */
009983 MemPage *pPageMove; /* The page to move to. */
009984
009985 /* Creating a new table may probably require moving an existing database
009986 ** to make room for the new tables root page. In case this page turns
009987 ** out to be an overflow page, delete all overflow page-map caches
009988 ** held by open cursors.
009989 */
009990 invalidateAllOverflowCache(pBt);
009991
009992 /* Read the value of meta[3] from the database to determine where the
009993 ** root page of the new table should go. meta[3] is the largest root-page
009994 ** created so far, so the new root-page is (meta[3]+1).
009995 */
009996 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
009997 if( pgnoRoot>btreePagecount(pBt) ){
009998 return SQLITE_CORRUPT_PGNO(pgnoRoot);
009999 }
010000 pgnoRoot++;
010001
010002 /* The new root-page may not be allocated on a pointer-map page, or the
010003 ** PENDING_BYTE page.
010004 */
010005 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
010006 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
010007 pgnoRoot++;
010008 }
010009 assert( pgnoRoot>=3 );
010010
010011 /* Allocate a page. The page that currently resides at pgnoRoot will
010012 ** be moved to the allocated page (unless the allocated page happens
010013 ** to reside at pgnoRoot).
010014 */
010015 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
010016 if( rc!=SQLITE_OK ){
010017 return rc;
010018 }
010019
010020 if( pgnoMove!=pgnoRoot ){
010021 /* pgnoRoot is the page that will be used for the root-page of
010022 ** the new table (assuming an error did not occur). But we were
010023 ** allocated pgnoMove. If required (i.e. if it was not allocated
010024 ** by extending the file), the current page at position pgnoMove
010025 ** is already journaled.
010026 */
010027 u8 eType = 0;
010028 Pgno iPtrPage = 0;
010029
010030 /* Save the positions of any open cursors. This is required in
010031 ** case they are holding a reference to an xFetch reference
010032 ** corresponding to page pgnoRoot. */
010033 rc = saveAllCursors(pBt, 0, 0);
010034 releasePage(pPageMove);
010035 if( rc!=SQLITE_OK ){
010036 return rc;
010037 }
010038
010039 /* Move the page currently at pgnoRoot to pgnoMove. */
010040 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
010041 if( rc!=SQLITE_OK ){
010042 return rc;
010043 }
010044 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
010045 if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
010046 rc = SQLITE_CORRUPT_PGNO(pgnoRoot);
010047 }
010048 if( rc!=SQLITE_OK ){
010049 releasePage(pRoot);
010050 return rc;
010051 }
010052 assert( eType!=PTRMAP_ROOTPAGE );
010053 assert( eType!=PTRMAP_FREEPAGE );
010054 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
010055 releasePage(pRoot);
010056
010057 /* Obtain the page at pgnoRoot */
010058 if( rc!=SQLITE_OK ){
010059 return rc;
010060 }
010061 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
010062 if( rc!=SQLITE_OK ){
010063 return rc;
010064 }
010065 rc = sqlite3PagerWrite(pRoot->pDbPage);
010066 if( rc!=SQLITE_OK ){
010067 releasePage(pRoot);
010068 return rc;
010069 }
010070 }else{
010071 pRoot = pPageMove;
010072 }
010073
010074 /* Update the pointer-map and meta-data with the new root-page number. */
010075 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
010076 if( rc ){
010077 releasePage(pRoot);
010078 return rc;
010079 }
010080
010081 /* When the new root page was allocated, page 1 was made writable in
010082 ** order either to increase the database filesize, or to decrement the
010083 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
010084 */
010085 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
010086 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
010087 if( NEVER(rc) ){
010088 releasePage(pRoot);
010089 return rc;
010090 }
010091
010092 }else{
010093 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
010094 if( rc ) return rc;
010095 }
010096 #endif
010097 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
010098 if( createTabFlags & BTREE_INTKEY ){
010099 ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
010100 }else{
010101 ptfFlags = PTF_ZERODATA | PTF_LEAF;
010102 }
010103 zeroPage(pRoot, ptfFlags);
010104 sqlite3PagerUnref(pRoot->pDbPage);
010105 assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
010106 *piTable = pgnoRoot;
010107 return SQLITE_OK;
010108 }
010109 int sqlite3BtreeCreateTable(Btree *p, Pgno *piTable, int flags){
010110 int rc;
010111 sqlite3BtreeEnter(p);
010112 rc = btreeCreateTable(p, piTable, flags);
010113 sqlite3BtreeLeave(p);
010114 return rc;
010115 }
010116
010117 /*
010118 ** Erase the given database page and all its children. Return
010119 ** the page to the freelist.
010120 */
010121 static int clearDatabasePage(
010122 BtShared *pBt, /* The BTree that contains the table */
010123 Pgno pgno, /* Page number to clear */
010124 int freePageFlag, /* Deallocate page if true */
010125 i64 *pnChange /* Add number of Cells freed to this counter */
010126 ){
010127 MemPage *pPage;
010128 int rc;
010129 unsigned char *pCell;
010130 int i;
010131 int hdr;
010132 CellInfo info;
010133
010134 assert( sqlite3_mutex_held(pBt->mutex) );
010135 if( pgno>btreePagecount(pBt) ){
010136 return SQLITE_CORRUPT_PGNO(pgno);
010137 }
010138 rc = getAndInitPage(pBt, pgno, &pPage, 0);
010139 if( rc ) return rc;
010140 if( (pBt->openFlags & BTREE_SINGLE)==0
010141 && sqlite3PagerPageRefcount(pPage->pDbPage) != (1 + (pgno==1))
010142 ){
010143 rc = SQLITE_CORRUPT_PAGE(pPage);
010144 goto cleardatabasepage_out;
010145 }
010146 hdr = pPage->hdrOffset;
010147 for(i=0; i<pPage->nCell; i++){
010148 pCell = findCell(pPage, i);
010149 if( !pPage->leaf ){
010150 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
010151 if( rc ) goto cleardatabasepage_out;
010152 }
010153 BTREE_CLEAR_CELL(rc, pPage, pCell, info);
010154 if( rc ) goto cleardatabasepage_out;
010155 }
010156 if( !pPage->leaf ){
010157 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
010158 if( rc ) goto cleardatabasepage_out;
010159 if( pPage->intKey ) pnChange = 0;
010160 }
010161 if( pnChange ){
010162 testcase( !pPage->intKey );
010163 *pnChange += pPage->nCell;
010164 }
010165 if( freePageFlag ){
010166 freePage(pPage, &rc);
010167 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
010168 zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
010169 }
010170
010171 cleardatabasepage_out:
010172 releasePage(pPage);
010173 return rc;
010174 }
010175
010176 /*
010177 ** Delete all information from a single table in the database. iTable is
010178 ** the page number of the root of the table. After this routine returns,
010179 ** the root page is empty, but still exists.
010180 **
010181 ** This routine will fail with SQLITE_LOCKED if there are any open
010182 ** read cursors on the table. Open write cursors are moved to the
010183 ** root of the table.
010184 **
010185 ** If pnChange is not NULL, then the integer value pointed to by pnChange
010186 ** is incremented by the number of entries in the table.
010187 */
010188 int sqlite3BtreeClearTable(Btree *p, int iTable, i64 *pnChange){
010189 int rc;
010190 BtShared *pBt = p->pBt;
010191 sqlite3BtreeEnter(p);
010192 assert( p->inTrans==TRANS_WRITE );
010193
010194 rc = saveAllCursors(pBt, (Pgno)iTable, 0);
010195
010196 if( SQLITE_OK==rc ){
010197 /* Invalidate all incrblob cursors open on table iTable (assuming iTable
010198 ** is the root of a table b-tree - if it is not, the following call is
010199 ** a no-op). */
010200 if( p->hasIncrblobCur ){
010201 invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1);
010202 }
010203 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
010204 }
010205 sqlite3BtreeLeave(p);
010206 return rc;
010207 }
010208
010209 /*
010210 ** Delete all information from the single table that pCur is open on.
010211 **
010212 ** This routine only work for pCur on an ephemeral table.
010213 */
010214 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
010215 return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
010216 }
010217
010218 /*
010219 ** Erase all information in a table and add the root of the table to
010220 ** the freelist. Except, the root of the principle table (the one on
010221 ** page 1) is never added to the freelist.
010222 **
010223 ** This routine will fail with SQLITE_LOCKED if there are any open
010224 ** cursors on the table.
010225 **
010226 ** If AUTOVACUUM is enabled and the page at iTable is not the last
010227 ** root page in the database file, then the last root page
010228 ** in the database file is moved into the slot formerly occupied by
010229 ** iTable and that last slot formerly occupied by the last root page
010230 ** is added to the freelist instead of iTable. In this say, all
010231 ** root pages are kept at the beginning of the database file, which
010232 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the
010233 ** page number that used to be the last root page in the file before
010234 ** the move. If no page gets moved, *piMoved is set to 0.
010235 ** The last root page is recorded in meta[3] and the value of
010236 ** meta[3] is updated by this procedure.
010237 */
010238 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
010239 int rc;
010240 MemPage *pPage = 0;
010241 BtShared *pBt = p->pBt;
010242
010243 assert( sqlite3BtreeHoldsMutex(p) );
010244 assert( p->inTrans==TRANS_WRITE );
010245 assert( iTable>=2 );
010246 if( iTable>btreePagecount(pBt) ){
010247 return SQLITE_CORRUPT_PGNO(iTable);
010248 }
010249
010250 rc = sqlite3BtreeClearTable(p, iTable, 0);
010251 if( rc ) return rc;
010252 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
010253 if( NEVER(rc) ){
010254 releasePage(pPage);
010255 return rc;
010256 }
010257
010258 *piMoved = 0;
010259
010260 #ifdef SQLITE_OMIT_AUTOVACUUM
010261 freePage(pPage, &rc);
010262 releasePage(pPage);
010263 #else
010264 if( pBt->autoVacuum ){
010265 Pgno maxRootPgno;
010266 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
010267
010268 if( iTable==maxRootPgno ){
010269 /* If the table being dropped is the table with the largest root-page
010270 ** number in the database, put the root page on the free list.
010271 */
010272 freePage(pPage, &rc);
010273 releasePage(pPage);
010274 if( rc!=SQLITE_OK ){
010275 return rc;
010276 }
010277 }else{
010278 /* The table being dropped does not have the largest root-page
010279 ** number in the database. So move the page that does into the
010280 ** gap left by the deleted root-page.
010281 */
010282 MemPage *pMove;
010283 releasePage(pPage);
010284 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
010285 if( rc!=SQLITE_OK ){
010286 return rc;
010287 }
010288 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
010289 releasePage(pMove);
010290 if( rc!=SQLITE_OK ){
010291 return rc;
010292 }
010293 pMove = 0;
010294 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
010295 freePage(pMove, &rc);
010296 releasePage(pMove);
010297 if( rc!=SQLITE_OK ){
010298 return rc;
010299 }
010300 *piMoved = maxRootPgno;
010301 }
010302
010303 /* Set the new 'max-root-page' value in the database header. This
010304 ** is the old value less one, less one more if that happens to
010305 ** be a root-page number, less one again if that is the
010306 ** PENDING_BYTE_PAGE.
010307 */
010308 maxRootPgno--;
010309 while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
010310 || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
010311 maxRootPgno--;
010312 }
010313 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
010314
010315 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
010316 }else{
010317 freePage(pPage, &rc);
010318 releasePage(pPage);
010319 }
010320 #endif
010321 return rc;
010322 }
010323 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
010324 int rc;
010325 sqlite3BtreeEnter(p);
010326 rc = btreeDropTable(p, iTable, piMoved);
010327 sqlite3BtreeLeave(p);
010328 return rc;
010329 }
010330
010331
010332 /*
010333 ** This function may only be called if the b-tree connection already
010334 ** has a read or write transaction open on the database.
010335 **
010336 ** Read the meta-information out of a database file. Meta[0]
010337 ** is the number of free pages currently in the database. Meta[1]
010338 ** through meta[15] are available for use by higher layers. Meta[0]
010339 ** is read-only, the others are read/write.
010340 **
010341 ** The schema layer numbers meta values differently. At the schema
010342 ** layer (and the SetCookie and ReadCookie opcodes) the number of
010343 ** free pages is not visible. So Cookie[0] is the same as Meta[1].
010344 **
010345 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case. Instead
010346 ** of reading the value out of the header, it instead loads the "DataVersion"
010347 ** from the pager. The BTREE_DATA_VERSION value is not actually stored in the
010348 ** database file. It is a number computed by the pager. But its access
010349 ** pattern is the same as header meta values, and so it is convenient to
010350 ** read it from this routine.
010351 */
010352 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
010353 BtShared *pBt = p->pBt;
010354
010355 sqlite3BtreeEnter(p);
010356 assert( p->inTrans>TRANS_NONE );
010357 assert( SQLITE_OK==querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK) );
010358 assert( pBt->pPage1 );
010359 assert( idx>=0 && idx<=15 );
010360
010361 if( idx==BTREE_DATA_VERSION ){
010362 *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iBDataVersion;
010363 }else{
010364 *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
010365 }
010366
010367 /* If auto-vacuum is disabled in this build and this is an auto-vacuum
010368 ** database, mark the database as read-only. */
010369 #ifdef SQLITE_OMIT_AUTOVACUUM
010370 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
010371 pBt->btsFlags |= BTS_READ_ONLY;
010372 }
010373 #endif
010374
010375 sqlite3BtreeLeave(p);
010376 }
010377
010378 /*
010379 ** Write meta-information back into the database. Meta[0] is
010380 ** read-only and may not be written.
010381 */
010382 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
010383 BtShared *pBt = p->pBt;
010384 unsigned char *pP1;
010385 int rc;
010386 assert( idx>=1 && idx<=15 );
010387 sqlite3BtreeEnter(p);
010388 assert( p->inTrans==TRANS_WRITE );
010389 assert( pBt->pPage1!=0 );
010390 pP1 = pBt->pPage1->aData;
010391 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
010392 if( rc==SQLITE_OK ){
010393 put4byte(&pP1[36 + idx*4], iMeta);
010394 #ifndef SQLITE_OMIT_AUTOVACUUM
010395 if( idx==BTREE_INCR_VACUUM ){
010396 assert( pBt->autoVacuum || iMeta==0 );
010397 assert( iMeta==0 || iMeta==1 );
010398 pBt->incrVacuum = (u8)iMeta;
010399 }
010400 #endif
010401 }
010402 sqlite3BtreeLeave(p);
010403 return rc;
010404 }
010405
010406 /*
010407 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
010408 ** number of entries in the b-tree and write the result to *pnEntry.
010409 **
010410 ** SQLITE_OK is returned if the operation is successfully executed.
010411 ** Otherwise, if an error is encountered (i.e. an IO error or database
010412 ** corruption) an SQLite error code is returned.
010413 */
010414 int sqlite3BtreeCount(sqlite3 *db, BtCursor *pCur, i64 *pnEntry){
010415 i64 nEntry = 0; /* Value to return in *pnEntry */
010416 int rc; /* Return code */
010417
010418 rc = moveToRoot(pCur);
010419 if( rc==SQLITE_EMPTY ){
010420 *pnEntry = 0;
010421 return SQLITE_OK;
010422 }
010423
010424 /* Unless an error occurs, the following loop runs one iteration for each
010425 ** page in the B-Tree structure (not including overflow pages).
010426 */
010427 while( rc==SQLITE_OK && !AtomicLoad(&db->u1.isInterrupted) ){
010428 int iIdx; /* Index of child node in parent */
010429 MemPage *pPage; /* Current page of the b-tree */
010430
010431 /* If this is a leaf page or the tree is not an int-key tree, then
010432 ** this page contains countable entries. Increment the entry counter
010433 ** accordingly.
010434 */
010435 pPage = pCur->pPage;
010436 if( pPage->leaf || !pPage->intKey ){
010437 nEntry += pPage->nCell;
010438 }
010439
010440 /* pPage is a leaf node. This loop navigates the cursor so that it
010441 ** points to the first interior cell that it points to the parent of
010442 ** the next page in the tree that has not yet been visited. The
010443 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
010444 ** of the page, or to the number of cells in the page if the next page
010445 ** to visit is the right-child of its parent.
010446 **
010447 ** If all pages in the tree have been visited, return SQLITE_OK to the
010448 ** caller.
010449 */
010450 if( pPage->leaf ){
010451 do {
010452 if( pCur->iPage==0 ){
010453 /* All pages of the b-tree have been visited. Return successfully. */
010454 *pnEntry = nEntry;
010455 return moveToRoot(pCur);
010456 }
010457 moveToParent(pCur);
010458 }while ( pCur->ix>=pCur->pPage->nCell );
010459
010460 pCur->ix++;
010461 pPage = pCur->pPage;
010462 }
010463
010464 /* Descend to the child node of the cell that the cursor currently
010465 ** points at. This is the right-child if (iIdx==pPage->nCell).
010466 */
010467 iIdx = pCur->ix;
010468 if( iIdx==pPage->nCell ){
010469 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
010470 }else{
010471 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
010472 }
010473 }
010474
010475 /* An error has occurred. Return an error code. */
010476 return rc;
010477 }
010478
010479 /*
010480 ** Return the pager associated with a BTree. This routine is used for
010481 ** testing and debugging only.
010482 */
010483 Pager *sqlite3BtreePager(Btree *p){
010484 return p->pBt->pPager;
010485 }
010486
010487 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
010488 /*
010489 ** Record an OOM error during integrity_check
010490 */
010491 static void checkOom(IntegrityCk *pCheck){
010492 pCheck->rc = SQLITE_NOMEM;
010493 pCheck->mxErr = 0; /* Causes integrity_check processing to stop */
010494 if( pCheck->nErr==0 ) pCheck->nErr++;
010495 }
010496
010497 /*
010498 ** Invoke the progress handler, if appropriate. Also check for an
010499 ** interrupt.
010500 */
010501 static void checkProgress(IntegrityCk *pCheck){
010502 sqlite3 *db = pCheck->db;
010503 if( AtomicLoad(&db->u1.isInterrupted) ){
010504 pCheck->rc = SQLITE_INTERRUPT;
010505 pCheck->nErr++;
010506 pCheck->mxErr = 0;
010507 }
010508 #ifndef SQLITE_OMIT_PROGRESS_CALLBACK
010509 if( db->xProgress ){
010510 assert( db->nProgressOps>0 );
010511 pCheck->nStep++;
010512 if( (pCheck->nStep % db->nProgressOps)==0
010513 && db->xProgress(db->pProgressArg)
010514 ){
010515 pCheck->rc = SQLITE_INTERRUPT;
010516 pCheck->nErr++;
010517 pCheck->mxErr = 0;
010518 }
010519 }
010520 #endif
010521 }
010522
010523 /*
010524 ** Append a message to the error message string.
010525 */
010526 static void checkAppendMsg(
010527 IntegrityCk *pCheck,
010528 const char *zFormat,
010529 ...
010530 ){
010531 va_list ap;
010532 checkProgress(pCheck);
010533 if( !pCheck->mxErr ) return;
010534 pCheck->mxErr--;
010535 pCheck->nErr++;
010536 va_start(ap, zFormat);
010537 if( pCheck->errMsg.nChar ){
010538 sqlite3_str_append(&pCheck->errMsg, "\n", 1);
010539 }
010540 if( pCheck->zPfx ){
010541 sqlite3_str_appendf(&pCheck->errMsg, pCheck->zPfx,
010542 pCheck->v0, pCheck->v1, pCheck->v2);
010543 }
010544 sqlite3_str_vappendf(&pCheck->errMsg, zFormat, ap);
010545 va_end(ap);
010546 if( pCheck->errMsg.accError==SQLITE_NOMEM ){
010547 checkOom(pCheck);
010548 }
010549 }
010550 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
010551
010552 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
010553
010554 /*
010555 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
010556 ** corresponds to page iPg is already set.
010557 */
010558 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
010559 assert( pCheck->aPgRef!=0 );
010560 assert( iPg<=pCheck->nCkPage && sizeof(pCheck->aPgRef[0])==1 );
010561 return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
010562 }
010563
010564 /*
010565 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
010566 */
010567 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
010568 assert( pCheck->aPgRef!=0 );
010569 assert( iPg<=pCheck->nCkPage && sizeof(pCheck->aPgRef[0])==1 );
010570 pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
010571 }
010572
010573
010574 /*
010575 ** Add 1 to the reference count for page iPage. If this is the second
010576 ** reference to the page, add an error message to pCheck->zErrMsg.
010577 ** Return 1 if there are 2 or more references to the page and 0 if
010578 ** if this is the first reference to the page.
010579 **
010580 ** Also check that the page number is in bounds.
010581 */
010582 static int checkRef(IntegrityCk *pCheck, Pgno iPage){
010583 if( iPage>pCheck->nCkPage || iPage==0 ){
010584 checkAppendMsg(pCheck, "invalid page number %u", iPage);
010585 return 1;
010586 }
010587 if( getPageReferenced(pCheck, iPage) ){
010588 checkAppendMsg(pCheck, "2nd reference to page %u", iPage);
010589 return 1;
010590 }
010591 setPageReferenced(pCheck, iPage);
010592 return 0;
010593 }
010594
010595 #ifndef SQLITE_OMIT_AUTOVACUUM
010596 /*
010597 ** Check that the entry in the pointer-map for page iChild maps to
010598 ** page iParent, pointer type ptrType. If not, append an error message
010599 ** to pCheck.
010600 */
010601 static void checkPtrmap(
010602 IntegrityCk *pCheck, /* Integrity check context */
010603 Pgno iChild, /* Child page number */
010604 u8 eType, /* Expected pointer map type */
010605 Pgno iParent /* Expected pointer map parent page number */
010606 ){
010607 int rc;
010608 u8 ePtrmapType;
010609 Pgno iPtrmapParent;
010610
010611 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
010612 if( rc!=SQLITE_OK ){
010613 if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) checkOom(pCheck);
010614 checkAppendMsg(pCheck, "Failed to read ptrmap key=%u", iChild);
010615 return;
010616 }
010617
010618 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
010619 checkAppendMsg(pCheck,
010620 "Bad ptr map entry key=%u expected=(%u,%u) got=(%u,%u)",
010621 iChild, eType, iParent, ePtrmapType, iPtrmapParent);
010622 }
010623 }
010624 #endif
010625
010626 /*
010627 ** Check the integrity of the freelist or of an overflow page list.
010628 ** Verify that the number of pages on the list is N.
010629 */
010630 static void checkList(
010631 IntegrityCk *pCheck, /* Integrity checking context */
010632 int isFreeList, /* True for a freelist. False for overflow page list */
010633 Pgno iPage, /* Page number for first page in the list */
010634 u32 N /* Expected number of pages in the list */
010635 ){
010636 int i;
010637 u32 expected = N;
010638 int nErrAtStart = pCheck->nErr;
010639 while( iPage!=0 && pCheck->mxErr ){
010640 DbPage *pOvflPage;
010641 unsigned char *pOvflData;
010642 if( checkRef(pCheck, iPage) ) break;
010643 N--;
010644 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){
010645 checkAppendMsg(pCheck, "failed to get page %u", iPage);
010646 break;
010647 }
010648 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
010649 if( isFreeList ){
010650 u32 n = (u32)get4byte(&pOvflData[4]);
010651 #ifndef SQLITE_OMIT_AUTOVACUUM
010652 if( pCheck->pBt->autoVacuum ){
010653 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
010654 }
010655 #endif
010656 if( n>pCheck->pBt->usableSize/4-2 ){
010657 checkAppendMsg(pCheck,
010658 "freelist leaf count too big on page %u", iPage);
010659 N--;
010660 }else{
010661 for(i=0; i<(int)n; i++){
010662 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
010663 #ifndef SQLITE_OMIT_AUTOVACUUM
010664 if( pCheck->pBt->autoVacuum ){
010665 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
010666 }
010667 #endif
010668 checkRef(pCheck, iFreePage);
010669 }
010670 N -= n;
010671 }
010672 }
010673 #ifndef SQLITE_OMIT_AUTOVACUUM
010674 else{
010675 /* If this database supports auto-vacuum and iPage is not the last
010676 ** page in this overflow list, check that the pointer-map entry for
010677 ** the following page matches iPage.
010678 */
010679 if( pCheck->pBt->autoVacuum && N>0 ){
010680 i = get4byte(pOvflData);
010681 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
010682 }
010683 }
010684 #endif
010685 iPage = get4byte(pOvflData);
010686 sqlite3PagerUnref(pOvflPage);
010687 }
010688 if( N && nErrAtStart==pCheck->nErr ){
010689 checkAppendMsg(pCheck,
010690 "%s is %u but should be %u",
010691 isFreeList ? "size" : "overflow list length",
010692 expected-N, expected);
010693 }
010694 }
010695 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
010696
010697 /*
010698 ** An implementation of a min-heap.
010699 **
010700 ** aHeap[0] is the number of elements on the heap. aHeap[1] is the
010701 ** root element. The daughter nodes of aHeap[N] are aHeap[N*2]
010702 ** and aHeap[N*2+1].
010703 **
010704 ** The heap property is this: Every node is less than or equal to both
010705 ** of its daughter nodes. A consequence of the heap property is that the
010706 ** root node aHeap[1] is always the minimum value currently in the heap.
010707 **
010708 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
010709 ** the heap, preserving the heap property. The btreeHeapPull() routine
010710 ** removes the root element from the heap (the minimum value in the heap)
010711 ** and then moves other nodes around as necessary to preserve the heap
010712 ** property.
010713 **
010714 ** This heap is used for cell overlap and coverage testing. Each u32
010715 ** entry represents the span of a cell or freeblock on a btree page.
010716 ** The upper 16 bits are the index of the first byte of a range and the
010717 ** lower 16 bits are the index of the last byte of that range.
010718 */
010719 static void btreeHeapInsert(u32 *aHeap, u32 x){
010720 u32 j, i;
010721 assert( aHeap!=0 );
010722 i = ++aHeap[0];
010723 aHeap[i] = x;
010724 while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
010725 x = aHeap[j];
010726 aHeap[j] = aHeap[i];
010727 aHeap[i] = x;
010728 i = j;
010729 }
010730 }
010731 static int btreeHeapPull(u32 *aHeap, u32 *pOut){
010732 u32 j, i, x;
010733 if( (x = aHeap[0])==0 ) return 0;
010734 *pOut = aHeap[1];
010735 aHeap[1] = aHeap[x];
010736 aHeap[x] = 0xffffffff;
010737 aHeap[0]--;
010738 i = 1;
010739 while( (j = i*2)<=aHeap[0] ){
010740 if( aHeap[j]>aHeap[j+1] ) j++;
010741 if( aHeap[i]<aHeap[j] ) break;
010742 x = aHeap[i];
010743 aHeap[i] = aHeap[j];
010744 aHeap[j] = x;
010745 i = j;
010746 }
010747 return 1;
010748 }
010749
010750 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
010751 /*
010752 ** Do various sanity checks on a single page of a tree. Return
010753 ** the tree depth. Root pages return 0. Parents of root pages
010754 ** return 1, and so forth.
010755 **
010756 ** These checks are done:
010757 **
010758 ** 1. Make sure that cells and freeblocks do not overlap
010759 ** but combine to completely cover the page.
010760 ** 2. Make sure integer cell keys are in order.
010761 ** 3. Check the integrity of overflow pages.
010762 ** 4. Recursively call checkTreePage on all children.
010763 ** 5. Verify that the depth of all children is the same.
010764 */
010765 static int checkTreePage(
010766 IntegrityCk *pCheck, /* Context for the sanity check */
010767 Pgno iPage, /* Page number of the page to check */
010768 i64 *piMinKey, /* Write minimum integer primary key here */
010769 i64 maxKey /* Error if integer primary key greater than this */
010770 ){
010771 MemPage *pPage = 0; /* The page being analyzed */
010772 int i; /* Loop counter */
010773 int rc; /* Result code from subroutine call */
010774 int depth = -1, d2; /* Depth of a subtree */
010775 int pgno; /* Page number */
010776 int nFrag; /* Number of fragmented bytes on the page */
010777 int hdr; /* Offset to the page header */
010778 int cellStart; /* Offset to the start of the cell pointer array */
010779 int nCell; /* Number of cells */
010780 int doCoverageCheck = 1; /* True if cell coverage checking should be done */
010781 int keyCanBeEqual = 1; /* True if IPK can be equal to maxKey
010782 ** False if IPK must be strictly less than maxKey */
010783 u8 *data; /* Page content */
010784 u8 *pCell; /* Cell content */
010785 u8 *pCellIdx; /* Next element of the cell pointer array */
010786 BtShared *pBt; /* The BtShared object that owns pPage */
010787 u32 pc; /* Address of a cell */
010788 u32 usableSize; /* Usable size of the page */
010789 u32 contentOffset; /* Offset to the start of the cell content area */
010790 u32 *heap = 0; /* Min-heap used for checking cell coverage */
010791 u32 x, prev = 0; /* Next and previous entry on the min-heap */
010792 const char *saved_zPfx = pCheck->zPfx;
010793 int saved_v1 = pCheck->v1;
010794 int saved_v2 = pCheck->v2;
010795 u8 savedIsInit = 0;
010796
010797 /* Check that the page exists
010798 */
010799 checkProgress(pCheck);
010800 if( pCheck->mxErr==0 ) goto end_of_check;
010801 pBt = pCheck->pBt;
010802 usableSize = pBt->usableSize;
010803 if( iPage==0 ) return 0;
010804 if( checkRef(pCheck, iPage) ) return 0;
010805 pCheck->zPfx = "Tree %u page %u: ";
010806 pCheck->v1 = iPage;
010807 if( (rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0 ){
010808 checkAppendMsg(pCheck,
010809 "unable to get the page. error code=%d", rc);
010810 if( rc==SQLITE_IOERR_NOMEM ) pCheck->rc = SQLITE_NOMEM;
010811 goto end_of_check;
010812 }
010813
010814 /* Clear MemPage.isInit to make sure the corruption detection code in
010815 ** btreeInitPage() is executed. */
010816 savedIsInit = pPage->isInit;
010817 pPage->isInit = 0;
010818 if( (rc = btreeInitPage(pPage))!=0 ){
010819 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */
010820 checkAppendMsg(pCheck,
010821 "btreeInitPage() returns error code %d", rc);
010822 goto end_of_check;
010823 }
010824 if( (rc = btreeComputeFreeSpace(pPage))!=0 ){
010825 assert( rc==SQLITE_CORRUPT );
010826 checkAppendMsg(pCheck, "free space corruption", rc);
010827 goto end_of_check;
010828 }
010829 data = pPage->aData;
010830 hdr = pPage->hdrOffset;
010831
010832 /* Set up for cell analysis */
010833 pCheck->zPfx = "Tree %u page %u cell %u: ";
010834 contentOffset = get2byteNotZero(&data[hdr+5]);
010835 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */
010836
010837 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
010838 ** number of cells on the page. */
010839 nCell = get2byte(&data[hdr+3]);
010840 assert( pPage->nCell==nCell );
010841 if( pPage->leaf || pPage->intKey==0 ){
010842 pCheck->nRow += nCell;
010843 }
010844
010845 /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
010846 ** immediately follows the b-tree page header. */
010847 cellStart = hdr + 12 - 4*pPage->leaf;
010848 assert( pPage->aCellIdx==&data[cellStart] );
010849 pCellIdx = &data[cellStart + 2*(nCell-1)];
010850
010851 if( !pPage->leaf ){
010852 /* Analyze the right-child page of internal pages */
010853 pgno = get4byte(&data[hdr+8]);
010854 #ifndef SQLITE_OMIT_AUTOVACUUM
010855 if( pBt->autoVacuum ){
010856 pCheck->zPfx = "Tree %u page %u right child: ";
010857 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
010858 }
010859 #endif
010860 depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
010861 keyCanBeEqual = 0;
010862 }else{
010863 /* For leaf pages, the coverage check will occur in the same loop
010864 ** as the other cell checks, so initialize the heap. */
010865 heap = pCheck->heap;
010866 heap[0] = 0;
010867 }
010868
010869 /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
010870 ** integer offsets to the cell contents. */
010871 for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
010872 CellInfo info;
010873
010874 /* Check cell size */
010875 pCheck->v2 = i;
010876 assert( pCellIdx==&data[cellStart + i*2] );
010877 pc = get2byteAligned(pCellIdx);
010878 pCellIdx -= 2;
010879 if( pc<contentOffset || pc>usableSize-4 ){
010880 checkAppendMsg(pCheck, "Offset %u out of range %u..%u",
010881 pc, contentOffset, usableSize-4);
010882 doCoverageCheck = 0;
010883 continue;
010884 }
010885 pCell = &data[pc];
010886 pPage->xParseCell(pPage, pCell, &info);
010887 if( pc+info.nSize>usableSize ){
010888 checkAppendMsg(pCheck, "Extends off end of page");
010889 doCoverageCheck = 0;
010890 continue;
010891 }
010892
010893 /* Check for integer primary key out of range */
010894 if( pPage->intKey ){
010895 if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
010896 checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
010897 }
010898 maxKey = info.nKey;
010899 keyCanBeEqual = 0; /* Only the first key on the page may ==maxKey */
010900 }
010901
010902 /* Check the content overflow list */
010903 if( info.nPayload>info.nLocal ){
010904 u32 nPage; /* Number of pages on the overflow chain */
010905 Pgno pgnoOvfl; /* First page of the overflow chain */
010906 assert( pc + info.nSize - 4 <= usableSize );
010907 nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
010908 pgnoOvfl = get4byte(&pCell[info.nSize - 4]);
010909 #ifndef SQLITE_OMIT_AUTOVACUUM
010910 if( pBt->autoVacuum ){
010911 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
010912 }
010913 #endif
010914 checkList(pCheck, 0, pgnoOvfl, nPage);
010915 }
010916
010917 if( !pPage->leaf ){
010918 /* Check sanity of left child page for internal pages */
010919 pgno = get4byte(pCell);
010920 #ifndef SQLITE_OMIT_AUTOVACUUM
010921 if( pBt->autoVacuum ){
010922 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
010923 }
010924 #endif
010925 d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
010926 keyCanBeEqual = 0;
010927 if( d2!=depth ){
010928 checkAppendMsg(pCheck, "Child page depth differs");
010929 depth = d2;
010930 }
010931 }else{
010932 /* Populate the coverage-checking heap for leaf pages */
010933 btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
010934 }
010935 }
010936 *piMinKey = maxKey;
010937
010938 /* Check for complete coverage of the page
010939 */
010940 pCheck->zPfx = 0;
010941 if( doCoverageCheck && pCheck->mxErr>0 ){
010942 /* For leaf pages, the min-heap has already been initialized and the
010943 ** cells have already been inserted. But for internal pages, that has
010944 ** not yet been done, so do it now */
010945 if( !pPage->leaf ){
010946 heap = pCheck->heap;
010947 heap[0] = 0;
010948 for(i=nCell-1; i>=0; i--){
010949 u32 size;
010950 pc = get2byteAligned(&data[cellStart+i*2]);
010951 size = pPage->xCellSize(pPage, &data[pc]);
010952 btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
010953 }
010954 }
010955 assert( heap!=0 );
010956 /* Add the freeblocks to the min-heap
010957 **
010958 ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
010959 ** is the offset of the first freeblock, or zero if there are no
010960 ** freeblocks on the page.
010961 */
010962 i = get2byte(&data[hdr+1]);
010963 while( i>0 ){
010964 int size, j;
010965 assert( (u32)i<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */
010966 size = get2byte(&data[i+2]);
010967 assert( (u32)(i+size)<=usableSize ); /* due to btreeComputeFreeSpace() */
010968 btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1));
010969 /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
010970 ** big-endian integer which is the offset in the b-tree page of the next
010971 ** freeblock in the chain, or zero if the freeblock is the last on the
010972 ** chain. */
010973 j = get2byte(&data[i]);
010974 /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
010975 ** increasing offset. */
010976 assert( j==0 || j>i+size ); /* Enforced by btreeComputeFreeSpace() */
010977 assert( (u32)j<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */
010978 i = j;
010979 }
010980 /* Analyze the min-heap looking for overlap between cells and/or
010981 ** freeblocks, and counting the number of untracked bytes in nFrag.
010982 **
010983 ** Each min-heap entry is of the form: (start_address<<16)|end_address.
010984 ** There is an implied first entry the covers the page header, the cell
010985 ** pointer index, and the gap between the cell pointer index and the start
010986 ** of cell content.
010987 **
010988 ** The loop below pulls entries from the min-heap in order and compares
010989 ** the start_address against the previous end_address. If there is an
010990 ** overlap, that means bytes are used multiple times. If there is a gap,
010991 ** that gap is added to the fragmentation count.
010992 */
010993 nFrag = 0;
010994 prev = contentOffset - 1; /* Implied first min-heap entry */
010995 while( btreeHeapPull(heap,&x) ){
010996 if( (prev&0xffff)>=(x>>16) ){
010997 checkAppendMsg(pCheck,
010998 "Multiple uses for byte %u of page %u", x>>16, iPage);
010999 break;
011000 }else{
011001 nFrag += (x>>16) - (prev&0xffff) - 1;
011002 prev = x;
011003 }
011004 }
011005 nFrag += usableSize - (prev&0xffff) - 1;
011006 /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
011007 ** is stored in the fifth field of the b-tree page header.
011008 ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
011009 ** number of fragmented free bytes within the cell content area.
011010 */
011011 if( heap[0]==0 && nFrag!=data[hdr+7] ){
011012 checkAppendMsg(pCheck,
011013 "Fragmentation of %u bytes reported as %u on page %u",
011014 nFrag, data[hdr+7], iPage);
011015 }
011016 }
011017
011018 end_of_check:
011019 if( !doCoverageCheck ) pPage->isInit = savedIsInit;
011020 releasePage(pPage);
011021 pCheck->zPfx = saved_zPfx;
011022 pCheck->v1 = saved_v1;
011023 pCheck->v2 = saved_v2;
011024 return depth+1;
011025 }
011026 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
011027
011028 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
011029 /*
011030 ** This routine does a complete check of the given BTree file. aRoot[] is
011031 ** an array of pages numbers were each page number is the root page of
011032 ** a table. nRoot is the number of entries in aRoot.
011033 **
011034 ** A read-only or read-write transaction must be opened before calling
011035 ** this function.
011036 **
011037 ** Write the number of error seen in *pnErr. Except for some memory
011038 ** allocation errors, an error message held in memory obtained from
011039 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is
011040 ** returned. If a memory allocation error occurs, NULL is returned.
011041 **
011042 ** If the first entry in aRoot[] is 0, that indicates that the list of
011043 ** root pages is incomplete. This is a "partial integrity-check". This
011044 ** happens when performing an integrity check on a single table. The
011045 ** zero is skipped, of course. But in addition, the freelist checks
011046 ** and the checks to make sure every page is referenced are also skipped,
011047 ** since obviously it is not possible to know which pages are covered by
011048 ** the unverified btrees. Except, if aRoot[1] is 1, then the freelist
011049 ** checks are still performed.
011050 */
011051 int sqlite3BtreeIntegrityCheck(
011052 sqlite3 *db, /* Database connection that is running the check */
011053 Btree *p, /* The btree to be checked */
011054 Pgno *aRoot, /* An array of root pages numbers for individual trees */
011055 Mem *aCnt, /* Memory cells to write counts for each tree to */
011056 int nRoot, /* Number of entries in aRoot[] */
011057 int mxErr, /* Stop reporting errors after this many */
011058 int *pnErr, /* OUT: Write number of errors seen to this variable */
011059 char **pzOut /* OUT: Write the error message string here */
011060 ){
011061 Pgno i;
011062 IntegrityCk sCheck;
011063 BtShared *pBt = p->pBt;
011064 u64 savedDbFlags = pBt->db->flags;
011065 char zErr[100];
011066 int bPartial = 0; /* True if not checking all btrees */
011067 int bCkFreelist = 1; /* True to scan the freelist */
011068 VVA_ONLY( int nRef );
011069
011070 assert( nRoot>0 );
011071 assert( aCnt!=0 );
011072
011073 /* aRoot[0]==0 means this is a partial check */
011074 if( aRoot[0]==0 ){
011075 assert( nRoot>1 );
011076 bPartial = 1;
011077 if( aRoot[1]!=1 ) bCkFreelist = 0;
011078 }
011079
011080 sqlite3BtreeEnter(p);
011081 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
011082 VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );
011083 assert( nRef>=0 );
011084 memset(&sCheck, 0, sizeof(sCheck));
011085 sCheck.db = db;
011086 sCheck.pBt = pBt;
011087 sCheck.pPager = pBt->pPager;
011088 sCheck.nCkPage = btreePagecount(sCheck.pBt);
011089 sCheck.mxErr = mxErr;
011090 sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
011091 sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;
011092 if( sCheck.nCkPage==0 ){
011093 goto integrity_ck_cleanup;
011094 }
011095
011096 sCheck.aPgRef = sqlite3MallocZero((sCheck.nCkPage / 8)+ 1);
011097 if( !sCheck.aPgRef ){
011098 checkOom(&sCheck);
011099 goto integrity_ck_cleanup;
011100 }
011101 sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
011102 if( sCheck.heap==0 ){
011103 checkOom(&sCheck);
011104 goto integrity_ck_cleanup;
011105 }
011106
011107 i = PENDING_BYTE_PAGE(pBt);
011108 if( i<=sCheck.nCkPage ) setPageReferenced(&sCheck, i);
011109
011110 /* Check the integrity of the freelist
011111 */
011112 if( bCkFreelist ){
011113 sCheck.zPfx = "Freelist: ";
011114 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
011115 get4byte(&pBt->pPage1->aData[36]));
011116 sCheck.zPfx = 0;
011117 }
011118
011119 /* Check all the tables.
011120 */
011121 #ifndef SQLITE_OMIT_AUTOVACUUM
011122 if( !bPartial ){
011123 if( pBt->autoVacuum ){
011124 Pgno mx = 0;
011125 Pgno mxInHdr;
011126 for(i=0; (int)i<nRoot; i++) if( mx<aRoot[i] ) mx = aRoot[i];
011127 mxInHdr = get4byte(&pBt->pPage1->aData[52]);
011128 if( mx!=mxInHdr ){
011129 checkAppendMsg(&sCheck,
011130 "max rootpage (%u) disagrees with header (%u)",
011131 mx, mxInHdr
011132 );
011133 }
011134 }else if( get4byte(&pBt->pPage1->aData[64])!=0 ){
011135 checkAppendMsg(&sCheck,
011136 "incremental_vacuum enabled with a max rootpage of zero"
011137 );
011138 }
011139 }
011140 #endif
011141 testcase( pBt->db->flags & SQLITE_CellSizeCk );
011142 pBt->db->flags &= ~(u64)SQLITE_CellSizeCk;
011143 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
011144 sCheck.nRow = 0;
011145 if( aRoot[i] ){
011146 i64 notUsed;
011147 #ifndef SQLITE_OMIT_AUTOVACUUM
011148 if( pBt->autoVacuum && aRoot[i]>1 && !bPartial ){
011149 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
011150 }
011151 #endif
011152 sCheck.v0 = aRoot[i];
011153 checkTreePage(&sCheck, aRoot[i], ¬Used, LARGEST_INT64);
011154 }
011155 sqlite3MemSetArrayInt64(aCnt, i, sCheck.nRow);
011156 }
011157 pBt->db->flags = savedDbFlags;
011158
011159 /* Make sure every page in the file is referenced
011160 */
011161 if( !bPartial ){
011162 for(i=1; i<=sCheck.nCkPage && sCheck.mxErr; i++){
011163 #ifdef SQLITE_OMIT_AUTOVACUUM
011164 if( getPageReferenced(&sCheck, i)==0 ){
011165 checkAppendMsg(&sCheck, "Page %u: never used", i);
011166 }
011167 #else
011168 /* If the database supports auto-vacuum, make sure no tables contain
011169 ** references to pointer-map pages.
011170 */
011171 if( getPageReferenced(&sCheck, i)==0 &&
011172 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
011173 checkAppendMsg(&sCheck, "Page %u: never used", i);
011174 }
011175 if( getPageReferenced(&sCheck, i)!=0 &&
011176 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
011177 checkAppendMsg(&sCheck, "Page %u: pointer map referenced", i);
011178 }
011179 #endif
011180 }
011181 }
011182
011183 /* Clean up and report errors.
011184 */
011185 integrity_ck_cleanup:
011186 sqlite3PageFree(sCheck.heap);
011187 sqlite3_free(sCheck.aPgRef);
011188 *pnErr = sCheck.nErr;
011189 if( sCheck.nErr==0 ){
011190 sqlite3_str_reset(&sCheck.errMsg);
011191 *pzOut = 0;
011192 }else{
011193 *pzOut = sqlite3StrAccumFinish(&sCheck.errMsg);
011194 }
011195 /* Make sure this analysis did not leave any unref() pages. */
011196 assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
011197 sqlite3BtreeLeave(p);
011198 return sCheck.rc;
011199 }
011200 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
011201
011202 /*
011203 ** Return the full pathname of the underlying database file. Return
011204 ** an empty string if the database is in-memory or a TEMP database.
011205 **
011206 ** The pager filename is invariant as long as the pager is
011207 ** open so it is safe to access without the BtShared mutex.
011208 */
011209 const char *sqlite3BtreeGetFilename(Btree *p){
011210 assert( p->pBt->pPager!=0 );
011211 return sqlite3PagerFilename(p->pBt->pPager, 1);
011212 }
011213
011214 /*
011215 ** Return the pathname of the journal file for this database. The return
011216 ** value of this routine is the same regardless of whether the journal file
011217 ** has been created or not.
011218 **
011219 ** The pager journal filename is invariant as long as the pager is
011220 ** open so it is safe to access without the BtShared mutex.
011221 */
011222 const char *sqlite3BtreeGetJournalname(Btree *p){
011223 assert( p->pBt->pPager!=0 );
011224 return sqlite3PagerJournalname(p->pBt->pPager);
011225 }
011226
011227 /*
011228 ** Return one of SQLITE_TXN_NONE, SQLITE_TXN_READ, or SQLITE_TXN_WRITE
011229 ** to describe the current transaction state of Btree p.
011230 */
011231 int sqlite3BtreeTxnState(Btree *p){
011232 assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
011233 return p ? p->inTrans : 0;
011234 }
011235
011236 #ifndef SQLITE_OMIT_WAL
011237 /*
011238 ** Run a checkpoint on the Btree passed as the first argument.
011239 **
011240 ** Return SQLITE_LOCKED if this or any other connection has an open
011241 ** transaction on the shared-cache the argument Btree is connected to.
011242 **
011243 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
011244 */
011245 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
011246 int rc = SQLITE_OK;
011247 if( p ){
011248 BtShared *pBt = p->pBt;
011249 sqlite3BtreeEnter(p);
011250 if( pBt->inTransaction!=TRANS_NONE ){
011251 rc = SQLITE_LOCKED;
011252 }else{
011253 rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt);
011254 }
011255 sqlite3BtreeLeave(p);
011256 }
011257 return rc;
011258 }
011259 #endif
011260
011261 /*
011262 ** Return true if there is currently a backup running on Btree p.
011263 */
011264 int sqlite3BtreeIsInBackup(Btree *p){
011265 assert( p );
011266 assert( sqlite3_mutex_held(p->db->mutex) );
011267 return p->nBackup!=0;
011268 }
011269
011270 /*
011271 ** This function returns a pointer to a blob of memory associated with
011272 ** a single shared-btree. The memory is used by client code for its own
011273 ** purposes (for example, to store a high-level schema associated with
011274 ** the shared-btree). The btree layer manages reference counting issues.
011275 **
011276 ** The first time this is called on a shared-btree, nBytes bytes of memory
011277 ** are allocated, zeroed, and returned to the caller. For each subsequent
011278 ** call the nBytes parameter is ignored and a pointer to the same blob
011279 ** of memory returned.
011280 **
011281 ** If the nBytes parameter is 0 and the blob of memory has not yet been
011282 ** allocated, a null pointer is returned. If the blob has already been
011283 ** allocated, it is returned as normal.
011284 **
011285 ** Just before the shared-btree is closed, the function passed as the
011286 ** xFree argument when the memory allocation was made is invoked on the
011287 ** blob of allocated memory. The xFree function should not call sqlite3_free()
011288 ** on the memory, the btree layer does that.
011289 */
011290 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
011291 BtShared *pBt = p->pBt;
011292 sqlite3BtreeEnter(p);
011293 if( !pBt->pSchema && nBytes ){
011294 pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
011295 pBt->xFreeSchema = xFree;
011296 }
011297 sqlite3BtreeLeave(p);
011298 return pBt->pSchema;
011299 }
011300
011301 /*
011302 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
011303 ** btree as the argument handle holds an exclusive lock on the
011304 ** sqlite_schema table. Otherwise SQLITE_OK.
011305 */
011306 int sqlite3BtreeSchemaLocked(Btree *p){
011307 int rc;
011308 assert( sqlite3_mutex_held(p->db->mutex) );
011309 sqlite3BtreeEnter(p);
011310 rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK);
011311 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
011312 sqlite3BtreeLeave(p);
011313 return rc;
011314 }
011315
011316
011317 #ifndef SQLITE_OMIT_SHARED_CACHE
011318 /*
011319 ** Obtain a lock on the table whose root page is iTab. The
011320 ** lock is a write lock if isWritelock is true or a read lock
011321 ** if it is false.
011322 */
011323 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
011324 int rc = SQLITE_OK;
011325 assert( p->inTrans!=TRANS_NONE );
011326 if( p->sharable ){
011327 u8 lockType = READ_LOCK + isWriteLock;
011328 assert( READ_LOCK+1==WRITE_LOCK );
011329 assert( isWriteLock==0 || isWriteLock==1 );
011330
011331 sqlite3BtreeEnter(p);
011332 rc = querySharedCacheTableLock(p, iTab, lockType);
011333 if( rc==SQLITE_OK ){
011334 rc = setSharedCacheTableLock(p, iTab, lockType);
011335 }
011336 sqlite3BtreeLeave(p);
011337 }
011338 return rc;
011339 }
011340 #endif
011341
011342 #ifndef SQLITE_OMIT_INCRBLOB
011343 /*
011344 ** Argument pCsr must be a cursor opened for writing on an
011345 ** INTKEY table currently pointing at a valid table entry.
011346 ** This function modifies the data stored as part of that entry.
011347 **
011348 ** Only the data content may only be modified, it is not possible to
011349 ** change the length of the data stored. If this function is called with
011350 ** parameters that attempt to write past the end of the existing data,
011351 ** no modifications are made and SQLITE_CORRUPT is returned.
011352 */
011353 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
011354 int rc;
011355 assert( cursorOwnsBtShared(pCsr) );
011356 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
011357 assert( pCsr->curFlags & BTCF_Incrblob );
011358
011359 rc = restoreCursorPosition(pCsr);
011360 if( rc!=SQLITE_OK ){
011361 return rc;
011362 }
011363 assert( pCsr->eState!=CURSOR_REQUIRESEEK );
011364 if( pCsr->eState!=CURSOR_VALID ){
011365 return SQLITE_ABORT;
011366 }
011367
011368 /* Save the positions of all other cursors open on this table. This is
011369 ** required in case any of them are holding references to an xFetch
011370 ** version of the b-tree page modified by the accessPayload call below.
011371 **
011372 ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
011373 ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
011374 ** saveAllCursors can only return SQLITE_OK.
011375 */
011376 VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
011377 assert( rc==SQLITE_OK );
011378
011379 /* Check some assumptions:
011380 ** (a) the cursor is open for writing,
011381 ** (b) there is a read/write transaction open,
011382 ** (c) the connection holds a write-lock on the table (if required),
011383 ** (d) there are no conflicting read-locks, and
011384 ** (e) the cursor points at a valid row of an intKey table.
011385 */
011386 if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
011387 return SQLITE_READONLY;
011388 }
011389 assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
011390 && pCsr->pBt->inTransaction==TRANS_WRITE );
011391 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
011392 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
011393 assert( pCsr->pPage->intKey );
011394
011395 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
011396 }
011397
011398 /*
011399 ** Mark this cursor as an incremental blob cursor.
011400 */
011401 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
011402 pCur->curFlags |= BTCF_Incrblob;
011403 pCur->pBtree->hasIncrblobCur = 1;
011404 }
011405 #endif
011406
011407 /*
011408 ** Set both the "read version" (single byte at byte offset 18) and
011409 ** "write version" (single byte at byte offset 19) fields in the database
011410 ** header to iVersion.
011411 */
011412 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
011413 BtShared *pBt = pBtree->pBt;
011414 int rc; /* Return code */
011415
011416 assert( iVersion==1 || iVersion==2 );
011417
011418 /* If setting the version fields to 1, do not automatically open the
011419 ** WAL connection, even if the version fields are currently set to 2.
011420 */
011421 pBt->btsFlags &= ~BTS_NO_WAL;
011422 if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
011423
011424 rc = sqlite3BtreeBeginTrans(pBtree, 0, 0);
011425 if( rc==SQLITE_OK ){
011426 u8 *aData = pBt->pPage1->aData;
011427 if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
011428 rc = sqlite3BtreeBeginTrans(pBtree, 2, 0);
011429 if( rc==SQLITE_OK ){
011430 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
011431 if( rc==SQLITE_OK ){
011432 aData[18] = (u8)iVersion;
011433 aData[19] = (u8)iVersion;
011434 }
011435 }
011436 }
011437 }
011438
011439 pBt->btsFlags &= ~BTS_NO_WAL;
011440 return rc;
011441 }
011442
011443 /*
011444 ** Return true if the cursor has a hint specified. This routine is
011445 ** only used from within assert() statements
011446 */
011447 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
011448 return (pCsr->hints & mask)!=0;
011449 }
011450
011451 /*
011452 ** Return true if the given Btree is read-only.
011453 */
011454 int sqlite3BtreeIsReadonly(Btree *p){
011455 return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
011456 }
011457
011458 /*
011459 ** Return the size of the header added to each page by this module.
011460 */
011461 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
011462
011463 /*
011464 ** If no transaction is active and the database is not a temp-db, clear
011465 ** the in-memory pager cache.
011466 */
011467 void sqlite3BtreeClearCache(Btree *p){
011468 BtShared *pBt = p->pBt;
011469 if( pBt->inTransaction==TRANS_NONE ){
011470 sqlite3PagerClearCache(pBt->pPager);
011471 }
011472 }
011473
011474 #if !defined(SQLITE_OMIT_SHARED_CACHE)
011475 /*
011476 ** Return true if the Btree passed as the only argument is sharable.
011477 */
011478 int sqlite3BtreeSharable(Btree *p){
011479 return p->sharable;
011480 }
011481
011482 /*
011483 ** Return the number of connections to the BtShared object accessed by
011484 ** the Btree handle passed as the only argument. For private caches
011485 ** this is always 1. For shared caches it may be 1 or greater.
011486 */
011487 int sqlite3BtreeConnectionCount(Btree *p){
011488 testcase( p->sharable );
011489 return p->pBt->nRef;
011490 }
011491 #endif