Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Changes In Branch btree-opt2 Excluding Merge-Ins
This is equivalent to a diff from 4df852ce26 to eed6a33145
2015-06-24
| ||
00:05 | Performance optimization on balance_nonroot() and related routines. 2.6% faster overall with a size increase of less than 750 bytes. (check-in: 25131e7062 user: drh tags: trunk) | |
2015-06-23
| ||
23:31 | Mark some branches as unreachable after the recent change that recognizes mismatch result set sizes on compound SELECT statements sooner. (check-in: c8d1f305b6 user: drh tags: trunk) | |
21:35 | Testability improvement. (Closed-Leaf check-in: eed6a33145 user: drh tags: btree-opt2) | |
18:24 | Multiple overflow cells are always adjacent and sequential. Exploit this invariant for a small size reduction and performance increase and add assert()s to prove the invariant. (check-in: f77f2f48f4 user: drh tags: btree-opt2) | |
15:06 | Merge latest trunk changes with this branch. Add tests for columnsize=0. (check-in: ef44c71a22 user: dan tags: fts5) | |
13:02 | Merge the compound SELECT operator fix from trunk. (check-in: a7be554f4b user: drh tags: btree-opt2) | |
12:19 | Test that the left and right sides of a compound SELECT operator have the same number of expressions in the expanded expression list before beginning to generate code. (check-in: 4df852ce26 user: dan tags: trunk) | |
2015-06-20
| ||
14:11 | Update the fuzztest data using the latest test vectors discovered by AFL. (check-in: b97f9cf73e user: drh tags: trunk) | |
Changes to src/btree.c.
︙ | ︙ | |||
952 953 954 955 956 957 958 | ** the page, 1 means the second cell, and so forth) return a pointer ** to the cell content. ** ** This routine works only for pages that do not contain overflow cells. */ #define findCell(P,I) \ ((P)->aData + ((P)->maskPage & get2byte(&(P)->aCellIdx[2*(I)]))) | < < < < < < < < < < < < < < < < < < < < < < | 952 953 954 955 956 957 958 959 960 961 962 963 964 965 | ** the page, 1 means the second cell, and so forth) return a pointer ** to the cell content. ** ** This routine works only for pages that do not contain overflow cells. */ #define findCell(P,I) \ ((P)->aData + ((P)->maskPage & get2byte(&(P)->aCellIdx[2*(I)]))) /* ** This is common tail processing for btreeParseCellPtr() and ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely ** on a single B-tree page. Make necessary adjustments to the CellInfo ** structure. */ |
︙ | ︙ | |||
1571 1572 1573 1574 1575 1576 1577 | ** ** Check to see if iFreeBlk should be coalesced onto the end of iStart. */ if( iFreeBlk && iEnd+3>=iFreeBlk ){ nFrag = iFreeBlk - iEnd; if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT; iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]); | | | 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 | ** ** Check to see if iFreeBlk should be coalesced onto the end of iStart. */ if( iFreeBlk && iEnd+3>=iFreeBlk ){ nFrag = iFreeBlk - iEnd; if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT; iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]); if( NEVER(iEnd > pPage->pBt->usableSize) ) return SQLITE_CORRUPT_BKPT; iSize = iEnd - iStart; iFreeBlk = get2byte(&data[iFreeBlk]); } /* If iPtr is another freeblock (that is, if iPtr is not the freelist ** pointer in the page header) then check to see if iStart should be ** coalesced onto the end of iPtr. |
︙ | ︙ | |||
6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 | if( iChild ){ put4byte(pCell, iChild); } j = pPage->nOverflow++; assert( j<(int)(sizeof(pPage->apOvfl)/sizeof(pPage->apOvfl[0])) ); pPage->apOvfl[j] = pCell; pPage->aiOvfl[j] = (u16)i; }else{ int rc = sqlite3PagerWrite(pPage->pDbPage); if( rc!=SQLITE_OK ){ *pRC = rc; return; } assert( sqlite3PagerIswriteable(pPage->pDbPage) ); | > > > > > > > > | 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 | if( iChild ){ put4byte(pCell, iChild); } j = pPage->nOverflow++; assert( j<(int)(sizeof(pPage->apOvfl)/sizeof(pPage->apOvfl[0])) ); pPage->apOvfl[j] = pCell; pPage->aiOvfl[j] = (u16)i; /* When multiple overflows occur, they are always sequential and in ** sorted order. This invariants arise because multiple overflows can ** only occur when inserting divider cells into the parent page during ** balancing, and the dividers are adjacent and sorted. */ assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */ assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */ }else{ int rc = sqlite3PagerWrite(pPage->pDbPage); if( rc!=SQLITE_OK ){ *pRC = rc; return; } assert( sqlite3PagerIswriteable(pPage->pDbPage) ); |
︙ | ︙ | |||
6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 | ** the entry for the overflow page into the pointer map. */ ptrmapPutOvflPtr(pPage, pCell, pRC); } #endif } } /* ** Array apCell[] contains pointers to nCell b-tree page cells. The ** szCell[] array contains the size in bytes of each cell. This function ** replaces the current contents of page pPg with the contents of the cell ** array. ** ** Some of the cells in apCell[] may currently be stored in pPg. This ** function works around problems caused by this by making a copy of any ** such cells before overwriting the page data. ** ** The MemPage.nFree field is invalidated by this function. It is the ** responsibility of the caller to set it correctly. */ | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 | ** the entry for the overflow page into the pointer map. */ ptrmapPutOvflPtr(pPage, pCell, pRC); } #endif } } /* ** A CellArray object contains a cache of pointers and sizes for a ** consecutive sequence of cells that might be held multiple pages. */ typedef struct CellArray CellArray; struct CellArray { int nCell; /* Number of cells in apCell[] */ MemPage *pRef; /* Reference page */ u8 **apCell; /* All cells begin balanced */ u16 *szCell; /* Local size of all cells in apCell[] */ }; /* ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been ** computed. */ static void populateCellCache(CellArray *p, int idx, int N){ assert( idx>=0 && idx+N<=p->nCell ); while( N>0 ){ assert( p->apCell[idx]!=0 ); if( p->szCell[idx]==0 ){ p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]); }else{ assert( CORRUPT_DB || p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) ); } idx++; N--; } } /* ** Return the size of the Nth element of the cell array */ static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){ assert( N>=0 && N<p->nCell ); assert( p->szCell[N]==0 ); p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]); return p->szCell[N]; } static u16 cachedCellSize(CellArray *p, int N){ assert( N>=0 && N<p->nCell ); if( p->szCell[N] ) return p->szCell[N]; return computeCellSize(p, N); } /* ** Array apCell[] contains pointers to nCell b-tree page cells. The ** szCell[] array contains the size in bytes of each cell. This function ** replaces the current contents of page pPg with the contents of the cell ** array. ** ** Some of the cells in apCell[] may currently be stored in pPg. This ** function works around problems caused by this by making a copy of any ** such cells before overwriting the page data. ** ** The MemPage.nFree field is invalidated by this function. It is the ** responsibility of the caller to set it correctly. */ static int rebuildPage( MemPage *pPg, /* Edit this page */ int nCell, /* Final number of cells on page */ u8 **apCell, /* Array of cells */ u16 *szCell /* Array of cell sizes */ ){ const int hdr = pPg->hdrOffset; /* Offset of header on pPg */ u8 * const aData = pPg->aData; /* Pointer to data for pPg */ |
︙ | ︙ | |||
6316 6317 6318 6319 6320 6321 6322 | pData = pEnd; for(i=0; i<nCell; i++){ u8 *pCell = apCell[i]; if( pCell>aData && pCell<pEnd ){ pCell = &pTmp[pCell - aData]; } pData -= szCell[i]; | < > > | > | 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 | pData = pEnd; for(i=0; i<nCell; i++){ u8 *pCell = apCell[i]; if( pCell>aData && pCell<pEnd ){ pCell = &pTmp[pCell - aData]; } pData -= szCell[i]; put2byte(pCellptr, (pData - aData)); pCellptr += 2; if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT; memcpy(pData, pCell, szCell[i]); assert( szCell[i]==pPg->xCellSize(pPg, pCell) || CORRUPT_DB ); testcase( szCell[i]!=pPg->xCellSize(pPg,pCell) ); } /* The pPg->nFree field is now set incorrectly. The caller will fix it. */ pPg->nCell = nCell; pPg->nOverflow = 0; put2byte(&aData[hdr+1], 0); put2byte(&aData[hdr+3], pPg->nCell); put2byte(&aData[hdr+5], pData - aData); aData[hdr+7] = 0x00; return SQLITE_OK; } /* ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell ** contains the size in bytes of each such cell. This function attempts to ** add the cells stored in the array to page pPg. If it cannot (because ** the page needs to be defragmented before the cells will fit), non-zero |
︙ | ︙ | |||
6363 6364 6365 6366 6367 6368 6369 6370 | ** cells in apCell[], then the cells do not fit and non-zero is returned. */ static int pageInsertArray( MemPage *pPg, /* Page to add cells to */ u8 *pBegin, /* End of cell-pointer array */ u8 **ppData, /* IN/OUT: Page content -area pointer */ u8 *pCellptr, /* Pointer to cell-pointer area */ int nCell, /* Number of cells to add to pPg */ | > | < > | | < > | > | < > | | | > > > > | 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 | ** cells in apCell[], then the cells do not fit and non-zero is returned. */ static int pageInsertArray( MemPage *pPg, /* Page to add cells to */ u8 *pBegin, /* End of cell-pointer array */ u8 **ppData, /* IN/OUT: Page content -area pointer */ u8 *pCellptr, /* Pointer to cell-pointer area */ int iFirst, /* Index of first cell to add */ int nCell, /* Number of cells to add to pPg */ CellArray *pCArray /* Array of cells */ ){ int i; u8 *aData = pPg->aData; u8 *pData = *ppData; const int bFreelist = aData[1] || aData[2]; int iEnd = iFirst + nCell; assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */ for(i=iFirst; i<iEnd; i++){ int sz, rc; u8 *pSlot; sz = cachedCellSize(pCArray, i); if( bFreelist==0 || (pSlot = pageFindSlot(pPg, sz, &rc, 0))==0 ){ pData -= sz; if( pData<pBegin ) return 1; pSlot = pData; } memcpy(pSlot, pCArray->apCell[i], sz); put2byte(pCellptr, (pSlot - aData)); pCellptr += 2; } *ppData = pData; return 0; } /* ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell ** contains the size in bytes of each such cell. This function adds the ** space associated with each cell in the array that is currently stored ** within the body of pPg to the pPg free-list. The cell-pointers and other ** fields of the page are not updated. ** ** This function returns the total number of cells added to the free-list. */ static int pageFreeArray( MemPage *pPg, /* Page to edit */ int iFirst, /* First cell to delete */ int nCell, /* Cells to delete */ CellArray *pCArray /* Array of cells */ ){ u8 * const aData = pPg->aData; u8 * const pEnd = &aData[pPg->pBt->usableSize]; u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize]; int nRet = 0; int i; int iEnd = iFirst + nCell; u8 *pFree = 0; int szFree = 0; for(i=iFirst; i<iEnd; i++){ u8 *pCell = pCArray->apCell[i]; if( pCell>=pStart && pCell<pEnd ){ int sz; /* No need to use cachedCellSize() here. The sizes of all cells that ** are to be freed have already been computing while deciding which ** cells need freeing */ sz = pCArray->szCell[i]; assert( sz>0 ); if( pFree!=(pCell + sz) ){ if( pFree ){ assert( pFree>aData && (pFree - aData)<65536 ); freeSpace(pPg, (u16)(pFree - aData), szFree); } pFree = pCell; szFree = sz; |
︙ | ︙ | |||
6450 6451 6452 6453 6454 6455 6456 | ** ** This routine makes the necessary adjustments to pPg so that it contains ** the correct cells after being balanced. ** ** The pPg->nFree field is invalid when this function returns. It is the ** responsibility of the caller to set it correctly. */ | | | < | < < | < < | | | | | > | > | | 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 | ** ** This routine makes the necessary adjustments to pPg so that it contains ** the correct cells after being balanced. ** ** The pPg->nFree field is invalid when this function returns. It is the ** responsibility of the caller to set it correctly. */ static int editPage( MemPage *pPg, /* Edit this page */ int iOld, /* Index of first cell currently on page */ int iNew, /* Index of new first cell on page */ int nNew, /* Final number of cells on page */ CellArray *pCArray /* Array of cells and sizes */ ){ u8 * const aData = pPg->aData; const int hdr = pPg->hdrOffset; u8 *pBegin = &pPg->aCellIdx[nNew * 2]; int nCell = pPg->nCell; /* Cells stored on pPg */ u8 *pData; u8 *pCellptr; int i; int iOldEnd = iOld + pPg->nCell + pPg->nOverflow; int iNewEnd = iNew + nNew; #ifdef SQLITE_DEBUG u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); memcpy(pTmp, aData, pPg->pBt->usableSize); #endif /* Remove cells from the start and end of the page */ if( iOld<iNew ){ int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray); memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2); nCell -= nShift; } if( iNewEnd < iOldEnd ){ nCell -= pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray); } pData = &aData[get2byteNotZero(&aData[hdr+5])]; if( pData<pBegin ) goto editpage_fail; /* Add cells to the start of the page */ if( iNew<iOld ){ int nAdd = MIN(nNew,iOld-iNew); assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB ); pCellptr = pPg->aCellIdx; memmove(&pCellptr[nAdd*2], pCellptr, nCell*2); if( pageInsertArray( pPg, pBegin, &pData, pCellptr, iNew, nAdd, pCArray ) ) goto editpage_fail; nCell += nAdd; } /* Add any overflow cells */ for(i=0; i<pPg->nOverflow; i++){ int iCell = (iOld + pPg->aiOvfl[i]) - iNew; if( iCell>=0 && iCell<nNew ){ pCellptr = &pPg->aCellIdx[iCell * 2]; memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2); nCell++; if( pageInsertArray( pPg, pBegin, &pData, pCellptr, iCell+iNew, 1, pCArray ) ) goto editpage_fail; } } /* Append cells to the end of the page */ pCellptr = &pPg->aCellIdx[nCell*2]; if( pageInsertArray( pPg, pBegin, &pData, pCellptr, iNew+nCell, nNew-nCell, pCArray ) ) goto editpage_fail; pPg->nCell = nNew; pPg->nOverflow = 0; put2byte(&aData[hdr+3], pPg->nCell); put2byte(&aData[hdr+5], pData - aData); #ifdef SQLITE_DEBUG for(i=0; i<nNew && !CORRUPT_DB; i++){ u8 *pCell = pCArray->apCell[i+iNew]; int iOff = get2byte(&pPg->aCellIdx[i*2]); if( pCell>=aData && pCell<&aData[pPg->pBt->usableSize] ){ pCell = &pTmp[pCell - aData]; } assert( 0==memcmp(pCell, &aData[iOff], pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) ); } #endif return SQLITE_OK; editpage_fail: /* Unable to edit this page. Rebuild it from scratch instead. */ populateCellCache(pCArray, iNew, nNew); return rebuildPage(pPg, nNew, &pCArray->apCell[iNew], &pCArray->szCell[iNew]); } /* ** The following parameters determine how many adjacent pages get involved ** in a balancing operation. NN is the number of neighbors on either side ** of the page that participate in the balancing operation. NB is the ** total number of pages that participate, including the target page and |
︙ | ︙ | |||
6616 6617 6618 6619 6620 6621 6622 | u8 *pCell = pPage->apOvfl[0]; u16 szCell = pPage->xCellSize(pPage, pCell); u8 *pStop; assert( sqlite3PagerIswriteable(pNew->pDbPage) ); assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); | | > | 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 | u8 *pCell = pPage->apOvfl[0]; u16 szCell = pPage->xCellSize(pPage, pCell); u8 *pStop; assert( sqlite3PagerIswriteable(pNew->pDbPage) ); assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); rc = rebuildPage(pNew, 1, &pCell, &szCell); if( NEVER(rc) ) return rc; pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell; /* If this is an auto-vacuum database, update the pointer map ** with entries for the new page, and any pointer from the ** cell on the page to an overflow page. If either of these ** operations fails, the return code is set, but the contents ** of the parent page are still manipulated by thh code below. |
︙ | ︙ | |||
6820 6821 6822 6823 6824 6825 6826 | MemPage *pParent, /* Parent page of siblings being balanced */ int iParentIdx, /* Index of "the page" in pParent */ u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ int isRoot, /* True if pParent is a root-page */ int bBulk /* True if this call is part of a bulk load */ ){ BtShared *pBt; /* The whole database */ | < < | | < < > > > | 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 | MemPage *pParent, /* Parent page of siblings being balanced */ int iParentIdx, /* Index of "the page" in pParent */ u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ int isRoot, /* True if pParent is a root-page */ int bBulk /* True if this call is part of a bulk load */ ){ BtShared *pBt; /* The whole database */ int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ int nNew = 0; /* Number of pages in apNew[] */ int nOld; /* Number of pages in apOld[] */ int i, j, k; /* Loop counters */ int nxDiv; /* Next divider slot in pParent->aCell[] */ int rc = SQLITE_OK; /* The return code */ u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */ int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ int usableSpace; /* Bytes in pPage beyond the header */ int pageFlags; /* Value of pPage->aData[0] */ int iSpace1 = 0; /* First unused byte of aSpace1[] */ int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ int szScratch; /* Size of scratch memory requested */ MemPage *apOld[NB]; /* pPage and up to two siblings */ MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ u8 *pRight; /* Location in parent of right-sibling pointer */ u8 *apDiv[NB-1]; /* Divider cells in pParent */ int cntNew[NB+2]; /* Index in b.paCell[] of cell after i-th page */ int cntOld[NB+2]; /* Old index in b.apCell[] */ int szNew[NB+2]; /* Combined size of cells placed on i-th page */ u8 *aSpace1; /* Space for copies of dividers cells */ Pgno pgno; /* Temp var to store a page number in */ u8 abDone[NB+2]; /* True after i'th new page is populated */ Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */ Pgno aPgOrder[NB+2]; /* Copy of aPgno[] used for sorting pages */ u16 aPgFlags[NB+2]; /* flags field of new pages before shuffling */ CellArray b; /* Parsed information on cells being balanced */ memset(abDone, 0, sizeof(abDone)); b.nCell = 0; b.apCell = 0; pBt = pParent->pBt; assert( sqlite3_mutex_held(pBt->mutex) ); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); #if 0 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno)); #endif |
︙ | ︙ | |||
6960 6961 6962 6963 6964 6965 6966 | ** alignment */ nMaxCells = (nMaxCells + 3)&~3; /* ** Allocate space for memory structures */ szScratch = | | | | | | | | | | > | | < > > > > > > | > > > > > > > > > > > > > > > > > > > > | > | > | | < | < > | < > | | | | | | | | | | | | | | | | | | | | | | > > > > > > > > > | | | > > > | | > > > | > > > | < < | | | | > > > > > | > > > > > > > > > > > > > > > > > > | | > | | < > > | | | | > | > > > | 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 | ** alignment */ nMaxCells = (nMaxCells + 3)&~3; /* ** Allocate space for memory structures */ szScratch = nMaxCells*sizeof(u8*) /* b.apCell */ + nMaxCells*sizeof(u16) /* b.szCell */ + pBt->pageSize; /* aSpace1 */ /* EVIDENCE-OF: R-28375-38319 SQLite will never request a scratch buffer ** that is more than 6 times the database page size. */ assert( szScratch<=6*(int)pBt->pageSize ); b.apCell = sqlite3ScratchMalloc( szScratch ); if( b.apCell==0 ){ rc = SQLITE_NOMEM; goto balance_cleanup; } b.szCell = (u16*)&b.apCell[nMaxCells]; aSpace1 = (u8*)&b.szCell[nMaxCells]; assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); /* ** Load pointers to all cells on sibling pages and the divider cells ** into the local b.apCell[] array. Make copies of the divider cells ** into space obtained from aSpace1[]. The divider cells have already ** been removed from pParent. ** ** If the siblings are on leaf pages, then the child pointers of the ** divider cells are stripped from the cells before they are copied ** into aSpace1[]. In this way, all cells in b.apCell[] are without ** child pointers. If siblings are not leaves, then all cell in ** b.apCell[] include child pointers. Either way, all cells in b.apCell[] ** are alike. ** ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. ** leafData: 1 if pPage holds key+data and pParent holds only keys. */ b.pRef = apOld[0]; leafCorrection = b.pRef->leaf*4; leafData = b.pRef->intKeyLeaf; for(i=0; i<nOld; i++){ MemPage *pOld = apOld[i]; int limit = pOld->nCell; u8 *aData = pOld->aData; u16 maskPage = pOld->maskPage; u8 *piCell = aData + pOld->cellOffset; u8 *piEnd; /* Verify that all sibling pages are of the same "type" (table-leaf, ** table-interior, index-leaf, or index-interior). */ if( pOld->aData[0]!=apOld[0]->aData[0] ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } /* Load b.apCell[] with pointers to all cells in pOld. If pOld ** constains overflow cells, include them in the b.apCell[] array ** in the correct spot. ** ** Note that when there are multiple overflow cells, it is always the ** case that they are sequential and adjacent. This invariant arises ** because multiple overflows can only occurs when inserting divider ** cells into a parent on a prior balance, and divider cells are always ** adjacent and are inserted in order. There is an assert() tagged ** with "NOTE 1" in the overflow cell insertion loop to prove this ** invariant. ** ** This must be done in advance. Once the balance starts, the cell ** offset section of the btree page will be overwritten and we will no ** long be able to find the cells if a pointer to each cell is not saved ** first. */ memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*limit); if( pOld->nOverflow>0 ){ memset(&b.szCell[b.nCell+limit], 0, sizeof(b.szCell[0])*pOld->nOverflow); limit = pOld->aiOvfl[0]; for(j=0; j<limit; j++){ b.apCell[b.nCell] = aData + (maskPage & get2byte(piCell)); piCell += 2; b.nCell++; } for(k=0; k<pOld->nOverflow; k++){ assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */ b.apCell[b.nCell] = pOld->apOvfl[k]; b.nCell++; } limit = pOld->nCell - pOld->aiOvfl[0]; } piEnd = aData + pOld->cellOffset + 2*pOld->nCell; while( piCell<piEnd ){ assert( b.nCell<nMaxCells ); b.apCell[b.nCell] = aData + (maskPage & get2byte(piCell)); piCell += 2; b.nCell++; } cntOld[i] = b.nCell; if( i<nOld-1 && !leafData){ u16 sz = (u16)szNew[i]; u8 *pTemp; assert( b.nCell<nMaxCells ); b.szCell[b.nCell] = sz; pTemp = &aSpace1[iSpace1]; iSpace1 += sz; assert( sz<=pBt->maxLocal+23 ); assert( iSpace1 <= (int)pBt->pageSize ); memcpy(pTemp, apDiv[i], sz); b.apCell[b.nCell] = pTemp+leafCorrection; assert( leafCorrection==0 || leafCorrection==4 ); b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection; if( !pOld->leaf ){ assert( leafCorrection==0 ); assert( pOld->hdrOffset==0 ); /* The right pointer of the child page pOld becomes the left ** pointer of the divider cell */ memcpy(b.apCell[b.nCell], &pOld->aData[8], 4); }else{ assert( leafCorrection==4 ); while( b.szCell[b.nCell]<4 ){ /* Do not allow any cells smaller than 4 bytes. If a smaller cell ** does exist, pad it with 0x00 bytes. */ assert( b.szCell[b.nCell]==3 || CORRUPT_DB ); assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB ); aSpace1[iSpace1++] = 0x00; b.szCell[b.nCell]++; } } b.nCell++; } } /* ** Figure out the number of pages needed to hold all b.nCell cells. ** Store this number in "k". Also compute szNew[] which is the total ** size of all cells on the i-th page and cntNew[] which is the index ** in b.apCell[] of the cell that divides page i from page i+1. ** cntNew[k] should equal b.nCell. ** ** Values computed by this block: ** ** k: The total number of sibling pages ** szNew[i]: Spaced used on the i-th sibling page. ** cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to ** the right of the i-th sibling page. ** usableSpace: Number of bytes of space available on each sibling. ** */ usableSpace = pBt->usableSize - 12 + leafCorrection; for(i=0; i<nOld; i++){ MemPage *p = apOld[i]; szNew[i] = usableSpace - p->nFree; if( szNew[i]<0 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } for(j=0; j<p->nOverflow; j++){ szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]); } cntNew[i] = cntOld[i]; } k = nOld; for(i=0; i<k; i++){ int sz; while( szNew[i]>usableSpace ){ if( i+1>=k ){ k = i+2; if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } szNew[k-1] = 0; cntNew[k-1] = b.nCell; } sz = 2 + cachedCellSize(&b, cntNew[i]-1); szNew[i] -= sz; if( !leafData ){ if( cntNew[i]<b.nCell ){ sz = 2 + cachedCellSize(&b, cntNew[i]); }else{ sz = 0; } } szNew[i+1] += sz; cntNew[i]--; } while( cntNew[i]<b.nCell ){ sz = 2 + cachedCellSize(&b, cntNew[i]); if( szNew[i]+sz>usableSpace ) break; szNew[i] += sz; cntNew[i]++; if( !leafData ){ if( cntNew[i]<b.nCell ){ sz = 2 + cachedCellSize(&b, cntNew[i]); }else{ sz = 0; } } szNew[i+1] -= sz; } if( cntNew[i]>=b.nCell ){ k = i+1; }else if( cntNew[i] - (i>0 ? cntNew[i-1] : 0) <= 0 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } } /* ** The packing computed by the previous block is biased toward the siblings ** on the left side (siblings with smaller keys). The left siblings are ** always nearly full, while the right-most sibling might be nearly empty. ** The next block of code attempts to adjust the packing of siblings to ** get a better balance. ** ** This adjustment is more than an optimization. The packing above might ** be so out of balance as to be illegal. For example, the right-most ** sibling might be completely empty. This adjustment is not optional. */ for(i=k-1; i>0; i--){ int szRight = szNew[i]; /* Size of sibling on the right */ int szLeft = szNew[i-1]; /* Size of sibling on the left */ int r; /* Index of right-most cell in left sibling */ int d; /* Index of first cell to the left of right sibling */ r = cntNew[i-1] - 1; d = r + 1 - leafData; (void)cachedCellSize(&b, d); while(1){ assert( d<nMaxCells ); assert( r<nMaxCells ); (void)cachedCellSize(&b, r); if( szRight!=0 && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+2)) ){ break; } szRight += b.szCell[d] + 2; szLeft -= b.szCell[r] + 2; cntNew[i-1] = r; if( cntNew[i-1] <= 0 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } r--; d--; } szNew[i] = szRight; szNew[i-1] = szLeft; } /* Sanity check: For a non-corrupt database file one of the follwing ** must be true: |
︙ | ︙ | |||
7160 7161 7162 7163 7164 7165 7166 | }else{ assert( i>0 ); rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0); if( rc ) goto balance_cleanup; zeroPage(pNew, pageFlags); apNew[i] = pNew; nNew++; | | | 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 | }else{ assert( i>0 ); rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0); if( rc ) goto balance_cleanup; zeroPage(pNew, pageFlags); apNew[i] = pNew; nNew++; cntOld[i] = b.nCell; /* Set the pointer-map entry for the new sibling page. */ if( ISAUTOVACUUM ){ ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc); if( rc!=SQLITE_OK ){ goto balance_cleanup; } |
︙ | ︙ | |||
7265 7266 7267 7268 7269 7270 7271 | MemPage *pNew = apNew[0]; u8 *aOld = pNew->aData; int cntOldNext = pNew->nCell + pNew->nOverflow; int usableSize = pBt->usableSize; int iNew = 0; int iOld = 0; | | | | 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 | MemPage *pNew = apNew[0]; u8 *aOld = pNew->aData; int cntOldNext = pNew->nCell + pNew->nOverflow; int usableSize = pBt->usableSize; int iNew = 0; int iOld = 0; for(i=0; i<b.nCell; i++){ u8 *pCell = b.apCell[i]; if( i==cntOldNext ){ MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld]; cntOldNext += pOld->nCell + pOld->nOverflow + !leafData; aOld = pOld->aData; } if( i==cntNew[iNew] ){ pNew = apNew[++iNew]; |
︙ | ︙ | |||
7291 7292 7293 7294 7295 7296 7297 | || pNew->pgno!=aPgno[iOld] || pCell<aOld || pCell>=&aOld[usableSize] ){ if( !leafCorrection ){ ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc); } | | > > | | | | | | 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 | || pNew->pgno!=aPgno[iOld] || pCell<aOld || pCell>=&aOld[usableSize] ){ if( !leafCorrection ){ ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc); } if( cachedCellSize(&b,i)>pNew->minLocal ){ ptrmapPutOvflPtr(pNew, pCell, &rc); } if( rc ) goto balance_cleanup; } } } /* Insert new divider cells into pParent. */ for(i=0; i<nNew-1; i++){ u8 *pCell; u8 *pTemp; int sz; MemPage *pNew = apNew[i]; j = cntNew[i]; assert( j<nMaxCells ); assert( b.apCell[j]!=0 ); pCell = b.apCell[j]; sz = b.szCell[j] + leafCorrection; pTemp = &aOvflSpace[iOvflSpace]; if( !pNew->leaf ){ memcpy(&pNew->aData[8], pCell, 4); }else if( leafData ){ /* If the tree is a leaf-data tree, and the siblings are leaves, ** then there is no divider cell in b.apCell[]. Instead, the divider ** cell consists of the integer key for the right-most cell of ** the sibling-page assembled above only. */ CellInfo info; j--; pNew->xParseCell(pNew, b.apCell[j], &info); pCell = pTemp; sz = 4 + putVarint(&pCell[4], info.nKey); pTemp = 0; }else{ pCell -= 4; /* Obscure case for non-leaf-data trees: If the cell at pCell was ** previously stored on a leaf node, and its reported size was 4 ** bytes, then it may actually be smaller than this ** (see btreeParseCellPtr(), 4 bytes is the minimum size of ** any cell). But it is important to pass the correct size to ** insertCell(), so reparse the cell now. ** ** Note that this can never happen in an SQLite data file, as all ** cells are at least 4 bytes. It only happens in b-trees used ** to evaluate "IN (SELECT ...)" and similar clauses. */ if( b.szCell[j]==4 ){ assert(leafCorrection==4); sz = pParent->xCellSize(pParent, pCell); } } iOvflSpace += sz; assert( sz<=pBt->maxLocal+23 ); assert( iOvflSpace <= (int)pBt->pageSize ); |
︙ | ︙ | |||
7395 7396 7397 7398 7399 7400 7401 | ** only after iPg+1 has already been updated. */ assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] ); if( iPg==0 ){ iNew = iOld = 0; nNewCell = cntNew[0]; }else{ | | | > | 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 | ** only after iPg+1 has already been updated. */ assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] ); if( iPg==0 ){ iNew = iOld = 0; nNewCell = cntNew[0]; }else{ iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell; iNew = cntNew[iPg-1] + !leafData; nNewCell = cntNew[iPg] - iNew; } rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b); if( rc ) goto balance_cleanup; abDone[iPg]++; apNew[iPg]->nFree = usableSpace-szNew[iPg]; assert( apNew[iPg]->nOverflow==0 ); assert( apNew[iPg]->nCell==nNewCell ); } } |
︙ | ︙ | |||
7451 7452 7453 7454 7455 7456 7457 | u32 key = get4byte(&apNew[i]->aData[8]); ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); } } assert( pParent->isInit ); TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", | | | 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 | u32 key = get4byte(&apNew[i]->aData[8]); ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); } } assert( pParent->isInit ); TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", nOld, nNew, b.nCell)); /* Free any old pages that were not reused as new pages. */ for(i=nNew; i<nOld; i++){ freePage(apOld[i], &rc); } |
︙ | ︙ | |||
7474 7475 7476 7477 7478 7479 7480 | } #endif /* ** Cleanup before returning. */ balance_cleanup: | | | 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 | } #endif /* ** Cleanup before returning. */ balance_cleanup: sqlite3ScratchFree(b.apCell); for(i=0; i<nOld; i++){ releasePage(apOld[i]); } for(i=0; i<nNew; i++){ releasePage(apNew[i]); } |
︙ | ︙ | |||
7925 7926 7927 7928 7929 7930 7931 | if( !pPage->leaf ){ MemPage *pLeaf = pCur->apPage[pCur->iPage]; int nCell; Pgno n = pCur->apPage[iCellDepth+1]->pgno; unsigned char *pTmp; pCell = findCell(pLeaf, pLeaf->nCell-1); | | | 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 | if( !pPage->leaf ){ MemPage *pLeaf = pCur->apPage[pCur->iPage]; int nCell; Pgno n = pCur->apPage[iCellDepth+1]->pgno; unsigned char *pTmp; pCell = findCell(pLeaf, pLeaf->nCell-1); if( NEVER(pCell<&pLeaf->aData[4]) ) return SQLITE_CORRUPT_BKPT; nCell = pLeaf->xCellSize(pLeaf, pCell); assert( MX_CELL_SIZE(pBt) >= nCell ); pTmp = pBt->pTmpSpace; assert( pTmp!=0 ); rc = sqlite3PagerWrite(pLeaf->pDbPage); insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc); dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc); |
︙ | ︙ |