/ Check-in [1d43ee4000]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Defer computing the MemPage.nFree value of an in-memory btree page until it is actually needed, since for many pages it is never needed. This checkin works sufficiently to prove the concept, but still has issues with exception handling.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | deferred-free-space
Files: files | file ages | folders
SHA3-256: 1d43ee4000b71f5c6d49244dee96358c567f09ba3451b9d22895a796d3f61ad6
User & Date: drh 2019-02-09 21:06:40
Context
2019-02-09
22:33
Fix a page-cache reference leak in the btree balancer when there is a corrupt database. check-in: 9285899120 user: drh tags: deferred-free-space
21:06
Defer computing the MemPage.nFree value of an in-memory btree page until it is actually needed, since for many pages it is never needed. This checkin works sufficiently to prove the concept, but still has issues with exception handling. check-in: 1d43ee4000 user: drh tags: deferred-free-space
19:23
Change a few assert() statements in fts3 that might fail if the database is corrupt. check-in: db74a56af7 user: dan tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/btree.c.

1502
1503
1504
1505
1506
1507
1508

1509
1510
1511
1512
1513
1514
1515
....
1653
1654
1655
1656
1657
1658
1659

1660
1661
1662
1663
1664
1665
1666
....
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857

1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875

1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918

1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
....
1980
1981
1982
1983
1984
1985
1986




























































































1987
1988
1989
1990
1991
1992
1993
....
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157


2158
2159
2160
2161
2162
2163
2164
....
6562
6563
6564
6565
6566
6567
6568

6569
6570
6571
6572
6573
6574
6575
....
6632
6633
6634
6635
6636
6637
6638

6639
6640
6641
6642
6643
6644
6645
....
7183
7184
7185
7186
7187
7188
7189
7190
7191









7192
7193
7194
7195
7196
7197
7198
....
7462
7463
7464
7465
7466
7467
7468




7469
7470
7471
7472
7473
7474
7475
....
7497
7498
7499
7500
7501
7502
7503



7504
7505
7506
7507
7508
7509
7510
....
7700
7701
7702
7703
7704
7705
7706

7707
7708
7709
7710
7711
7712
7713
....
8243
8244
8245
8246
8247
8248
8249




8250
8251
8252
8253
8254
8255
8256
....
8617
8618
8619
8620
8621
8622
8623




8624
8625
8626
8627
8628
8629
8630
....
8767
8768
8769
8770
8771
8772
8773

8774
8775
8776
8777
8778
8779
8780
....
8837
8838
8839
8840
8841
8842
8843




8844
8845
8846
8847
8848
8849
8850
....
9727
9728
9729
9730
9731
9732
9733





9734
9735
9736
9737
9738
9739
9740
      src = temp;
    }
    memcpy(&data[cbrk], &src[pc], size);
  }
  data[hdr+7] = 0;

 defragment_out:

  if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){
    return SQLITE_CORRUPT_PAGE(pPage);
  }
  assert( cbrk>=iCellFirst );
  put2byte(&data[hdr+5], cbrk);
  data[hdr+1] = 0;
  data[hdr+2] = 0;
................................................................................

  /* The request could not be fulfilled using a freelist slot.  Check
  ** to see if defragmentation is necessary.
  */
  testcase( gap+2+nByte==top );
  if( gap+2+nByte>top ){
    assert( pPage->nCell>0 || CORRUPT_DB );

    rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte)));
    if( rc ) return rc;
    top = get2byteNotZero(&data[hdr+5]);
    assert( gap+2+nByte<=top );
  }


................................................................................
    return SQLITE_CORRUPT_PAGE(pPage);
  }
  pPage->max1bytePayload = pBt->max1bytePayload;
  return SQLITE_OK;
}

/*
** Initialize the auxiliary information for a disk block.
**
** Return SQLITE_OK on success.  If we see that the page does
** not contain a well-formed database page, then return 
** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
** guarantee that the page is well-formed.  It only shows that
** we failed to detect any corruption.
*/
static int btreeInitPage(MemPage *pPage){

  int pc;            /* Address of a freeblock within pPage->aData[] */
  u8 hdr;            /* Offset to beginning of page header */
  u8 *data;          /* Equal to pPage->aData */
  BtShared *pBt;        /* The main btree structure */
  int usableSize;    /* Amount of usable space on each page */
  u16 cellOffset;    /* Offset from start of page to first cell pointer */
  int nFree;         /* Number of unused bytes on the page */
  int top;           /* First byte of the cell content area */
  int iCellFirst;    /* First allowable cell or freeblock offset */
  int iCellLast;     /* Last possible cell or freeblock offset */

  assert( pPage->pBt!=0 );
  assert( pPage->pBt->db!=0 );
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
  assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
  assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
  assert( pPage->isInit==0 );


  pBt = pPage->pBt;
  hdr = pPage->hdrOffset;
  data = pPage->aData;
  /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
  ** the b-tree page type. */
  if( decodeFlags(pPage, data[hdr]) ){
    return SQLITE_CORRUPT_PAGE(pPage);
  }
  assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
  pPage->maskPage = (u16)(pBt->pageSize - 1);
  pPage->nOverflow = 0;
  usableSize = pBt->usableSize;
  pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;
  pPage->aDataEnd = &data[usableSize];
  pPage->aCellIdx = &data[cellOffset];
  pPage->aDataOfst = &data[pPage->childPtrSize];
  /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
  ** the start of the cell content area. A zero value for this integer is
  ** interpreted as 65536. */
  top = get2byteNotZero(&data[hdr+5]);
  /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
  ** number of cells on the page. */
  pPage->nCell = get2byte(&data[hdr+3]);
  if( pPage->nCell>MX_CELL(pBt) ){
    /* To many cells for a single page.  The page must be corrupt */
    return SQLITE_CORRUPT_PAGE(pPage);
  }
  testcase( pPage->nCell==MX_CELL(pBt) );
  /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
  ** possible for a root page of a table that contains no rows) then the
  ** offset to the cell content area will equal the page size minus the
  ** bytes of reserved space. */
  assert( pPage->nCell>0 || top==usableSize || CORRUPT_DB );

  /* A malformed database page might cause us to read past the end
  ** of page when parsing a cell.  
  **
  ** The following block of code checks early to see if a cell extends
  ** past the end of a page boundary and causes SQLITE_CORRUPT to be 
  ** returned if it does.
  */
  iCellFirst = cellOffset + 2*pPage->nCell;

  iCellLast = usableSize - 4;
  if( pBt->db->flags & SQLITE_CellSizeCk ){
    int i;            /* Index into the cell pointer array */
    int sz;           /* Size of a cell */

    if( !pPage->leaf ) iCellLast--;
    for(i=0; i<pPage->nCell; i++){
      pc = get2byteAligned(&data[cellOffset+i*2]);
      testcase( pc==iCellFirst );
      testcase( pc==iCellLast );
      if( pc<iCellFirst || pc>iCellLast ){
        return SQLITE_CORRUPT_PAGE(pPage);
      }
      sz = pPage->xCellSize(pPage, &data[pc]);
      testcase( pc+sz==usableSize );
      if( pc+sz>usableSize ){
        return SQLITE_CORRUPT_PAGE(pPage);
      }
    }
    if( !pPage->leaf ) iCellLast++;
  }  

  /* Compute the total free space on the page
  ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
  ** start of the first freeblock on the page, or is zero if there are no
  ** freeblocks. */
  pc = get2byte(&data[hdr+1]);
  nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
................................................................................
  ** serves to verify that the offset to the start of the cell-content
  ** area, according to the page header, lies within the page.
  */
  if( nFree>usableSize ){
    return SQLITE_CORRUPT_PAGE(pPage);
  }
  pPage->nFree = (u16)(nFree - iCellFirst);




























































































  pPage->isInit = 1;
  return SQLITE_OK;
}

/*
** Set up a raw page so that it looks like a database page holding
** no entries.
................................................................................
  assert( sqlite3_mutex_held(pBt->mutex) );
  assert( pCur==0 || ppPage==&pCur->pPage );
  assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
  assert( pCur==0 || pCur->iPage>0 );

  if( pgno>btreePagecount(pBt) ){
    rc = SQLITE_CORRUPT_BKPT;
    goto getAndInitPage_error;
  }
  rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
  if( rc ){
    goto getAndInitPage_error;
  }
  *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
  if( (*ppPage)->isInit==0 ){
    btreePageFromDbPage(pDbPage, pgno, pBt);
    rc = btreeInitPage(*ppPage);
    if( rc!=SQLITE_OK ){
      releasePage(*ppPage);
      goto getAndInitPage_error;
    }
  }
  assert( (*ppPage)->pgno==pgno );
  assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );

  /* If obtaining a child page for a cursor, we must verify that the page is
  ** compatible with the root page. */
  if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){
    rc = SQLITE_CORRUPT_PGNO(pgno);
    releasePage(*ppPage);
    goto getAndInitPage_error;
  }
  return SQLITE_OK;

getAndInitPage_error:


  if( pCur ){
    pCur->iPage--;
    pCur->pPage = pCur->apPage[pCur->iPage];
  }
  testcase( pgno==0 );
  assert( pgno!=0 || rc==SQLITE_CORRUPT );
  return rc;
................................................................................
  int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */

  if( *pRC ) return;
  assert( idx>=0 && idx<pPage->nCell );
  assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
  assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );

  data = pPage->aData;
  ptr = &pPage->aCellIdx[2*idx];
  pc = get2byte(ptr);
  hdr = pPage->hdrOffset;
  testcase( pc==get2byte(&data[hdr+5]) );
  testcase( pc+sz==pPage->pBt->usableSize );
  if( pc+sz > pPage->pBt->usableSize ){
................................................................................
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  /* The cell should normally be sized correctly.  However, when moving a
  ** malformed cell from a leaf page to an interior page, if the cell size
  ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
  ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
  ** the term after the || in the following assert(). */
  assert( sz==pPage->xCellSize(pPage, pCell) || (sz==8 && iChild>0) );

  if( pPage->nOverflow || sz+2>pPage->nFree ){
    if( pTemp ){
      memcpy(pTemp, pCell, sz);
      pCell = pTemp;
    }
    if( iChild ){
      put4byte(pCell, iChild);
................................................................................
  MemPage *pNew;                       /* Newly allocated page */
  int rc;                              /* Return Code */
  Pgno pgnoNew;                        /* Page number of pNew */

  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  assert( pPage->nOverflow==1 );

  if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT;  /* dbfuzz001.test */










  /* Allocate a new page. This page will become the right-sibling of 
  ** pPage. Make the parent page writable, so that the new divider cell
  ** may be inserted. If both these operations are successful, proceed.
  */
  rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);

................................................................................
  */
  assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
  assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );

  if( !aOvflSpace ){
    return SQLITE_NOMEM_BKPT;
  }





  /* Find the sibling pages to balance. Also locate the cells in pParent 
  ** that divide the siblings. An attempt is made to find NN siblings on 
  ** either side of pPage. More siblings are taken from one side, however, 
  ** if there are fewer than NN siblings on the other side. If pParent
  ** has NB or fewer children then all children of pParent are taken.  
  **
................................................................................
    pRight = &pParent->aData[pParent->hdrOffset+8];
  }else{
    pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
  }
  pgno = get4byte(pRight);
  while( 1 ){
    rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);



    if( rc ){
      memset(apOld, 0, (i+1)*sizeof(MemPage*));
      goto balance_cleanup;
    }
    nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
    if( (i--)==0 ) break;

................................................................................
    b.apEnd[k] = p->aDataEnd;
    b.ixNx[k] = cntOld[i];
    if( !leafData ){
      k++;
      b.apEnd[k] = pParent->aDataEnd;
      b.ixNx[k] = cntOld[i]+1;
    }

    szNew[i] = usableSpace - p->nFree;
    for(j=0; j<p->nOverflow; j++){
      szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
    }
    cntNew[i] = cntOld[i];
  }
  k = nOld;
................................................................................
  VVA_ONLY( int balance_quick_called = 0 );
  VVA_ONLY( int balance_deeper_called = 0 );

  do {
    int iPage = pCur->iPage;
    MemPage *pPage = pCur->pPage;





    if( iPage==0 ){
      if( pPage->nOverflow ){
        /* The root page of the b-tree is overfull. In this case call the
        ** balance_deeper() function to create a new child for the root-page
        ** and copy the current contents of the root-page to it. The
        ** next iteration of the do-loop will balance the child page.
        */ 
................................................................................

  }
  assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );

  pPage = pCur->pPage;
  assert( pPage->intKey || pX->nKey>=0 );
  assert( pPage->leaf || !pPage->intKey );





  TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
          pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
          loc==0 ? "overwrite" : "new entry"));
  assert( pPage->isInit );
  newCell = pBt->pTmpSpace;
  assert( newCell!=0 );
................................................................................
  assert( pCur->eState==CURSOR_VALID );
  assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );

  iCellDepth = pCur->iPage;
  iCellIdx = pCur->ix;
  pPage = pCur->pPage;
  pCell = findCell(pPage, iCellIdx);


  /* If the bPreserve flag is set to true, then the cursor position must
  ** be preserved following this delete operation. If the current delete
  ** will cause a b-tree rebalance, then this is done by saving the cursor
  ** key and leaving the cursor in CURSOR_REQUIRESEEK state before 
  ** returning. 
  **
................................................................................
  ** node to replace the deleted cell.  */
  if( !pPage->leaf ){
    MemPage *pLeaf = pCur->pPage;
    int nCell;
    Pgno n;
    unsigned char *pTmp;





    if( iCellDepth<pCur->iPage-1 ){
      n = pCur->apPage[iCellDepth+1]->pgno;
    }else{
      n = pCur->pPage->pgno;
    }
    pCell = findCell(pLeaf, pLeaf->nCell-1);
    if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
................................................................................
  savedIsInit = pPage->isInit;
  pPage->isInit = 0;
  if( (rc = btreeInitPage(pPage))!=0 ){
    assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
    checkAppendMsg(pCheck,
                   "btreeInitPage() returns error code %d", rc);
    goto end_of_check;





  }
  data = pPage->aData;
  hdr = pPage->hdrOffset;

  /* Set up for cell analysis */
  pCheck->zPfx = "On tree page %d cell %d: ";
  contentOffset = get2byteNotZero(&data[hdr+5]);







>







 







>







 







|
|
<
<
<
<
<

<
>



<

<











|
>

|


<
<
<
<
<
<
<
<
<
<
<
<
<




<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
>

<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







|



|






<
|









<
|



|
>
>







 







>







 







>







 







|

>
>
>
>
>
>
>
>
>







 







>
>
>
>







 







>
>
>







 







>







 







>
>
>
>







 







>
>
>
>







 







>







 







>
>
>
>







 







>
>
>
>
>







1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
....
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
....
1844
1845
1846
1847
1848
1849
1850
1851
1852





1853

1854
1855
1856
1857

1858

1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875













1876
1877
1878
1879






















1880
1881




















1882
1883
1884
1885
1886
1887
1888
....
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
....
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174

2175
2176
2177
2178
2179
2180
2181
2182
2183
2184

2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
....
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
....
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
....
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
....
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
....
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
....
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
....
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
....
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
....
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
....
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
....
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
      src = temp;
    }
    memcpy(&data[cbrk], &src[pc], size);
  }
  data[hdr+7] = 0;

 defragment_out:
  assert( pPage->nFree>=0 );
  if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){
    return SQLITE_CORRUPT_PAGE(pPage);
  }
  assert( cbrk>=iCellFirst );
  put2byte(&data[hdr+5], cbrk);
  data[hdr+1] = 0;
  data[hdr+2] = 0;
................................................................................

  /* The request could not be fulfilled using a freelist slot.  Check
  ** to see if defragmentation is necessary.
  */
  testcase( gap+2+nByte==top );
  if( gap+2+nByte>top ){
    assert( pPage->nCell>0 || CORRUPT_DB );
    assert( pPage->nFree>=0 );
    rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte)));
    if( rc ) return rc;
    top = get2byteNotZero(&data[hdr+5]);
    assert( gap+2+nByte<=top );
  }


................................................................................
    return SQLITE_CORRUPT_PAGE(pPage);
  }
  pPage->max1bytePayload = pBt->max1bytePayload;
  return SQLITE_OK;
}

/*
** Compute the amount of freespace on the page.  In other words, fill
** in the pPage->nFree field.





*/

static int btreeComputeFreeSpace(MemPage *pPage){
  int pc;            /* Address of a freeblock within pPage->aData[] */
  u8 hdr;            /* Offset to beginning of page header */
  u8 *data;          /* Equal to pPage->aData */

  int usableSize;    /* Amount of usable space on each page */

  int nFree;         /* Number of unused bytes on the page */
  int top;           /* First byte of the cell content area */
  int iCellFirst;    /* First allowable cell or freeblock offset */
  int iCellLast;     /* Last possible cell or freeblock offset */

  assert( pPage->pBt!=0 );
  assert( pPage->pBt->db!=0 );
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
  assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
  assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
  assert( pPage->isInit==1 );
  assert( pPage->nFree<0 );

  usableSize = pPage->pBt->usableSize;
  hdr = pPage->hdrOffset;
  data = pPage->aData;













  /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
  ** the start of the cell content area. A zero value for this integer is
  ** interpreted as 65536. */
  top = get2byteNotZero(&data[hdr+5]);






















  iCellFirst = hdr + 8 + pPage->childPtrSize + 2*pPage->nCell;
  iCellLast = usableSize - 4;





















  /* Compute the total free space on the page
  ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
  ** start of the first freeblock on the page, or is zero if there are no
  ** freeblocks. */
  pc = get2byte(&data[hdr+1]);
  nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
................................................................................
  ** serves to verify that the offset to the start of the cell-content
  ** area, according to the page header, lies within the page.
  */
  if( nFree>usableSize ){
    return SQLITE_CORRUPT_PAGE(pPage);
  }
  pPage->nFree = (u16)(nFree - iCellFirst);
  return SQLITE_OK;
}

/*
** Initialize the auxiliary information for a disk block.
**
** Return SQLITE_OK on success.  If we see that the page does
** not contain a well-formed database page, then return 
** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
** guarantee that the page is well-formed.  It only shows that
** we failed to detect any corruption.
*/
static int btreeInitPage(MemPage *pPage){
  int pc;            /* Address of a freeblock within pPage->aData[] */
  u8 hdr;            /* Offset to beginning of page header */
  u8 *data;          /* Equal to pPage->aData */
  BtShared *pBt;        /* The main btree structure */
  int usableSize;    /* Amount of usable space on each page */
  u16 cellOffset;    /* Offset from start of page to first cell pointer */
  int iCellFirst;    /* First allowable cell or freeblock offset */
  int iCellLast;     /* Last possible cell or freeblock offset */

  assert( pPage->pBt!=0 );
  assert( pPage->pBt->db!=0 );
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
  assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
  assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
  assert( pPage->isInit==0 );

  pBt = pPage->pBt;
  hdr = pPage->hdrOffset;
  data = pPage->aData;
  /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
  ** the b-tree page type. */
  if( decodeFlags(pPage, data[hdr]) ){
    return SQLITE_CORRUPT_PAGE(pPage);
  }
  assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
  pPage->maskPage = (u16)(pBt->pageSize - 1);
  pPage->nOverflow = 0;
  usableSize = pBt->usableSize;
  pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;
  pPage->aDataEnd = &data[usableSize];
  pPage->aCellIdx = &data[cellOffset];
  pPage->aDataOfst = &data[pPage->childPtrSize];
  /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
  ** number of cells on the page. */
  pPage->nCell = get2byte(&data[hdr+3]);
  if( pPage->nCell>MX_CELL(pBt) ){
    /* To many cells for a single page.  The page must be corrupt */
    return SQLITE_CORRUPT_PAGE(pPage);
  }
  testcase( pPage->nCell==MX_CELL(pBt) );
  /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
  ** possible for a root page of a table that contains no rows) then the
  ** offset to the cell content area will equal the page size minus the
  ** bytes of reserved space. */
  assert( pPage->nCell>0
       || get2byteNotZero(&data[hdr+5])==usableSize
       || CORRUPT_DB );

  /* A malformed database page might cause us to read past the end
  ** of page when parsing a cell.  
  **
  ** The following block of code checks early to see if a cell extends
  ** past the end of a page boundary and causes SQLITE_CORRUPT to be 
  ** returned if it does.
  */
  iCellFirst = cellOffset + 2*pPage->nCell;
  iCellLast = usableSize - 4;
  if( pBt->db->flags & SQLITE_CellSizeCk ){
    int i;            /* Index into the cell pointer array */
    int sz;           /* Size of a cell */

    if( !pPage->leaf ) iCellLast--;
    for(i=0; i<pPage->nCell; i++){
      pc = get2byteAligned(&data[cellOffset+i*2]);
      testcase( pc==iCellFirst );
      testcase( pc==iCellLast );
      if( pc<iCellFirst || pc>iCellLast ){
        return SQLITE_CORRUPT_PAGE(pPage);
      }
      sz = pPage->xCellSize(pPage, &data[pc]);
      testcase( pc+sz==usableSize );
      if( pc+sz>usableSize ){
        return SQLITE_CORRUPT_PAGE(pPage);
      }
    }
    if( !pPage->leaf ) iCellLast++;
  }  
  pPage->nFree = -1;  /* Indicate that this value is yet uncomputed */
  pPage->isInit = 1;
  return SQLITE_OK;
}

/*
** Set up a raw page so that it looks like a database page holding
** no entries.
................................................................................
  assert( sqlite3_mutex_held(pBt->mutex) );
  assert( pCur==0 || ppPage==&pCur->pPage );
  assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
  assert( pCur==0 || pCur->iPage>0 );

  if( pgno>btreePagecount(pBt) ){
    rc = SQLITE_CORRUPT_BKPT;
    goto getAndInitPage_error1;
  }
  rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
  if( rc ){
    goto getAndInitPage_error1;
  }
  *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
  if( (*ppPage)->isInit==0 ){
    btreePageFromDbPage(pDbPage, pgno, pBt);
    rc = btreeInitPage(*ppPage);
    if( rc!=SQLITE_OK ){

      goto getAndInitPage_error2;
    }
  }
  assert( (*ppPage)->pgno==pgno );
  assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );

  /* If obtaining a child page for a cursor, we must verify that the page is
  ** compatible with the root page. */
  if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){
    rc = SQLITE_CORRUPT_PGNO(pgno);

    goto getAndInitPage_error2;
  }
  return SQLITE_OK;

getAndInitPage_error2:
  releasePage(*ppPage);
getAndInitPage_error1:
  if( pCur ){
    pCur->iPage--;
    pCur->pPage = pCur->apPage[pCur->iPage];
  }
  testcase( pgno==0 );
  assert( pgno!=0 || rc==SQLITE_CORRUPT );
  return rc;
................................................................................
  int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */

  if( *pRC ) return;
  assert( idx>=0 && idx<pPage->nCell );
  assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
  assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  assert( pPage->nFree>=0 );
  data = pPage->aData;
  ptr = &pPage->aCellIdx[2*idx];
  pc = get2byte(ptr);
  hdr = pPage->hdrOffset;
  testcase( pc==get2byte(&data[hdr+5]) );
  testcase( pc+sz==pPage->pBt->usableSize );
  if( pc+sz > pPage->pBt->usableSize ){
................................................................................
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  /* The cell should normally be sized correctly.  However, when moving a
  ** malformed cell from a leaf page to an interior page, if the cell size
  ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
  ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
  ** the term after the || in the following assert(). */
  assert( sz==pPage->xCellSize(pPage, pCell) || (sz==8 && iChild>0) );
  assert( pPage->nFree>=0 );
  if( pPage->nOverflow || sz+2>pPage->nFree ){
    if( pTemp ){
      memcpy(pTemp, pCell, sz);
      pCell = pTemp;
    }
    if( iChild ){
      put4byte(pCell, iChild);
................................................................................
  MemPage *pNew;                       /* Newly allocated page */
  int rc;                              /* Return Code */
  Pgno pgnoNew;                        /* Page number of pNew */

  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  assert( pPage->nOverflow==1 );
  
  if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT;  /* dbfuzz001.test */
  if( pPage->nFree<0 ){
    rc = btreeComputeFreeSpace(pPage);
    if( rc ) return rc;
  }
  if( pParent->nFree<0 ){
    rc = btreeComputeFreeSpace(pParent);
    if( rc ) return rc;
  }


  /* Allocate a new page. This page will become the right-sibling of 
  ** pPage. Make the parent page writable, so that the new divider cell
  ** may be inserted. If both these operations are successful, proceed.
  */
  rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);

................................................................................
  */
  assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
  assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );

  if( !aOvflSpace ){
    return SQLITE_NOMEM_BKPT;
  }
  if( pParent->nFree<0 ){
    rc = btreeComputeFreeSpace(pParent);
    if( rc ) return rc;
  }

  /* Find the sibling pages to balance. Also locate the cells in pParent 
  ** that divide the siblings. An attempt is made to find NN siblings on 
  ** either side of pPage. More siblings are taken from one side, however, 
  ** if there are fewer than NN siblings on the other side. If pParent
  ** has NB or fewer children then all children of pParent are taken.  
  **
................................................................................
    pRight = &pParent->aData[pParent->hdrOffset+8];
  }else{
    pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
  }
  pgno = get4byte(pRight);
  while( 1 ){
    rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
    if( rc==0 && apOld[i]->nFree<0 ){
      rc = btreeComputeFreeSpace(apOld[i]);
    }
    if( rc ){
      memset(apOld, 0, (i+1)*sizeof(MemPage*));
      goto balance_cleanup;
    }
    nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
    if( (i--)==0 ) break;

................................................................................
    b.apEnd[k] = p->aDataEnd;
    b.ixNx[k] = cntOld[i];
    if( !leafData ){
      k++;
      b.apEnd[k] = pParent->aDataEnd;
      b.ixNx[k] = cntOld[i]+1;
    }
    assert( p->nFree>=0 );
    szNew[i] = usableSpace - p->nFree;
    for(j=0; j<p->nOverflow; j++){
      szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
    }
    cntNew[i] = cntOld[i];
  }
  k = nOld;
................................................................................
  VVA_ONLY( int balance_quick_called = 0 );
  VVA_ONLY( int balance_deeper_called = 0 );

  do {
    int iPage = pCur->iPage;
    MemPage *pPage = pCur->pPage;

    if( pPage->nFree<0 ){
      rc = btreeComputeFreeSpace(pPage);
      if( rc ) break;
    }
    if( iPage==0 ){
      if( pPage->nOverflow ){
        /* The root page of the b-tree is overfull. In this case call the
        ** balance_deeper() function to create a new child for the root-page
        ** and copy the current contents of the root-page to it. The
        ** next iteration of the do-loop will balance the child page.
        */ 
................................................................................

  }
  assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );

  pPage = pCur->pPage;
  assert( pPage->intKey || pX->nKey>=0 );
  assert( pPage->leaf || !pPage->intKey );
  if( pPage->nFree<0 ){
    rc = btreeComputeFreeSpace(pPage);
    if( rc ) return rc;
  }

  TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
          pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
          loc==0 ? "overwrite" : "new entry"));
  assert( pPage->isInit );
  newCell = pBt->pTmpSpace;
  assert( newCell!=0 );
................................................................................
  assert( pCur->eState==CURSOR_VALID );
  assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );

  iCellDepth = pCur->iPage;
  iCellIdx = pCur->ix;
  pPage = pCur->pPage;
  pCell = findCell(pPage, iCellIdx);
  if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ) return SQLITE_CORRUPT;

  /* If the bPreserve flag is set to true, then the cursor position must
  ** be preserved following this delete operation. If the current delete
  ** will cause a b-tree rebalance, then this is done by saving the cursor
  ** key and leaving the cursor in CURSOR_REQUIRESEEK state before 
  ** returning. 
  **
................................................................................
  ** node to replace the deleted cell.  */
  if( !pPage->leaf ){
    MemPage *pLeaf = pCur->pPage;
    int nCell;
    Pgno n;
    unsigned char *pTmp;

    if( pLeaf->nFree<0 ){
      rc = btreeComputeFreeSpace(pLeaf);
      if( rc ) return rc;
    }
    if( iCellDepth<pCur->iPage-1 ){
      n = pCur->apPage[iCellDepth+1]->pgno;
    }else{
      n = pCur->pPage->pgno;
    }
    pCell = findCell(pLeaf, pLeaf->nCell-1);
    if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
................................................................................
  savedIsInit = pPage->isInit;
  pPage->isInit = 0;
  if( (rc = btreeInitPage(pPage))!=0 ){
    assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
    checkAppendMsg(pCheck,
                   "btreeInitPage() returns error code %d", rc);
    goto end_of_check;
  }
  if( (rc = btreeComputeFreeSpace(pPage))!=0 ){
    assert( rc==SQLITE_CORRUPT );
    checkAppendMsg(pCheck, "free space corruption", rc);
    goto end_of_check;
  }
  data = pPage->aData;
  hdr = pPage->hdrOffset;

  /* Set up for cell analysis */
  pCheck->zPfx = "On tree page %d cell %d: ";
  contentOffset = get2byteNotZero(&data[hdr+5]);

Changes to src/btreeInt.h.

282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
  u8 hdrOffset;        /* 100 for page 1.  0 otherwise */
  u8 childPtrSize;     /* 0 if leaf==1.  4 if leaf==0 */
  u8 max1bytePayload;  /* min(maxLocal,127) */
  u8 nOverflow;        /* Number of overflow cell bodies in aCell[] */
  u16 maxLocal;        /* Copy of BtShared.maxLocal or BtShared.maxLeaf */
  u16 minLocal;        /* Copy of BtShared.minLocal or BtShared.minLeaf */
  u16 cellOffset;      /* Index in aData of first cell pointer */
  u16 nFree;           /* Number of free bytes on the page */
  u16 nCell;           /* Number of cells on this page, local and ovfl */
  u16 maskPage;        /* Mask for page offset */
  u16 aiOvfl[4];       /* Insert the i-th overflow cell before the aiOvfl-th
                       ** non-overflow cell */
  u8 *apOvfl[4];       /* Pointers to the body of overflow cells */
  BtShared *pBt;       /* Pointer to BtShared that this page is part of */
  u8 *aData;           /* Pointer to disk image of the page data */







|







282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
  u8 hdrOffset;        /* 100 for page 1.  0 otherwise */
  u8 childPtrSize;     /* 0 if leaf==1.  4 if leaf==0 */
  u8 max1bytePayload;  /* min(maxLocal,127) */
  u8 nOverflow;        /* Number of overflow cell bodies in aCell[] */
  u16 maxLocal;        /* Copy of BtShared.maxLocal or BtShared.maxLeaf */
  u16 minLocal;        /* Copy of BtShared.minLocal or BtShared.minLeaf */
  u16 cellOffset;      /* Index in aData of first cell pointer */
  int nFree;           /* Number of free bytes on the page */
  u16 nCell;           /* Number of cells on this page, local and ovfl */
  u16 maskPage;        /* Mask for page offset */
  u16 aiOvfl[4];       /* Insert the i-th overflow cell before the aiOvfl-th
                       ** non-overflow cell */
  u8 *apOvfl[4];       /* Pointers to the body of overflow cells */
  BtShared *pBt;       /* Pointer to BtShared that this page is part of */
  u8 *aData;           /* Pointer to disk image of the page data */