/ Changes On Branch sorter-coalesce-writes
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch sorter-coalesce-writes Excluding Merge-Ins

This is equivalent to a diff from 541e9310a7 to 2e5741f774

2012-08-06
19:28
Merge the sorter-coalesce-writes branch into the trunk. This improves CREATE INDEX performance on some platforms. (check-in: e1e9cb08b0 user: dan tags: trunk)
19:12
Fix a crash that could follow an OOM condition. (Closed-Leaf check-in: 2e5741f774 user: dan tags: sorter-coalesce-writes)
18:50
When reusing pages as part of creating a new index, allocate the leaves from each free-list trunk page in ascending order, instead of trying to maximize localization for each individual allocation. This increases the chance that pages will be written to disk in ascending order by a large CREATE INDEX statement, improving overall performance. (check-in: d045f8b2d4 user: dan tags: sorter-coalesce-writes)
18:10
Update sorter-coalesce-writes branch with latest trunk changes. (check-in: 214f8cda17 user: dan tags: sorter-coalesce-writes)
10:51
Update description strings in the VSIX package. (check-in: 541e9310a7 user: mistachkin tags: trunk)
2012-08-03
10:00
Correct comment about WinRT compiler define. No changes to code. (check-in: 335e91e599 user: mistachkin tags: trunk)

Changes to src/analyze.c.

172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
      ** side-effect of the CREATE TABLE statement is to leave the rootpage 
      ** of the new table in register pParse->regRoot. This is important 
      ** because the OpenWrite opcode below will be needing it. */
      sqlite3NestedParse(pParse,
          "CREATE TABLE %Q.%s(%s)", pDb->zName, zTab, aTable[i].zCols
      );
      aRoot[i] = pParse->regRoot;
      aCreateTbl[i] = 1;
    }else{
      /* The table already exists. If zWhere is not NULL, delete all entries 
      ** associated with the table zWhere. If zWhere is NULL, delete the
      ** entire contents of the table. */
      aRoot[i] = pStat->tnum;
      sqlite3TableLock(pParse, iDb, aRoot[i], 1, zTab);
      if( zWhere ){







|







172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
      ** side-effect of the CREATE TABLE statement is to leave the rootpage 
      ** of the new table in register pParse->regRoot. This is important 
      ** because the OpenWrite opcode below will be needing it. */
      sqlite3NestedParse(pParse,
          "CREATE TABLE %Q.%s(%s)", pDb->zName, zTab, aTable[i].zCols
      );
      aRoot[i] = pParse->regRoot;
      aCreateTbl[i] = OPFLAG_P2ISREG;
    }else{
      /* The table already exists. If zWhere is not NULL, delete all entries 
      ** associated with the table zWhere. If zWhere is NULL, delete the
      ** entire contents of the table. */
      aRoot[i] = pStat->tnum;
      sqlite3TableLock(pParse, iDb, aRoot[i], 1, zTab);
      if( zWhere ){

Changes to src/btree.c.

5922
5923
5924
5925
5926
5927
5928
5929

5930
5931
5932
5933
5934
5935
5936
** If aOvflSpace is set to a null pointer, this function returns 
** SQLITE_NOMEM.
*/
static int balance_nonroot(
  MemPage *pParent,               /* Parent page of siblings being balanced */
  int iParentIdx,                 /* Index of "the page" in pParent */
  u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
  int isRoot                      /* True if pParent is a root-page */

){
  BtShared *pBt;               /* The whole database */
  int nCell = 0;               /* Number of cells in apCell[] */
  int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
  int nNew = 0;                /* Number of pages in apNew[] */
  int nOld;                    /* Number of pages in apOld[] */
  int i, j, k;                 /* Loop counters */







|
>







5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
** If aOvflSpace is set to a null pointer, this function returns 
** SQLITE_NOMEM.
*/
static int balance_nonroot(
  MemPage *pParent,               /* Parent page of siblings being balanced */
  int iParentIdx,                 /* Index of "the page" in pParent */
  u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
  int isRoot,                     /* True if pParent is a root-page */
  int bBulk                       /* True if this call is part of a bulk load */
){
  BtShared *pBt;               /* The whole database */
  int nCell = 0;               /* Number of cells in apCell[] */
  int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
  int nNew = 0;                /* Number of pages in apNew[] */
  int nOld;                    /* Number of pages in apOld[] */
  int i, j, k;                 /* Loop counters */
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
      pNew = apNew[i] = apOld[i];
      apOld[i] = 0;
      rc = sqlite3PagerWrite(pNew->pDbPage);
      nNew++;
      if( rc ) goto balance_cleanup;
    }else{
      assert( i>0 );
      rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0);
      if( rc ) goto balance_cleanup;
      apNew[i] = pNew;
      nNew++;

      /* Set the pointer-map entry for the new sibling page. */
      if( ISAUTOVACUUM ){
        ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);







|







6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
      pNew = apNew[i] = apOld[i];
      apOld[i] = 0;
      rc = sqlite3PagerWrite(pNew->pDbPage);
      nNew++;
      if( rc ) goto balance_cleanup;
    }else{
      assert( i>0 );
      rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
      if( rc ) goto balance_cleanup;
      apNew[i] = pNew;
      nNew++;

      /* Set the pointer-map entry for the new sibling page. */
      if( ISAUTOVACUUM ){
        ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
          ** different page). Once this subsequent call to balance_nonroot() 
          ** has completed, it is safe to release the pSpace buffer used by
          ** the previous call, as the overflow cell data will have been 
          ** copied either into the body of a database page or into the new
          ** pSpace buffer passed to the latter call to balance_nonroot().
          */
          u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
          rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1);
          if( pFree ){
            /* If pFree is not NULL, it points to the pSpace buffer used 
            ** by a previous call to balance_nonroot(). Its contents are
            ** now stored either on real database pages or within the 
            ** new pSpace buffer, so it may be safely freed here. */
            sqlite3PageFree(pFree);
          }







|







6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
          ** different page). Once this subsequent call to balance_nonroot() 
          ** has completed, it is safe to release the pSpace buffer used by
          ** the previous call, as the overflow cell data will have been 
          ** copied either into the body of a database page or into the new
          ** pSpace buffer passed to the latter call to balance_nonroot().
          */
          u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
          rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, pCur->hints);
          if( pFree ){
            /* If pFree is not NULL, it points to the pSpace buffer used 
            ** by a previous call to balance_nonroot(). Its contents are
            ** now stored either on real database pages or within the 
            ** new pSpace buffer, so it may be safely freed here. */
            sqlite3PageFree(pFree);
          }
8290
8291
8292
8293
8294
8295
8296










      }
    }
  }

  pBt->btsFlags &= ~BTS_NO_WAL;
  return rc;
}

















>
>
>
>
>
>
>
>
>
>
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
      }
    }
  }

  pBt->btsFlags &= ~BTS_NO_WAL;
  return rc;
}

/*
** set the mask of hint flags for cursor pCsr. Currently the only valid
** values are 0 and BTREE_BULKLOAD.
*/
void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){
  assert( mask==BTREE_BULKLOAD || mask==0 );
  pCsr->hints = mask;
}

Changes to src/btree.h.

131
132
133
134
135
136
137






138
139
140
141
142
143
144
#define BTREE_FILE_FORMAT         2
#define BTREE_DEFAULT_CACHE_SIZE  3
#define BTREE_LARGEST_ROOT_PAGE   4
#define BTREE_TEXT_ENCODING       5
#define BTREE_USER_VERSION        6
#define BTREE_INCR_VACUUM         7







int sqlite3BtreeCursor(
  Btree*,                              /* BTree containing table to open */
  int iTable,                          /* Index of root page */
  int wrFlag,                          /* 1 for writing.  0 for read-only */
  struct KeyInfo*,                     /* First argument to compare function */
  BtCursor *pCursor                    /* Space to write cursor structure */
);







>
>
>
>
>
>







131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#define BTREE_FILE_FORMAT         2
#define BTREE_DEFAULT_CACHE_SIZE  3
#define BTREE_LARGEST_ROOT_PAGE   4
#define BTREE_TEXT_ENCODING       5
#define BTREE_USER_VERSION        6
#define BTREE_INCR_VACUUM         7

/*
** Values that may be OR'd together to form the second argument of an
** sqlite3BtreeCursorHints() call.
*/
#define BTREE_BULKLOAD 0x00000001

int sqlite3BtreeCursor(
  Btree*,                              /* BTree containing table to open */
  int iTable,                          /* Index of root page */
  int wrFlag,                          /* 1 for writing.  0 for read-only */
  struct KeyInfo*,                     /* First argument to compare function */
  BtCursor *pCursor                    /* Space to write cursor structure */
);
174
175
176
177
178
179
180
181
182

183
184
185
186
187
188
189

char *sqlite3BtreeIntegrityCheck(Btree*, int *aRoot, int nRoot, int, int*);
struct Pager *sqlite3BtreePager(Btree*);

int sqlite3BtreePutData(BtCursor*, u32 offset, u32 amt, void*);
void sqlite3BtreeCacheOverflow(BtCursor *);
void sqlite3BtreeClearCursor(BtCursor *);

int sqlite3BtreeSetVersion(Btree *pBt, int iVersion);


#ifndef NDEBUG
int sqlite3BtreeCursorIsValid(BtCursor*);
#endif

#ifndef SQLITE_OMIT_BTREECOUNT
int sqlite3BtreeCount(BtCursor *, i64 *);







<

>







180
181
182
183
184
185
186

187
188
189
190
191
192
193
194
195

char *sqlite3BtreeIntegrityCheck(Btree*, int *aRoot, int nRoot, int, int*);
struct Pager *sqlite3BtreePager(Btree*);

int sqlite3BtreePutData(BtCursor*, u32 offset, u32 amt, void*);
void sqlite3BtreeCacheOverflow(BtCursor *);
void sqlite3BtreeClearCursor(BtCursor *);

int sqlite3BtreeSetVersion(Btree *pBt, int iVersion);
void sqlite3BtreeCursorHints(BtCursor *, unsigned int mask);

#ifndef NDEBUG
int sqlite3BtreeCursorIsValid(BtCursor*);
#endif

#ifndef SQLITE_OMIT_BTREECOUNT
int sqlite3BtreeCount(BtCursor *, i64 *);

Changes to src/btreeInt.h.

506
507
508
509
510
511
512

513
514
515
516
517
518
519
  u8 wrFlag;                /* True if writable */
  u8 atLast;                /* Cursor pointing to the last entry */
  u8 validNKey;             /* True if info.nKey is valid */
  u8 eState;                /* One of the CURSOR_XXX constants (see below) */
#ifndef SQLITE_OMIT_INCRBLOB
  u8 isIncrblobHandle;      /* True if this cursor is an incr. io handle */
#endif

  i16 iPage;                            /* Index of current page in apPage */
  u16 aiIdx[BTCURSOR_MAX_DEPTH];        /* Current index in apPage[i] */
  MemPage *apPage[BTCURSOR_MAX_DEPTH];  /* Pages from root to current page */
};

/*
** Potential values for BtCursor.eState.







>







506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
  u8 wrFlag;                /* True if writable */
  u8 atLast;                /* Cursor pointing to the last entry */
  u8 validNKey;             /* True if info.nKey is valid */
  u8 eState;                /* One of the CURSOR_XXX constants (see below) */
#ifndef SQLITE_OMIT_INCRBLOB
  u8 isIncrblobHandle;      /* True if this cursor is an incr. io handle */
#endif
  u8 hints;                             /* As configured by CursorSetHints() */
  i16 iPage;                            /* Index of current page in apPage */
  u16 aiIdx[BTCURSOR_MAX_DEPTH];        /* Current index in apPage[i] */
  MemPage *apPage[BTCURSOR_MAX_DEPTH];  /* Pages from root to current page */
};

/*
** Potential values for BtCursor.eState.

Changes to src/build.c.

1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
    */
    if( pSelect ){
      SelectDest dest;
      Table *pSelTab;

      assert(pParse->nTab==1);
      sqlite3VdbeAddOp3(v, OP_OpenWrite, 1, pParse->regRoot, iDb);
      sqlite3VdbeChangeP5(v, 1);
      pParse->nTab = 2;
      sqlite3SelectDestInit(&dest, SRT_Table, 1);
      sqlite3Select(pParse, pSelect, &dest);
      sqlite3VdbeAddOp1(v, OP_Close, 1);
      if( pParse->nErr==0 ){
        pSelTab = sqlite3ResultSetOfSelect(pParse, pSelect);
        if( pSelTab==0 ) return;







|







1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
    */
    if( pSelect ){
      SelectDest dest;
      Table *pSelTab;

      assert(pParse->nTab==1);
      sqlite3VdbeAddOp3(v, OP_OpenWrite, 1, pParse->regRoot, iDb);
      sqlite3VdbeChangeP5(v, OPFLAG_P2ISREG);
      pParse->nTab = 2;
      sqlite3SelectDestInit(&dest, SRT_Table, 1);
      sqlite3Select(pParse, pSelect, &dest);
      sqlite3VdbeAddOp1(v, OP_Close, 1);
      if( pParse->nErr==0 ){
        pSelTab = sqlite3ResultSetOfSelect(pParse, pSelect);
        if( pSelTab==0 ) return;
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
  }else{
    tnum = pIndex->tnum;
    sqlite3VdbeAddOp2(v, OP_Clear, tnum, iDb);
  }
  pKey = sqlite3IndexKeyinfo(pParse, pIndex);
  sqlite3VdbeAddOp4(v, OP_OpenWrite, iIdx, tnum, iDb, 
                    (char *)pKey, P4_KEYINFO_HANDOFF);
  if( memRootPage>=0 ){
    sqlite3VdbeChangeP5(v, 1);
  }

#ifndef SQLITE_OMIT_MERGE_SORT
  /* Open the sorter cursor if we are to use one. */
  iSorter = pParse->nTab++;
  sqlite3VdbeAddOp4(v, OP_SorterOpen, iSorter, 0, 0, (char*)pKey, P4_KEYINFO);
#else
  iSorter = iTab;







<
|
<







2393
2394
2395
2396
2397
2398
2399

2400

2401
2402
2403
2404
2405
2406
2407
  }else{
    tnum = pIndex->tnum;
    sqlite3VdbeAddOp2(v, OP_Clear, tnum, iDb);
  }
  pKey = sqlite3IndexKeyinfo(pParse, pIndex);
  sqlite3VdbeAddOp4(v, OP_OpenWrite, iIdx, tnum, iDb, 
                    (char *)pKey, P4_KEYINFO_HANDOFF);

  sqlite3VdbeChangeP5(v, OPFLAG_BULKCSR|((memRootPage>=0)?OPFLAG_P2ISREG:0));


#ifndef SQLITE_OMIT_MERGE_SORT
  /* Open the sorter cursor if we are to use one. */
  iSorter = pParse->nTab++;
  sqlite3VdbeAddOp4(v, OP_SorterOpen, iSorter, 0, 0, (char*)pKey, P4_KEYINFO);
#else
  iSorter = iTab;

Changes to src/sqliteInt.h.

2313
2314
2315
2316
2317
2318
2319


2320
2321
2322
2323
2324
2325
2326
#define OPFLAG_LASTROWID     0x02    /* Set to update db->lastRowid */
#define OPFLAG_ISUPDATE      0x04    /* This OP_Insert is an sql UPDATE */
#define OPFLAG_APPEND        0x08    /* This is likely to be an append */
#define OPFLAG_USESEEKRESULT 0x10    /* Try to avoid a seek in BtreeInsert() */
#define OPFLAG_CLEARCACHE    0x20    /* Clear pseudo-table cache in OP_Column */
#define OPFLAG_LENGTHARG     0x40    /* OP_Column only used for length() */
#define OPFLAG_TYPEOFARG     0x80    /* OP_Column only used for typeof() */



/*
 * Each trigger present in the database schema is stored as an instance of
 * struct Trigger. 
 *
 * Pointers to instances of struct Trigger are stored in two ways.
 * 1. In the "trigHash" hash table (part of the sqlite3* that represents the 







>
>







2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
#define OPFLAG_LASTROWID     0x02    /* Set to update db->lastRowid */
#define OPFLAG_ISUPDATE      0x04    /* This OP_Insert is an sql UPDATE */
#define OPFLAG_APPEND        0x08    /* This is likely to be an append */
#define OPFLAG_USESEEKRESULT 0x10    /* Try to avoid a seek in BtreeInsert() */
#define OPFLAG_CLEARCACHE    0x20    /* Clear pseudo-table cache in OP_Column */
#define OPFLAG_LENGTHARG     0x40    /* OP_Column only used for length() */
#define OPFLAG_TYPEOFARG     0x80    /* OP_Column only used for typeof() */
#define OPFLAG_BULKCSR       0x01    /* OP_Open** used to open bulk cursor */
#define OPFLAG_P2ISREG       0x02    /* P2 to OP_Open** is a register number */

/*
 * Each trigger present in the database schema is stored as an instance of
 * struct Trigger. 
 *
 * Pointers to instances of struct Trigger are stored in two ways.
 * 1. In the "trigHash" hash table (part of the sqlite3* that represents the 

Changes to src/test_vfs.c.

357
358
359
360
361
362
363
364

365
366
367
368
369
370
371
){
  int rc = SQLITE_OK;
  TestvfsFd *pFd = tvfsGetFd(pFile);
  Testvfs *p = (Testvfs *)pFd->pVfs->pAppData;

  if( p->pScript && p->mask&TESTVFS_WRITE_MASK ){
    tvfsExecTcl(p, "xWrite", 
        Tcl_NewStringObj(pFd->zFilename, -1), pFd->pShmId, 0

    );
    tvfsResultCode(p, &rc);
  }

  if( rc==SQLITE_OK && tvfsInjectFullerr(p) ){
    rc = SQLITE_FULL;
  }







|
>







357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
){
  int rc = SQLITE_OK;
  TestvfsFd *pFd = tvfsGetFd(pFile);
  Testvfs *p = (Testvfs *)pFd->pVfs->pAppData;

  if( p->pScript && p->mask&TESTVFS_WRITE_MASK ){
    tvfsExecTcl(p, "xWrite", 
        Tcl_NewStringObj(pFd->zFilename, -1), pFd->pShmId, 
        Tcl_NewWideIntObj(iOfst)
    );
    tvfsResultCode(p, &rc);
  }

  if( rc==SQLITE_OK && tvfsInjectFullerr(p) ){
    rc = SQLITE_FULL;
  }

Changes to src/vdbe.c.

3116
3117
3118
3119
3120
3121
3122



3123
3124
3125
3126
3127
3128
3129
  int p2;
  int iDb;
  int wrFlag;
  Btree *pX;
  VdbeCursor *pCur;
  Db *pDb;




  if( p->expired ){
    rc = SQLITE_ABORT;
    break;
  }

  nField = 0;
  pKeyInfo = 0;







>
>
>







3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
  int p2;
  int iDb;
  int wrFlag;
  Btree *pX;
  VdbeCursor *pCur;
  Db *pDb;

  assert( (pOp->p5&(OPFLAG_P2ISREG|OPFLAG_BULKCSR))==pOp->p5 );
  assert( pOp->opcode==OP_OpenWrite || pOp->p5==0 );

  if( p->expired ){
    rc = SQLITE_ABORT;
    break;
  }

  nField = 0;
  pKeyInfo = 0;
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
    assert( sqlite3SchemaMutexHeld(db, iDb, 0) );
    if( pDb->pSchema->file_format < p->minWriteFileFormat ){
      p->minWriteFileFormat = pDb->pSchema->file_format;
    }
  }else{
    wrFlag = 0;
  }
  if( pOp->p5 ){
    assert( p2>0 );
    assert( p2<=p->nMem );
    pIn2 = &aMem[p2];
    assert( memIsValid(pIn2) );
    assert( (pIn2->flags & MEM_Int)!=0 );
    sqlite3VdbeMemIntegerify(pIn2);
    p2 = (int)pIn2->u.i;







|







3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
    assert( sqlite3SchemaMutexHeld(db, iDb, 0) );
    if( pDb->pSchema->file_format < p->minWriteFileFormat ){
      p->minWriteFileFormat = pDb->pSchema->file_format;
    }
  }else{
    wrFlag = 0;
  }
  if( pOp->p5 & OPFLAG_P2ISREG ){
    assert( p2>0 );
    assert( p2<=p->nMem );
    pIn2 = &aMem[p2];
    assert( memIsValid(pIn2) );
    assert( (pIn2->flags & MEM_Int)!=0 );
    sqlite3VdbeMemIntegerify(pIn2);
    p2 = (int)pIn2->u.i;
3170
3171
3172
3173
3174
3175
3176


3177
3178
3179
3180
3181
3182
3183
  assert( pOp->p1>=0 );
  pCur = allocateCursor(p, pOp->p1, nField, iDb, 1);
  if( pCur==0 ) goto no_mem;
  pCur->nullRow = 1;
  pCur->isOrdered = 1;
  rc = sqlite3BtreeCursor(pX, p2, wrFlag, pKeyInfo, pCur->pCursor);
  pCur->pKeyInfo = pKeyInfo;



  /* Since it performs no memory allocation or IO, the only value that
  ** sqlite3BtreeCursor() may return is SQLITE_OK. */
  assert( rc==SQLITE_OK );

  /* Set the VdbeCursor.isTable and isIndex variables. Previous versions of
  ** SQLite used to check if the root-page flags were sane at this point







>
>







3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
  assert( pOp->p1>=0 );
  pCur = allocateCursor(p, pOp->p1, nField, iDb, 1);
  if( pCur==0 ) goto no_mem;
  pCur->nullRow = 1;
  pCur->isOrdered = 1;
  rc = sqlite3BtreeCursor(pX, p2, wrFlag, pKeyInfo, pCur->pCursor);
  pCur->pKeyInfo = pKeyInfo;
  assert( OPFLAG_BULKCSR==BTREE_BULKLOAD );
  sqlite3BtreeCursorHints(pCur->pCursor, (pOp->p5 & OPFLAG_BULKCSR));

  /* Since it performs no memory allocation or IO, the only value that
  ** sqlite3BtreeCursor() may return is SQLITE_OK. */
  assert( rc==SQLITE_OK );

  /* Set the VdbeCursor.isTable and isIndex variables. Previous versions of
  ** SQLite used to check if the root-page flags were sane at this point

Changes to src/vdbesort.c.

18
19
20
21
22
23
24

25
26
27
28
29
30
31
#include "sqliteInt.h"
#include "vdbeInt.h"

#ifndef SQLITE_OMIT_MERGE_SORT

typedef struct VdbeSorterIter VdbeSorterIter;
typedef struct SorterRecord SorterRecord;


/*
** NOTES ON DATA STRUCTURE USED FOR N-WAY MERGES:
**
** As keys are added to the sorter, they are written to disk in a series
** of sorted packed-memory-arrays (PMAs). The size of each PMA is roughly
** the same as the cache-size allowed for temporary databases. In order







>







18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#include "sqliteInt.h"
#include "vdbeInt.h"

#ifndef SQLITE_OMIT_MERGE_SORT

typedef struct VdbeSorterIter VdbeSorterIter;
typedef struct SorterRecord SorterRecord;
typedef struct FileWriter FileWriter;

/*
** NOTES ON DATA STRUCTURE USED FOR N-WAY MERGES:
**
** As keys are added to the sorter, they are written to disk in a series
** of sorted packed-memory-arrays (PMAs). The size of each PMA is roughly
** the same as the cache-size allowed for temporary databases. In order
115
116
117
118
119
120
121
















122
123
124
125
126
127
128
  i64 iReadOff;                   /* Current read offset */
  i64 iEof;                       /* 1 byte past EOF for this iterator */
  int nAlloc;                     /* Bytes of space at aAlloc */
  int nKey;                       /* Number of bytes in key */
  sqlite3_file *pFile;            /* File iterator is reading from */
  u8 *aAlloc;                     /* Allocated space */
  u8 *aKey;                       /* Pointer to current key */
















};

/*
** A structure to store a single record. All in-memory records are connected
** together into a linked list headed at VdbeSorter.pRecord using the 
** SorterRecord.pNext pointer.
*/







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
  i64 iReadOff;                   /* Current read offset */
  i64 iEof;                       /* 1 byte past EOF for this iterator */
  int nAlloc;                     /* Bytes of space at aAlloc */
  int nKey;                       /* Number of bytes in key */
  sqlite3_file *pFile;            /* File iterator is reading from */
  u8 *aAlloc;                     /* Allocated space */
  u8 *aKey;                       /* Pointer to current key */
  u8 *aBuffer;                    /* Current read buffer */
  int nBuffer;                    /* Size of read buffer in bytes */
};

/*
** An instance of this structure is used to separate the stream of records
** being written to files by the merge-sort code into aligned, page-sized
** blocks.
*/
struct FileWriter {
  u8 *aBuffer;                    /* Pointer to write buffer */
  int nBuffer;                    /* Size of write buffer in bytes */
  int iBufStart;                  /* First byte of buffer to write */
  int iBufEnd;                    /* Last byte of buffer to write */
  i64 iWriteOff;                  /* Offset of start of buffer in file */
  sqlite3_file *pFile;            /* File to write to */
};

/*
** A structure to store a single record. All in-memory records are connected
** together into a linked list headed at VdbeSorter.pRecord using the 
** SorterRecord.pNext pointer.
*/
140
141
142
143
144
145
146

147
148


















































































































149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267



268
269
270

271
272
273
274



275
276
277

278







279



280




281

282


283
284
285
286
287
288
289

/*
** Free all memory belonging to the VdbeSorterIter object passed as the second
** argument. All structure fields are set to zero before returning.
*/
static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
  sqlite3DbFree(db, pIter->aAlloc);

  memset(pIter, 0, sizeof(VdbeSorterIter));
}



















































































































/*
** Advance iterator pIter to the next key in its PMA. Return SQLITE_OK if
** no error occurs, or an SQLite error code if one does.
*/
static int vdbeSorterIterNext(
  sqlite3 *db,                    /* Database handle (for sqlite3DbMalloc() ) */
  VdbeSorterIter *pIter           /* Iterator to advance */
){
  int rc;                         /* Return Code */
  int nRead;                      /* Number of bytes read */
  int nRec = 0;                   /* Size of record in bytes */
  int iOff = 0;                   /* Size of serialized size varint in bytes */

  assert( pIter->iEof>=pIter->iReadOff );
  if( pIter->iEof-pIter->iReadOff>5 ){
    nRead = 5;
  }else{
    nRead = (int)(pIter->iEof - pIter->iReadOff);
  }
  if( nRead<=0 ){
    /* This is an EOF condition */
    vdbeSorterIterZero(db, pIter);
    return SQLITE_OK;
  }

  rc = sqlite3OsRead(pIter->pFile, pIter->aAlloc, nRead, pIter->iReadOff);
  if( rc==SQLITE_OK ){
    iOff = getVarint32(pIter->aAlloc, nRec);
    if( (iOff+nRec)>nRead ){
      int nRead2;                   /* Number of extra bytes to read */
      if( (iOff+nRec)>pIter->nAlloc ){
        int nNew = pIter->nAlloc*2;
        while( (iOff+nRec)>nNew ) nNew = nNew*2;
        pIter->aAlloc = sqlite3DbReallocOrFree(db, pIter->aAlloc, nNew);
        if( !pIter->aAlloc ) return SQLITE_NOMEM;
        pIter->nAlloc = nNew;
      }
  
      nRead2 = iOff + nRec - nRead;
      rc = sqlite3OsRead(
          pIter->pFile, &pIter->aAlloc[nRead], nRead2, pIter->iReadOff+nRead
      );
    }
  }

  assert( rc!=SQLITE_OK || nRec>0 );
  pIter->iReadOff += iOff+nRec;
  pIter->nKey = nRec;
  pIter->aKey = &pIter->aAlloc[iOff];
  return rc;
}

/*
** Write a single varint, value iVal, to file-descriptor pFile. Return
** SQLITE_OK if successful, or an SQLite error code if some error occurs.
**
** The value of *piOffset when this function is called is used as the byte
** offset in file pFile to write to. Before returning, *piOffset is 
** incremented by the number of bytes written.
*/
static int vdbeSorterWriteVarint(
  sqlite3_file *pFile,            /* File to write to */
  i64 iVal,                       /* Value to write as a varint */
  i64 *piOffset                   /* IN/OUT: Write offset in file pFile */
){
  u8 aVarint[9];                  /* Buffer large enough for a varint */
  int nVarint;                    /* Number of used bytes in varint */
  int rc;                         /* Result of write() call */

  nVarint = sqlite3PutVarint(aVarint, iVal);
  rc = sqlite3OsWrite(pFile, aVarint, nVarint, *piOffset);
  *piOffset += nVarint;

  return rc;
}

/*
** Read a single varint from file-descriptor pFile. Return SQLITE_OK if
** successful, or an SQLite error code if some error occurs.
**
** The value of *piOffset when this function is called is used as the
** byte offset in file pFile from whence to read the varint. If successful
** (i.e. if no IO error occurs), then *piOffset is set to the offset of
** the first byte past the end of the varint before returning. *piVal is
** set to the integer value read. If an error occurs, the final values of
** both *piOffset and *piVal are undefined.
*/
static int vdbeSorterReadVarint(
  sqlite3_file *pFile,            /* File to read from */
  i64 *piOffset,                  /* IN/OUT: Read offset in pFile */
  i64 *piVal                      /* OUT: Value read from file */
){
  u8 aVarint[9];                  /* Buffer large enough for a varint */
  i64 iOff = *piOffset;           /* Offset in file to read from */
  int rc;                         /* Return code */

  rc = sqlite3OsRead(pFile, aVarint, 9, iOff);
  if( rc==SQLITE_OK ){
    *piOffset += getVarint(aVarint, (u64 *)piVal);
  }

  return rc;
}

/*
** Initialize iterator pIter to scan through the PMA stored in file pFile
** starting at offset iStart and ending at offset iEof-1. This function 
** leaves the iterator pointing to the first key in the PMA (or EOF if the 
** PMA is empty).
*/
static int vdbeSorterIterInit(
  sqlite3 *db,                    /* Database handle */
  const VdbeSorter *pSorter,      /* Sorter object */
  i64 iStart,                     /* Start offset in pFile */
  VdbeSorterIter *pIter,          /* Iterator to populate */
  i64 *pnByte                     /* IN/OUT: Increment this value by PMA size */
){
  int rc;




  assert( pSorter->iWriteOff>iStart );
  assert( pIter->aAlloc==0 );

  pIter->pFile = pSorter->pTemp1;
  pIter->iReadOff = iStart;
  pIter->nAlloc = 128;
  pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);



  if( !pIter->aAlloc ){
    rc = SQLITE_NOMEM;
  }else{

    i64 nByte;                         /* Total size of PMA in bytes */







    rc = vdbeSorterReadVarint(pSorter->pTemp1, &pIter->iReadOff, &nByte);



    *pnByte += nByte;




    pIter->iEof = pIter->iReadOff + nByte;

  }


  if( rc==SQLITE_OK ){
    rc = vdbeSorterIterNext(db, pIter);
  }
  return rc;
}









>


>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>










<
|
<

<
|
<
<
<
<
<





|

<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<


















|
>
>
>



>




>
>
>
|


>
|
>
>
>
>
>
>
>
|
>
>
>
|
>
>
>
>
|
>
|
>
>







157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290

291

292

293





294
295
296
297
298
299
300




















301



302















































303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368

/*
** Free all memory belonging to the VdbeSorterIter object passed as the second
** argument. All structure fields are set to zero before returning.
*/
static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
  sqlite3DbFree(db, pIter->aAlloc);
  sqlite3DbFree(db, pIter->aBuffer);
  memset(pIter, 0, sizeof(VdbeSorterIter));
}

/*
** Read nByte bytes of data from the stream of data iterated by object p.
** If successful, set *ppOut to point to a buffer containing the data
** and return SQLITE_OK. Otherwise, if an error occurs, return an SQLite
** error code.
**
** The buffer indicated by *ppOut may only be considered valid until the
** next call to this function.
*/
static int vdbeSorterIterRead(
  sqlite3 *db,                    /* Database handle (for malloc) */
  VdbeSorterIter *p,              /* Iterator */
  int nByte,                      /* Bytes of data to read */
  u8 **ppOut                      /* OUT: Pointer to buffer containing data */
){
  int iBuf;                       /* Offset within buffer to read from */
  int nAvail;                     /* Bytes of data available in buffer */
  assert( p->aBuffer );

  /* If there is no more data to be read from the buffer, read the next 
  ** p->nBuffer bytes of data from the file into it. Or, if there are less
  ** than p->nBuffer bytes remaining in the PMA, read all remaining data.  */
  iBuf = p->iReadOff % p->nBuffer;
  if( iBuf==0 ){
    int nRead;                    /* Bytes to read from disk */
    int rc;                       /* sqlite3OsRead() return code */

    /* Determine how many bytes of data to read. */
    nRead = p->iEof - p->iReadOff;
    if( nRead>p->nBuffer ) nRead = p->nBuffer;
    assert( nRead>0 );

    /* Read data from the file. Return early if an error occurs. */
    rc = sqlite3OsRead(p->pFile, p->aBuffer, nRead, p->iReadOff);
    assert( rc!=SQLITE_IOERR_SHORT_READ );
    if( rc!=SQLITE_OK ) return rc;
  }
  nAvail = p->nBuffer - iBuf; 

  if( nByte<=nAvail ){
    /* The requested data is available in the in-memory buffer. In this
    ** case there is no need to make a copy of the data, just return a 
    ** pointer into the buffer to the caller.  */
    *ppOut = &p->aBuffer[iBuf];
    p->iReadOff += nByte;
  }else{
    /* The requested data is not all available in the in-memory buffer.
    ** In this case, allocate space at p->aAlloc[] to copy the requested
    ** range into. Then return a copy of pointer p->aAlloc to the caller.  */
    int nRem;                     /* Bytes remaining to copy */

    /* Extend the p->aAlloc[] allocation if required. */
    if( p->nAlloc<nByte ){
      int nNew = p->nAlloc*2;
      while( nByte>nNew ) nNew = nNew*2;
      p->aAlloc = sqlite3DbReallocOrFree(db, p->aAlloc, nNew);
      if( !p->aAlloc ) return SQLITE_NOMEM;
    }

    /* Copy as much data as is available in the buffer into the start of
    ** p->aAlloc[].  */
    memcpy(p->aAlloc, &p->aBuffer[iBuf], nAvail);
    p->iReadOff += nAvail;
    nRem = nByte - nAvail;

    /* The following loop copies up to p->nBuffer bytes per iteration into
    ** the p->aAlloc[] buffer.  */
    while( nRem>0 ){
      int rc;                     /* vdbeSorterIterRead() return code */
      int nCopy;                  /* Number of bytes to copy */
      u8 *aNext;                  /* Pointer to buffer to copy data from */

      nCopy = nRem;
      if( nRem>p->nBuffer ) nCopy = p->nBuffer;
      rc = vdbeSorterIterRead(db, p, nCopy, &aNext);
      if( rc!=SQLITE_OK ) return rc;
      assert( aNext!=p->aAlloc );
      memcpy(&p->aAlloc[nByte - nRem], aNext, nCopy);
      nRem -= nCopy;
    }

    *ppOut = p->aAlloc;
  }

  return SQLITE_OK;
}

/*
** Read a varint from the stream of data accessed by p. Set *pnOut to
** the value read.
*/
static int vdbeSorterIterVarint(sqlite3 *db, VdbeSorterIter *p, u64 *pnOut){
  int iBuf;

  iBuf = p->iReadOff % p->nBuffer;
  if( iBuf && (p->nBuffer-iBuf)>=9 ){
    p->iReadOff += sqlite3GetVarint(&p->aBuffer[iBuf], pnOut);
  }else{
    u8 aVarint[9];
    int i;
    for(i=0; i<sizeof(aVarint); i++){
      u8 *a;
      int rc = vdbeSorterIterRead(db, p, 1, &a);
      if( rc ) return rc;
      aVarint[i] = *a;
      if( (aVarint[i] & 0x80)==0 ) break;
    }
    sqlite3GetVarint(aVarint, pnOut);
  }

  return SQLITE_OK;
}


/*
** Advance iterator pIter to the next key in its PMA. Return SQLITE_OK if
** no error occurs, or an SQLite error code if one does.
*/
static int vdbeSorterIterNext(
  sqlite3 *db,                    /* Database handle (for sqlite3DbMalloc() ) */
  VdbeSorterIter *pIter           /* Iterator to advance */
){
  int rc;                         /* Return Code */

  u64 nRec = 0;                   /* Size of record in bytes */



  if( pIter->iReadOff>=pIter->iEof ){





    /* This is an EOF condition */
    vdbeSorterIterZero(db, pIter);
    return SQLITE_OK;
  }

  rc = vdbeSorterIterVarint(db, pIter, &nRec);
  if( rc==SQLITE_OK ){




















    pIter->nKey = (int)nRec;



    rc = vdbeSorterIterRead(db, pIter, nRec, &pIter->aKey);















































  }

  return rc;
}

/*
** Initialize iterator pIter to scan through the PMA stored in file pFile
** starting at offset iStart and ending at offset iEof-1. This function 
** leaves the iterator pointing to the first key in the PMA (or EOF if the 
** PMA is empty).
*/
static int vdbeSorterIterInit(
  sqlite3 *db,                    /* Database handle */
  const VdbeSorter *pSorter,      /* Sorter object */
  i64 iStart,                     /* Start offset in pFile */
  VdbeSorterIter *pIter,          /* Iterator to populate */
  i64 *pnByte                     /* IN/OUT: Increment this value by PMA size */
){
  int rc = SQLITE_OK;
  int nBuf;

  nBuf = sqlite3BtreeGetPageSize(db->aDb[0].pBt);

  assert( pSorter->iWriteOff>iStart );
  assert( pIter->aAlloc==0 );
  assert( pIter->aBuffer==0 );
  pIter->pFile = pSorter->pTemp1;
  pIter->iReadOff = iStart;
  pIter->nAlloc = 128;
  pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);
  pIter->nBuffer = nBuf;
  pIter->aBuffer = (u8 *)sqlite3DbMallocRaw(db, nBuf);

  if( !pIter->aBuffer ){
    rc = SQLITE_NOMEM;
  }else{
    int iBuf;

    iBuf = iStart % nBuf;
    if( iBuf ){
      int nRead = nBuf - iBuf;
      if( (iStart + nRead) > pSorter->iWriteOff ){
        nRead = pSorter->iWriteOff - iStart;
      }
      rc = sqlite3OsRead(
          pSorter->pTemp1, &pIter->aBuffer[iBuf], nRead, iStart
      );
      assert( rc!=SQLITE_IOERR_SHORT_READ );
    }

    if( rc==SQLITE_OK ){
      u64 nByte;                       /* Size of PMA in bytes */
      pIter->iEof = pSorter->iWriteOff;
      rc = vdbeSorterIterVarint(db, pIter, &nByte);
      pIter->iEof = pIter->iReadOff + nByte;
      *pnByte += nByte;
    }
  }

  if( rc==SQLITE_OK ){
    rc = vdbeSorterIterNext(db, pIter);
  }
  return rc;
}


527
528
529
530
531
532
533






















































































534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549

550



551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567


568

569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593

594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
  }
  pSorter->pRecord = p;

  sqlite3_free(aSlot);
  return SQLITE_OK;
}
























































































/*
** Write the current contents of the in-memory linked-list to a PMA. Return
** SQLITE_OK if successful, or an SQLite error code otherwise.
**
** The format of a PMA is:
**
**     * A varint. This varint contains the total number of bytes of content
**       in the PMA (not including the varint itself).
**
**     * One or more records packed end-to-end in order of ascending keys. 
**       Each record consists of a varint followed by a blob of data (the 
**       key). The varint is the number of bytes in the blob of data.
*/
static int vdbeSorterListToPMA(sqlite3 *db, const VdbeCursor *pCsr){
  int rc = SQLITE_OK;             /* Return code */

  VdbeSorter *pSorter = pCsr->pSorter;




  if( pSorter->nInMemory==0 ){
    assert( pSorter->pRecord==0 );
    return rc;
  }

  rc = vdbeSorterSort(pCsr);

  /* If the first temporary PMA file has not been opened, open it now. */
  if( rc==SQLITE_OK && pSorter->pTemp1==0 ){
    rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
    assert( rc!=SQLITE_OK || pSorter->pTemp1 );
    assert( pSorter->iWriteOff==0 );
    assert( pSorter->nPMA==0 );
  }

  if( rc==SQLITE_OK ){


    i64 iOff = pSorter->iWriteOff;

    SorterRecord *p;
    SorterRecord *pNext = 0;
    static const char eightZeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };

    pSorter->nPMA++;
    rc = vdbeSorterWriteVarint(pSorter->pTemp1, pSorter->nInMemory, &iOff);
    for(p=pSorter->pRecord; rc==SQLITE_OK && p; p=pNext){
      pNext = p->pNext;
      rc = vdbeSorterWriteVarint(pSorter->pTemp1, p->nVal, &iOff);

      if( rc==SQLITE_OK ){
        rc = sqlite3OsWrite(pSorter->pTemp1, p->pVal, p->nVal, iOff);
        iOff += p->nVal;
      }

      sqlite3DbFree(db, p);
    }

    /* This assert verifies that unless an error has occurred, the size of 
    ** the PMA on disk is the same as the expected size stored in
    ** pSorter->nInMemory. */ 
    assert( rc!=SQLITE_OK || pSorter->nInMemory==(
          iOff-pSorter->iWriteOff-sqlite3VarintLen(pSorter->nInMemory)
    ));


    pSorter->iWriteOff = iOff;
    if( rc==SQLITE_OK ){
      /* Terminate each file with 8 extra bytes so that from any offset
      ** in the file we can always read 9 bytes without a SHORT_READ error */
      rc = sqlite3OsWrite(pSorter->pTemp1, eightZeros, 8, iOff);
    }
    pSorter->pRecord = p;
  }

  return rc;
}

/*
** Add a record to the sorter.
*/







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
















>

>
>
>

















>
>
|
>


|


|


<
|

|
<





<
<
|
<
<
<
|
>
|
|
<
<
<
<
<
<







606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748

749
750
751

752
753
754
755
756


757



758
759
760
761






762
763
764
765
766
767
768
  }
  pSorter->pRecord = p;

  sqlite3_free(aSlot);
  return SQLITE_OK;
}

/*
** Initialize a file-writer object.
*/
static int fileWriterInit(
  sqlite3 *db,                    /* Database (for malloc) */
  sqlite3_file *pFile,            /* File to write to */
  FileWriter *p,                  /* Object to populate */
  i64 iStart                      /* Offset of pFile to begin writing at */
){
  int nBuf = sqlite3BtreeGetPageSize(db->aDb[0].pBt);

  memset(p, 0, sizeof(FileWriter));
  p->aBuffer = (u8 *)sqlite3DbMallocRaw(db, nBuf);
  if( !p->aBuffer ) return SQLITE_NOMEM;

  p->iBufEnd = p->iBufStart = (iStart % nBuf);
  p->iWriteOff = iStart - p->iBufStart;
  p->nBuffer = nBuf;
  p->pFile = pFile;
  return SQLITE_OK;
}

/*
** Write nData bytes of data to the file-write object. Return SQLITE_OK
** if successful, or an SQLite error code if an error occurs.
*/
static int fileWriterWrite(FileWriter *p, u8 *pData, int nData){
  int nRem = nData;
  while( nRem>0 ){
    int nCopy = nRem;
    if( nCopy>(p->nBuffer - p->iBufEnd) ){
      nCopy = p->nBuffer - p->iBufEnd;
    }

    memcpy(&p->aBuffer[p->iBufEnd], &pData[nData-nRem], nCopy);
    p->iBufEnd += nCopy;
    if( p->iBufEnd==p->nBuffer ){
      int rc = sqlite3OsWrite(p->pFile, 
          &p->aBuffer[p->iBufStart], p->iBufEnd - p->iBufStart, 
          p->iWriteOff + p->iBufStart
      );
      if( rc!=SQLITE_OK ) return rc;
      p->iBufStart = p->iBufEnd = 0;
      p->iWriteOff += p->nBuffer;
    }
    assert( p->iBufEnd<p->nBuffer );

    nRem -= nCopy;
  }

  return SQLITE_OK;
}

/*
** Flush any buffered data to disk and clean up the file-writer object.
** The results of using the file-writer after this call are undefined.
** Return SQLITE_OK if flushing the buffered data succeeds or is not 
** required. Otherwise, return an SQLite error code.
**
** Before returning, set *piEof to the offset immediately following the
** last byte written to the file.
*/
static int fileWriterFinish(sqlite3 *db, FileWriter *p, i64 *piEof){
  int rc = SQLITE_OK;
  if( p->aBuffer && p->iBufEnd>p->iBufStart ){
    rc = sqlite3OsWrite(p->pFile, 
        &p->aBuffer[p->iBufStart], p->iBufEnd - p->iBufStart, 
        p->iWriteOff + p->iBufStart
    );
  }
  *piEof = (p->iWriteOff + p->iBufEnd);
  sqlite3DbFree(db, p->aBuffer);
  memset(p, 0, sizeof(FileWriter));
  return rc;
}

/*
** Write value iVal encoded as a varint to the file-write object. Return 
** SQLITE_OK if successful, or an SQLite error code if an error occurs.
*/
static int fileWriterWriteVarint(FileWriter *p, u64 iVal){
  int nByte; 
  u8 aByte[10];
  nByte = sqlite3PutVarint(aByte, iVal);
  return fileWriterWrite(p, aByte, nByte);
}

/*
** Write the current contents of the in-memory linked-list to a PMA. Return
** SQLITE_OK if successful, or an SQLite error code otherwise.
**
** The format of a PMA is:
**
**     * A varint. This varint contains the total number of bytes of content
**       in the PMA (not including the varint itself).
**
**     * One or more records packed end-to-end in order of ascending keys. 
**       Each record consists of a varint followed by a blob of data (the 
**       key). The varint is the number of bytes in the blob of data.
*/
static int vdbeSorterListToPMA(sqlite3 *db, const VdbeCursor *pCsr){
  int rc = SQLITE_OK;             /* Return code */
  int rc2;                        /* fileWriterFinish return code */
  VdbeSorter *pSorter = pCsr->pSorter;
  FileWriter writer;

  memset(&writer, 0, sizeof(FileWriter));

  if( pSorter->nInMemory==0 ){
    assert( pSorter->pRecord==0 );
    return rc;
  }

  rc = vdbeSorterSort(pCsr);

  /* If the first temporary PMA file has not been opened, open it now. */
  if( rc==SQLITE_OK && pSorter->pTemp1==0 ){
    rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
    assert( rc!=SQLITE_OK || pSorter->pTemp1 );
    assert( pSorter->iWriteOff==0 );
    assert( pSorter->nPMA==0 );
  }

  if( rc==SQLITE_OK ){
    rc = fileWriterInit(db, pSorter->pTemp1, &writer, pSorter->iWriteOff);
  }

  if( rc==SQLITE_OK ){
    SorterRecord *p;
    SorterRecord *pNext = 0;


    pSorter->nPMA++;
    rc = fileWriterWriteVarint(&writer, pSorter->nInMemory);
    for(p=pSorter->pRecord; rc==SQLITE_OK && p; p=pNext){
      pNext = p->pNext;

      rc = fileWriterWriteVarint(&writer, p->nVal);
      if( rc==SQLITE_OK ){
        rc = fileWriterWrite(&writer, p->pVal, p->nVal);

      }

      sqlite3DbFree(db, p);
    }



    pSorter->pRecord = p;



  }

  rc2 = fileWriterFinish(db, &writer, &pSorter->iWriteOff);
  if( rc==SQLITE_OK ) rc = rc2;







  return rc;
}

/*
** Add a record to the sorter.
*/
638
639
640
641
642
643
644





645
646

647
648
649
650
651
652
653
  **   * The total memory allocated for the in-memory list is greater 
  **     than (page-size * 10) and sqlite3HeapNearlyFull() returns true.
  */
  if( rc==SQLITE_OK && pSorter->mxPmaSize>0 && (
        (pSorter->nInMemory>pSorter->mxPmaSize)
     || (pSorter->nInMemory>pSorter->mnPmaSize && sqlite3HeapNearlyFull())
  )){





    rc = vdbeSorterListToPMA(db, pCsr);
    pSorter->nInMemory = 0;

  }

  return rc;
}

/*
** Helper function for sqlite3VdbeSorterRewind(). 







>
>
>
>
>


>







798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
  **   * The total memory allocated for the in-memory list is greater 
  **     than (page-size * 10) and sqlite3HeapNearlyFull() returns true.
  */
  if( rc==SQLITE_OK && pSorter->mxPmaSize>0 && (
        (pSorter->nInMemory>pSorter->mxPmaSize)
     || (pSorter->nInMemory>pSorter->mnPmaSize && sqlite3HeapNearlyFull())
  )){
#ifdef SQLITE_DEBUG
    i64 nExpect = pSorter->iWriteOff
                + sqlite3VarintLen(pSorter->nInMemory)
                + pSorter->nInMemory;
#endif
    rc = vdbeSorterListToPMA(db, pCsr);
    pSorter->nInMemory = 0;
    assert( rc!=SQLITE_OK || (nExpect==pSorter->iWriteOff) );
  }

  return rc;
}

/*
** Helper function for sqlite3VdbeSorterRewind(). 
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
  ** from the in-memory list.  */
  if( pSorter->nPMA==0 ){
    *pbEof = !pSorter->pRecord;
    assert( pSorter->aTree==0 );
    return vdbeSorterSort(pCsr);
  }

  /* Write the current b-tree to a PMA. Close the b-tree cursor. */
  rc = vdbeSorterListToPMA(db, pCsr);
  if( rc!=SQLITE_OK ) return rc;

  /* Allocate space for aIter[] and aTree[]. */
  nIter = pSorter->nPMA;
  if( nIter>SORTER_MAX_MERGE_COUNT ) nIter = SORTER_MAX_MERGE_COUNT;
  assert( nIter>0 );







|







866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
  ** from the in-memory list.  */
  if( pSorter->nPMA==0 ){
    *pbEof = !pSorter->pRecord;
    assert( pSorter->aTree==0 );
    return vdbeSorterSort(pCsr);
  }

  /* Write the current in-memory list to a PMA. */
  rc = vdbeSorterListToPMA(db, pCsr);
  if( rc!=SQLITE_OK ) return rc;

  /* Allocate space for aIter[] and aTree[]. */
  nIter = pSorter->nPMA;
  if( nIter>SORTER_MAX_MERGE_COUNT ) nIter = SORTER_MAX_MERGE_COUNT;
  assert( nIter>0 );
722
723
724
725
726
727
728


729


730
731
732
733
734
735
736
  do {
    int iNew;                     /* Index of new, merged, PMA */

    for(iNew=0; 
        rc==SQLITE_OK && iNew*SORTER_MAX_MERGE_COUNT<pSorter->nPMA; 
        iNew++
    ){


      i64 nWrite;                 /* Number of bytes in new PMA */



      /* If there are SORTER_MAX_MERGE_COUNT or less PMAs in file pTemp1,
      ** initialize an iterator for each of them and break out of the loop.
      ** These iterators will be incrementally merged as the VDBE layer calls
      ** sqlite3VdbeSorterNext().
      **
      ** Otherwise, if pTemp1 contains more than SORTER_MAX_MERGE_COUNT PMAs,







>
>

>
>







888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
  do {
    int iNew;                     /* Index of new, merged, PMA */

    for(iNew=0; 
        rc==SQLITE_OK && iNew*SORTER_MAX_MERGE_COUNT<pSorter->nPMA; 
        iNew++
    ){
      int rc2;                    /* Return code from fileWriterFinish() */
      FileWriter writer;          /* Object used to write to disk */
      i64 nWrite;                 /* Number of bytes in new PMA */

      memset(&writer, 0, sizeof(FileWriter));

      /* If there are SORTER_MAX_MERGE_COUNT or less PMAs in file pTemp1,
      ** initialize an iterator for each of them and break out of the loop.
      ** These iterators will be incrementally merged as the VDBE layer calls
      ** sqlite3VdbeSorterNext().
      **
      ** Otherwise, if pTemp1 contains more than SORTER_MAX_MERGE_COUNT PMAs,
746
747
748
749
750
751
752



753
754
755
756
757
758
759
760
761
762
763

764

765
766
767
768
769



770
771
772
773
774
775
776
      /* Open the second temp file, if it is not already open. */
      if( pTemp2==0 ){
        assert( iWrite2==0 );
        rc = vdbeSorterOpenTempFile(db, &pTemp2);
      }

      if( rc==SQLITE_OK ){



        rc = vdbeSorterWriteVarint(pTemp2, nWrite, &iWrite2);
      }

      if( rc==SQLITE_OK ){
        int bEof = 0;
        while( rc==SQLITE_OK && bEof==0 ){
          int nToWrite;
          VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
          assert( pIter->pFile );
          nToWrite = pIter->nKey + sqlite3VarintLen(pIter->nKey);
          rc = sqlite3OsWrite(pTemp2, pIter->aAlloc, nToWrite, iWrite2);

          iWrite2 += nToWrite;

          if( rc==SQLITE_OK ){
            rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
          }
        }
      }



    }

    if( pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
      break;
    }else{
      sqlite3_file *pTmp = pSorter->pTemp1;
      pSorter->nPMA = iNew;







>
>
>
|





<


|
|
>
|
>





>
>
>







916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931

932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
      /* Open the second temp file, if it is not already open. */
      if( pTemp2==0 ){
        assert( iWrite2==0 );
        rc = vdbeSorterOpenTempFile(db, &pTemp2);
      }

      if( rc==SQLITE_OK ){
        rc = fileWriterInit(db, pTemp2, &writer, iWrite2);
      }
      if( rc==SQLITE_OK ){
        rc = fileWriterWriteVarint(&writer, nWrite);
      }

      if( rc==SQLITE_OK ){
        int bEof = 0;
        while( rc==SQLITE_OK && bEof==0 ){

          VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
          assert( pIter->pFile );

          rc = fileWriterWriteVarint(&writer, pIter->nKey);
          if( rc==SQLITE_OK ){
            rc = fileWriterWrite(&writer, pIter->aKey, pIter->nKey);
          }
          if( rc==SQLITE_OK ){
            rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
          }
        }
      }

      rc2 = fileWriterFinish(db, &writer, &iWrite2);
      if( rc==SQLITE_OK ) rc = rc2;
    }

    if( pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
      break;
    }else{
      sqlite3_file *pTmp = pSorter->pTemp1;
      pSorter->nPMA = iNew;

Changes to test/index4.test.

12
13
14
15
16
17
18
























19
20
21
22
23
24
25
# focus of this file is testing the CREATE INDEX statement.
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

set testprefix index4

























do_execsql_test 1.1 {
  BEGIN;
    CREATE TABLE t1(x);
    INSERT INTO t1 VALUES(randomblob(102));
    INSERT INTO t1 SELECT randomblob(102) FROM t1;     --     2
    INSERT INTO t1 SELECT randomblob(102) FROM t1;     --     4







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# focus of this file is testing the CREATE INDEX statement.
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

set testprefix index4

#proc str {n} { string range [string repeat [format %.06d. $n] 20] 0 101 }
#db func str str
#do_execsql_test 1.1 {
#  BEGIN;
#    CREATE TABLE t1(x);
#    INSERT INTO t1 VALUES(str(1));
#    INSERT INTO t1 SELECT str(rowid + 1) FROM t1;        --     2
#    INSERT INTO t1 SELECT str(rowid + 2) FROM t1;        --     4
#    INSERT INTO t1 SELECT str(rowid + 4) FROM t1;        --     8
#    INSERT INTO t1 SELECT str(rowid + 8) FROM t1;        --    16
#    INSERT INTO t1 SELECT str(rowid + 16) FROM t1;       --    32
#    INSERT INTO t1 SELECT str(rowid + 32) FROM t1;       --    64
#    INSERT INTO t1 SELECT str(rowid + 64) FROM t1;       --   128
#    INSERT INTO t1 SELECT str(rowid + 128) FROM t1;      --   256
#    INSERT INTO t1 SELECT str(rowid + 256) FROM t1;      --   512
#    INSERT INTO t1 SELECT str(rowid + 512) FROM t1;      --  1024
#    INSERT INTO t1 SELECT str(rowid + 1024) FROM t1;     --  2048
#    INSERT INTO t1 SELECT str(rowid + 2048) FROM t1;     --  4096
#    INSERT INTO t1 SELECT str(rowid + 4096) FROM t1;     --  8192
#    INSERT INTO t1 SELECT str(rowid + 8192) FROM t1;     -- 16384
#    INSERT INTO t1 SELECT str(rowid + 16384) FROM t1;    -- 32768
#  COMMIT;
#}

do_execsql_test 1.1 {
  BEGIN;
    CREATE TABLE t1(x);
    INSERT INTO t1 VALUES(randomblob(102));
    INSERT INTO t1 SELECT randomblob(102) FROM t1;     --     2
    INSERT INTO t1 SELECT randomblob(102) FROM t1;     --     4

Added test/index5.test.























































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# 2012 August 6
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#


set testdir [file dirname $argv0]
source $testdir/tester.tcl
set ::testprefix index5

do_test 1.1 {
  execsql {
    PRAGMA page_size = 1024;
    CREATE TABLE t1(x);
    BEGIN;
  }
  for {set i 0} {$i < 100000} {incr i} {
    execsql { INSERT INTO t1 VALUES(randstr(100,100)) }
  }
  execsql COMMIT
  execsql { 
    CREATE INDEX i1 ON t1(x);
    DROP INDEX I1;
    PRAGMA main.page_size;
  }
} {1024}

db close
testvfs tvfs
tvfs filter xWrite
tvfs script write_cb
proc write_cb {xCall file handle iOfst} {
  if {[file tail $file]=="test.db"} {
    lappend ::write_list [expr $iOfst/1024]
  }
  puts "$xCall $file $args"
}

do_test 1.2 {
  sqlite3 db test.db -vfs tvfs
  set ::write_list [list]
  execsql { CREATE INDEX i1 ON t1(x) }
} {}

do_test 1.3 {
  set nForward 0
  set nBackward 0
  set nNoncont 0
  set iPrev [lindex $::write_list 0]
  for {set i 1} {$i < [llength $::write_list]} {incr i} {
    set iNext [lindex $::write_list $i]
    if {$iNext==($iPrev+1)} {
      incr nForward
    } elseif {$iNext==($iPrev-1)} { 
      incr nBackward 
    } else {
      incr nNoncont
    }
    set iPrev $iNext
  }

  expr {$nForward > $nBackward}
} {1}
db close
tvfs delete

finish_test