Index: src/vdbesort.c ================================================================== --- src/vdbesort.c +++ src/vdbesort.c @@ -89,12 +89,14 @@ u8 eWork; /* One of the SORTER_THREAD_* constants */ int nConsolidate; /* For THREAD_CONS, max final PMAs */ SorterRecord *pList; /* List of records for pThread to sort */ int nInMemory; /* Expected size of PMA based on pList */ u8 *aListMemory; /* Records memory (or NULL) */ + u32 iSeq; /* Sequence number for PMA */ int nPMA; /* Number of PMAs currently in pTemp1 */ + int bEmbeddedSeq; /* True if pTemp1 contains embedded seq. */ i64 iTemp1Off; /* Offset to write to in pTemp1 */ sqlite3_file *pTemp1; /* File to write PMAs to, or NULL */ }; @@ -185,10 +187,11 @@ SorterRecord *pRecord; /* Head of in-memory record list */ SorterMerger *pMerger; /* For final merge of PMAs (by caller) */ u8 *aMemory; /* Block of memory to alloc records from */ int iMemory; /* Offset of first free byte in aMemory */ int nMemory; /* Size of aMemory allocation in bytes */ + u32 iNextSeq; /* Sequence number for next PMA */ SorterThread aThread[SQLITE_MAX_SORTER_THREAD]; }; /* ** The following type is an iterator for a PMA. It caches the current key in @@ -195,10 +198,12 @@ ** variables nKey/aKey. If the iterator is at EOF, pFile==0. */ struct VdbeSorterIter { i64 iReadOff; /* Current read offset */ i64 iEof; /* 1 byte past EOF for this iterator */ + int bEmbeddedSeq; /* True if records have sequence values */ + int iSeq; /* Current sequence value */ int nAlloc; /* Bytes of space at aAlloc */ int nKey; /* Number of bytes in key */ sqlite3_file *pFile; /* File iterator is reading from */ u8 *aAlloc; /* Allocated space */ u8 *aKey; /* Pointer to current key */ @@ -415,10 +420,15 @@ vdbeSorterIterZero(pIter); return SQLITE_OK; } rc = vdbeSorterIterVarint(pIter, &nRec); + if( rc==SQLITE_OK && pIter->bEmbeddedSeq ){ + u64 iSeq; + rc = vdbeSorterIterVarint(pIter, &iSeq); + pIter->iSeq = (int)iSeq; + } if( rc==SQLITE_OK ){ pIter->nKey = (int)nRec; rc = vdbeSorterIterRead(pIter, (int)nRec, &pIter->aKey); } @@ -446,10 +456,11 @@ assert( pIter->aBuffer==0 ); pIter->pFile = pThread->pTemp1; pIter->iReadOff = iStart; pIter->nAlloc = 128; pIter->aAlloc = (u8*)sqlite3Malloc(pIter->nAlloc); + pIter->bEmbeddedSeq = pThread->bEmbeddedSeq; /* Try to xFetch() a mapping of the entire temp file. If this is possible, ** the PMA will be read via the mapping. Otherwise, use xRead(). */ rc = sqlite3OsFetch(pIter->pFile, 0, pThread->iTemp1Off, &pMap); @@ -468,23 +479,34 @@ if( (iStart + nRead) > pThread->iTemp1Off ){ nRead = (int)(pThread->iTemp1Off - iStart); } rc = sqlite3OsRead( pThread->pTemp1, &pIter->aBuffer[iBuf], nRead, iStart - ); + ); assert( rc!=SQLITE_IOERR_SHORT_READ ); } } } } if( rc==SQLITE_OK ){ u64 nByte; /* Size of PMA in bytes */ pIter->iEof = pThread->iTemp1Off; - rc = vdbeSorterIterVarint(pIter, &nByte); - pIter->iEof = pIter->iReadOff + nByte; - *pnByte += nByte; + if( pIter->bEmbeddedSeq==0 ){ + u64 iSeq, nElem; + rc = vdbeSorterIterVarint(pIter, &iSeq); + pIter->iSeq = (int)iSeq; + if( rc==SQLITE_OK ){ + rc = vdbeSorterIterVarint(pIter, &nElem); + *pnByte += (nElem * sqlite3VarintLen(iSeq)); + } + } + if( rc==SQLITE_OK ){ + rc = vdbeSorterIterVarint(pIter, &nByte); + pIter->iEof = pIter->iReadOff + nByte; + *pnByte += nByte; + } } if( rc==SQLITE_OK ){ rc = vdbeSorterIterNext(pIter); } @@ -749,10 +771,11 @@ vdbeSorterMergerReset(pSorter->pMerger); pSorter->pRecord = 0; pSorter->nInMemory = 0; pSorter->bUsePMA = 0; pSorter->iMemory = 0; + pSorter->iNextSeq = 0; } /* ** Free any cursor components allocated by sqlite3VdbeSorterXXX routines. */ @@ -822,15 +845,19 @@ /* ** Sort the linked list of records headed at pThread->pList. Return ** SQLITE_OK if successful, or an SQLite error code (i.e. SQLITE_NOMEM) if ** an error occurs. +** +** If the pnElem argument is not NULL and no error occurs, set *pnElem to +** the total number of elements in the list. */ -static int vdbeSorterSort(SorterThread *pThread){ +static int vdbeSorterSort(SorterThread *pThread, i64 *pnElem){ int i; SorterRecord **aSlot; SorterRecord *p; + i64 nElem = 0; aSlot = (SorterRecord **)sqlite3MallocZero(64 * sizeof(SorterRecord *)); if( !aSlot ){ return SQLITE_NOMEM; } @@ -854,18 +881,20 @@ vdbeSorterMerge(pThread, p, aSlot[i], &p); aSlot[i] = 0; } aSlot[i] = p; p = pNext; + nElem++; } p = 0; for(i=0; i<64; i++){ vdbeSorterMerge(pThread, p, aSlot[i], &p); } pThread->pList = p; + *pnElem = nElem; sqlite3_free(aSlot); return SQLITE_OK; } /* @@ -987,11 +1016,11 @@ ** ** * One or more records packed end-to-end in order of ascending keys. ** Each record consists of a varint followed by a blob of data (the ** key). The varint is the number of bytes in the blob of data. */ -static int vdbeSorterListToPMA(SorterThread *pThread){ +static int vdbeSorterListToPMA(SorterThread *pThread, i64 nElem){ int rc = SQLITE_OK; /* Return code */ FileWriter writer; /* Object used to write to the file */ memset(&writer, 0, sizeof(FileWriter)); assert( pThread->nInMemory>0 ); @@ -1000,25 +1029,28 @@ if( pThread->pTemp1==0 ){ rc = vdbeSorterOpenTempFile(pThread->pVfs, &pThread->pTemp1); assert( rc!=SQLITE_OK || pThread->pTemp1 ); assert( pThread->iTemp1Off==0 ); assert( pThread->nPMA==0 ); + assert( pThread->bEmbeddedSeq==0 ); } /* Try to get the file to memory map */ if( rc==SQLITE_OK ){ rc = vdbeSorterExtendFile( - pThread->pTemp1, pThread->iTemp1Off + pThread->nInMemory + 9 + pThread->pTemp1, pThread->iTemp1Off + 9 + 9 + 9 + pThread->nInMemory ); } if( rc==SQLITE_OK ){ SorterRecord *p; SorterRecord *pNext = 0; fileWriterInit(pThread->pTemp1, &writer, pThread->pgsz, pThread->iTemp1Off); pThread->nPMA++; + fileWriterWriteVarint(&writer, (u64)pThread->iSeq); + fileWriterWriteVarint(&writer, (u64)nElem); fileWriterWriteVarint(&writer, pThread->nInMemory); for(p=pThread->pList; p; p=pNext){ pNext = p->u.pNext; fileWriterWriteVarint(&writer, p->nVal); fileWriterWrite(&writer, SRVAL(p), p->nVal); @@ -1089,11 +1121,11 @@ ** ** If the two values were equal, then the value from the oldest ** PMA should be considered smaller. The VdbeSorter.aIter[] array ** is sorted from oldest to newest, so pIter1 contains older values ** than pIter2 iff (pIter1iSeq < pIter2->iSeq) ){ pMerger->aTree[i] = (int)(pIter1 - pMerger->aIter); pIter2 = &pMerger->aIter[ pMerger->aTree[i ^ 0x0001] ]; pKey2 = pIter2->aKey; }else{ if( pIter1->pFile ) pKey2 = 0; @@ -1186,10 +1218,11 @@ fileWriterWriteVarint(&writer, nOut); while( rc==SQLITE_OK && bEof==0 ){ VdbeSorterIter *pIter = &pMerger->aIter[ pMerger->aTree[1] ]; assert( pIter->pFile!=0 ); /* pIter is not at EOF */ fileWriterWriteVarint(&writer, pIter->nKey); + fileWriterWriteVarint(&writer, (u64)pIter->iSeq); fileWriterWrite(&writer, pIter->aKey, pIter->nKey); rc = vdbeSorterNext(pThread, pMerger, &bEof); } rc2 = fileWriterFinish(&writer, &iWriteOff); if( rc==SQLITE_OK ) rc = rc2; @@ -1198,23 +1231,29 @@ vdbeSorterMergerFree(pMerger); sqlite3OsCloseFree(pThread->pTemp1); pThread->pTemp1 = pTemp2; pThread->nPMA = (i / SORTER_MAX_MERGE_COUNT); pThread->iTemp1Off = iWriteOff; + pThread->bEmbeddedSeq = 1; + sqlite3OsUnfetch(pTemp2, 0, 0); } }else{ + i64 nElem; + /* Sort the pThread->pList list */ - rc = vdbeSorterSort(pThread); + rc = vdbeSorterSort(pThread, &nElem); /* If required, write the list out to a PMA. */ if( rc==SQLITE_OK && pThread->eWork==SORTER_THREAD_TO_PMA ){ #ifdef SQLITE_DEBUG i64 nExpect = pThread->nInMemory + sqlite3VarintLen(pThread->nInMemory) + + sqlite3VarintLen(pThread->iSeq) + + sqlite3VarintLen(nElem) + pThread->iTemp1Off; #endif - rc = vdbeSorterListToPMA(pThread); + rc = vdbeSorterListToPMA(pThread, nElem); assert( rc!=SQLITE_OK || (nExpect==pThread->iTemp1Off) ); } } thread_out: @@ -1266,10 +1305,11 @@ assert( pThread->pThread==0 && pThread->bDone==0 ); pThread->eWork = SORTER_THREAD_TO_PMA; pThread->pList = pSorter->pRecord; pThread->nInMemory = pSorter->nInMemory; + pThread->iSeq = pSorter->iNextSeq++; pSorter->nInMemory = 0; pSorter->pRecord = 0; if( pSorter->aMemory ){ u8 *aMem = pThread->aListMemory;