2007/12/4 Scott Hess <[EMAIL PROTECTED]>: > This seems a little excessive, though. I do see that there's an > O(N^2) path in the prefix-searching (loadSegmentLeavesInt()'s call to > docListUnion()). I can reasonably make that O(logN), which might help > a great deal, if you're hitting it. Not really sure how to tell if > you're hitting it, but I'll experiment at my end and see whether I can > improve things there.
With the attached patch, the time to match against 't*' with the rfc dataset goes from 1m16s to 5s. It passes the tests, but I'll not guarantee that this is what I'll check in. I want to think on it. But let me know if this doesn't help. -scott
Index: ext/fts3/fts3.c =================================================================== RCS file: /sqlite/sqlite/ext/fts3/fts3.c,v retrieving revision 1.12 diff -u -r1.12 fts3.c --- ext/fts3/fts3.c 24 Nov 2007 00:41:52 -0000 1.12 +++ ext/fts3/fts3.c 5 Dec 2007 01:03:32 -0000 @@ -5555,6 +5555,21 @@ return rc; } +/* Call docListUnion() to merge *left and *right into *out, destroying +** *left and *right. Handles left or right in the same location as +** out (in-place merge). +*/ +static void docListUnionWithDestroy(DataBuffer *left, DataBuffer *right, + DataBuffer *out) { + DataBuffer result; + dataBufferInit(&result, left->nData+right->nData); + docListUnion(left->pData, left->nData, right->pData, right->nData, + &result); + dataBufferDestroy(left); + dataBufferDestroy(right); + *out = result; +} + /* Scan pReader for pTerm/nTerm, and merge the term's doclist over ** *out (any doclists with duplicate docids overwrite those in *out). ** Internal function for loadSegmentLeaf(). @@ -5562,6 +5577,15 @@ static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader, const char *pTerm, int nTerm, int isPrefix, DataBuffer *out){ + /* To keep merging O(logN), keep a list of merged buffers. After + ** each doclist is added, buffers are merged for the number of 0 + ** low-order bits in nDoclists. So this many doclists are merged at + ** each point: 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, ... (like a + ** tree). + */ + DataBuffer *pBuffers = NULL; + int nBuffers = 0, nMaxBuffers = 0, nDoclists = 0; + assert( nTerm>0 ); /* Process while the prefix matches. */ @@ -5575,24 +5599,54 @@ int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix); if( c==0 ){ const char *pData = leavesReaderData(pReader); - int nData = leavesReaderDataBytes(pReader); - if( out->nData==0 ){ - dataBufferReplace(out, pData, nData); - }else{ - DataBuffer result; - dataBufferInit(&result, out->nData+nData); - docListUnion(out->pData, out->nData, pData, nData, &result); - dataBufferDestroy(out); - *out = result; - /* TODO(shess) Rather than destroy out, we could retain it for - ** later reuse. - */ + int n, nData = leavesReaderDataBytes(pReader); + if( nBuffers==nMaxBuffers ){ + ++nMaxBuffers; + pBuffers = sqlite3_realloc(pBuffers, nMaxBuffers*sizeof(*pBuffers)); + } + dataBufferInit(&(pBuffers[nBuffers]), nData); + dataBufferReplace(&(pBuffers[nBuffers]), pData, nData); + nBuffers++; + assert(nBuffers<=nMaxBuffers); + + ++nDoclists; + for( n=nDoclists; (n%2)==0; n>>=1) { + assert(n); /* Can't happen if nDoclists!=0. */ + docListUnionWithDestroy(&(pBuffers[nBuffers-2]), + &(pBuffers[nBuffers-1]), + &(pBuffers[nBuffers-2])); + nBuffers--; } } if( c>0 ) break; /* Past any possible matches. */ rc = leavesReaderStep(v, pReader); - if( rc!=SQLITE_OK ) return rc; + if( rc!=SQLITE_OK ){ + while( nBuffers>0 ){ + nBuffers--; + dataBufferDestroy(&(pBuffers[nBuffers])); + } + sqlite3_free(pBuffers); + return rc; + } + } + while( nBuffers>1 ){ + docListUnionWithDestroy(&(pBuffers[nBuffers-2]), + &(pBuffers[nBuffers-1]), + &(pBuffers[nBuffers-2])); + nBuffers--; + } + if( nBuffers>0 ){ + assert(1==nBuffers); + if( out->nData==0 ){ + dataBufferDestroy(out); + *out = pBuffers[0]; + }else{ + docListUnionWithDestroy(out, &(pBuffers[0]), out); + } + sqlite3_free(pBuffers); + } else { + assert(NULL==pBuffers); } return SQLITE_OK; }
----------------------------------------------------------------------------- To unsubscribe, send email to [EMAIL PROTECTED] -----------------------------------------------------------------------------