2007/12/4 Scott Hess <[EMAIL PROTECTED]>:
> This seems a little excessive, though. I do see that there's an
> O(N^2) path in the prefix-searching (loadSegmentLeavesInt()'s call to
> docListUnion()). I can reasonably make that O(logN), which might help
> a great deal, if you're hitting it. Not really sure how to tell if
> you're hitting it, but I'll experiment at my end and see whether I can
> improve things there.
With the attached patch, the time to match against 't*' with the rfc
dataset goes from 1m16s to 5s.
It passes the tests, but I'll not guarantee that this is what I'll
check in. I want to think on it. But let me know if this doesn't
help.
-scott
Index: ext/fts3/fts3.c
===================================================================
RCS file: /sqlite/sqlite/ext/fts3/fts3.c,v
retrieving revision 1.12
diff -u -r1.12 fts3.c
--- ext/fts3/fts3.c 24 Nov 2007 00:41:52 -0000 1.12
+++ ext/fts3/fts3.c 5 Dec 2007 01:03:32 -0000
@@ -5555,6 +5555,21 @@
return rc;
}
+/* Call docListUnion() to merge *left and *right into *out, destroying
+** *left and *right. Handles left or right in the same location as
+** out (in-place merge).
+*/
+static void docListUnionWithDestroy(DataBuffer *left, DataBuffer *right,
+ DataBuffer *out) {
+ DataBuffer result;
+ dataBufferInit(&result, left->nData+right->nData);
+ docListUnion(left->pData, left->nData, right->pData, right->nData,
+ &result);
+ dataBufferDestroy(left);
+ dataBufferDestroy(right);
+ *out = result;
+}
+
/* Scan pReader for pTerm/nTerm, and merge the term's doclist over
** *out (any doclists with duplicate docids overwrite those in *out).
** Internal function for loadSegmentLeaf().
@@ -5562,6 +5577,15 @@
static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
const char *pTerm, int nTerm, int isPrefix,
DataBuffer *out){
+ /* To keep merging O(logN), keep a list of merged buffers. After
+ ** each doclist is added, buffers are merged for the number of 0
+ ** low-order bits in nDoclists. So this many doclists are merged at
+ ** each point: 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, ... (like a
+ ** tree).
+ */
+ DataBuffer *pBuffers = NULL;
+ int nBuffers = 0, nMaxBuffers = 0, nDoclists = 0;
+
assert( nTerm>0 );
/* Process while the prefix matches. */
@@ -5575,24 +5599,54 @@
int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);
if( c==0 ){
const char *pData = leavesReaderData(pReader);
- int nData = leavesReaderDataBytes(pReader);
- if( out->nData==0 ){
- dataBufferReplace(out, pData, nData);
- }else{
- DataBuffer result;
- dataBufferInit(&result, out->nData+nData);
- docListUnion(out->pData, out->nData, pData, nData, &result);
- dataBufferDestroy(out);
- *out = result;
- /* TODO(shess) Rather than destroy out, we could retain it for
- ** later reuse.
- */
+ int n, nData = leavesReaderDataBytes(pReader);
+ if( nBuffers==nMaxBuffers ){
+ ++nMaxBuffers;
+ pBuffers = sqlite3_realloc(pBuffers, nMaxBuffers*sizeof(*pBuffers));
+ }
+ dataBufferInit(&(pBuffers[nBuffers]), nData);
+ dataBufferReplace(&(pBuffers[nBuffers]), pData, nData);
+ nBuffers++;
+ assert(nBuffers<=nMaxBuffers);
+
+ ++nDoclists;
+ for( n=nDoclists; (n%2)==0; n>>=1) {
+ assert(n); /* Can't happen if nDoclists!=0. */
+ docListUnionWithDestroy(&(pBuffers[nBuffers-2]),
+ &(pBuffers[nBuffers-1]),
+ &(pBuffers[nBuffers-2]));
+ nBuffers--;
}
}
if( c>0 ) break; /* Past any possible matches. */
rc = leavesReaderStep(v, pReader);
- if( rc!=SQLITE_OK ) return rc;
+ if( rc!=SQLITE_OK ){
+ while( nBuffers>0 ){
+ nBuffers--;
+ dataBufferDestroy(&(pBuffers[nBuffers]));
+ }
+ sqlite3_free(pBuffers);
+ return rc;
+ }
+ }
+ while( nBuffers>1 ){
+ docListUnionWithDestroy(&(pBuffers[nBuffers-2]),
+ &(pBuffers[nBuffers-1]),
+ &(pBuffers[nBuffers-2]));
+ nBuffers--;
+ }
+ if( nBuffers>0 ){
+ assert(1==nBuffers);
+ if( out->nData==0 ){
+ dataBufferDestroy(out);
+ *out = pBuffers[0];
+ }else{
+ docListUnionWithDestroy(out, &(pBuffers[0]), out);
+ }
+ sqlite3_free(pBuffers);
+ } else {
+ assert(NULL==pBuffers);
}
return SQLITE_OK;
}
-----------------------------------------------------------------------------
To unsubscribe, send email to [EMAIL PROTECTED]
-----------------------------------------------------------------------------