Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory sc8-pr-cvs7.sourceforge.net:/tmp/cvs-serv27686

Modified Files:
        nexi.c pftijah_tokenize.l pftijah_util.mx serialize_pftijah.mx 
Log Message:
* repair BBP refcount bug for BAT

* reimplement the direct bat acces methods in pftijah serialization for more
  speed (and clarity).

* Start optimizing the the pftijah tokenizer. The flex functions are called once
  per handle_character() call. This leads to 2 malloc's per call. I tried to
  do without the malloc's but this caused to a lot of strange results:-)
  I am now planning to craft the flexer by hand. The first small experiment
  shows there is a lot to gain there. (25% speedup in indexing time).



Index: serialize_pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/serialize_pftijah.mx,v
retrieving revision 1.41
retrieving revision 1.42
diff -u -d -r1.41 -r1.42
--- serialize_pftijah.mx        23 Feb 2007 15:11:07 -0000      1.41
+++ serialize_pftijah.mx        27 Feb 2007 15:43:37 -0000      1.42
@@ -31,8 +31,8 @@
 
 extern int handleTijahTerm(struct tijahContextStruct *ctx, char* term);
 
-extern int useFlexScanner(char* buf, int len, struct tijahContextStruct* 
tjCtx); /* FLEX */
-extern char* flexScanOneTerm(char* buf, int len);
+extern int useFlexScanner(char* buf, struct tijahContextStruct* tjCtx); /* 
FLEX */
+extern char* flexScanOneTerm(char* buf);
 
 extern char* normalizeTerm(struct tijahContextStruct *ctx, char* term );
 
@@ -70,15 +70,10 @@
 typedef struct dbat_struct {
        const char*     name;
        BAT*            bat;
-       int             oid_mark;
-       int             max_i;
-       int             max_sz;
-       bit             dflt;     /* fill with default value during extend */
-       int             dflt_int; /* the default int value */
-       chr             dflt_chr; /* the default chr value */
-       oid             dflt_oid; /* the default oid value */
-       /* */
-        union { /* cast to perform direct indexex insert in [void,any] BATs */
+       oid             raw_max;
+       oid             seqbase;
+       oid             seq_max;
+        union { /* cast to perform direct indexe insert in [void,any] BATs */
             void* voidCAST; /* the basecast */
             chr*  chrCAST;  /* cast for [void,chr] BAT */
             int*  intCAST;  /* cast for [void,int] BAT */
@@ -89,7 +84,6 @@
 int dbat_init(const char* name, dbat* dbat, BAT* b) {
        dbat->name = name;
        dbat->bat  = b;
-       dbat->dflt = FALSE;
        if ( dbat->bat->htype != TYPE_void ) {
            stream_printf(GDKerr,"ERROR: dbat_init(%s) non void 
BAT\n",dbat->name);
            return 0;
@@ -98,31 +92,25 @@
            stream_printf(GDKerr,"ERROR: dbat_init(%s) unknown 
ttype(%d)\n",dbat->name,dbat->bat->ttype);
            return 0;
        }
-        dbat->oid_mark = b->hseqbase;
-       dbat->max_i = dbat->max_sz = BATcount(dbat->bat);
+        dbat->seqbase = (oid)b->hseqbase;
+       dbat->raw_max = (oid)BATcount(dbat->bat);
+       dbat->seq_max = dbat->raw_max + dbat->seqbase;
        dbat->cast.voidCAST = (void*)BUNfirst(dbat->bat);
        /* */
        return 1;
 }
 
-int dbat_finalize(dbat* dbat) {
-        BAT* b = dbat->bat;
 
+int dbat_finalize(dbat* dbat, int topidx) {
         void* top;
+        BAT* b = dbat->bat;
         
-        int bottomTop = dbat->max_i;
-       if ( 0 ) stream_printf(GDKout,"dbat_finalize(size=%d)\n",dbat->max_i);
+       topidx -= (int)dbat->seqbase;
+        int bottomTop = topidx;
         switch( b->ttype ) {
          case TYPE_int :
                 top = &dbat->cast.intCAST[bottomTop];
                 break;
-         case TYPE_chr: {
-                b->batBuns->free = dbat->max_i; 
-                BATsetcount(b, dbat->max_i);
-                b->tsorted = 0;
-               b->batDirty = TRUE; /* VERY important this one */
-                return 1;
-                }
          case TYPE_oid:
                 top = &dbat->cast.oidCAST[bottomTop];
                 break;
@@ -137,7 +125,7 @@
        /* */
        dbat->name  = NULL;
        dbat->bat   = NULL;
-       dbat->max_i = dbat->max_sz = 0;
+       dbat->raw_max = dbat->seqbase = 0;
        /* */
        return 1;
 }
@@ -145,9 +133,14 @@
 #define MINCHUNK 8192
 #define MAXCHUNK 67108864
 
-int dbat_extend(dbat* dbat, int i_mark) {
-    /* CHECK THIS if ( i_mark ) i = i_mark - dbat->oid_mark; */
-    size_t newsize = 
MAX(MIN(MAX(MINCHUNK,dbat->max_sz*2),(size_t)(dbat->max_sz+MAXCHUNK)),(size_t)i_mark);
+int dbat_extend(dbat* dbat, oid min_i /*raw-index*/, size_t forced_size) {
+    size_t newsize;
+    
+    if ( forced_size ) {
+       newsize = forced_size;
+    } else {
+       newsize = 
MAX(MIN(MAX(MINCHUNK,dbat->raw_max*2),(size_t)(dbat->raw_max+MAXCHUNK)),(size_t)min_i);
+    }
 
     /* first check if the number of BUN's < INT_MAX. If this was the case
      * and the previous time INT_MAX was returned this means the BAT cannot
@@ -156,94 +149,34 @@
     if ( newsize > INT_MAX ) {
        newsize = INT_MAX;
 
-       if ( dbat->max_sz == INT_MAX ) {
+       if ( dbat->raw_max == INT_MAX ) {
                GDKerror("dbat_extend: BATextend[\"%s\"](size>INT_MAX) 
fails\n","incomplete");
                return -1;
        }
     }
-    if ( 0 ) { stream_printf(GDKout,"dbat_extend[%s](%d -> 
%d)\n",dbat->name,dbat->max_sz,newsize); }
-    dbat->max_sz= newsize;
+#if 0
+    stream_printf(GDKout,"dbat_extend[%s](%d -> 
%d)\n",dbat->name,dbat->raw_max,newsize);
+#endif
+    dbat->raw_max= newsize;
+    dbat->seq_max = dbat->raw_max + dbat->seqbase;
     if ( !(dbat->bat = BATextend(dbat->bat,newsize)) ) {
         GDKerror("dbat_extend: BATextend[\"%s\"](to %d) 
fails\n","incomplete",newsize);
         return -1;
     }
     dbat->cast.voidCAST = (void*)BUNfirst(dbat->bat);
-    /*
-     * now check if there's a default value handler used  
-     *
-     */
-    if ( dbat->dflt ) {
-        switch( dbat->bat->ttype ) {
-         case TYPE_int : {
-               int v   = dbat->dflt_int;
-               int *to = &dbat->cast.intCAST[dbat->max_sz];
-               for(register int *p = &dbat->cast.intCAST[dbat->max_i]; p<to;)
-                   *p++ = v;
-                break;
-               }
-         case TYPE_chr: {
-               chr v   = dbat->dflt_chr;
-               chr *to = &dbat->cast.chrCAST[dbat->max_sz];
-               for(register chr *p = &dbat->cast.chrCAST[dbat->max_i]; p<to;)
-                   *p++ = v;
-                break;
-                }
-         case TYPE_oid: {
-               oid v   = dbat->dflt_oid;
-               oid *to = &dbat->cast.oidCAST[dbat->max_sz];
-               for(register oid *p = &dbat->cast.oidCAST[dbat->max_i]; p<to;)
-                   *p++ = v;
-                break;
-               }
-         default:
-                GDKerror("dbat_extend: bad ttype\n");
-                return -1;
-        }
-    }
-    /* */
     return 1;
 }
 
 int dbat_sizeHint(dbat* dbat, int sizeHint_mark) {
-        int sizeHint = sizeHint_mark - dbat->oid_mark;
-       int estimate = dbat->max_i + sizeHint;
-
-       return dbat_extend(dbat, estimate);
-}
-
-INLINE static int dbat_set_oid(dbat* dbat, int pos_mark, oid v) {
-       register int pos;
+        int sizeHint = sizeHint_mark - dbat->seqbase;
+       int estimate = dbat->raw_max + sizeHint;
 
-       if ( (pos=pos_mark - dbat->oid_mark) < dbat->max_i ) {
-           dbat->cast.oidCAST[pos] = v;
-           return 1;
-       } else {
-           if ( pos >= dbat->max_sz ) {
-               if ( dbat_extend(dbat,pos) < 0 )
-                   return -1;
-           }
-           dbat->max_i = pos + 1;
-           dbat->cast.oidCAST[pos] = v;
-           return 1;
-       }
+       return dbat_extend(dbat, estimate, 0);
 }
 
-INLINE static int dbat_set_int(dbat* dbat, int pos_mark, int v) {
-       register int pos;
+#define dbat_set_oid(DBAT,I,V) (DBAT)->cast.oidCAST[I-(DBAT)->seqbase] = V
 
-       if ( (pos=pos_mark - dbat->oid_mark) < dbat->max_i ) {
-           dbat->cast.intCAST[pos] = v;
-           return 1;
-       } else {
-           if ( pos >= dbat->max_sz ) {
-               if ( dbat_extend(dbat,pos) < 0 )
-                   return -1;
-           }
-           dbat->max_i = pos + 1;
-           dbat->cast.intCAST[pos] = v;
-           return 1;
-       }
-}
+#define dbat_set_int(DBAT,I,V) (DBAT)->cast.intCAST[I-(DBAT)->seqbase] = V
 
 /************************************************
  *
@@ -315,9 +248,6 @@
 
 /************************************************
  *
- *
- * First the temporary shredder for Tijah by JF
- *
  */
 
 INLINE static oid
@@ -328,15 +258,15 @@
     BUN bun;
 
     HASHfnd_str(bun, tjctx->hm_globalTag, (str)t);
-    if ( bun )
-    /* if ( (bun = BUNfnd(tjctx->hm_globalTag,t)) ) OLD */
+    if ( bun ) {
         return *(oid*)BUNtail(tjctx->hm_globalTag,bun);
-    else {
-       if ( !BUNins(tjctx->b_globalTag, &tjctx->n_globalTag, (str)t, FALSE) ) {
+    } else {
+       if ( BUNins(tjctx->b_globalTag, &tjctx->n_globalTag, (str)t, FALSE) ) {
+           return tjctx->n_globalTag++;
+        } else {
            GDKerror("INSERT OF \"%s\" in globalTag fails.\n");
            return oid_nil;
-        } else
-           return tjctx->n_globalTag++;
+        }
     }
 #endif
 }
@@ -349,10 +279,7 @@
    BUN bun;
 
    HASHfnd_str(bun, tjctx->hm_globalTerm, (str)t);
-   if ( bun ) {
-       return *(oid*)BUNtail(tjctx->hm_globalTerm,bun);
-   } else
-       return oid_nil;
+   return ( bun ? *(oid*)BUNtail(tjctx->hm_globalTerm,bun) : oid_nil );
 }
 
 INLINE static oid
@@ -366,22 +293,35 @@
     if ( bun )
         return *(oid*)BUNtail(tjctx->hm_globalTerm,bun);
     else {
-       if ( !BUNins(tjctx->b_globalTerm, &tjctx->n_globalTerm, (str)t, FALSE)){
+       if ( BUNins(tjctx->b_globalTerm, &tjctx->n_globalTerm, (str)t, FALSE)){
+           return tjctx->n_globalTerm++;
+        } else { 
            GDKerror("INSERT OF \"%s\" in globalTerm fails.\n");
            return oid_nil;
-        } else 
-           return tjctx->n_globalTerm++;
+        }
     }
 #endif
 }
 
-#define tj_add2plane(TJCTX,O) \
-    ((dbat_set_oid(&(TJCTX)->dbat_collPre, (TJCTX)->tijahPre, O) < 0) \
-               ? oid_nil : ((oid)(TJCTX)->tijahPre++))
+INLINE oid tj_extend_plane(struct tijahContextStruct *tjctx) {
+    oid base = tjctx->tijahPre - tjctx->dbat_collPre.seqbase; 
 
-#define insertPreSize(TJCTX,POS,SIZE) \
-    dbat_set_int(&TJCTX->dbat_collSize,(int)POS,SIZE)
+    if ( base >= tjctx->dbat_collPre.raw_max ) {
+       if ( dbat_extend(&tjctx->dbat_collPre,base, 0) < 0 )
+           return oid_nil;
+       /* IMPORTANT: the size of the two bats is synchronized by the use
+        * of the forced size (last) parameter of dbat_extend
+        */
+       if ( 
dbat_extend(&tjctx->dbat_collSize,base,tjctx->dbat_collPre.raw_max) < 0 )
+           return oid_nil;
+    }
+    return tjctx->tijahPre++;
+}
 
+#define tj_newPre(TJCTX) \
+       (((TJCTX)->tijahPre < (TJCTX)->dbat_collPre.seq_max) \
+       ? \
+       ((oid)(TJCTX)->tijahPre++) : tj_extend_plane(TJCTX))
 
 int
 handleTijahTerm(struct tijahContextStruct *tjctx, char* term) {
@@ -397,13 +337,13 @@
            }
           }
           if ( (termOid = tj_termOid(tjctx, term)) == oid_nil )
-           return 0;
+           return -1;
        }
        if ( termOid ) { /* term is not a stopword */
-            if ( (tjPre = tj_add2plane(tjctx, termOid)) == oid_nil )
+            if ( (tjPre = tj_newPre(tjctx) ) == oid_nil )
                return 0;
-            if ( insertPreSize(tjctx,tjPre,0) < 0 )
-                       return -1;
+            dbat_set_oid(&tjctx->dbat_collPre, tjPre, termOid);
+            dbat_set_int(&tjctx->dbat_collSize,(int)tjPre,0);
 #ifdef TJ_TRACE
             if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:TERM: \"%s\", 
termoid=%d, tjPre=%d\n",tjctx->name,term,termOid,tjPre);
 #endif
@@ -416,40 +356,13 @@
      return 1;
 }
 
-/************
- *
- * The part where the Strings from Pathfinder are shredded into words
- * by Tijah. The USE_FLEX macro determines if the strings is shredded
- * by Hennings fancy flex scanner or Jan's simple strtok() scanner.
- */
-
-const char* obsoleteNexiChars = " \t\n\r,:;&[EMAIL PROTECTED]";
-
-int 
-useStrtokScanner(tjCtx* tjctx, char* s)
-{
-    char *t;
-    int  sz = 0;
-
-#ifdef TJ_TRACE
-    if (TJ_TRACE) stream_printf(GDKout,"C[%s]:CHARACTERS:\n",tjctx->name);
-#endif
-    if ( (t = strtok(s,obsoleteNexiChars)) ) do {
-       /* not the empty string here */
-        if ( handleTijahTerm(tjctx,t) < 0 )
-             return -1;
-       sz++;
-    } while ( (t=strtok(NULL,obsoleteNexiChars)) );
-    return 1;
-}
-
 /************************************************
  *
  * Now the real output handlers
  */
 
 
-#ifdef notused
+#if 0
 static int
 handle_sizeHint(XqueryCtx* ctx, int hinted_size) {
     tjCtx* tjctx = (tjCtx*)ctx->driverWs;
@@ -502,14 +415,12 @@
            return (str)str_nil;
 }
 
-#define GUESSFORCE FALSE
-
 /* 
  * Replace the value of a collection parameter int the collection parameter
  * bat
  */
 static int replaceCollParam(tjCtx* tjctx, str param, str val) {
-       return ( BUNreplace(tjctx->b_collParam,param,val,GUESSFORCE) != NULL );
+       return ( BUNreplace(tjctx->b_collParam,param,val,FALSE) != NULL );
 }
 
 static BAT*
@@ -894,10 +805,10 @@
     /* if ( DOEMIT(tjctx) ) { */
         if ( (termOid = tj_tagOid(tjctx, name)) == oid_nil )
            return 0;
-        if ( (tjPre = tj_add2plane(tjctx, termOid)) == oid_nil )
+        if ( (tjPre = tj_newPre(tjctx) ) == oid_nil )
            return 0;
+        dbat_set_oid(&tjctx->dbat_collPre, tjPre, termOid);
         if ( tj_pushTag(tjctx,tjPre) < 0 ) return 0;
-        if ( 0 ) stream_printf(GDKout,"C[%s]:startElement: \"%s\", termoid=%d, 
Tijah pre#=%d, Pathfinder pre#=%d\n",tjctx->name,name,termOid,tjPre,pre);
 #ifdef TJ_TRACE
         if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:startElement: \"%s\", 
termoid=%d, Tijah pre#=%d, Pathfinder 
pre#=%d\n",tjctx->name,name,termOid,tjPre,pre);
 #endif
@@ -913,8 +824,7 @@
     --tjctx->doc_height;
     oid start = tj_popTag(tjctx); /* oid of the first node of the element */
     int size  = tjctx->tijahPre - start - 1; /* the Tijah element size */
-    if ( insertPreSize(tjctx,start,size) < 0 )
-           return 0;
+    dbat_set_int(&tjctx->dbat_collSize,(int)start,size);
 #ifdef TJ_TRACE
     if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:empty_endElement: \"%s\"\n", 
tjctx->name,"");
 #endif
@@ -934,8 +844,7 @@
     /* if ( DOEMIT(tjctx) ) { */
         oid start = tj_popTag(tjctx); /* oid of the first node of the element 
*/
         int size  = tjctx->tijahPre - start - 1; /* the Tijah element size */
-       if ( insertPreSize(tjctx,start,size) < 0 )
-           return 0;
+       dbat_set_int(&tjctx->dbat_collSize,(int)start,size);
 #ifdef TJ_TRACE
         if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:endElement: \"%s\"\n", 
tjctx->name,name);
 #endif
@@ -944,8 +853,6 @@
     return 1;
 }
 
-#define USE_FLEX 1
-
 /**
  * Output generation handler. Handles equivalent of * SAX characters() event.
  */
@@ -954,28 +861,23 @@
     EMPTY_CHECK;
     tjCtx* tjctx = (tjCtx*)ctx->driverWs;
 
+    register char* p = (char*)ch;
+    while( *p && isspace(*p) ) p++;
+    if ( !*p )
+        return 1;
 #ifdef TJ_TRACE
-    if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:handle_characters(%s) 
start\n",tjctx->name, (char*)ch);
+    if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:handle_characters(%s) 
start\n",tjctx->name, p);
 #endif
 
     if ( DOEMIT(tjctx) ) {
-#ifdef USE_FLEX
-        return useFlexScanner((char*)ch,strlen((char*)ch),tjctx);
-#else
-        return useStrtokScanner(tjctx,(char*)ch);
-#endif
+        return useFlexScanner(p,tjctx);
     }
     return 1;
 }
 
 char* normalizeTerm(struct tijahContextStruct *tjctx, char* term ) {
        char *res;
-#ifdef USE_FLEX
-        res = flexScanOneTerm((char*)term,strlen((char*)term));
-#else
-       res = strtok(term,obsoleteNexiChars);
-#endif
-       /* INCOMPLETE, should make shure tijahContext is always avail. here */
+        res = flexScanOneTerm((char*)term);
         if ( res && tjctx && tjctx->stemCtx->stem) {
            if ( !(res = 
(char*)tjctx->stemCtx->stem(tjctx->stemCtx,(char*)res)) ) {
                /* must be a stopword */
@@ -986,14 +888,6 @@
 }
 
 int CMDtj_normalizeTerm(char** res, str term, str stemmer) {
-//Leave tokenization disabled for now
-//    char* tokenized;
-//#ifdef USE_FLEX
-//    tokenized = flexScanOneTerm(term,strlen(term));
-//#else
-//    tokenized = strtok(term,obsoleteNexiChars);
-//#endif
-
     tjStemCtx* stemCtx = getStemmingContext( stemmer );
 
     if ( stemCtx->stem ) {
@@ -1123,13 +1017,9 @@
 #ifdef TJ_TRACE
        if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:FINISH 
INDEXING\n",tjctx->name);
 #endif
-
-        /* feature not used anymore ????? */
-       if ( 0 /* ROEL CHANGE VIRTUAL ROOT SIZE HERE */ )
-               insertPreSize(tjctx,0,tjctx->tijahPre - 1);
-       if ( dbat_finalize(&tjctx->dbat_collPre) < 0 )
+       if ( dbat_finalize(&tjctx->dbat_collPre, tjctx->tijahPre) < 0 )
                return GDK_FAIL;
-       if ( dbat_finalize(&tjctx->dbat_collSize) < 0 )
+       if ( dbat_finalize(&tjctx->dbat_collSize, tjctx->tijahPre) < 0 )
                return GDK_FAIL;
 #ifdef TJ_TRACE
        if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:FINALIZED DIRECT 
BATS\n",tjctx->name);

Index: pftijah_tokenize.l
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_tokenize.l,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -d -r1.12 -r1.13
--- pftijah_tokenize.l  9 Jan 2007 15:44:39 -0000       1.12
+++ pftijah_tokenize.l  27 Feb 2007 15:43:37 -0000      1.13
@@ -115,7 +115,40 @@
 
 %%
 
-int useFlexScanner(char* buf, int len, struct tijahContextStruct* tjCtx) {
+int OPT0useFlexScanner(char* buf, struct tijahContextStruct* tjCtx) {
+  /* UPDATE: this delivers very strange testset results and should not be
+   * used I think.
+   */
+  /* This is an optimized version of the flex scanner which does not copy the
+   * input buffer. The only strange thing about this interface is that it
+   * requires 2 YY_END_OF_BUFFER_CHAR (eg. 0) at the end of the buffer. The
+   * size of the buffer is inclusive the 2 0's.
+   * The last zero is toggled with its original value to prevent corruption
+   * of memory management tables. This was for me the only way to prevent
+   * copying here.
+   */
+  int len = strlen(buf);
+  char remember = buf[len+1];
+  buf[len+1] = YY_END_OF_BUFFER_CHAR;
+  YY_BUFFER_STATE myBuf = yy_scan_buffer(buf, len+2);
+
+  if ( !myBuf ) {
+      stream_printf(GDKout,"# useFlexScanner: unable to get setup non-copy 
buffer.");
+      return 0;
+  }
+  while ( pftijah_tokenizelex() ) {
+      /* stream_printf(GDKout,"# scan(%s).\n",pftijah_tokenizetext); */
+      if ( !handleTijahTerm(tjCtx, pftijah_tokenizetext) )
+          return 0;
+  }
+  yy_delete_buffer(myBuf);
+  buf[len+1] = remember;
+  return 1;
+}
+
+int useFlexScanner(char* buf, struct tijahContextStruct* tjCtx) {
+  // the original
+  int len = strlen(buf);
   YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
   while (pftijah_tokenizelex()) {
       if ( !handleTijahTerm(tjCtx, pftijah_tokenizetext) )
@@ -125,6 +158,40 @@
   return 1;
 }
 
+int OPT2useFlexScanner(char* input, struct tijahContextStruct* tjctx)
+{
+    /* the fast function. This function is in the pftijah context with lots
+     * of small strings to tokenize many times faster as the flex and the 
+     * strtok() methods which seem to have a rather larger overhead
+     */
+    register char* s = input;
+    register char x;
+// #define EMIT x=*s; *s=0; stream_printf(GDKout,"#[%s]\n",base);if 
(!handleTijahTerm(tjctx,base)) return 0; *s=x
+#define EMIT x=*s; *s=0; if (!handleTijahTerm(tjctx,base)) return 0; *s=x
+
+    while ( 1 ) {
+      while ( isspace( *s ) ) s++;
+      if ( *s ) {
+         char* base = s;
+         if ( isalnum(*s) ) {
+             if ( isdigit(*s) ) {
+                 while ( isdigit(*++s) ) ;
+                 EMIT;
+             } else {
+                 if (isupper(*s)) *s=tolower(*s);
+                 while ( isalnum(*++s) ) if (isupper(*s)) *s=tolower(*s);
+                 EMIT;
+             }
+         } else {
+             // INCOMPLETE, ENTITIES HERE
+             // stream_printf(GDKout,"#[SKIPPING:%c]\n",*s);
+             s++;
+         }
+      } else 
+          return 1;
+    }
+}
+
 char* tijah_tokenize_string(char* buf, int len, char* outbuf) {
   int cnt = 0;
   YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
@@ -137,9 +204,10 @@
   return outbuf;
 }
 
-char* flexScanOneTerm(char* buf, int len) {
+char* flexScanOneTerm(char* buf) {
   char *res;
   char resBUFF[256];
+  int len = strlen(buf);
 
   YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
   if ( pftijah_tokenizelex() ) {

Index: nexi.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/nexi.c,v
retrieving revision 1.49
retrieving revision 1.50
diff -u -d -r1.49 -r1.50
--- nexi.c      23 Feb 2007 15:11:05 -0000      1.49
+++ nexi.c      27 Feb 2007 15:43:37 -0000      1.50
@@ -455,6 +455,7 @@
     /*
      * Now find out if the collection is fragmented or not.
      */
+    /* INCOMPLETE, ERROR HERE WITH REFCOUNTS IN HEAD */
     BAT* fb = 
pftu_lookup_bat(pftu_batname1("tj_%s_fragments",(char*)parserCtx->collection,0));
     if ( ! fb ) {
            stream_printf(GDKerr,"Error: cannot find fragments bat for 
collection \"%s\".\n",parserCtx->collection);
@@ -471,6 +472,8 @@
               parserCtx->ffPfx        = "";
               parserCtx->flastPfx     = ", str(1)";
     }
+    BBPunfix(BBPcacheid(fb));
+    fb = NULL;
     // Some special cases for NLLR, since NLLR only works with COARSE2 at the 
moment
     if ( txt_retr_model->model == MODEL_NLLR ) {
         // Switch to COARSE2 algebra for NLLR

Index: pftijah_util.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_util.mx,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -d -r1.2 -r1.3
--- pftijah_util.mx     9 Jan 2007 17:15:23 -0000       1.2
+++ pftijah_util.mx     27 Feb 2007 15:43:37 -0000      1.3
@@ -73,6 +73,7 @@
     if ( b == bat_nil ) {
        return NULL;
     } else {
+        BBPfix(b);
        return BBPdescriptor(b);
     }
 }


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

Reply via email to