On 2007-02-27 16:43, Jan Flokstra wrote: > Update of /cvsroot/monetdb/pathfinder/modules/pftijah > In directory sc8-pr-cvs7.sourceforge.net:/tmp/cvs-serv27686 > > Modified Files: > nexi.c pftijah_tokenize.l pftijah_util.mx serialize_pftijah.mx > Log Message: > * repair BBP refcount bug for BAT
Is this a fix which also applies to the stable branch?
> * reimplement the direct bat acces methods in pftijah serialization for more
> speed (and clarity).
>
> * Start optimizing the the pftijah tokenizer. The flex functions are called
> once
> per handle_character() call. This leads to 2 malloc's per call. I tried to
> do without the malloc's but this caused to a lot of strange results:-)
> I am now planning to craft the flexer by hand. The first small experiment
> shows there is a lot to gain there. (25% speedup in indexing time).
>
>
>
> Index: serialize_pftijah.mx
> ===================================================================
> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/serialize_pftijah.mx,v
> retrieving revision 1.41
> retrieving revision 1.42
> diff -u -d -r1.41 -r1.42
> --- serialize_pftijah.mx 23 Feb 2007 15:11:07 -0000 1.41
> +++ serialize_pftijah.mx 27 Feb 2007 15:43:37 -0000 1.42
> @@ -31,8 +31,8 @@
>
> extern int handleTijahTerm(struct tijahContextStruct *ctx, char* term);
>
> -extern int useFlexScanner(char* buf, int len, struct tijahContextStruct*
> tjCtx); /* FLEX */
> -extern char* flexScanOneTerm(char* buf, int len);
> +extern int useFlexScanner(char* buf, struct tijahContextStruct* tjCtx); /*
> FLEX */
> +extern char* flexScanOneTerm(char* buf);
>
> extern char* normalizeTerm(struct tijahContextStruct *ctx, char* term );
>
> @@ -70,15 +70,10 @@
> typedef struct dbat_struct {
> const char* name;
> BAT* bat;
> - int oid_mark;
> - int max_i;
> - int max_sz;
> - bit dflt; /* fill with default value during extend */
> - int dflt_int; /* the default int value */
> - chr dflt_chr; /* the default chr value */
> - oid dflt_oid; /* the default oid value */
> - /* */
> - union { /* cast to perform direct indexex insert in [void,any] BATs
> */
> + oid raw_max;
> + oid seqbase;
> + oid seq_max;
> + union { /* cast to perform direct indexe insert in [void,any] BATs */
> void* voidCAST; /* the basecast */
> chr* chrCAST; /* cast for [void,chr] BAT */
> int* intCAST; /* cast for [void,int] BAT */
> @@ -89,7 +84,6 @@
> int dbat_init(const char* name, dbat* dbat, BAT* b) {
> dbat->name = name;
> dbat->bat = b;
> - dbat->dflt = FALSE;
> if ( dbat->bat->htype != TYPE_void ) {
> stream_printf(GDKerr,"ERROR: dbat_init(%s) non void
> BAT\n",dbat->name);
> return 0;
> @@ -98,31 +92,25 @@
> stream_printf(GDKerr,"ERROR: dbat_init(%s) unknown
> ttype(%d)\n",dbat->name,dbat->bat->ttype);
> return 0;
> }
> - dbat->oid_mark = b->hseqbase;
> - dbat->max_i = dbat->max_sz = BATcount(dbat->bat);
> + dbat->seqbase = (oid)b->hseqbase;
> + dbat->raw_max = (oid)BATcount(dbat->bat);
> + dbat->seq_max = dbat->raw_max + dbat->seqbase;
> dbat->cast.voidCAST = (void*)BUNfirst(dbat->bat);
> /* */
> return 1;
> }
>
> -int dbat_finalize(dbat* dbat) {
> - BAT* b = dbat->bat;
>
> +int dbat_finalize(dbat* dbat, int topidx) {
> void* top;
> + BAT* b = dbat->bat;
>
> - int bottomTop = dbat->max_i;
> - if ( 0 ) stream_printf(GDKout,"dbat_finalize(size=%d)\n",dbat->max_i);
> + topidx -= (int)dbat->seqbase;
> + int bottomTop = topidx;
> switch( b->ttype ) {
> case TYPE_int :
> top = &dbat->cast.intCAST[bottomTop];
> break;
> - case TYPE_chr: {
> - b->batBuns->free = dbat->max_i;
> - BATsetcount(b, dbat->max_i);
> - b->tsorted = 0;
> - b->batDirty = TRUE; /* VERY important this one */
> - return 1;
> - }
> case TYPE_oid:
> top = &dbat->cast.oidCAST[bottomTop];
> break;
> @@ -137,7 +125,7 @@
> /* */
> dbat->name = NULL;
> dbat->bat = NULL;
> - dbat->max_i = dbat->max_sz = 0;
> + dbat->raw_max = dbat->seqbase = 0;
> /* */
> return 1;
> }
> @@ -145,9 +133,14 @@
> #define MINCHUNK 8192
> #define MAXCHUNK 67108864
>
> -int dbat_extend(dbat* dbat, int i_mark) {
> - /* CHECK THIS if ( i_mark ) i = i_mark - dbat->oid_mark; */
> - size_t newsize =
> MAX(MIN(MAX(MINCHUNK,dbat->max_sz*2),(size_t)(dbat->max_sz+MAXCHUNK)),(size_t)i_mark);
> +int dbat_extend(dbat* dbat, oid min_i /*raw-index*/, size_t forced_size) {
> + size_t newsize;
> +
> + if ( forced_size ) {
> + newsize = forced_size;
> + } else {
> + newsize =
> MAX(MIN(MAX(MINCHUNK,dbat->raw_max*2),(size_t)(dbat->raw_max+MAXCHUNK)),(size_t)min_i);
> + }
>
> /* first check if the number of BUN's < INT_MAX. If this was the case
> * and the previous time INT_MAX was returned this means the BAT cannot
> @@ -156,94 +149,34 @@
> if ( newsize > INT_MAX ) {
> newsize = INT_MAX;
>
> - if ( dbat->max_sz == INT_MAX ) {
> + if ( dbat->raw_max == INT_MAX ) {
> GDKerror("dbat_extend: BATextend[\"%s\"](size>INT_MAX)
> fails\n","incomplete");
> return -1;
> }
> }
> - if ( 0 ) { stream_printf(GDKout,"dbat_extend[%s](%d ->
> %d)\n",dbat->name,dbat->max_sz,newsize); }
> - dbat->max_sz= newsize;
> +#if 0
> + stream_printf(GDKout,"dbat_extend[%s](%d ->
> %d)\n",dbat->name,dbat->raw_max,newsize);
> +#endif
> + dbat->raw_max= newsize;
> + dbat->seq_max = dbat->raw_max + dbat->seqbase;
> if ( !(dbat->bat = BATextend(dbat->bat,newsize)) ) {
> GDKerror("dbat_extend: BATextend[\"%s\"](to %d)
> fails\n","incomplete",newsize);
> return -1;
> }
> dbat->cast.voidCAST = (void*)BUNfirst(dbat->bat);
> - /*
> - * now check if there's a default value handler used
> - *
> - */
> - if ( dbat->dflt ) {
> - switch( dbat->bat->ttype ) {
> - case TYPE_int : {
> - int v = dbat->dflt_int;
> - int *to = &dbat->cast.intCAST[dbat->max_sz];
> - for(register int *p = &dbat->cast.intCAST[dbat->max_i]; p<to;)
> - *p++ = v;
> - break;
> - }
> - case TYPE_chr: {
> - chr v = dbat->dflt_chr;
> - chr *to = &dbat->cast.chrCAST[dbat->max_sz];
> - for(register chr *p = &dbat->cast.chrCAST[dbat->max_i]; p<to;)
> - *p++ = v;
> - break;
> - }
> - case TYPE_oid: {
> - oid v = dbat->dflt_oid;
> - oid *to = &dbat->cast.oidCAST[dbat->max_sz];
> - for(register oid *p = &dbat->cast.oidCAST[dbat->max_i]; p<to;)
> - *p++ = v;
> - break;
> - }
> - default:
> - GDKerror("dbat_extend: bad ttype\n");
> - return -1;
> - }
> - }
> - /* */
> return 1;
> }
>
> int dbat_sizeHint(dbat* dbat, int sizeHint_mark) {
> - int sizeHint = sizeHint_mark - dbat->oid_mark;
> - int estimate = dbat->max_i + sizeHint;
> -
> - return dbat_extend(dbat, estimate);
> -}
> -
> -INLINE static int dbat_set_oid(dbat* dbat, int pos_mark, oid v) {
> - register int pos;
> + int sizeHint = sizeHint_mark - dbat->seqbase;
> + int estimate = dbat->raw_max + sizeHint;
>
> - if ( (pos=pos_mark - dbat->oid_mark) < dbat->max_i ) {
> - dbat->cast.oidCAST[pos] = v;
> - return 1;
> - } else {
> - if ( pos >= dbat->max_sz ) {
> - if ( dbat_extend(dbat,pos) < 0 )
> - return -1;
> - }
> - dbat->max_i = pos + 1;
> - dbat->cast.oidCAST[pos] = v;
> - return 1;
> - }
> + return dbat_extend(dbat, estimate, 0);
> }
>
> -INLINE static int dbat_set_int(dbat* dbat, int pos_mark, int v) {
> - register int pos;
> +#define dbat_set_oid(DBAT,I,V) (DBAT)->cast.oidCAST[I-(DBAT)->seqbase] = V
>
> - if ( (pos=pos_mark - dbat->oid_mark) < dbat->max_i ) {
> - dbat->cast.intCAST[pos] = v;
> - return 1;
> - } else {
> - if ( pos >= dbat->max_sz ) {
> - if ( dbat_extend(dbat,pos) < 0 )
> - return -1;
> - }
> - dbat->max_i = pos + 1;
> - dbat->cast.intCAST[pos] = v;
> - return 1;
> - }
> -}
> +#define dbat_set_int(DBAT,I,V) (DBAT)->cast.intCAST[I-(DBAT)->seqbase] = V
>
> /************************************************
> *
> @@ -315,9 +248,6 @@
>
> /************************************************
> *
> - *
> - * First the temporary shredder for Tijah by JF
> - *
> */
>
> INLINE static oid
> @@ -328,15 +258,15 @@
> BUN bun;
>
> HASHfnd_str(bun, tjctx->hm_globalTag, (str)t);
> - if ( bun )
> - /* if ( (bun = BUNfnd(tjctx->hm_globalTag,t)) ) OLD */
> + if ( bun ) {
> return *(oid*)BUNtail(tjctx->hm_globalTag,bun);
> - else {
> - if ( !BUNins(tjctx->b_globalTag, &tjctx->n_globalTag, (str)t, FALSE) ) {
> + } else {
> + if ( BUNins(tjctx->b_globalTag, &tjctx->n_globalTag, (str)t, FALSE) ) {
> + return tjctx->n_globalTag++;
> + } else {
> GDKerror("INSERT OF \"%s\" in globalTag fails.\n");
> return oid_nil;
> - } else
> - return tjctx->n_globalTag++;
> + }
> }
> #endif
> }
> @@ -349,10 +279,7 @@
> BUN bun;
>
> HASHfnd_str(bun, tjctx->hm_globalTerm, (str)t);
> - if ( bun ) {
> - return *(oid*)BUNtail(tjctx->hm_globalTerm,bun);
> - } else
> - return oid_nil;
> + return ( bun ? *(oid*)BUNtail(tjctx->hm_globalTerm,bun) : oid_nil );
> }
>
> INLINE static oid
> @@ -366,22 +293,35 @@
> if ( bun )
> return *(oid*)BUNtail(tjctx->hm_globalTerm,bun);
> else {
> - if ( !BUNins(tjctx->b_globalTerm, &tjctx->n_globalTerm, (str)t, FALSE)){
> + if ( BUNins(tjctx->b_globalTerm, &tjctx->n_globalTerm, (str)t, FALSE)){
> + return tjctx->n_globalTerm++;
> + } else {
> GDKerror("INSERT OF \"%s\" in globalTerm fails.\n");
> return oid_nil;
> - } else
> - return tjctx->n_globalTerm++;
> + }
> }
> #endif
> }
>
> -#define tj_add2plane(TJCTX,O) \
> - ((dbat_set_oid(&(TJCTX)->dbat_collPre, (TJCTX)->tijahPre, O) < 0) \
> - ? oid_nil : ((oid)(TJCTX)->tijahPre++))
> +INLINE oid tj_extend_plane(struct tijahContextStruct *tjctx) {
> + oid base = tjctx->tijahPre - tjctx->dbat_collPre.seqbase;
>
> -#define insertPreSize(TJCTX,POS,SIZE) \
> - dbat_set_int(&TJCTX->dbat_collSize,(int)POS,SIZE)
> + if ( base >= tjctx->dbat_collPre.raw_max ) {
> + if ( dbat_extend(&tjctx->dbat_collPre,base, 0) < 0 )
> + return oid_nil;
> + /* IMPORTANT: the size of the two bats is synchronized by the use
> + * of the forced size (last) parameter of dbat_extend
> + */
> + if (
> dbat_extend(&tjctx->dbat_collSize,base,tjctx->dbat_collPre.raw_max) < 0 )
> + return oid_nil;
> + }
> + return tjctx->tijahPre++;
> +}
>
> +#define tj_newPre(TJCTX) \
> + (((TJCTX)->tijahPre < (TJCTX)->dbat_collPre.seq_max) \
> + ? \
> + ((oid)(TJCTX)->tijahPre++) : tj_extend_plane(TJCTX))
>
> int
> handleTijahTerm(struct tijahContextStruct *tjctx, char* term) {
> @@ -397,13 +337,13 @@
> }
> }
> if ( (termOid = tj_termOid(tjctx, term)) == oid_nil )
> - return 0;
> + return -1;
> }
> if ( termOid ) { /* term is not a stopword */
> - if ( (tjPre = tj_add2plane(tjctx, termOid)) == oid_nil )
> + if ( (tjPre = tj_newPre(tjctx) ) == oid_nil )
> return 0;
> - if ( insertPreSize(tjctx,tjPre,0) < 0 )
> - return -1;
> + dbat_set_oid(&tjctx->dbat_collPre, tjPre, termOid);
> + dbat_set_int(&tjctx->dbat_collSize,(int)tjPre,0);
> #ifdef TJ_TRACE
> if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:TERM: \"%s\",
> termoid=%d, tjPre=%d\n",tjctx->name,term,termOid,tjPre);
> #endif
> @@ -416,40 +356,13 @@
> return 1;
> }
>
> -/************
> - *
> - * The part where the Strings from Pathfinder are shredded into words
> - * by Tijah. The USE_FLEX macro determines if the strings is shredded
> - * by Hennings fancy flex scanner or Jan's simple strtok() scanner.
> - */
> -
> -const char* obsoleteNexiChars = " \t\n\r,:;&[EMAIL PROTECTED]";
> -
> -int
> -useStrtokScanner(tjCtx* tjctx, char* s)
> -{
> - char *t;
> - int sz = 0;
> -
> -#ifdef TJ_TRACE
> - if (TJ_TRACE) stream_printf(GDKout,"C[%s]:CHARACTERS:\n",tjctx->name);
> -#endif
> - if ( (t = strtok(s,obsoleteNexiChars)) ) do {
> - /* not the empty string here */
> - if ( handleTijahTerm(tjctx,t) < 0 )
> - return -1;
> - sz++;
> - } while ( (t=strtok(NULL,obsoleteNexiChars)) );
> - return 1;
> -}
> -
> /************************************************
> *
> * Now the real output handlers
> */
>
>
> -#ifdef notused
> +#if 0
> static int
> handle_sizeHint(XqueryCtx* ctx, int hinted_size) {
> tjCtx* tjctx = (tjCtx*)ctx->driverWs;
> @@ -502,14 +415,12 @@
> return (str)str_nil;
> }
>
> -#define GUESSFORCE FALSE
> -
> /*
> * Replace the value of a collection parameter int the collection parameter
> * bat
> */
> static int replaceCollParam(tjCtx* tjctx, str param, str val) {
> - return ( BUNreplace(tjctx->b_collParam,param,val,GUESSFORCE) != NULL );
> + return ( BUNreplace(tjctx->b_collParam,param,val,FALSE) != NULL );
> }
>
> static BAT*
> @@ -894,10 +805,10 @@
> /* if ( DOEMIT(tjctx) ) { */
> if ( (termOid = tj_tagOid(tjctx, name)) == oid_nil )
> return 0;
> - if ( (tjPre = tj_add2plane(tjctx, termOid)) == oid_nil )
> + if ( (tjPre = tj_newPre(tjctx) ) == oid_nil )
> return 0;
> + dbat_set_oid(&tjctx->dbat_collPre, tjPre, termOid);
> if ( tj_pushTag(tjctx,tjPre) < 0 ) return 0;
> - if ( 0 ) stream_printf(GDKout,"C[%s]:startElement: \"%s\",
> termoid=%d, Tijah pre#=%d, Pathfinder
> pre#=%d\n",tjctx->name,name,termOid,tjPre,pre);
> #ifdef TJ_TRACE
> if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:startElement: \"%s\",
> termoid=%d, Tijah pre#=%d, Pathfinder
> pre#=%d\n",tjctx->name,name,termOid,tjPre,pre);
> #endif
> @@ -913,8 +824,7 @@
> --tjctx->doc_height;
> oid start = tj_popTag(tjctx); /* oid of the first node of the element */
> int size = tjctx->tijahPre - start - 1; /* the Tijah element size */
> - if ( insertPreSize(tjctx,start,size) < 0 )
> - return 0;
> + dbat_set_int(&tjctx->dbat_collSize,(int)start,size);
> #ifdef TJ_TRACE
> if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:empty_endElement: \"%s\"\n",
> tjctx->name,"");
> #endif
> @@ -934,8 +844,7 @@
> /* if ( DOEMIT(tjctx) ) { */
> oid start = tj_popTag(tjctx); /* oid of the first node of the
> element */
> int size = tjctx->tijahPre - start - 1; /* the Tijah element size */
> - if ( insertPreSize(tjctx,start,size) < 0 )
> - return 0;
> + dbat_set_int(&tjctx->dbat_collSize,(int)start,size);
> #ifdef TJ_TRACE
> if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:endElement: \"%s\"\n",
> tjctx->name,name);
> #endif
> @@ -944,8 +853,6 @@
> return 1;
> }
>
> -#define USE_FLEX 1
> -
> /**
> * Output generation handler. Handles equivalent of * SAX characters() event.
> */
> @@ -954,28 +861,23 @@
> EMPTY_CHECK;
> tjCtx* tjctx = (tjCtx*)ctx->driverWs;
>
> + register char* p = (char*)ch;
> + while( *p && isspace(*p) ) p++;
> + if ( !*p )
> + return 1;
> #ifdef TJ_TRACE
> - if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:handle_characters(%s)
> start\n",tjctx->name, (char*)ch);
> + if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:handle_characters(%s)
> start\n",tjctx->name, p);
> #endif
>
> if ( DOEMIT(tjctx) ) {
> -#ifdef USE_FLEX
> - return useFlexScanner((char*)ch,strlen((char*)ch),tjctx);
> -#else
> - return useStrtokScanner(tjctx,(char*)ch);
> -#endif
> + return useFlexScanner(p,tjctx);
> }
> return 1;
> }
>
> char* normalizeTerm(struct tijahContextStruct *tjctx, char* term ) {
> char *res;
> -#ifdef USE_FLEX
> - res = flexScanOneTerm((char*)term,strlen((char*)term));
> -#else
> - res = strtok(term,obsoleteNexiChars);
> -#endif
> - /* INCOMPLETE, should make shure tijahContext is always avail. here */
> + res = flexScanOneTerm((char*)term);
> if ( res && tjctx && tjctx->stemCtx->stem) {
> if ( !(res =
> (char*)tjctx->stemCtx->stem(tjctx->stemCtx,(char*)res)) ) {
> /* must be a stopword */
> @@ -986,14 +888,6 @@
> }
>
> int CMDtj_normalizeTerm(char** res, str term, str stemmer) {
> -//Leave tokenization disabled for now
> -// char* tokenized;
> -//#ifdef USE_FLEX
> -// tokenized = flexScanOneTerm(term,strlen(term));
> -//#else
> -// tokenized = strtok(term,obsoleteNexiChars);
> -//#endif
> -
> tjStemCtx* stemCtx = getStemmingContext( stemmer );
>
> if ( stemCtx->stem ) {
> @@ -1123,13 +1017,9 @@
> #ifdef TJ_TRACE
> if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:FINISH
> INDEXING\n",tjctx->name);
> #endif
> -
> - /* feature not used anymore ????? */
> - if ( 0 /* ROEL CHANGE VIRTUAL ROOT SIZE HERE */ )
> - insertPreSize(tjctx,0,tjctx->tijahPre - 1);
> - if ( dbat_finalize(&tjctx->dbat_collPre) < 0 )
> + if ( dbat_finalize(&tjctx->dbat_collPre, tjctx->tijahPre) < 0 )
> return GDK_FAIL;
> - if ( dbat_finalize(&tjctx->dbat_collSize) < 0 )
> + if ( dbat_finalize(&tjctx->dbat_collSize, tjctx->tijahPre) < 0 )
> return GDK_FAIL;
> #ifdef TJ_TRACE
> if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:FINALIZED DIRECT
> BATS\n",tjctx->name);
>
> Index: pftijah_tokenize.l
> ===================================================================
> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_tokenize.l,v
> retrieving revision 1.12
> retrieving revision 1.13
> diff -u -d -r1.12 -r1.13
> --- pftijah_tokenize.l 9 Jan 2007 15:44:39 -0000 1.12
> +++ pftijah_tokenize.l 27 Feb 2007 15:43:37 -0000 1.13
> @@ -115,7 +115,40 @@
>
> %%
>
> -int useFlexScanner(char* buf, int len, struct tijahContextStruct* tjCtx) {
> +int OPT0useFlexScanner(char* buf, struct tijahContextStruct* tjCtx) {
> + /* UPDATE: this delivers very strange testset results and should not be
> + * used I think.
> + */
> + /* This is an optimized version of the flex scanner which does not copy the
> + * input buffer. The only strange thing about this interface is that it
> + * requires 2 YY_END_OF_BUFFER_CHAR (eg. 0) at the end of the buffer. The
> + * size of the buffer is inclusive the 2 0's.
> + * The last zero is toggled with its original value to prevent corruption
> + * of memory management tables. This was for me the only way to prevent
> + * copying here.
> + */
> + int len = strlen(buf);
> + char remember = buf[len+1];
> + buf[len+1] = YY_END_OF_BUFFER_CHAR;
> + YY_BUFFER_STATE myBuf = yy_scan_buffer(buf, len+2);
> +
> + if ( !myBuf ) {
> + stream_printf(GDKout,"# useFlexScanner: unable to get setup non-copy
> buffer.");
> + return 0;
> + }
> + while ( pftijah_tokenizelex() ) {
> + /* stream_printf(GDKout,"# scan(%s).\n",pftijah_tokenizetext); */
> + if ( !handleTijahTerm(tjCtx, pftijah_tokenizetext) )
> + return 0;
> + }
> + yy_delete_buffer(myBuf);
> + buf[len+1] = remember;
> + return 1;
> +}
> +
> +int useFlexScanner(char* buf, struct tijahContextStruct* tjCtx) {
> + // the original
> + int len = strlen(buf);
> YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
> while (pftijah_tokenizelex()) {
> if ( !handleTijahTerm(tjCtx, pftijah_tokenizetext) )
> @@ -125,6 +158,40 @@
> return 1;
> }
>
> +int OPT2useFlexScanner(char* input, struct tijahContextStruct* tjctx)
> +{
> + /* the fast function. This function is in the pftijah context with lots
> + * of small strings to tokenize many times faster as the flex and the
> + * strtok() methods which seem to have a rather larger overhead
> + */
> + register char* s = input;
> + register char x;
> +// #define EMIT x=*s; *s=0; stream_printf(GDKout,"#[%s]\n",base);if
> (!handleTijahTerm(tjctx,base)) return 0; *s=x
> +#define EMIT x=*s; *s=0; if (!handleTijahTerm(tjctx,base)) return 0; *s=x
> +
> + while ( 1 ) {
> + while ( isspace( *s ) ) s++;
> + if ( *s ) {
> + char* base = s;
> + if ( isalnum(*s) ) {
> + if ( isdigit(*s) ) {
> + while ( isdigit(*++s) ) ;
> + EMIT;
> + } else {
> + if (isupper(*s)) *s=tolower(*s);
> + while ( isalnum(*++s) ) if (isupper(*s)) *s=tolower(*s);
> + EMIT;
> + }
> + } else {
> + // INCOMPLETE, ENTITIES HERE
> + // stream_printf(GDKout,"#[SKIPPING:%c]\n",*s);
> + s++;
> + }
> + } else
> + return 1;
> + }
> +}
> +
> char* tijah_tokenize_string(char* buf, int len, char* outbuf) {
> int cnt = 0;
> YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
> @@ -137,9 +204,10 @@
> return outbuf;
> }
>
> -char* flexScanOneTerm(char* buf, int len) {
> +char* flexScanOneTerm(char* buf) {
> char *res;
> char resBUFF[256];
> + int len = strlen(buf);
>
> YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
> if ( pftijah_tokenizelex() ) {
>
> Index: nexi.c
> ===================================================================
> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/nexi.c,v
> retrieving revision 1.49
> retrieving revision 1.50
> diff -u -d -r1.49 -r1.50
> --- nexi.c 23 Feb 2007 15:11:05 -0000 1.49
> +++ nexi.c 27 Feb 2007 15:43:37 -0000 1.50
> @@ -455,6 +455,7 @@
> /*
> * Now find out if the collection is fragmented or not.
> */
> + /* INCOMPLETE, ERROR HERE WITH REFCOUNTS IN HEAD */
> BAT* fb =
> pftu_lookup_bat(pftu_batname1("tj_%s_fragments",(char*)parserCtx->collection,0));
> if ( ! fb ) {
> stream_printf(GDKerr,"Error: cannot find fragments bat for
> collection \"%s\".\n",parserCtx->collection);
> @@ -471,6 +472,8 @@
> parserCtx->ffPfx = "";
> parserCtx->flastPfx = ", str(1)";
> }
> + BBPunfix(BBPcacheid(fb));
> + fb = NULL;
> // Some special cases for NLLR, since NLLR only works with COARSE2 at
> the moment
> if ( txt_retr_model->model == MODEL_NLLR ) {
> // Switch to COARSE2 algebra for NLLR
>
> Index: pftijah_util.mx
> ===================================================================
> RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_util.mx,v
> retrieving revision 1.2
> retrieving revision 1.3
> diff -u -d -r1.2 -r1.3
> --- pftijah_util.mx 9 Jan 2007 17:15:23 -0000 1.2
> +++ pftijah_util.mx 27 Feb 2007 15:43:37 -0000 1.3
> @@ -73,6 +73,7 @@
> if ( b == bat_nil ) {
> return NULL;
> } else {
> + BBPfix(b);
> return BBPdescriptor(b);
> }
> }
>
>
> -------------------------------------------------------------------------
> Take Surveys. Earn Cash. Influence the Future of IT
> Join SourceForge.net's Techsay panel and you'll get the chance to share your
> opinions on IT & business topics through brief surveys-and earn cash
> http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
> _______________________________________________
> Monetdb-pf-checkins mailing list
> [email protected]
> https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins
--
Sjoerd Mullender
signature.asc
Description: OpenPGP digital signature
------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________ Monetdb-pf-checkins mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins
