Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory sc8-pr-cvs7.sourceforge.net:/tmp/cvs-serv20056
Modified Files:
nexi.c nexi_generate_mil.c pftijah.mx pftijah_tokenize.l
serialize_pftijah.mx serialize_pftijah_options.mx
Log Message:
- slice pftijah results in milprint_summer. This is a preparation for the tijah-
countiall(query-id) function Arjen requested.
- implemented a very fast tokenizer. Because we shred a lot of seperate strings
the startup overhead in flex is considerable. I implemented a very
rudimentary in "C" using ctype.h functions. This speeds up indexing. To use
it you should set the icollection parameter "tokenizer" to "fast". The old
flex version is still the default.
- made clean select_root(startNodes) impl. No more side effects.
Index: serialize_pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/serialize_pftijah.mx,v
retrieving revision 1.42
retrieving revision 1.43
diff -u -d -r1.42 -r1.43
--- serialize_pftijah.mx 27 Feb 2007 15:43:37 -0000 1.42
+++ serialize_pftijah.mx 1 Mar 2007 11:26:31 -0000 1.43
@@ -31,7 +31,9 @@
extern int handleTijahTerm(struct tijahContextStruct *ctx, char* term);
-extern int useFlexScanner(char* buf, struct tijahContextStruct* tjCtx); /*
FLEX */
+extern int tokenize_flex(char* buf, struct tijahContextStruct* tjCtx);
+extern int tokenize_fast(char* buf, struct tijahContextStruct* tjCtx);
+
extern char* flexScanOneTerm(char* buf);
extern char* normalizeTerm(struct tijahContextStruct *ctx, char* term );
@@ -224,6 +226,7 @@
int tagStackPtr;
/* */
int preExpansion; /* estimation for #TijahPre/PfPre */
+ int tokenize_fast; /* boolean to indicate fast tokenizer */
char* stemmer; /* name of the stemmer used for this collection */
tjStemCtx* stemCtx;
char checkStopWords;
@@ -504,6 +507,11 @@
str str_name = readCollParam(res,"name");
res->name = (char*)str_name;
+ res->tokenize_fast = 0;
+ str str_tokenizer = readCollParam(res,"tokenizer");
+ if ( strcmp(str_tokenizer,"fast") == 0 )
+ res->tokenize_fast = 1;
+
/* */
res->checkStopWords = 1;
str str_stopw = readCollParam(res,"stopwords");
@@ -870,7 +878,10 @@
#endif
if ( DOEMIT(tjctx) ) {
- return useFlexScanner(p,tjctx);
+ if ( tjctx->tokenize_fast)
+ return tokenize_fast(p,tjctx);
+ else
+ return tokenize_flex(p,tjctx);
}
return 1;
}
Index: nexi.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/nexi.c,v
retrieving revision 1.50
retrieving revision 1.51
diff -u -d -r1.50 -r1.51
--- nexi.c 27 Feb 2007 15:43:37 -0000 1.50
+++ nexi.c 1 Mar 2007 11:26:30 -0000 1.51
@@ -226,8 +226,10 @@
txt_retr_model->next = NULL;
/** Compiler parameters **/
+#if 0
// The number of elements to return
int retNum = -1; // -1 = unlimited
+#endif
algebra_type = COARSE2;
preproc_type = PLAIN;
scale_on = FALSE;
@@ -273,11 +275,13 @@
if (TDEBUG(1)) stream_printf(GDKout,"# old_main: ignoring
fragmentation setting.\n");
} else if ( strcmp(optName,"background_collection") == 0 ) {
strcpy(background_collection, optVal);
- } else if ( strcmp(optName,"returnNumber") == 0 ||
- strcmp(optName,"retNum") == 0 ||
- strcmp(optName,"top") == 0 ) {
- retNum = atoi( optVal );
-
+#if 0
+ } else if ( strcmp(optName,"returnNumber") == 0 ) {
+ int xx = atoi( optVal );
+ if ( xx < 0 ) {
+ // incomplete should check if number is OK
+ }
+#endif
} else if ( strcmp(optName,"algebraType") == 0 ) {
if ( strcasecmp( optVal, "ASPECT" ) == 0 ) {
algebra_type = ASPECT;
@@ -448,6 +452,8 @@
txt_retr_model->prior_type = NO_PRIOR;
}
+ } else if (strcmp(optName, "returnNumber") == 0) {
+ // ignore, is handled by milprint_summer
} else {
stream_printf(GDKout,"TijahOptions: should handle:
%s=%s\n",optName,optVal);
}
@@ -483,7 +489,9 @@
// Prepend some variables to the MIL code.
MILPRINTF(MILOUT, "tj_setCollName(\"%s\");\n", parserCtx->collection);
+#if 0
MILPRINTF(MILOUT, "retNum := %d;\n", retNum);
+#endif
MILPRINTF(MILOUT, "var stemmer := bat(\"tj_\"+ collName
+\"_param\").find(\"stemmer\");\n");
if (strcmp(background_collection,""))
{ MILPRINTF(MILOUT, "tj_setBackgroundCollName(\"%s\");\n",
background_collection); }
Index: pftijah_tokenize.l
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah_tokenize.l,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -d -r1.13 -r1.14
--- pftijah_tokenize.l 27 Feb 2007 15:43:37 -0000 1.13
+++ pftijah_tokenize.l 1 Mar 2007 11:26:31 -0000 1.14
@@ -115,7 +115,8 @@
%%
-int OPT0useFlexScanner(char* buf, struct tijahContextStruct* tjCtx) {
+#if 0
+int tokenize_flex(char* buf, struct tijahContextStruct* tjCtx) {
/* UPDATE: this delivers very strange testset results and should not be
* used I think.
*/
@@ -133,7 +134,7 @@
YY_BUFFER_STATE myBuf = yy_scan_buffer(buf, len+2);
if ( !myBuf ) {
- stream_printf(GDKout,"# useFlexScanner: unable to get setup non-copy
buffer.");
+ stream_printf(GDKout,"# tokenize_flex: unable to get setup non-copy
buffer.");
return 0;
}
while ( pftijah_tokenizelex() ) {
@@ -145,8 +146,9 @@
buf[len+1] = remember;
return 1;
}
+#endif
-int useFlexScanner(char* buf, struct tijahContextStruct* tjCtx) {
+int tokenize_flex(char* buf, struct tijahContextStruct* tjCtx) {
// the original
int len = strlen(buf);
YY_BUFFER_STATE myBuf = yy_scan_bytes(buf, len);
@@ -158,17 +160,68 @@
return 1;
}
-int OPT2useFlexScanner(char* input, struct tijahContextStruct* tjctx)
+/*
+ * Implementation of the fast tokenizer in "C"
+ *
+ */
+
+/* */
+/* #define TFDEBUG 1 */
+
+#ifdef TFDEBUG
+#define EMIT \
+ tmpc=*s; *s=0; \
+ if (TFDEBUG) stream_printf(GDKout,"#[%s]\n",base); \
+ if (!handleTijahTerm(tjctx,base)) \
+ return 0; \
+ *s=tmpc
+#else
+#define EMIT \
+ tmpc=*s; *s=0; \
+ if (!handleTijahTerm(tjctx,base)) \
+ return 0; \
+ *s=tmpc
+#endif
+
+#define TOLOWER(C) if (isupper(C)) C=tolower(C);
+/* */
+
+char* scan_ENTITY(char* s) {
+ if ( *s++ == '&' ) {
+ if ( *s == '#' ) {
+ // scan [&][#]{Digits}[;]
+ if (isdigit(*++s) ) {
+ s++;
+ while( isdigit(*s) ) s++;
+ if ( *s == ';' )
+ return ++s; // return one past the entity
+ }
+ } else if (isalpha(*s) ) {
+ // scan [&]{Letter}+[;]
+ s++;
+ while( isalpha(*s) ) s++;
+ if ( *s == ';' )
+ return ++s; // return one past the entity
+ }
+ }
+ return NULL;
+}
+
+
+int tokenize_fast(char* input, struct tijahContextStruct* tjctx)
{
/* the fast function. This function is in the pftijah context with lots
* of small strings to tokenize many times faster as the flex and the
- * strtok() methods which seem to have a rather larger overhead
+ * strtok() methods which seem to have a rather larger overhead.
+ * In the future I will implement contraction and other goodies but there's
+ * no time for that yet.
*/
register char* s = input;
- register char x;
-// #define EMIT x=*s; *s=0; stream_printf(GDKout,"#[%s]\n",base);if
(!handleTijahTerm(tjctx,base)) return 0; *s=x
-#define EMIT x=*s; *s=0; if (!handleTijahTerm(tjctx,base)) return 0; *s=x
+ register char tmpc, *tmpp;
+#ifdef TFDEBUG
+ if (TFDEBUG) stream_printf(GDKout,"#[TOKENIZE-FAST:%s\n",s);
+#endif
while ( 1 ) {
while ( isspace( *s ) ) s++;
if ( *s ) {
@@ -178,13 +231,24 @@
while ( isdigit(*++s) ) ;
EMIT;
} else {
- if (isupper(*s)) *s=tolower(*s);
- while ( isalnum(*++s) ) if (isupper(*s)) *s=tolower(*s);
+ TOLOWER(*s);
+ if ( isalnum(*++s) ) { // the length 2 check
+ TOLOWER(*s);
+ while ( isalnum(*++s) ) TOLOWER(*s);
+ EMIT;
+ }
+ }
+ } else if ( *s == '&' ) {
+ if ( (tmpp = scan_ENTITY(s)) ) {
+ s = tmpp; // emit entity till here
EMIT;
+ } else {
+ s++; // no entity, just skip
}
} else {
- // INCOMPLETE, ENTITIES HERE
- // stream_printf(GDKout,"#[SKIPPING:%c]\n",*s);
+#ifdef TFDEBUG
+ if (TFDEBUG) stream_printf(GDKout,"#[SKIPPING:%c]\n",*s);
+#endif
s++;
}
} else
Index: nexi_generate_mil.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/nexi_generate_mil.c,v
retrieving revision 1.25
retrieving revision 1.26
diff -u -d -r1.25 -r1.26
--- nexi_generate_mil.c 22 Feb 2007 11:34:42 -0000 1.25
+++ nexi_generate_mil.c 1 Mar 2007 11:26:30 -0000 1.26
@@ -300,7 +300,7 @@
if (p_com->left == NULL && p_com->right == NULL) {
if (!strcmp(p_com->argument,"\"Root\"")) {
- MILPRINTF(MILOUT, "R%d := select_root%s();\n",
com_num,parserCtx->ffPfx);
+ MILPRINTF(MILOUT, "R%d := select_root%s(startNodes);\n",
com_num,parserCtx->ffPfx);
}
else {
MILPRINTF(MILOUT, "R%d := select_node%s(%s,%s);\n", com_num,
parserCtx->ffPfx,p_com->argument, txt_retr_model->e_class);
@@ -1046,7 +1046,9 @@
MILPRINTF(MILOUT, "collect := nil;\n");
}
MILPRINTF(MILOUT, "R%d := R%d.tsort_rev();\n", com_num, com_num);
+#if 0
MILPRINTF(MILOUT, "if ( retNum >= 0 ) { R%d := R%d.slice(0, retNum - 1);
}\n", com_num, com_num);
+#endif
MILPRINTF(MILOUT, "R%d.persists(true).rename(\"nexi_result\");\n",
com_num);
MILPRINTF(MILOUT, "}\n");
Index: serialize_pftijah_options.mx
===================================================================
RCS file:
/cvsroot/monetdb/pathfinder/modules/pftijah/serialize_pftijah_options.mx,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -d -r1.12 -r1.13
--- serialize_pftijah_options.mx 9 Feb 2007 23:41:11 -0000 1.12
+++ serialize_pftijah_options.mx 1 Mar 2007 11:26:31 -0000 1.13
@@ -115,10 +115,13 @@
pftijah_options_handle_attribute(XqueryCtx* ctx, str prefix, str loc, str
value) {
struct PTOS* ptoctx = (struct PTOS*)ctx->driverWs;
(void)prefix;
- if ( 1 && strcmp(loc,"debug") == 0 ) {
+ if ( strcmp(loc,"debug") == 0 ) {
int v = atoi(value);
SET_TDEBUG(v);
if (TDEBUG(1)) stream_printf(GDKout,"# pftijah_option_handler: setting
debug value to %d.\n",v);
+ } else if ( (strcmp(loc,"top")==0) || (strcmp(loc,"retNum")==0) ) {
+ // INCOMPLETE, check if it is a correct number
+ loc = (str)"returnNumber";
}
if ( !BUNins(ptoctx->bat, loc, value, FALSE) ) {
stream_printf(GDKout,"pft_opt_deb:
pftijah_options_handle_attribute(%s,%s) FAILS[BUNins].\n",loc,value);
Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.99
retrieving revision 1.100
diff -u -d -r1.99 -r1.100
--- pftijah.mx 27 Feb 2007 09:59:10 -0000 1.99
+++ pftijah.mx 1 Mar 2007 11:26:30 -0000 1.100
@@ -328,6 +328,7 @@
- str collName: the name of the collection
- BAT[str,str] param: initialization parameter for collection.\n\
possible values are:\n\
+ tokenizer = { flex, fast }\n\
stemmer = { nostemming, snowball-english, snowball-porter, snowball-dutch
}\n\
fragmentSize = [number] == the maximum size of a [pre|item] fragment\n\
tagFilter = [comma seperated list of tags to be indexed]\n\
@@ -385,13 +386,16 @@
#
# now read the param file
#
- var stemmer := "nostemming";
- var tagfilter := "";
+ var stemmer := "nostemming";
+ var tokenizer := "flex";
+ var tagfilter := "";
[EMAIL PROTECTED]() {
if ( verbose ) printf("#TJ:
tj_init_global():param[%s]=\"%s\"\n",$h,$t);
if ( $h = "stemmer" ) {
stemmer := $t;
+ } else if ( $h = "tokenizer" ) {
+ tokenizer := $t;
} else if ( $h = "pf_collection" ) {
bat("tj_" + collName + "_param").insert($h,$t);
} else if ( $h = "fragmentSize" ) {
@@ -408,6 +412,7 @@
bat("tj_" + collName + "_param").insert("_version","1.01");
bat("tj_" + collName + "_param").insert("name",collName);
bat("tj_" + collName + "_param").insert("height","0");
+ bat("tj_" + collName + "_param").insert("tokenizer",tokenizer);
bat("tj_" + collName + "_param").insert("stemmer",stemmer);
bat("tj_" + collName + "_param").insert("tagFilter",tagfilter);
bat("tj_" + collName + "_param").insert("preExpansion","4");
@@ -1069,13 +1074,13 @@
# Returns a bat [preorder rank,score].
# The score is initalized based on the scoreBase global variable.
##
-PROC select_root() : bat[oid,dbl] :=
+PROC select_root(bat[void,oid] par_startNodes) : bat[oid,dbl] :=
{
- if ( not( isnil( startNodes ) ) ) {
+ if ( not( isnil( par_startNodes ) ) ) {
# Start from a set of starting nodes if available.
# It is assumed that the startNodes are [any, pre]
- if ( count( startNodes ) > 0 ) {
- var root_reg := startNodes.reverse().sort().project(
dbl(scoreBase) );
+ if ( count( par_startNodes ) > 0 ) {
+ var root_reg := par_startNodes.reverse().sort().project(
dbl(scoreBase) );
return root_reg;
} else {
var root_reg := new(oid,dbl,1);
@@ -1094,11 +1099,11 @@
}
}
-PROC select_root_frag() : bat[oid,bat] :=
+PROC select_root_frag(bat[void,oid] par_startNodes) : bat[oid,bat] :=
{
var res := new(oid,bat);
- res.insert([EMAIL PROTECTED], select_root() );
+ res.insert([EMAIL PROTECTED], select_root(par_startNodes) );
return res;
}
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins