Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv16016
Modified Files:
pftijah.mx serialize_pftijah.mx
Log Message:
- Implement the INEX path generator to the serialiser.
For every element node indexed a INEX style path is stored in a BAT.
This functionality is hidden by a #ifdef USE_INEX_PATH which should be
switched on manually.
U serialize_pftijah.mx
Index: serialize_pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/serialize_pftijah.mx,v
retrieving revision 1.74
retrieving revision 1.75
diff -u -d -r1.74 -r1.75
--- serialize_pftijah.mx 15 May 2009 11:25:42 -0000 1.74
+++ serialize_pftijah.mx 22 Sep 2009 07:15:58 -0000 1.75
@@ -42,6 +42,8 @@
/* #define TJ_TRACE 1 */
+/* #define USE_INEX_PATH 1 */
+
/*******************************************
* serialize_pftijah.c : XML serialization
*
@@ -202,6 +204,162 @@
}
#endif
+#ifdef USE_INEX_PATH
+
+/*
+ * Start of the implementation of the INEX path generator. During indexing
+ * this structure will keep track of the used tags in the current path. The
+ * number of times a tagname is used as a child of the current parent is
+ * counted and printed in the current path. For every element node the
+ * path is stored in the tj_FTINDEX_path1 bat.
+ *
+ */
+
+
+/*
+ * The main stackframe bat is a union of two possiblilities.
+ * The stack consists of a 'level' frame followed by a number of tags used
+ * at this level. A typical stack may looks like:
+ *
+ * tag
+ * tag
+ * tag
+ * level
+ * tag
+ * tag
+ * level
+ * tag
+ * level[bottom]
+ */
+typedef union {
+ struct {
+ int ntags; /* number of tags at this level */
+ int subpath; /* index of subpath in printbuffer */
+ int prev_frame; /* previous level frame, -1 for bottom */
+ } level;
+ struct {
+ str strVal; /* string value of the tagname */
+ int cnt; /* number of times used at this level */
+ } tag;
+} IpgFrame;
+
+/* The main structure managing the Inex path stack and printbuffer */
+typedef struct InexPathGenBase {
+ BAT* b_pre_path; /* [oid,str] bat wher path is stored*/
+ int top; /* index of the top of the stack */
+ int max; /* maximum possible size of stack */
+ IpgFrame* stack; /* the physical stack */
+ char* pathbuff; /* the path printing buffer */
+} IpgBase;
+
+#define IPG_INIT_STACKSZ 512
+
+#define IPG_INIT_PATHSZ 8192
+
+
+static IpgBase* ipg_init(BAT* b) {
+
+ IpgBase* res;
+ if ( !(res = (IpgBase*)GDKmalloc( sizeof(IpgBase) )) )
+ return NULL;
+ res->b_pre_path = b;
+ res->top = 0;
+ res->max = IPG_INIT_STACKSZ;
+ res->stack = (IpgFrame*)GDKmalloc( res->max * sizeof(IpgFrame) );
+ if ( !res->stack )
+ return NULL;
+ res->pathbuff = (char*)GDKmalloc( IPG_INIT_PATHSZ );
+ if (0) stream_printf(GDKout,"-IPG[ipg_init() called.]\n");
+ return res;
+}
+
+/* function used for starting a new document */
+static int ipg_startdoc(IpgBase* base, str doc) {
+ if (0) stream_printf(GDKout,"-IPG[start: doc=%s]\n",doc);
+ base->top = 0;
+ base->stack[base->top].level.ntags = 0;
+ base->stack[base->top].level.prev_frame = -1;
+ sprintf(base->pathbuff,"fn:doc(\"%s\")",doc);
+ base->stack[base->top].level.subpath = strlen(base->pathbuff);
+ return 1;
+}
+
+/* allocate one new frame element on top of stack */
+static INLINE int ipg_frame_alloc(IpgBase* base) {
+ int newtop = base->top+base->stack[base->top].level.ntags + 1;
+
+ if ( newtop >= base->max ) {
+ if (1) stream_printf(GDKout,"-IPG[ipg_frame_alloc()
extending.]\n");
+ base->max = 2 * base->max;
+ base->stack = (IpgFrame*)GDKrealloc(base->stack,
+ base->max * sizeof(IpgFrame) );
+ if ( !base->stack )
+ return -1;
+ }
+ return newtop;
+}
+
+static int ipg_pushtag(IpgBase* base, str tag, oid o) {
+ IpgFrame* found = NULL;
+
+ for(int i=0; i< base->stack[base->top].level.ntags && !found; i++) {
+ IpgFrame* f = & base->stack[base->top+i+1];
+
+ if (strcmp(f->tag.strVal,tag) == 0 ) {
+ found = f;
+ }
+ }
+ if ( !found ) {
+ ipg_frame_alloc(base);
+ base->stack[base->top].level.ntags++;
+ found = &
base->stack[base->top+base->stack[base->top].level.ntags];
+ found->tag.strVal = (str)GDKstrdup(tag);
+ found->tag.cnt = 0;
+ }
+ found->tag.cnt++;
+
+ /* now push a new frame on stack */
+ int newtop = ipg_frame_alloc(base);
+ base->stack[newtop].level.ntags = 0;
+ base->stack[newtop].level.prev_frame = base->top;
+
+
sprintf(&base->pathbuff[base->stack[base->top].level.subpath],"/%s[%d]",tag,found->tag.cnt);
+ base->stack[newtop].level.subpath = strlen(base->pathbuff);
+
+ base->top = newtop;
+
+ if ( !BUNins(base->b_pre_path, &o,(str)base->pathbuff, 0)) {
+ GDKerror("error writing recursive tagbat");
+ return -1;
+ }
+
+ if (0) stream_printf(GDKout,"-IPG[tijah(" OIDFMT "@0) =
%s]\n",o,base->pathbuff);
+
+ return 1;
+}
+
+static int ipg_poptag(IpgBase* base) {
+ /* free tag strings used at the top level */
+ for(int i=0; i< base->stack[base->top].level.ntags; i++) {
+ GDKfree(base->stack[base->top+i+1].tag.strVal);
+ }
+
+ base->top = base->stack[base->top].level.prev_frame;
+
+ if (0) stream_printf(GDKout,"-IPG[pop: path=%s]\n",base->pathbuff);
+ return 1;
+}
+
+static int ipg_destroy(IpgBase* base) {
+ GDKfree(base->stack);
+ GDKfree(base->pathbuff);
+ GDKfree(base);
+ if (0) stream_printf(GDKout,"-IPG[ipg_destroy() called.]\n");
+ return 1;
+}
+
+#endif
+
/************************************************
*
* The Tijah-Pathfinder index creation context
@@ -240,6 +398,8 @@
/* INCOMPLETE< SHOULD BE FRAGMENTED */
BAT* b_collSize; /* Tijah PRE-size BAT */
dbat dbat_collSize; /* Direct access struct for b_collSize */
+ /* INCOMPLETE< SHOULD BE FRAGMENTED */
+ BAT* b_collPath; /* Tijah PRE-INEX-PATH BAT */
/* */
BAT* b_collPfPre; /* Tijah PRE-PF BAT for Pathinder link */
/* */
@@ -276,6 +436,9 @@
tjStemCtx* stemCtx;
char checkStopWords;
oid lastStopWord;
+#ifdef USE_INEX_PATH
+ IpgBase* ipg;
+#endif
} tjCtx;
#define DOEMIT(TJCTX) ((TJCTX)->emitting)
@@ -689,6 +852,7 @@
if ( !(res->b_docFirstPre = getBAT(tjCtx_BAT,"_doc_firstpre")) ) return
NULL;
if ( !(res->b_collPre = getBAT(tjCtx_BAT,"_tid")) ) return NULL;
if ( !(res->b_collSize = getBAT(tjCtx_BAT,"_size")) ) return NULL;
+ if ( !(res->b_collPath = getBAT(tjCtx_BAT,"_path")) ) return NULL;
if ( (res->tijahPre = getPreSize(res)) == oid_nil )
return NULL;;
/* check here for new fragmentation */
@@ -812,6 +976,10 @@
if ( ! (res->hm_globalTerm->H->hash && res->hm_globalTag->H->hash) )
stream_printf(GDKout,"# WARNING: MISSING HASH ON TAG/TERM
DICTIONARY\n");
+#ifdef USE_INEX_PATH
+ if ( !(res->ipg = ipg_init(res->b_collPath)) )
+ return NULL;
+#endif
return res;
}
@@ -880,6 +1048,10 @@
GDKfree(tjctx->tagswitch);
if ( tjctx->stemCtx->clear && !tjctx->stemCtx->clear(tjctx->stemCtx) )
return GDK_FAIL;
+#ifdef USE_INEX_PATH
+ if ( !ipg_destroy(tjctx->ipg) )
+ return GDK_FAIL;
+#endif
GDKfree(tjctx);
#ifdef TJ_TRACE
if ( TJ_TRACE ) stream_printf(GDKout,"C[{}]:FINISH:
add2collection_handler()\n");
@@ -1062,6 +1234,10 @@
if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:startElement: \"%s\",
termoid=%d, Tijah pre#=%d, Pathfinder
pre#=%d\n",tjctx->name,name,termOid,tjPre,pre);
#endif
if ( !BUNins(tjctx->b_collPfPre, &tjPre, &pre, FALSE) ) return 0;
+#ifdef USE_INEX_PATH
+ if ( !ipg_pushtag(tjctx->ipg, name, tjPre) )
+ return 0;
+#endif
}
return 1;
}
@@ -1169,6 +1345,10 @@
}
if (tjctx->dict_handler &&
!handle_dictionary(tjctx,tjctx->level_pre[tjctx->doc_height]))
return 0;
+#ifdef USE_INEX_PATH
+ if ( !ipg_poptag(tjctx->ipg) )
+ return 0;
+#endif
return 1;
}
@@ -1311,6 +1491,12 @@
#ifdef TJ_TRACE
if ( TJ_TRACE ) stream_printf(GDKout,"C[%s]:LOADED
CONTEXT\n",tjctx->name);
#endif
+
+#ifdef USE_INEX_PATH
+ if ( !ipg_startdoc(tjctx->ipg,docName) )
+ return GDK_FAIL;
+#endif
+
if ( tijahDocIndex(tjctx,docName) != oid_nil ) {
stream_printf(GDKerr,"add2collection_handler(CTX,%s) already in
collection\n",docName);
return GDK_FAIL;
@@ -1406,6 +1592,11 @@
return GDK_FAIL;
}
oid start_oid = tjctx->tijahPre;
+
+#ifdef USE_INEX_PATH
+ if ( !ipg_startdoc(tjctx->ipg,docName) )
+ return GDK_FAIL;
+#endif
if ( !BUNappend(tjctx->b_docName, docName, 0) ||
!BUNappend(tjctx->b_docFirstPre, &start_oid, 0) )
U pftijah.mx
Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.234
retrieving revision 1.235
diff -u -d -r1.234 -r1.235
--- pftijah.mx 17 Sep 2009 10:58:39 -0000 1.234
+++ pftijah.mx 22 Sep 2009 07:15:58 -0000 1.235
@@ -654,6 +654,7 @@
}
bat("tj_" + ftiName + "_tid1").delete();
bat("tj_" + ftiName + "_size1").delete();
+ bat("tj_" + ftiName + "_path1").delete();
bat("tj_" + ftiName + "_fragments").delete();
bat("tj_" + ftiName + "_fragments").append(1...@0);
bat("tj_" + ftiName + "_pfpre").delete();
@@ -668,6 +669,7 @@
new(void,oid).seqbase(1...@0).persists(true).access(BAT_APPEND).bbpname("tj_" +
ftiName + "_tid1");
new(void,int).seqbase(1...@0).persists(true).access(BAT_APPEND).bbpname("tj_" +
ftiName + "_size1");
+ new(oid,str).persists(true).access(BAT_APPEND).bbpname("tj_" +
ftiName + "_path1");
# bat contains the start oid of every tid/size frag. Head is postfix
# string to _tid/_size. Normally "", "2", "3"
new(void,oid).seqbase(1...@0).append(1...@0).persists(true).access(BAT_APPEND).bbpname("tj_"
+ ftiName + "_fragments");
@@ -982,6 +984,7 @@
{
tjCollBat.append("tj_" + ftiName + "_tid" + str(int($t)));
tjCollBat.append("tj_" + ftiName + "_size"+ str(int($t)));
+ tjCollBat.append("tj_" + ftiName + "_path"+ str(int($t)));
}
if (isnil(CATCH(bat("tj_" + ftiName + "_TermIndex").count_wrd()))) {
tjCollBat.append("tj_" + ftiName + "_Terms");
@@ -1025,6 +1028,7 @@
var fpfx := str(bat("tj_" + ftiName + "_fragments").count_wrd());
tjCollBat.insert("_tid", bat("tj_" + ftiName + "_tid"+fpfx));
tjCollBat.insert("_size", bat("tj_" + ftiName + "_size"+fpfx));
+ tjCollBat.insert("_path", bat("tj_" + ftiName + "_path"+fpfx));
tjCollBat.insert("_pfpre", bat("tj_" + ftiName + "_pfpre"));
tjCollBat.insert("_conceptdict", bat("tj_" + ftiName +
"_conceptdict"));
tjCollBat.insert("_concept_tid", bat("tj_" + ftiName +
"_concept_tid"));
@@ -1101,6 +1105,7 @@
bat("tj_" + ftiName + "_doc_firstpre").access(BAT_APPEND);
bat("tj_" + ftiName + "_tid1").access(BAT_APPEND);
bat("tj_" + ftiName + "_size1").access(BAT_APPEND);
+ bat("tj_" + ftiName + "_path1").access(BAT_APPEND);
bat("tj_" + ftiName + "_fragments").access(BAT_APPEND);
bat("tj_" + ftiName + "_pfpre").access(BAT_APPEND);
bat("tj_" + ftiName + "_concept_tid").access(BAT_APPEND);
@@ -1199,6 +1204,7 @@
bat("tj_" + ftiName + "_doc_firstpre").access(BAT_READ);
bat("tj_" + ftiName + "_tid1").access(BAT_READ);
bat("tj_" + ftiName + "_size1").access(BAT_READ);
+ bat("tj_" + ftiName + "_path1").access(BAT_READ);
bat("tj_" + ftiName + "_fragments").access(BAT_READ);
bat("tj_" + ftiName + "_pfpre").access(BAT_READ);
bat("tj_" + ftiName + "_concept_tid").access(BAT_READ);
@@ -1310,6 +1316,7 @@
bat(_tj_TagBat(ftiName)).print();
print(bat("tj_" + ftiName + "_tid1"));
print(bat("tj_" + ftiName + "_size1"));
+ print(bat("tj_" + ftiName + "_path1"));
print(bat("tj_" + ftiName + "_pfpre"));
print(bat("tj_" + ftiName + "_conceptdict"));
print(bat("tj_" + ftiName + "_concept_tid"));
@@ -1343,6 +1350,7 @@
sum :+= batdsksize(bat("tj_" + ftiName + "_TagIndex"));
sum :+= batdsksize(bat("tj_" + ftiName + "_TermIndex"));
sum :+= batdsksize(bat("tj_" + ftiName + "_size1"));
+ sum :+= batdsksize(bat("tj_" + ftiName + "_path1"));
print(sum);
}
@@ -1907,7 +1915,7 @@
var orcomb := "sum";
var returnall := TRUE;
- var tjPre_score :=
tj_containing_query_nest_pre_NLLR(iter_tjPre.reverse(), Q);
+ var tjPre_score :=
tj_containing_query_nest_pre_term_NLLR(iter_tjPre.reverse(), Q);
var iter_score := iter_tjPre.leftjoin(tjPre_score);
result_item_s.append(iter_score);
------------------------------------------------------------------------------
Come build with us! The BlackBerry® Developer Conference in SF, CA
is the only developer event you need to attend this year. Jumpstart your
developing skills, take BlackBerry mobile applications to market and stay
ahead of the curve. Join us from November 9-12, 2009. Register now!
http://p.sf.net/sfu/devconf
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins