Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory sc8-pr-cvs16.sourceforge.net:/tmp/cvs-serv17504

Modified Files:
        nexi.c pftijah.mx 
Log Message:
- added new version of treemergejoin for unnested ancestor sets
- option handling for timing output
- using (poor man's) binary search for inplace +,*,-,/ in score aggregation



Index: nexi.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/nexi.c,v
retrieving revision 1.65
retrieving revision 1.66
diff -u -d -r1.65 -r1.66
--- nexi.c      4 Jun 2007 21:45:08 -0000       1.65
+++ nexi.c      14 Jun 2007 14:25:00 -0000      1.66
@@ -272,6 +272,12 @@
                SET_TDEBUG(v);
                if (TDEBUG(1)) stream_printf(GDKout,"# old_main: setting debug 
value to %d.\n",v);
            }
+       } else if ( strcmp(optName,"timing") == 0 ) {
+            if ( strcasecmp(optVal,"TRUE") == 0 ) {
+                MILPRINTF(MILOUT, "timing := TRUE;\n" );
+            } else {
+                MILPRINTF(MILOUT, "timing := FALSE;\n" );
+            }
        } else if ( strcmp(optName,"milfile") == 0 ) {
            /* incomplete open file */
             parserCtx->milFILEname = optVal;

Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.142
retrieving revision 1.143
diff -u -d -r1.142 -r1.143
--- pftijah.mx  14 Jun 2007 08:32:36 -0000      1.142
+++ pftijah.mx  14 Jun 2007 14:25:00 -0000      1.143
@@ -1967,18 +1967,35 @@
 # returns the document likelihood dLH(t) of term t
 # in all given docs that include the term
 PROC _getTermDocLHs0(oid tid, BAT[void,oid] docs, str cName) : bat[oid,dbl] := 
{
+     var t_pos := 0;
+     var t_tmj := 0;
+     var t_hst := 0;
+     var t_dsz := 0;
+     var t_div := 0;
      var pre_size := bat("tj_" + cName + "_size1");
      # get term positions in the entire collection
+     t_pos := time();
      var termPREs := _getTermPositions(tid, cName);
+     t_pos := time() - t_pos;
      # get doc - term relation
+     t_tmj := time();
      var doc_termPRE := treemergejoin_sort(docs, pre_size, termPREs);
+     t_tmj := time() - t_tmj;
+     if (timing) printf("# tmj sizes: docs: %d, terms: %d, pre_size: %d, res: 
%d\n", docs.count(), termPREs.count(), pre_size.count(), doc_termPRE.count());
      termPREs := nil;
+     t_hst := time();
      var res := doc_termPRE.reverse().histogram().sort();
+     t_hst := time() - t_hst;
      doc_termPRE := nil;
+     t_dsz := time();
      var doc_size := pre_size.semijoin(res).sort();
+     t_dsz := time() - t_dsz;
+     t_div := time();
      res := [dbl](res).access(BAT_WRITE);
      res.left_div(doc_size);
      doc_size := nil; 
+     t_div := time() - t_div;
+     if (timing) printf("# dLHs timing: pos: %d, tmj:: %d, hst: %d, dsz: %d, 
div: %d\n", t_pos, t_tmj, t_hst, t_dsz, t_div);
      return res.access(BAT_READ);
 }
 
@@ -2053,11 +2070,11 @@
 }
 
 # parameters: score_method, aggregation_method, base_score
-@:p_containing_q(NLLR,add,0,+,0)@
-@:p_containing_q(LMs,mul,1,*,cLambda * term_cLH)@
-@:p_containing_q(LM,mul,1,*,0)@
+@:p_containing_q_all(NLLR,add,0,+,0)@
+@:p_containing_q_all(LMs,mul,1,*,cLambda * term_cLH)@
+@:p_containing_q_all(LM,mul,1,*,0)@
 
[EMAIL PROTECTED] p_containing_q
[EMAIL PROTECTED] p_containing_q_all
 PROC [EMAIL PROTECTED](bat[oid,dbl] left, bat[void,str] Qterms, str ind, 
BAT[oid,str] qenv) : bat[oid,dbl] :=
 {
     if ( trace ) tj_trace( "BEGIN p_containing_q" );
@@ -2126,16 +2143,16 @@
 }
 
 @mil
-#PROC collTermCount(str col, bat[oid,int] terms) : bat[oid,int] :=
-#{
-#     var tids := terms.mirror();
-#     var offsets1 := tids.leftfetchjoin(bat("tj_" + col + "_TermIndex"));
-#     tids := tids.[int]().[+](1).[oid]();
-#     var offsets2 := tids.leftfetchjoin(bat("tj_" + col + "_TermIndex"));
-#     var res := [-](offsets2.[int](),offsets1.[int]()).select(1,int(nil));
-#     return res;
-#}
-#
+PROC collTermCount(str col, bat[oid,int] terms) : bat[oid,int] :=
+{
+     var tids := terms.mirror();
+     var offsets1 := tids.leftfetchjoin(bat("tj_" + col + "_TermIndex"));
+     tids := tids.[int]().[+](1).[oid]();
+     var offsets2 := tids.leftfetchjoin(bat("tj_" + col + "_TermIndex"));
+     var res := [-](offsets2.[int](),offsets1.[int]()).select(1,int(nil));
+     return res;
+}
+
 #PROC score_NLLR_mil(bat[oid,oid] elem_tid, bat[void,int] pre_size, 
bat[oid,int] tid_cnt, bat[oid,int] tid_frq, bat[oid,oid] elems, dbl _lmbd, int 
q_cnt) : bat[oid,dbl] :=
 #{
 #     # compute collection terms frequencies 
@@ -2238,99 +2255,51 @@
      return res;
 }
 
-#PROC p_containing_q_NLLR2(bat[oid,dbl] left, bat[void,str] Qterms, flt lmbd, 
str ind,BAT[oid,str] qenv) : bat[oid,dbl] :=
-#{
-#    if ( trace ) tj_trace( "BEGIN p_containing_q_NLLR" );
-#    var t1 := time();
-#    var pre_size := bat("tj_" + qenv.find(QENV_FTINAME) + "_size" + ind);
-#    
-#     # get term ids and drop all terms with zero frq in col or background-col
-#     var terms := _terms2void_tid( Qterms, qenv ).histogram();
-#     var q_cnt := Qterms.count();
-#     var tid_frq := collTermCount(qenv.find(QENV_FTIBGNAME), terms);
-#     terms := terms.semijoin(tid_frq);
-#     if (terms.count() = 0) {return new(oid,dbl);}
-#    
-#     # compute constant factor in score computation
-#     var _lmbd := dbl((1.0 - lmbd) / lmbd);
-#     var collFrq := bat("tj_" + qenv.find(QENV_FTIBGNAME) + "_Terms").count();
-#     _lmbd :*= collFrq;
-#     
-#     # fetch term occurrences and sort them in preorder
-#     var pre_tid := indexfetchjoin(terms.hmark([EMAIL PROTECTED]), 
-#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_TermIndex"), 
-#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_Terms") ).reverse();
-#     if (pre_tid.count() = 0) {return new(oid,dbl);}
-#     var t1a := time();
-#     pre_tid := pre_tid.sort();
-#     
-#     # evaluate doc/term (anc/desc) relationship
-#     var t2 := time();
-#     var elem_tid := _containing_desc_tmj(left.sort().mark([EMAIL 
PROTECTED]), pre_tid, pre_size);
-#     var t3 := time();
-#     pre_tid := nil;
-#     if (elem_tid.count() = 0) {return new(oid,dbl);}
-#     
-#     var res := score_NLLR(elem_tid, pre_size, terms, tid_frq, 
elem_tid.kunique(), _lmbd, q_cnt);
-#     # Obey SCOREBASE setting: 
-#     if ( int(qenv.find(QENV_SCOREBASE)) = 0 ) {
-#        res := [+](left, res);
-#     } else {
-#        res := [*](left, res);
-#     }
-#     if ( trace ) tj_trace( "END   p_containing_q_NLLR" );
-#     var t4 := time();
-#
-#     if (timing) printf("# p_containing_q_NLLR(): total time: %d, term 
selection: %d, containmentjoin: %d, score computation: %d\n", t4 - t1, t1a - 
t1, t3 - t2, t4 - t3);
-#     return res;
-#}
-#
-#PROC p_containing_q_NLLR_mil(bat[oid,dbl] left, bat[void,str] Qterms, flt 
lmbd, str ind,BAT[oid,str] qenv) : bat[oid,dbl] :=
-#{
-#    if ( trace ) tj_trace( "BEGIN p_containing_q_NLLR" );
-#    var t1 := time();
-#    var pre_size := bat("tj_" + qenv.find(QENV_FTINAME) + "_size" + ind);
-#    
-#     # get term ids and drop all terms with zero frq in col or background-col
-#     var terms := _terms2void_tid( Qterms, qenv ).histogram();
-#     var q_cnt := Qterms.count();
-#     var tid_frq := collTermCount(qenv.find(QENV_FTIBGNAME), terms);
-#     terms := terms.semijoin(tid_frq);
-#     if (terms.count() = 0) {return new(oid,dbl);}
-#    
-#     # compute constant factor in score computation
-#     var _lmbd := dbl((1.0 - lmbd) / lmbd);
-#     var collFrq := bat("tj_" + qenv.find(QENV_FTIBGNAME) + "_Terms").count();
-#     _lmbd :*= collFrq;
-#     
-#     # fetch term occurrences and sort them in preorder
-#     var pre_tid := indexfetchjoin(terms.hmark([EMAIL PROTECTED]), 
-#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_TermIndex"), 
-#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_Terms") ).reverse();
-#     if (pre_tid.count() = 0) {return new(oid,dbl);}
-#     var t1a := time();
-#     pre_tid := pre_tid.sort();
-#     
-#     # evaluate doc/term (anc/desc) relationship
-#     var t2 := time();
-#     var elem_tid := _containing_desc_tmj(left.sort().mark([EMAIL 
PROTECTED]), pre_tid, pre_size);
-#     var t3 := time();
-#     pre_tid := nil;
-#     if (elem_tid.count() = 0) {return new(oid,dbl);}
-#     
-#     var res := score_NLLR_mil(elem_tid, pre_size, terms, tid_frq, 
elem_tid.kunique(), _lmbd, q_cnt);
-#     
-#     # Obey SCOREBASE setting: 
-#     if ( int(qenv.find(QENV_SCOREBASE)) = 0 ) {
-#        res := [+](left, res);
-#     } else {
-#        res := [*](left, res);
-#     }
-#     if ( trace ) tj_trace( "END   p_containing_q_NLLR" );
-#     var t4 := time();
-#     if (timing) printf("total time: %d, term selection: %d, containmentjoin: 
%d, score computation: %d\n", t4 - t1, t1a - t1, t3 - t2, t4 - t3);
-#     return res;
-#}
+PROC p_containing_q_NLLR2(bat[oid,dbl] left, bat[void,str] Qterms, str ind, 
BAT[oid,str] qenv) : bat[oid,dbl] :=
+{
+    if ( trace ) tj_trace( "BEGIN p_containing_q_NLLR" );
+    var cName := qenv.find(QENV_FTINAME);
+    var bg_cName := qenv.find(QENV_FTIBGNAME);
+    var lmbd := dbl(qenv.find(QENV_C_LAMBDA));
+    
+    var t1 := time();
+    var pre_size := bat("tj_" + cName + "_size" + ind);
+    
+     # get term ids and drop all terms with zero frq in col or background-col
+     var terms := _terms2void_tid( Qterms, bg_cName ).histogram();
+     var q_cnt := Qterms.count();
+     var tid_frq := collTermCount(bg_cName, terms);
+     terms := terms.semijoin(tid_frq);
+     if (terms.count() = 0) {return new(oid,dbl);}
+    
+     # compute constant factor in score computation
+     var _lmbd := dbl((1.0 - lmbd) / lmbd);
+     var collFrq := bat("tj_" + bg_cName + "_Terms").count();
+     _lmbd :*= collFrq;
+     
+     # fetch term occurrences and sort them in preorder
+     var pre_tid := indexfetchjoin(terms.hmark([EMAIL PROTECTED]), 
+                                   bat("tj_" + cName + "_TermIndex"), 
+                                   bat("tj_" + cName + "_Terms") ).reverse();
+     if (pre_tid.count() = 0) {return new(oid,dbl);}
+     var t1a := time();
+     pre_tid := pre_tid.sort();
+     
+     # evaluate doc/term (anc/desc) relationship
+     var t2 := time();
+     var elem_tid := _containing_desc_tmj(left.sort().mark([EMAIL PROTECTED]), 
pre_tid, pre_size);
+     var t3 := time();
+     pre_tid := nil;
+     if (elem_tid.count() = 0) {return new(oid,dbl);}
+     
+     var res := score_NLLR(elem_tid, pre_size, terms, tid_frq, 
elem_tid.kunique(), _lmbd, q_cnt);
+     res := [+](left, res);
+     if ( trace ) tj_trace( "END   p_containing_q_NLLR" );
+     var t4 := time();
+
+     if (timing) printf("# p_containing_q_NLLR(): total time: %d, term 
selection: %d, containmentjoin: %d, score computation: %d\n", t4 - t1, t1a - 
t1, t3 - t2, t4 - t3);
+     return res;
+}
 
 ##
 # Implementation of the Language Modeling retrieval model, with smoothing.
@@ -3535,32 +3504,24 @@
  *
  */
 
-
 #define FIND_OID(FOID,BBAT,BSIZE,BPTR,BTAIL) \
-        (void)try_bun; \
-        (void)try_maxbun; \
-        (void)try_oid; \
+        /* use peter's poor mans binary search here */ \
+        while (BPTR+1048576 < BTAIL && (*(oid*)BUNhead(BBAT,BPTR+1048576)) < 
FOID) \
+            BPTR += 1048576; \
+        while (BPTR+32768 < BTAIL && (*(oid*)BUNhead(BBAT,BPTR+32768)) < FOID) 
\
+            BPTR += 32768; \
+        while (BPTR+1024 < BTAIL && (*(oid*)BUNhead(BBAT,BPTR+1024)) < FOID) \
+            BPTR += 1024; \
+        while (BPTR+32 < BTAIL && (*(oid*)BUNhead(BBAT,BPTR+32)) < FOID) \
+            BPTR += 32; \
+        do { \
+                BPTR += BSIZE; \
+        } while ( (BPTR < BTAIL) && ((*(oid*)BUNhead(BBAT,BPTR))<FOID) );
+ 
+#define FIND_OID_SCAN(FOID,BBAT,BSIZE,BPTR,BTAIL) \
         do { \
                 BPTR += BSIZE; \
         } while ( (BPTR < BTAIL) && ((*(oid*)BUNhead(BBAT,BPTR))<FOID) );
-
-#define FIND_OID_BSEARCH(FOID,BBAT,BSIZE,BPTR,BTAIL) \
-        BPTR += BSIZE; \
-        try_maxbun = BTAIL; \
-        try_bun = BPTR + (((int)(try_maxbun-BPTR)/BSIZE)/2)*BSIZE; \
-        while ( (BPTR < try_maxbun) ) { \
-            try_oid = *(oid*)BUNhead(BBAT,try_bun); \
-            if ( try_oid == FOID ) { \
-                BPTR = try_bun; \
-                break; \
-            } else if ( try_oid < FOID ) { \
-                BPTR = try_bun; \
-                try_bun = BPTR + (((int)(try_maxbun-BPTR)/BSIZE)/2)*BSIZE; \
-            } else { \
-                try_maxbun = try_bun; \
-                try_bun = BPTR + (((int)(BPTR-try_bun)/BSIZE)/2)*BSIZE; \
-            } \
-        }
 
 #define INPLACE_OID_CALC_HEADER \
         if ( !bat_oid_sort_chck(l) || !bat_oid_sort_chck(r) ) \
@@ -3569,9 +3530,6 @@
         BUN lp = BUNfirst(l); BUN ll = BUNlast(l); int lbsz = BUNsize(l); \
         BUN rp = BUNfirst(r); BUN rl = BUNlast(r); int rbsz = BUNsize(r); \
         while ( (lp < ll) && (rp < rl) ) { \
-                BUN try_bun;    /* used by FIND_OID() define */ \
-                BUN try_maxbun; /* idem */ \
-                oid try_oid;    /* idem */ \
                 oid lv = *(oid*)BUNhead(l,lp); \
                 oid rv = *(oid*)BUNhead(r,rp); \
                 if ( lv == rv ) { \
@@ -3615,7 +3573,6 @@
         INPLACE_OID_CALC_FOOTER;
 }
 
-
 int CMDleft_mul_dbl(BAT** res, BAT*l, BAT*r) {
         INPLACE_OID_CALC_HEADER;
        *dres *= *(dbl*)BUNtail(r,rp);
@@ -4301,6 +4258,7 @@
        return GDK_SUCCEED;
 }
 @
+
 @= push
     si new_stack_item;
     new_stack_item.ctx = *A_cur_start;
@@ -4437,6 +4395,101 @@
      return(GDK_SUCCEED);
 }
 @
[EMAIL PROTECTED] write
+      /* output everything that is on the stack (the ancestors) */
+      /* It has started before Dstart and since it's a tree, */
+      /* it has to end after Dend. */
+      *(oid*)dst              = A_out;
+      *(oid*)(dst+SIZEOF_OID) = *D_cur_start;
+      dst += SRs;
+@
[EMAIL PROTECTED]
+int CMDtreemergejoin_sort_flat(BAT **result, BAT *Astart, BAT *pre_size, BAT 
*Dstart) {
+
+    /* ---------------------------- declarations 
------------------------------------ */
+    char *name = "TJ_treemergejoin_sort";
+    BAT *res = *result;
+    oid *D_cur_start, *D_last_start, 
+        *A_cur_start, *A_last_start;
+    oid A_out = 0;
+    int SAs;
+    int SDs;
+    int SRs;
+    int *size;
+    int free;  
+    BUN dst = NULL;
+    
+    /* ------------------------------- checks 
---------------------------------------- */
+    if (!(BATtordered(Astart)&1)) { 
+        GDKerror("%s: Ancestor pre BAT not sorted (on start).\n",name); 
+        return(GDK_FAIL);
+    }
+    if (!(BATtordered(Dstart)&1)) { 
+        GDKerror("%s: Descendant BAT not sorted (on start).\n",name); 
+        return(GDK_FAIL);
+    }
+
+    /* ----------------------------- initialize 
-------------------------------------- */
+    // the size of the bat may not be correct (could be more, could be less)
+    free = BATcount(Dstart);
+    res = BATnew(TYPE_oid, TYPE_oid, free); 
+    if (res == NULL) 
+    { 
+       GDKerror("%s: could not allocate result BAT.\n", name);
+       return(GDK_FAIL);
+    }
+    
+    /* todo: what if size is not a multiple of sizeof(oid)  (BUNsize(...) % 
sizeof(oid) != 0) */
+    SAs = BUNsize(Astart) / SIZEOF_OID;
+    SDs = BUNsize(Dstart) / SIZEOF_OID;
+    SRs = BUNsize(res);
+    size = ((int*) BUNtloc(pre_size, BUNfirst(pre_size))) - 
(int)pre_size->hseqbase;
+    D_cur_start = (oid *) BUNtail(Dstart, BUNfirst(Dstart));
+    D_last_start = (oid *) BUNtail(Dstart, BUNlast(Dstart));  
+    A_cur_start = (oid *) BUNtail(Astart, BUNfirst(Astart));
+    A_last_start = (oid *) BUNtail(Astart, BUNlast(Astart));
+    dst = BUNlast(res);
+
+    /* -------------------------------- main 
---------------------------------------- */
+    while(D_cur_start < D_last_start) {
+      A_out = 0;
+      
+      /* poor man's binary search / exploiting forward scan */
+      while (A_cur_start+1048576 < A_last_start && 
*(oid*)(A_cur_start+1048576) < *D_cur_start)
+          A_cur_start += 1048576; 
+      while (A_cur_start+32768 < A_last_start && *(oid*)(A_cur_start+32768) < 
*D_cur_start)
+          A_cur_start += 32768; 
+      while (A_cur_start+1024 < A_last_start && *(oid*)(A_cur_start+1024) < 
*D_cur_start)
+          A_cur_start += 1024; 
+      while (A_cur_start+32 < A_last_start && *(oid*)(A_cur_start+32) < 
*D_cur_start)
+          A_cur_start += 32; 
+      while (*A_cur_start < *D_cur_start && A_cur_start < A_last_start){       
        
+       if (*D_cur_start <= (*A_cur_start + size[*A_cur_start])) {
+         A_out = *A_cur_start;
+       }  
+       A_cur_start += SAs;
+      }
+      
+      if (A_out) {
+         @:write@
+      }
+    
+      D_cur_start += SDs;
+    }
+    
+    /* ----------------------------- tidy up 
-------------------------------------- */
+    res->batBuns->free = dst - res->batBuns->base;
+    BATsetcount(res, (res->batBuns->free+Bunbase(res)-BUNfirst(res))/SRs);
+    res->batDirty = TRUE;
+    res->tsorted = GDK_SORTED;
+    res->hsorted = FALSE;
+    BATset(res, TRUE);
+    
+    *result = res;    
+    /* it is possible there are still ancestor candidates left on the stack, 
but we are out of descendants, so they starve... */
+    return(GDK_SUCCEED);
+}
+@
 @c
 bat *
 pftijah_prelude(void)


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

Reply via email to