pftijah pftijah.mx, , 1.144, 1.145

Henning Rode Fri, 15 Jun 2007 02:01:52 -0700

Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory sc8-pr-cvs16.sourceforge.net:/tmp/cvs-serv27025


Modified Files:
        pftijah.mx 
Log Message:
- using query environment variable "RECURSIVE_TAGS" to decide which
physical join operator to use: 
treemergejoin_sort, or treemergejoin_sort_unnested



Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.144
retrieving revision 1.145
diff -u -d -r1.144 -r1.145
--- pftijah.mx  15 Jun 2007 07:00:42 -0000      1.144
+++ pftijah.mx  15 Jun 2007 09:01:41 -0000      1.145
@@ -149,6 +149,9 @@
 .COMMAND treemergejoin_sort(BAT[void,oid],BAT[void,int],BAT[void,oid]) : 
BAT[oid,oid] = CMDtreemergejoin_sort; 
 "Stack tree merge join descendant"
 
+.COMMAND 
treemergejoin_sort_unnested(BAT[void,oid],BAT[void,int],BAT[void,oid]) : 
BAT[oid,oid] = CMDtreemergejoin_sort_unnested; 
+"Stack tree merge join descendant"
+
 .PRELUDE = pftijah_prelude;
 .EPILOGUE = pftijah_epilogue;
 
@@ -1401,81 +1404,6 @@
     return anc_desc;
 }
 
-
-## 
-# Compute parent-child relation. 
-# 
-# Forwards to parent_child_llscj: see below.
-##
-#PROC parent_child( bat[oid,any] parent, bat[oid,any] child, BAT[oid,str] 
qenv) : bat[oid,oid] :=
-#{
-#    return parent_child_llscj( parent, child, qenv );
-#}
-#
-#
-## 
-# Compute parent-child relation using the loop-lifted staircase join. 
-# 
-# The arguments parent and child must contain preorder indices in their heads. 
-# The tail values are discarded.
-#
-# Returns a bat containing [parent,child] preorder index pairs
-##
-#PROC parent_child_llscj( bat[oid,any] parent, bat[oid,any] child, 
BAT[oid,str] qenv) : bat[oid,oid] :=
-#{
-#    # Items contains the context nodes for the descendant step: 
-#    # this is the right side argument to contained_by.
-#    # The table must be [void,oid], so:
-#    var items := parent.mark([EMAIL PROTECTED]).reverse();
-#    
-#    # Suggestion from Thijs: make iters a [void,void], with the same length 
as anc
-#    var iters := parent.hmark(oid(0)).mark(oid(0));
-#    
-#    # Candidates: all element nodes
-#    var candidates := child.sort().mark([EMAIL PROTECTED]).reverse();
-#    
-#    # Load the pre-size table
-#    var pre_size := load( "tj_" + qenv.find(QENV_FTINAME) + "_size1");
-#    
-#    # Check the order of the items:
-#    items.chk_order();
-#    
-#    var void_chld := ll_child(iters, items, pre_size, candidates, collHeight, 
false, false, min(iters), max(iters), false);
-#    
-#    # Map back the ancestors
-#    var par_desc  := parent.mark(oid(0)).join(void_chld);
-#    
-#    candidates := nil;
-#    items := nil;
-#    pre_size := nil;
-#    
-#    return par_chld;
-#}
-#
-#
-##
-# Converts a list of query terms to a list of term id->document position 
mappings.
-#
-# Stemming on the query terms is performed using the same stemmer
-# that was used for the collection.
-#
-# Results are sorted on the document position (tail)
-##
-PROC Qterms_to_tid_pre(bat[void,str] Qterms, BAT[oid,str] qenv): bat[oid,oid] 
:=
-{
-    var ftiName := qenv.find(QENV_FTINAME);
-    var stemmer := bat("tj_"+ ftiName +"_param").find("stemmer");
-    var stemmed := [tj_normalizeTerm]( [toLower](Qterms), stemmer );
-    
-    var tids    := bat(_tj_TermBat(ftiName)).join( stemmed.reverse() 
).sort().hmark(oid(0));
-    
-    var result := indexfetchjoin( tids,
-                                  bat("tj_" + ftiName + "_TermIndex"),
-                                  bat("tj_" + ftiName + "_Terms") );
-    return result.tsort();
-}
-
-
 ##
 # Converts a list of query terms to a list of term ids
 #
@@ -1500,7 +1428,7 @@
 {
     # The pfpre table only stores element nodes, so we can use it as a filter:
     var pfpre  := bat( "tj_" + qenv.find(QENV_FTINAME) + "_pfpre");
-    var result := pfpre.mirror().join( ctx );
+    var result := ctx.semijoin(pfpre);
     return result;
 }
 
@@ -1818,6 +1746,7 @@
 {
     if ( trace ) tj_trace( "BEGIN contained_by_2" );
     var pre_size := bat("tj_"+qenv.find(QENV_FTINAME)+"_size"+ind);
+    
     # Check for the virtual root
     if ( not( pre_size.exist([EMAIL PROTECTED]) ) ) {
         # Exceptions for root in left or right argument. The case 
@@ -1831,10 +1760,18 @@
                 return left;
     }
     
-    var anc_desc := anc_desc( right, left, pre_size);
+    #var anc_desc := anc_desc( right, left, pre_size);
+    var anc := right.sort().hmark([EMAIL PROTECTED]);
+    var desc := left.sort().hmark([EMAIL PROTECTED]);
+    var anc_desc;
+    if (qenv.find(QENV_RECURSIVE_TAGS) = "0") {
+       anc_desc := treemergejoin_sort_unnested(anc, pre_size, desc);
+    } else {
+       anc_desc := treemergejoin_sort(anc, pre_size, desc);
+    }
     
     # Attach the scores to the resulting nodes again:
-    var result := anc_desc.reverse().kunique().mirror().join(left);
+    var result := left.semijoin(anc_desc.reverse());
     
     if ( trace ) tj_trace( "END   contained_by_2" );
     return result;
@@ -1901,10 +1838,18 @@
     }
     # TODO: fragmentation
     var pre_size := load( "tj_" + qenv.find(QENV_FTINAME) + "_size1");
-    var anc_desc := anc_desc( left, right, pre_size );
+    #var anc_desc := anc_desc( left, right, pre_size );
+    var anc := left.sort().hmark([EMAIL PROTECTED]);
+    var desc := right.sort().hmark([EMAIL PROTECTED]);
+    var anc_desc;
+    if (qenv.find(QENV_RECURSIVE_TAGS) = "0") {
+       anc_desc := treemergejoin_sort_unnested(anc, pre_size, desc);
+    } else {
+       anc_desc := treemergejoin_sort(anc, pre_size, desc);
+    } 
 
     # Attach the scores to the resulting nodes again:
-    var result := anc_desc.mirror().kunique().join(left);
+    var result := left.semijoin(anc_desc);
     
     if ( trace ) tj_trace( "END   containing_2" );
     return result;
@@ -1915,21 +1860,6 @@
 # PROBABILISTIC CONTAINMENT
 
################################################################################
 
-PROC _containing_desc_scj(bat[oid,void] left, bat[oid,oid] pre_tid, 
bat[void,int] pre_size) : bat[oid,oid] :=
-{
-    if ( trace ) tj_trace( "BEGIN _containing_desc_scj" );    
-     var elems := left.hmark([EMAIL PROTECTED]);
-     var cands := pre_tid.hmark([EMAIL PROTECTED]);
-     var iter := elems.mirror();
-     var elem_termpre := ll_descendant(        iter, elems, pre_size, cands, 
-                                       false, false, min(iter), max(iter), 
false, chr(nil));
-     elem_termpre := elem_termpre.reverse().leftfetchjoin(elems).reverse();
-     var elem_tid := elem_termpre.join(pre_tid);
-     elem_termpre := nil;
-     if ( trace ) tj_trace( "END   _containing_desc" );    
-     return elem_tid;
-}
-
 PROC _containing_desc_tmj(bat[oid,void] left, bat[oid,oid] pre_tid, 
bat[void,int] pre_size) : bat[oid,oid] :=
 {
     if ( trace ) tj_trace( "BEGIN _containing_desc_tmj" );    
@@ -1967,54 +1897,56 @@
 
 # returns the document likelihood dLH(t) of term t
 # in all given docs that include the term
-PROC _getTermDocLHs0(oid tid, BAT[void,oid] docs, str cName) : bat[oid,dbl] := 
{
+PROC _getTermDocLHs0(oid tid, BAT[void,oid] docs, str cName, bit nested) : 
bat[oid,dbl] := {
      var t_pos := 0;
      var t_tmj := 0;
      var t_hst := 0;
-     var t_dsz := 0;
      var t_div := 0;
      var pre_size := bat("tj_" + cName + "_size1");
      # get term positions in the entire collection
-     t_pos := time();
+     t_pos :-= time();
      var termPREs := _getTermPositions(tid, cName);
-     t_pos := time() - t_pos;
+     t_pos :+= time();
      # get doc - term relation
-     t_tmj := time();
-     var doc_termPRE := treemergejoin_sort(docs, pre_size, termPREs);
-     t_tmj := time() - t_tmj;
-     if (timing) printf("# tmj sizes: docs: %d, terms: %d, pre_size: %d, res: 
%d\n", docs.count(), termPREs.count(), pre_size.count(), doc_termPRE.count());
+     t_tmj :-= time();
+     var doc_termPRE;
+     if (nested) {
+       doc_termPRE := treemergejoin_sort(docs, pre_size, termPREs);
+     } else {
+       doc_termPRE := treemergejoin_sort_unnested(docs, pre_size, termPREs);
+     } 
+     t_tmj :+= time();
      termPREs := nil;
-     t_hst := time();
+     t_hst :-= time();
      var res := doc_termPRE.reverse().histogram().sort();
-     t_hst := time() - t_hst;
+     t_hst :+= time();
      doc_termPRE := nil;
-     t_dsz := time();
-     var doc_size := pre_size.semijoin(res).sort();
-     t_dsz := time() - t_dsz;
-     t_div := time();
+     t_div :-= time();
      res := [dbl](res).access(BAT_WRITE);
-     res.left_div(doc_size);
-     doc_size := nil; 
-     t_div := time() - t_div;
-     if (timing) printf("# dLHs timing: pos: %d, tmj:: %d, hst: %d, dsz: %d, 
div: %d\n", t_pos, t_tmj, t_hst, t_dsz, t_div);
+     res.left_div(pre_size);
+     t_div :+= time();
+     if (timing) printf("# dLHs timing: pos: %d, tmj:: %d, hst: %d, div: 
%d\n", t_pos, t_tmj, t_hst, t_div);
      return res.access(BAT_READ);
 }
 
 # returns the document likelihood dLH(t) of term t
 # in all given docs (not only those including t)
-PROC _getTermDocLHs1(oid tid, BAT[void,oid] docs, str cName) : bat[oid,dbl] := 
{
+PROC _getTermDocLHs1(oid tid, BAT[void,oid] docs, str cName, bit nested) : 
bat[oid,dbl] := {
      var pre_size := bat("tj_" + cName + "_size1");
      # get term positions in the entire collection
      var termPREs := _getTermPositions(tid, cName);
      # get doc - term relation
-     var doc_termPRE := treemergejoin_sort(docs, pre_size, termPREs);
+     var doc_termPRE;
+     if (nested) {
+       doc_termPRE := treemergejoin_sort(docs, pre_size, termPREs);
+     } else {
+       doc_termPRE := treemergejoin_sort_unnested(docs, pre_size, termPREs);
+     } 
      termPREs := nil;
      var res := doc_termPRE.reverse().histogram().sort();
      doc_termPRE := nil;
-     var doc_size := pre_size.semijoin(res).sort();
      res := [dbl](res).access(BAT_WRITE);
-     res.left_div(doc_size);
-     doc_size := nil;
+     res.left_div(pre_size);
      res.access(BAT_READ);
      var extended := docs.reverse().project(dbl(0));
      extended.access(BAT_WRITE);
@@ -2046,9 +1978,10 @@
 PROC _score_LMs(int term_qCnt, int qSize, dbl term_cLH, BAT[oid,dbl] 
term_dLHs, dbl cLambda) : bat[oid,dbl] := {
     var tmp1 := dbl(term_qCnt) * cLambda * term_cLH;
     var tmp2 := dbl(term_qCnt) * (dbl(1) - cLambda);
-    var term_dScores := [*](term_dLHs, tmp2);
-    term_dScores := [+](term_dScores, tmp1);
-    return term_dScores;
+    var term_dScores := term_dLHs.tmark([EMAIL PROTECTED]).access(BAT_WRITE);
+    term_dScores [:*=] tmp2;
+    term_dScores [:+=] tmp1;
+    return term_dLHs.mark([EMAIL PROTECTED]).leftfetchjoin(term_dScores);
 }
 
 #               ___    qCnt(t)       /  (1 - lambda) * dLH(t)       \  
@@ -2061,13 +1994,14 @@
 # where cLH(t) = likelihood of term t in (background) collection c
 #
 PROC _score_NLLR(int term_qCnt, int qSize, dbl term_cLH, BAT[oid,dbl] 
term_dLHs, dbl cLambda) : bat[oid,dbl] := {
-    var tmp := (dbl(1) - cLambda) / (cLambda * term_cLH);
-    var term_dScores := [*](term_dLHs, tmp);
-    term_dScores := [+](term_dScores, dbl(1));
-    term_dScores := [log](term_dScores);
+    var collFac := (dbl(1) - cLambda) / (cLambda * term_cLH);
+    var term_dScores := term_dLHs.tmark([EMAIL PROTECTED]).access(BAT_WRITE);
+    term_dScores [:*=] collFac;
+    term_dScores [:+=] dbl(1);
+    term_dScores.left_log();
     var term_qLH := dbl(term_qCnt) / dbl(qSize);
-    term_dScores := [*](term_dScores, term_qLH);
-    return term_dScores;
+    term_dScores [:*=] term_qLH;
+    return term_dLHs.mark([EMAIL PROTECTED]).leftfetchjoin(term_dScores);
 }
 
 # parameters: score_method, aggregation_method, base_score
@@ -2097,6 +2031,8 @@
     var t_qCnt := terms.histogram();
     
     # init further variables
+    var nested := false;
+    if (qenv.find(QENV_RECURSIVE_TAGS) = "1") {nested := true;}
     var scoreBase := dbl(@3);
     var cSize := bat("tj_" + bg_cName + "_Terms").count();
     var cLambda := dbl(qenv.find(QENV_C_LAMBDA));
@@ -2113,7 +2049,7 @@
 
        # get document likelihoods of term
        t_dLHs :-= time();
-       var term_dLHs := [EMAIL PROTECTED]($h, res.hmark([EMAIL PROTECTED]), 
cName);
+       var term_dLHs := [EMAIL PROTECTED]($h, res.hmark([EMAIL PROTECTED]), 
cName, nested);
        t_dLHs :+= time();
        
        # BEGIN RETRIEVAL MODEL DEPENDENT CODE
@@ -2154,44 +2090,6 @@
      return res;
 }
 
-#PROC score_NLLR_mil(bat[oid,oid] elem_tid, bat[void,int] pre_size, 
bat[oid,int] tid_cnt, bat[oid,int] tid_frq, bat[oid,oid] elems, dbl _lmbd, int 
q_cnt) : bat[oid,dbl] :=
-#{
-#     # compute collection terms frequencies 
-#     var _tid_frq := [/](_lmbd, tid_frq);
-#     tid_frq := nil;
-#
-#     # compute document sizes
-#     var elem_size := pre_size.semijoin(elems);
-#
-#     # compute scores in batloop over terms
-#     var doc_prob := new(oid,dbl);
-#     [EMAIL PROTECTED]()
-#     {
-#        var tmp := elem_tid.select($h);
-#        var fac := dbl(tid_cnt.find($h)) / dbl(q_cnt);
-#        tmp := tmp.reverse().histogram();
-#        tmp := [dbl](tmp);
-#        tmp := [/](tmp, elem_size);
-#        tmp := [*](tmp, $t);
-#        tmp := [+](tmp, 1);
-#        tmp := [log](tmp);
-#        tmp := [*](tmp, fac);
-#        doc_prob.insert(tmp);
-#     }
-#    
-#    var elements := left.mark([EMAIL PROTECTED]);
-#    
-#    if ( not( returnAllElements ) ) 
-#        elements := elements.semijoin(elem_tid);
-#     
-#     # aggregate doc scores
-#     var res := {sum}(doc_prob.tmark([EMAIL PROTECTED]), 
doc_prob.hmark([EMAIL PROTECTED]), elements);
-#    
-#     #res := res.[/](_tid_frq.count());
-#
-#     return res;
-#}
-
 PROC p_containing_q_NLLR_frag(bat[oid,bat] left, bat[void,str] Qterms, flt 
lmbd,BAT[oid,str] qenv) : bat[oid,bat] :=
 {
     if ( trace ) tj_trace( "BEGIN p_containing_q_NLLR_frag" );
@@ -2302,405 +2200,6 @@
      return res;
 }
 
-##
-# Implementation of the Language Modeling retrieval model, with smoothing.
-#
-# This function is meant for term-at-a-time score computation (ASPECT 
algebra), e.g.:
-#
-# var R1 := select_node("article");
-# var R2 := select_term("information");
-# var R3 := p_containing_t_LMs(R1, R2, 0.5, SIZE_TERM,qenv);
-#
-# R3 now contains all regions in R1, however with scores attached according
-# to the occurrence of the term "information" (R2)
-#
-#
-# This function implements the following formula:
-#                         tc(tm_i, doc)                  tc(tm_i,col)
-#  S(doc|tm_i) = lambda * ------------- + (1 - lambda) * ------------
-#                           len(doc)                       len(col)
-#
-#  tm_i         : query term
-#  doc          : document or, in this case, region
-#  tc(tm_i, doc): term count of query term tm_i in doc
-#  len(doc)     : size of doc (term or element size)
-##
-PROC p_containing_t_LMs(bat[oid,dbl] left, bat[oid,dbl] right, flt lmbd, int 
size_type,BAT[oid,str] qenv) : bat[oid,dbl] := 
-{
-    if ( trace ) tj_trace( "BEGIN p_containing_t_LMs_ASPECT" );
-    
-    var lambda;
-    var m_lambda;
-    var forgnd_prob := new(oid,dbl,ENTITY_NUM);
-    var bckgnd_prob;
-
-    var res_reg := new(oid,dbl,ENTITY_NUM);
-
-    # Compute the foreground probability: how many regions from right does 
left contain?
-    # This count is divided by the region size by reg_freq
-    # 
-    # This constitutes P(t|D), or tc(tm,doc)/len(doc)
-    forgnd_prob := reg_freq(left,right,size_type,qenv);
-
-    # Compute the background probability: how often does the query term occur 
in the collection
-    # In this case, we know that count(right) = tc(tm,col)
-    #
-    # This constitutes P(t|C), or tc(tm,col)/len(col)
-    var col_len := bat("tj_" + qenv.find(QENV_FTIBGNAME) + "_Terms").count();
-    bckgnd_prob := dbl(count(right))/dbl(col_len);
-
-    #if (size_type = SIZE_ENTITY) {
-    #    # Compute right size / collection entity size
-    #    
-    #    var pre_pfpre := bat("tj_" + qenv.find(QENV_FTINAME) + "_pfpre");
-    #    bckgnd_prob := dbl(count(right))/dbl(count(pre_pfpre));
-    #
-    #    pre_pfpre := nil;
-    #} else if (size_type = SIZE_TERM) {
-    #    # Compute right size / collection term size
-    #    
-    #    var pre_tid  := bat("tj_" + qenv.find(QENV_FTINAME) + "_tid");
-    #    var pre_pfpre := bat("tj_" + qenv.find(QENV_FTINAME) + "_pfpre");
-    #    var terms := pre_tid.mirror().kdiff( pre_pfpre.mirror() );
-    #
-    #    bckgnd_prob := dbl(count(right))/dbl(count(terms));
-    #    
-    #    var pre_tid := nil;
-    #    var pre_pfpre := nil;
-    #    var terms := nil;
-    #}
-
-    if (bckgnd_prob = dbl(0)) {
-        if (DEBUG) printf( "Minimum collection frequency has not been computed 
yet!\n" );
-        # Assign minimum collection frequency
-        
-        bckgnd_prob := min(col_freq(qenv));
-    }
-
-    # Precompute lambda values
-    lambda := dbl(lmbd);
-    m_lambda := dbl(1)-lambda;
-
-    if (int(qenv.find(QENV_SCOREBASE)) = 0)
-        res_reg := [+](left, 
[+]([*](lambda,forgnd_prob),*(m_lambda,bckgnd_prob)));
-    else if (int(qenv.find(QENV_SCOREBASE)) = 1)
-        res_reg := [*](left, 
[+]([*](lambda,forgnd_prob),*(m_lambda,bckgnd_prob)));
-
-    if ( trace ) tj_trace( "END   p_containing_t_LMs_ASPECT" );
-    return res_reg;
-}
-
-##
-# Implementation of the Language Modeling retrieval model.
-#
-# This function is meant for set-based score computation (COARSE2 algebra), 
e.g.:
-#
-# var R1 := select_node("article");
-# var terms := new(void,str).seqbase(oid(0));
-# var modifiers := new(void,int).seqbase(oid(0));
-# terms.append("information");
-# terms.append("retrieval");
-# var R2 := p_containing_t_LM(R1, terms, 1, SIZE_TERM);
-#
-# R2 now contains all regions in R1, however with scores attached according
-# to the occurrence of the terms "information" and "retrieval"
-#
-#
-# The score value for a single term and document is defined as follows:
-#                                                  
-#  S(doc|tm) = FG(tm) 
-#
-# where 
-#            tc(tm,doc)
-#  FG(tm) = ------------       (foreground statistics)
-#             len(doc)
-#
-# To compute score values for a set of terms at once, this function implements 
-# presence weighting (thanks to Thijs for pointing this out), using the 
following 
-# formula:
-#
-#                ___             
-# S(doc|q) =     | |     FG(tm_i)
-#             tm_i in q          
-#
-# where:                                                   
-#  q            : query: set of query terms
-#  tm_i         : query term
-#  doc          : search document or, in this case, context region
-#  tc(tm_i, doc): term count of query term tm_i in doc
-#  len(doc)     : size of doc (term or element size)
-##
-#PROC p_containing_q_LM2(bat[oid,dbl] ctx, bat[void,str] Qterms, BAT[oid,str] 
qenv) : bat[oid,dbl]
-#{   
-#    if ( trace ) tj_trace( "BEGIN p_containing_t_LM_COARSE" );
-#    # To follow the naming in the formula above, context regions are named 
"documents".
-#    # For each term we need:
-#    #  - foreground probability (first term). This depends on the context 
region
-#    
-#    # Convert the query terms from [void,str] to [void,tid]
-#    var bg_cName := qenv.find(QENV_FTIBGNAME);
-#    var terms := _terms2void_tid( Qterms, bg_cName );
-#
-#    ### Foreground probability: 
-#    # Find out the document positions of the terms for foreground probability
-#    var tid_pre := indexfetchjoin(terms, 
-#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_TermIndex"), 
-#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_Terms") );
-#    
-#    if (tid_pre.count() = 0) { return new(oid,dbl); }
-#    tid_pre := tid_pre.tsort();
-#   
-#    # TODO: fragmentation
-#    var pre_size := bat("tj_" + qenv.find(QENV_FTINAME) + "_size1");
-#    
-#    # See which document contain the query terms we create a bat of [doc, 
term-id]:
-#    var elems := ctx.sort().mark([EMAIL PROTECTED]);
-#    var doc_tid := _containing_desc_tmj(elems, tid_pre.reverse(), pre_size);
-#    
-#    # len(doc): [doc, size]
-#    var doc_len := [dbl](bat("tj_" + qenv.find(QENV_FTINAME) + 
"_size1").semijoin(elems));
-#    
-#    ###
-#    
-#    # Now, we need to compute the probability for each document->term pair
-#    var res := elems.project(dbl(1.0));
-#    
-#    # Iterate over all terms.
-#    [EMAIL PROTECTED]() {
-#        # Compute the first factor
-#        # $t contains the term id 
-#        var occurrences := doc_tid.select($t).sort();
-#      
-#        # Count the occurrence of the term in all documents: tc(tm_i, doc), 
-#        # for all documents at once
-#        var tc_tm_doc  := [dbl](occurrences.reverse().histogram());
-#        var foreground := [/](tc_tm_doc, doc_len);
-#
-#        res := [*](res, foreground);
-#    }
-#    
-#    if ( int(qenv.find(QENV_SCOREBASE)) = 0 ) {
-#        # Add the scores to the context set (this should have scores 0, so 
adding is OK)
-#        res := [+](ctx, res);
-#    } else {
-#        # Add the scores to the context set (this should have scores 1, so 
multiplying is OK)
-#        res := [*](ctx, res);
-#    }
-#    if ( trace ) tj_trace( "END   p_containing_t_LM_COARSE" );
-#    return res;
-#}
-
-##
-# Implementation of the Language Modeling retrieval model.
-#
-# This function is meant for set-based score computation (COARSE2 algebra), 
e.g.:
-#
-# var R1 := select_node("article");
-# var terms := new(void,str).seqbase(oid(0));
-# var modifiers := new(void,int).seqbase(oid(0));
-# terms.append("information");
-# terms.append("retrieval");
-# var R2 := p_containing_t_LMs(R1, terms, modifiers, 0.5, 1, SIZE_TERM);
-#
-# R2 now contains all regions in R1, however with scores attached according
-# to the occurrence of the terms "information" and "retrieval"
-#
-#
-# The score value for a single term and document is defined as follows:
-#                                                  
-#  S(doc|tm) = (1 - lambda) * FG(tm) + lambda * BG(tm)
-#
-# where 
-#            tc(tm,doc)
-#  FG(tm) = ------------       (foreground statistics)
-#             len(doc)
-#
-#            tc(tm,col)
-#  BG(tm) = ------------       (background statistics)
-#             len(col)
-#
-# To compute score values for a set of terms at once, this function implements 
-# presence weighting (thanks to Thijs for pointing this out), using the 
following 
-# formula:
-#
-#            /    ___     (1 - lambda) * FG(tm_i)     \        ___     
-# S(doc|q) = |    | |     ----------------------  + 1 |  *     | |    lambda * 
BG(tm_i)
-#            \ tm_i in q      lambda * BG(tm_i)       /     tm_i in q  
-#
-# where:                                                   
-#  q            : query: set of query terms
-#  tm_i         : query term
-#  doc          : search document or, in this case, context region
-#  tc(tm_i, doc): term count of query term tm_i in doc
-#  len(doc)     : size of doc (term or element size)
-##
-#PROC p_containing_q_LMs2(bat[oid,dbl] ctx, bat[void,str] Qterms, bat 
modifiers, flt lambda, int stemming, int size_type,BAT[oid,str] qenv) : 
bat[oid,dbl]
-#{   
-#    if ( trace ) tj_trace( "BEGIN p_containing_t_LMs_COARSE" );
-#    # To follow the naming in the formula above, context regions are named 
"documents".
-#    # For each term we need:
-#    #  - foreground probability (first term). This depends on the context 
region
-#    #  - background probability (second term). This is the same for every 
context region
-#    
-#    # Convert the query terms from [void,str] to [void,tid]
-#    var bg_cName := qenv.find(QENV_FTIBGNAME);
-#    var terms := _terms2void_tid( Qterms, bg_cName );
-#
-#    ### Background probability:
-#    # For each term: collection term frequency tc(tm_i, col):
-#    # var col_term_frq := tid_pre.reverse().histogram();
-#    # small fix to get LMs running again...
-#    var col_term_frq := collTermCount(qenv.find(QENV_FTIBGNAME), 
terms.reverse().project(int(0)));
-#
-#    # Collection size len(col): int
-#    var col_len := bat("tj_" + qenv.find(QENV_FTIBGNAME) + "_Terms").count();
-#
-#    # Take only the terms that occur in the collection
-#    terms := terms.reverse().semijoin(col_term_frq).hmark([EMAIL PROTECTED]);
-#
-#    if (terms.count() = 0) { return new(oid,dbl); }
-#    
-#    ###
-#    
-#    ### Foreground probability: 
-#    # Find out the document positions of the terms for foreground probability
-#    var tid_pre := indexfetchjoin(terms, 
-#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_TermIndex"), 
-#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_Terms") );
-#    
-#    if (tid_pre.count() = 0) { return new(oid,dbl); }
-#    tid_pre := tid_pre.tsort();
-#    
-#    # TODO: fragmentation
-#    var pre_size := bat("tj_" + qenv.find(QENV_FTINAME) + "_size1");
-#    
-#    # See which document contain the query terms we create a bat of [doc, 
term-id]:
-#    var doc_tid := _containing_desc_tmj(ctx.sort().mark([EMAIL PROTECTED]), 
tid_pre.reverse(), pre_size);
-#    
-#    # len(doc): [doc, size]
-#    var doc_len := [dbl](bat("tj_" + qenv.find(QENV_FTINAME) + 
"_size1").semijoin(doc_tid));
-#    
-#    ###
-#    
-#    # Now, we need to compute the probability for each document->term pair
-#    var doc_termscore   := new(oid,dbl);
-#    var prod_background := dbl(1);
-#    
-#    # Iterate over all terms.
-#    [EMAIL PROTECTED]() {
-#        # Compute the first factor
-#        # $t contains the term id 
-#        var occurrences := doc_tid.select($t).sort();
-#        
-#        # Count the occurrence of the term in all documents: tc(tm_i, doc), 
-#        # for all documents at once
-#        var tc_tm_doc  := [dbl](occurrences.reverse().histogram());
-#        var foreground := [/](tc_tm_doc, doc_len);
-#
-#        # Compute the background probability: tc(tm_i,col)/len(col)
-#        var tc_tm_col  := col_term_frq.find($t); 
-#        var background := dbl(tc_tm_col) / dbl(col_len);
-#        
-#        # Compute the first factor. This generates a [doc, term score] table 
for
-#        # each combination of doc and term
-#        var total := [+]( [/]( [*]((1.0 - lambda), foreground), lambda * 
background), dbl(1) );
-#        doc_termscore.insert( [dbl](total) );
-#        
-#        # Compute the second factor: product of background statistics over 
all terms
-#        prod_background := prod_background * background;
-#    }
-#    
-#    # We now have a table that lists [doc, term score] pairs. These need to 
be aggregated
-#    # into [doc, score] pairs: (this is the first aggregate product in the 
formula)
-#    
-#    var elements := ctx.mark([EMAIL PROTECTED]);
-#    
-#    if ( not( returnAllElements ) ) 
-#        elements := elements.semijoin(doc_tid);
-#        
-#    var res := {prod}(doc_termscore.tmark([EMAIL PROTECTED]), 
doc_termscore.hmark([EMAIL PROTECTED]), elements );
-#    
-#    # Now compute the final scores: multiply by the background score
-#    res := [*](res, lambda * prod_background);
-#    ctx.print();
-#    qenv.find(QENV_SCOREBASE).print();
-#    if ( int(qenv.find(QENV_SCOREBASE)) = 0 ) {
-#        # Add the scores to the context set (this should have scores 0, so 
adding is OK)
-#        res := [+](ctx, res);
-#    } else {
-#        # Add the scores to the context set (this should have scores 1, so 
multiplying is OK)
-#        res := [*](ctx, res);
-#    }
-#    if ( trace ) tj_trace( "END   p_containing_t_LMs_COARSE" );
-#    return res;
-#}
-#
-##
-# Returns the collection frequency table (should be precomputed, but is 
calculated for now)
-##
-PROC col_freq(BAT[oid,str] qenv) : bat[oid,dbl] :=
-{
-    var start_time := time();
-    var allterms := 
bat(tj_TermBat(qenv.find(QENV_FTINAME))).sort().mark([EMAIL 
PROTECTED]).reverse();
-    var alltermpos := indexfetchjoin(allterms, 
bat("tj_"+qenv.find(QENV_FTINAME)+"_TermIndex"), 
bat("tj_"+qenv.find(QENV_FTINAME)+"_Terms"));
-    var col_freqs := [dbl]({count}(alltermpos));
-
-    if (DEBUG) printf( "col_freq: \t\t%d ms\n", time() - start_time );
-    return col_freqs;
-}
-
-##
-# Returns the collection frequency of the term with the indicated tid
-##
-PROC col_freq(oid tid, BAT[oid,str] qenv) : dbl :=
-{
-    var start_time := time();
-    var alltermpos := indexfetchjoin(new(int,oid).insert(0,tid), 
bat("tj_"+qenv.find(QENV_FTINAME)+"_TermIndex"), 
bat("tj_"+qenv.find(QENV_FTINAME)+"_Terms"));
-    var col_freqs := {count}(alltermpos);
-    var result := dbl(col_freqs.fetch(0));
-    if (DEBUG) printf( "col_freq: \t\t%d ms\n", time() - start_time );
-    return result;
-}
-
-##
-# For each region in left, count the number of regions in right it contains.
-##
-PROC reg_freq(bat[oid,dbl] left, bat[oid,dbl] right, int size_type, 
BAT[oid,str] qenv) : bat[oid,dbl]    :=    
-{
-    if ( trace ) tj_trace( "START reg_freq");
-    
-    var reg_size;
-    
-    var ctx_tmp2 := new( oid, int, ENTITY_NUM );
-   
-    # TODO: fragmentation
-    var pre_size := load( "tj_" + qenv.find(QENV_FTINAME) + "_size1");
-    ctx_tmp2 := anc_desc( left, right, pre_size );
-    
-    # Containment count: for each region from left, count how many regions 
from right it contains
-    var num_terms := {count}(ctx_tmp2);
-    var prob_tmp  := [dbl](num_terms);
-    
-    # Region size: Determine the entity or term size of all regions from left 
that have descendants in right
-    if (size_type = SIZE_ENTITY)
-        reg_size := size_entity(prob_tmp,qenv);
-    else if (size_type = SIZE_TERM)
-        reg_size := size_term(prob_tmp,qenv);
-    
-    # prob_tmp := [/](prob_tmp,prob_tmp.mirror().join(reg_size));
-
-    # Divide the containment count (left) by the region size (left)
-    prob_tmp := [/](prob_tmp,reg_size);
-
-    # Set all to zero regions from left that do not contain regions from right
-    
-    var result := prob_tmp.kunion( left.project(dbl(0)) );
-
-    if ( trace ) tj_trace( "END   reg_freq " );
-    
-    return result;
-}
-
 
 # Calculate the term size of the region: how many terms does it contain?
 PROC size_term( bat[oid,any] region, BAT[oid,str] qenv ) : bat[oid,dbl] := 
@@ -2746,9 +2245,9 @@
     }
     
     # Entity sizes:
-    var pre_size := load( "tj_" + qenv.find(QENV_FTINAME) + "_size1");
+    var pre_size := bat( "tj_" + qenv.find(QENV_FTINAME) + "_size1");
     
-    var result := [dbl](region.mirror().join( pre_size ));
+    var result := [dbl](pre_size.semijoin(region));
 
     if ( trace ) {
         var trace_msg := sprintf("(returning %d regions)", count(result));
@@ -4405,7 +3904,7 @@
       dst += SRs;
 @
 @c
-int CMDtreemergejoin_sort_flat(BAT **result, BAT *Astart, BAT *pre_size, BAT 
*Dstart) {
+int CMDtreemergejoin_sort_unnested(BAT **result, BAT *Astart, BAT *pre_size, 
BAT *Dstart) {
 
     /* ---------------------------- declarations 
------------------------------------ */
     char *name = "TJ_treemergejoin_sort";


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

[Monetdb-pf-checkins] pathfinder/modules/pftijah pftijah.mx, , 1.144, 1.145

Reply via email to