Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory sc8-pr-cvs16.sourceforge.net:/tmp/cvs-serv11343

Modified Files:
      Tag: XQuery_0-18
        nexi.c nexi_generate_mil.c pftijah.mx 
Log Message:
- rewrite of the central score function (mil) code
it's now easier readable, and extendable. furthermore, most score function
params are now passed inside a query-environment bat, which is a cleaner
solution than having a different signature of parameters for each score
function.

- fixed small bug and corresponding test case for score combination in the
LMs retrieval model. (scores should be multiplied here, not added)

(i did not add any new functionality!)




Index: nexi_generate_mil.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/nexi_generate_mil.c,v
retrieving revision 1.33
retrieving revision 1.33.2.1
diff -u -d -r1.33 -r1.33.2.1
--- nexi_generate_mil.c 30 May 2007 14:08:09 -0000      1.33
+++ nexi_generate_mil.c 4 Jun 2007 17:42:06 -0000       1.33.2.1
@@ -708,19 +708,21 @@
 
         case MODEL_LM :
 
-          MILPRINTF(MILOUT, "R%d := R%d.p_containing_q_LM(terms, qenv);\n", 
com_num, com_nr_left);
+//          MILPRINTF(MILOUT, "R%d := R%d.p_containing_q_LM(terms, qenv);\n", 
com_num, com_nr_left);
+          MILPRINTF(MILOUT, "R%d := 
R%d.p_containing_q_LMs%s(terms%s,qenv);\n", com_num, com_nr_left, 
parserCtx->ffPfx, parserCtx->flastPfx);
 
           break;
 
         case MODEL_LMS :
          
-          MILPRINTF(MILOUT, "R%d := R%d.p_containing_q_LMs(terms, modifiers, 
%f, %d, %d,qenv);\n", com_num, com_nr_left, txt_retr_model->param1, 
txt_retr_model->stemming, txt_retr_model->size_type);
+//          MILPRINTF(MILOUT, "R%d := R%d.p_containing_q_LMs(terms, modifiers, 
%f, %d, %d,qenv);\n", com_num, com_nr_left, txt_retr_model->param1, 
txt_retr_model->stemming, txt_retr_model->size_type);
+          MILPRINTF(MILOUT, "R%d := 
R%d.p_containing_q_LMs%s(terms%s,qenv);\n", com_num, com_nr_left, 
parserCtx->ffPfx, parserCtx->flastPfx);
 
           break;
         
         case MODEL_NLLR :
             
-          MILPRINTF(MILOUT, "R%d := R%d.p_containing_q_NLLR%s(terms, 
%f%s,qenv);\n", com_num, com_nr_left, parserCtx->ffPfx, txt_retr_model->param1, 
parserCtx->flastPfx);
+          MILPRINTF(MILOUT, "R%d := 
R%d.p_containing_q_NLLR%s(terms%s,qenv);\n", com_num, com_nr_left, 
parserCtx->ffPfx, parserCtx->flastPfx);
         
           break;
         

Index: nexi.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/nexi.c,v
retrieving revision 1.64
retrieving revision 1.64.2.1
diff -u -d -r1.64 -r1.64.2.1
--- nexi.c      30 May 2007 14:08:08 -0000      1.64
+++ nexi.c      4 Jun 2007 17:42:05 -0000       1.64.2.1
@@ -234,9 +234,11 @@
     return_all          = FALSE;
     stem_stop_query     = FALSE;
     bool eq_init        = FALSE;
-    
+   
+    /* set query environment */
+    MILPRINTF(MILOUT, "var qenv := create_qenv();\n");
+
     /* startup of argument options */
-    /* INCOMPLETE, select_root() should distinguish between card(0) and nil */
     if ( use_startNodes ) {
         MILPRINTF(MILOUT, "var startNodes := new(void,oid);\n");
         MILPRINTF(MILOUT, "if ( view_bbp_name().reverse().exist(\"%s\") ) 
{\n", startNodes_name );
@@ -253,8 +255,10 @@
     MILPRINTF(MILOUT, "var tracefile := \"\";\n" );
     
 
-    char* qenv_prox_val = NULL;
-    char* qenv_fb_val   = NULL;
+    char* qenv_prox_val  = NULL;
+    char* qenv_fb_val    = NULL;
+    char* qenv_scorebase = "0"; //default setting
+    char* qenv_c_lambda = "0.8"; //default setting
 
     BUN p, q;
     BATloop(optbat, p, q) {
@@ -298,8 +302,10 @@
                 txt_retr_model->model = MODEL_BOOL;
             } else if ( strcasecmp(optVal,"LM") == 0 ) {
                 txt_retr_model->model = MODEL_LM;
+                qenv_scorebase = "1";
             } else if ( strcasecmp(optVal,"LMS") == 0 ) {
                 txt_retr_model->model = MODEL_LMS;
+                qenv_scorebase = "1";
             } else if ( strcasecmp(optVal,"TFIDF") == 0 ) {
                 txt_retr_model->model = MODEL_TFIDF;
             } else if ( strcasecmp(optVal,"OKAPI") == 0 ) {
@@ -371,6 +377,7 @@
         } else if ( strcmp(optName,"collection-lambda") == 0 || 
                     strcmp(optName,"ir-model-param1") == 0) {
             txt_retr_model->param1 = atof( optVal );
+           qenv_c_lambda = optVal;
 
         } else if ( strcmp(optName,"ir-model-param2") == 0 ) {
             txt_retr_model->param2 = atof( optVal );
@@ -433,12 +440,12 @@
             MILPRINTF(MILOUT, "trace     := TRUE;\n" );
             MILPRINTF(MILOUT, "tracefile := \"%s\";\n", optVal );
             
-        } else if (strcmp(optName, "scoreBase") == 0) {
+ /*       } else if (strcmp(optName, "scoreBase") == 0) {
             if (strcasecmp(optVal, "ONE") == 0) {
-                MILPRINTF(MILOUT, "qenv := tj_setScoreBase(\"1\",qenv);\n");
+                qenv_scorebase = "0";
             } else {
-                MILPRINTF(MILOUT, "qenv := tj_setScoreBase(\"0\",qenv);\n");
-            }
+                qenv_scorebase = "1";
+            } */
         } else if (strcmp(optName, "stem_stop_query") == 0) {
             if (strcasecmp(optVal, "TRUE") == 0) {
                 stem_stop_query = TRUE;
@@ -491,10 +498,11 @@
         algebra_type = COARSE2;
     }
         
-    
+    MILPRINTF(MILOUT, 
"modify_qenv(qenv,QENV_FTINAME,\"%s\");\n",parserCtx->collection);
+    MILPRINTF(MILOUT, 
"modify_qenv(qenv,QENV_FTIBGNAME,\"%s\");\n",parserCtx->collection);
+    MILPRINTF(MILOUT, 
"modify_qenv(qenv,QENV_SCOREBASE,\"%s\");\n",qenv_scorebase);
+    MILPRINTF(MILOUT, 
"modify_qenv(qenv,QENV_C_LAMBDA,\"%s\");\n",qenv_c_lambda);
     // Prepend some variables to the MIL code.
-    MILPRINTF(MILOUT, "var qenv := 
create_qenv(\"%s\",\"%s\",\"0\");\n",parserCtx->collection,parserCtx->collection);
-
     if ( qenv_prox_val ) { 
         MILPRINTF(MILOUT, 
"modify_qenv(qenv,QENV_TERM_PROXIMITY,\"%s\");\n",qenv_prox_val);
        free(qenv_prox_val);

Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.134
retrieving revision 1.134.2.1
diff -u -d -r1.134 -r1.134.2.1
--- pftijah.mx  1 Jun 2007 15:34:59 -0000       1.134
+++ pftijah.mx  4 Jun 2007 17:42:06 -0000       1.134.2.1
@@ -187,8 +187,8 @@
 
 const ENTITY_NUM := 10000;
 
-var collHeight := 10;
-var retNum := 100;
+#var collHeight := 10;
+#var retNum := 100;
 
 var trace := false;
 var timing := false;
@@ -215,16 +215,14 @@
 const QENV_FTINAME        := [EMAIL PROTECTED];
 const QENV_FTIBGNAME      := [EMAIL PROTECTED];
 const QENV_SCOREBASE      := [EMAIL PROTECTED];
-const QENV_TERM_PROXIMITY := [EMAIL PROTECTED];
-const QENV_FEEDBACK_DOCS  := [EMAIL PROTECTED];
+const QENV_C_LAMBDA       := [EMAIL PROTECTED];
+const QENV_TERM_PROXIMITY := [EMAIL PROTECTED];
+const QENV_FEEDBACK_DOCS  := [EMAIL PROTECTED];
 
 # create a query environment bat
-PROC create_qenv(str fti_name, str bg_fti_name, str scb) : BAT[oid,str]
+PROC create_qenv() : BAT[oid,str]
 {
     var res := new(oid,str);
-    res.insert(QENV_FTINAME, fti_name);
-    res.insert(QENV_FTIBGNAME, bg_fti_name);
-    res.insert(QENV_SCOREBASE, scb);
     return res;
 }
 
@@ -1125,13 +1123,6 @@
 }
 
 # INCOMPLETE: henning, what should I do about this.
-PROC tj_setScoreBase(int default, BAT[oid,str] qenv) : BAT[void,str] :=
-{
-       return 
create_qenv(qenv.find(QENV_FTINAME),qenv.find(QENV_FTIBGNAME),default);
-}
-
-
-# INCOMPLETE: henning, what should I do about this.
 var equivalences := new(str,str);
 PROC tj_initEquivalences() : void := 
 {
@@ -1277,12 +1268,12 @@
 # 
 # Forwards to parent_child_llscj: see below.
 ##
-PROC parent_child( bat[oid,any] parent, bat[oid,any] child, BAT[oid,str] qenv) 
: bat[oid,oid] :=
-{
-    return parent_child_llscj( parent, child, qenv );
-}
-
-
+#PROC parent_child( bat[oid,any] parent, bat[oid,any] child, BAT[oid,str] 
qenv) : bat[oid,oid] :=
+#{
+#    return parent_child_llscj( parent, child, qenv );
+#}
+#
+#
 ## 
 # Compute parent-child relation using the loop-lifted staircase join. 
 # 
@@ -1291,38 +1282,38 @@
 #
 # Returns a bat containing [parent,child] preorder index pairs
 ##
-PROC parent_child_llscj( bat[oid,any] parent, bat[oid,any] child, BAT[oid,str] 
qenv) : bat[oid,oid] :=
-{
-    # Items contains the context nodes for the descendant step: 
-    # this is the right side argument to contained_by.
-    # The table must be [void,oid], so:
-    var items := parent.mark([EMAIL PROTECTED]).reverse();
-    
-    # Suggestion from Thijs: make iters a [void,void], with the same length as 
anc
-    var iters := parent.hmark(oid(0)).mark(oid(0));
-    
-    # Candidates: all element nodes
-    var candidates := child.sort().mark([EMAIL PROTECTED]).reverse();
-    
-    # Load the pre-size table
-    var pre_size := load( "tj_" + qenv.find(QENV_FTINAME) + "_size1");
-    
-    # Check the order of the items:
-    items.chk_order();
-    
-    var void_chld := ll_child(iters, items, pre_size, candidates, collHeight, 
false, false, min(iters), max(iters), false);
-    
-    # Map back the ancestors
-    var par_desc  := parent.mark(oid(0)).join(void_chld);
-    
-    candidates := nil;
-    items := nil;
-    pre_size := nil;
-    
-    return par_chld;
-}
-
-
+#PROC parent_child_llscj( bat[oid,any] parent, bat[oid,any] child, 
BAT[oid,str] qenv) : bat[oid,oid] :=
+#{
+#    # Items contains the context nodes for the descendant step: 
+#    # this is the right side argument to contained_by.
+#    # The table must be [void,oid], so:
+#    var items := parent.mark([EMAIL PROTECTED]).reverse();
+#    
+#    # Suggestion from Thijs: make iters a [void,void], with the same length 
as anc
+#    var iters := parent.hmark(oid(0)).mark(oid(0));
+#    
+#    # Candidates: all element nodes
+#    var candidates := child.sort().mark([EMAIL PROTECTED]).reverse();
+#    
+#    # Load the pre-size table
+#    var pre_size := load( "tj_" + qenv.find(QENV_FTINAME) + "_size1");
+#    
+#    # Check the order of the items:
+#    items.chk_order();
+#    
+#    var void_chld := ll_child(iters, items, pre_size, candidates, collHeight, 
false, false, min(iters), max(iters), false);
+#    
+#    # Map back the ancestors
+#    var par_desc  := parent.mark(oid(0)).join(void_chld);
+#    
+#    candidates := nil;
+#    items := nil;
+#    pre_size := nil;
+#    
+#    return par_chld;
+#}
+#
+#
 ##
 # Converts a list of query terms to a list of term id->document position 
mappings.
 #
@@ -1352,12 +1343,13 @@
 # Stemming on the query terms is performed using the same stemmer
 # that was used for the collection.
 ##
-PROC _terms2void_tid( bat[void,str] Qterms, BAT[oid,str] qenv): bat[void,oid] 
:=
+PROC _terms2void_tid( bat[void,str] Qterms, str bg_cName): bat[void,oid] :=
 {
-    var stemmer := bat("tj_"+ qenv.find(QENV_FTINAME) 
+"_param").find("stemmer");
+    var stemmer := bat("tj_"+ bg_cName +"_param").find("stemmer");
     var stemmed := [tj_normalizeTerm]( [toLower](Qterms), stemmer );
-    
-    var tids := bat(_tj_TermBat(qenv.find(QENV_FTINAME))).join( 
stemmed.reverse() ).sort().hmark(oid(0));
+    var tids := bat(_tj_TermBat(bg_cName)).join( stemmed.reverse() 
).sort().hmark(oid(0));
+    var stopwords := tids.uselect([EMAIL PROTECTED]);
+    tids := tids.kdiff(stopwords);
     return tids;
 }
 
@@ -1835,8 +1827,8 @@
 }
 
 # returns the document likelihood dLH(t) of term t
-# in all given docs
-PROC _getTermDocLHs(oid tid, BAT[void,oid] docs, str cName) : bat[oid,dbl] := {
+# in all given docs that include the term
+PROC _getTermDocLHs0(oid tid, BAT[void,oid] docs, str cName) : bat[oid,dbl] := 
{
      var pre_size := bat("tj_" + cName + "_size1");
      # get term positions in the entire collection
      var termPREs := _getTermPositions(tid, cName);
@@ -1846,31 +1838,89 @@
      var res := doc_termPRE.reverse().histogram().sort();
      doc_termPRE := nil;
      var doc_size := pre_size.semijoin(res).sort();
-     
      res := [dbl](res).access(BAT_WRITE);
      res.left_div(doc_size);
      doc_size := nil; 
      return res.access(BAT_READ);
 }
 
-#               ___                 /  (1 - lambda) * dLH(t)       \  
-# NLLR(d|q) =   | |    qLH(t) * log |  ----------------------  + 1 |  
-#              t in q               \      lambda * cLH(t)         /  
+# returns the document likelihood dLH(t) of term t
+# in all given docs (not only those including t)
+PROC _getTermDocLHs1(oid tid, BAT[void,oid] docs, str cName) : bat[oid,dbl] := 
{
+     var pre_size := bat("tj_" + cName + "_size1");
+     # get term positions in the entire collection
+     var termPREs := _getTermPositions(tid, cName);
+     # get doc - term relation
+     var doc_termPRE := treemergejoin_sort(docs, pre_size, termPREs);
+     termPREs := nil;
+     var res := doc_termPRE.reverse().histogram().sort();
+     doc_termPRE := nil;
+     var doc_size := pre_size.semijoin(res).sort();
+     res := [dbl](res).access(BAT_WRITE);
+     res.left_div(doc_size);
+     doc_size := nil;
+     res.access(BAT_READ);
+     var extended := docs.reverse().project(dbl(0));
+     extended.access(BAT_WRITE);
+     extended.replace(res);
+     extended.access(BAT_READ);     
+     return extended.sort();
+}
+
+#             ___          
+# LM(d|q) =   | |    qCnt(t) * dLH(t)  
+#            t in q           
 #
-# where qLH(t) = likelihood of term t in query q
+# where qCnt(t) = count of term t in query q
+# where dLH(t) = likelihood of term t in doc d
+#
+PROC _score_LM(int term_qCnt, int qSize, dbl term_cLH, BAT[oid,dbl] term_dLHs, 
dbl cLambda) : bat[oid,dbl] := {
+    var term_dScores := [*](term_dLHs, term_qCnt);
+    return term_dScores;
+}
+
+#              ___          
+# LMs(d|q) =   | |    qCnt(t) ( (1-lambda) dLH(t) + lambda cLH(t) ) 
+#             t in q           
+#
+# where qCnt(t) = count of term t in query q
 # where dLH(t) = likelihood of term t in doc d
 # where cLH(t) = likelihood of term t in (background) collection c
 #
-PROC _score_NLLR(dbl term_qLH, dbl term_cLH, BAT[oid,dbl] term_dLHs, flt 
cLambda) : bat[oid,dbl] := {
+PROC _score_LMs(int term_qCnt, int qSize, dbl term_cLH, BAT[oid,dbl] 
term_dLHs, dbl cLambda) : bat[oid,dbl] := {
+    var tmp1 := dbl(term_qCnt) * cLambda * term_cLH;
+    var tmp2 := dbl(term_qCnt) * (dbl(1) - cLambda);
+    var term_dScores := [*](term_dLHs, tmp2);
+    term_dScores := [+](term_dScores, tmp1);
+    return term_dScores;
+}
+
+#               ___    qCnt(t)       /  (1 - lambda) * dLH(t)       \  
+# NLLR(d|q) =   | |    ------- * log |  ----------------------  + 1 |  
+#              t in q   qSize        \      lambda * cLH(t)         /  
+#
+# where qCnt(t) = count of term t in query q
+# where qSize = number of terms in query q
+# where dLH(t) = likelihood of term t in doc d
+# where cLH(t) = likelihood of term t in (background) collection c
+#
+PROC _score_NLLR(int term_qCnt, int qSize, dbl term_cLH, BAT[oid,dbl] 
term_dLHs, dbl cLambda) : bat[oid,dbl] := {
     var tmp := (dbl(1) - cLambda) / (cLambda * term_cLH);
     var term_dScores := [*](term_dLHs, tmp);
     term_dScores := [+](term_dScores, dbl(1));
     term_dScores := [log](term_dScores);
+    var term_qLH := dbl(term_qCnt) / dbl(qSize);
     term_dScores := [*](term_dScores, term_qLH);
     return term_dScores;
 }
 
-PROC p_containing_q_NLLR2(bat[oid,dbl] left, bat[void,str] Qterms, flt lmbd, 
str ind,BAT[oid,str] qenv) : bat[oid,dbl] :=
+# parameters: score_method, aggregation_method, base_score
+@:p_containing_q(NLLR,add,0,+,0)@
+@:p_containing_q(LMs,mul,1,*,cLambda * term_cLH)@
+@:p_containing_q(LM,mul,1,*,0)@
+
[EMAIL PROTECTED] p_containing_q
+PROC [EMAIL PROTECTED](bat[oid,dbl] left, bat[void,str] Qterms, str ind, 
BAT[oid,str] qenv) : bat[oid,dbl] :=
 {
     if ( trace ) tj_trace( "BEGIN p_containing_q" );
 
@@ -1882,21 +1932,23 @@
     var t_aggr := 0;
     t_total :-= time();
    
+    var cName := qenv.find(QENV_FTINAME);
+    var bg_cName := qenv.find(QENV_FTIBGNAME);
+    
     # get term ids and drop all terms with zero frq in background-col and 
calculate query LM
-    var terms := _terms2void_tid( Qterms, qenv );
+    var terms := _terms2void_tid( Qterms, bg_cName );
     var qSize := terms.count();
-    var qLM := [/]([dbl](terms.histogram()),dbl(qSize));
+    var t_qCnt := terms.histogram();
     
-    # init variables
-    var cName := qenv.find(QENV_FTINAME);
-    var bg_cName := qenv.find(QENV_FTIBGNAME);
+    # init further variables
+    var scoreBase := dbl(@3);
     var cSize := bat("tj_" + bg_cName + "_Terms").count();
-    var cLambda := lmbd; 
+    var cLambda := dbl(qenv.find(QENV_C_LAMBDA));
     var res := left.sort();
-    var dScores := 
res.project(dbl(qenv.find(QENV_SCOREBASE))).access(BAT_WRITE);
+    var dScores := res.project(scoreBase).access(BAT_WRITE);
     t_loop :-= time();
     # loop over query terms
-    [EMAIL PROTECTED]()
+    [EMAIL PROTECTED]()
     {
        # get collection likelihood of term
        t_cLH :-= time();
@@ -1905,81 +1957,84 @@
 
        # get document likelihoods of term
        t_dLHs :-= time();
-       var term_dLHs := _getTermDocLHs($h, res.hmark([EMAIL PROTECTED]), 
cName);
+       var term_dLHs := [EMAIL PROTECTED]($h, res.hmark([EMAIL PROTECTED]), 
cName);
        t_dLHs :+= time();
        
        # BEGIN RETRIEVAL MODEL DEPENDENT CODE
        t_score :-= time();
-       var term_dScores := _score_NLLR($t, term_cLH, term_dLHs, cLambda);
+       var term_dScores := [EMAIL PROTECTED]($t, qSize, term_cLH, term_dLHs, 
cLambda);
+       # update base score of non-matching docs
+       scoreBase :@4= @5;  
        t_score :+= time();
-       # END RETRIEVAL MODEL DEPENDENT CODE
-
        # aggregate term scores
        t_aggr :-= time();
-       dScores := dScores.left_add(term_dScores);
+       dScores := [EMAIL PROTECTED](term_dScores);
        t_aggr :+= time();
+       # END RETRIEVAL MODEL DEPENDENT CODE
     } 
     t_loop :+= time();
     dScores.access(BAT_READ);
     # delete all docs from the result list that do not match any query term
-    if ( not( returnAllElements ) and int(qenv.find(QENV_SCOREBASE)) = 0) 
-        dScores := dScores.select(dbl(0), dbl(nil), false, true);
-    
+    if ( not( returnAllElements )) {
+        var unchanged := dScores.uselect(scoreBase);
+        dScores := dScores.kdiff(unchanged);
+    }
+   
     # combine new doc scores with prior ones 
-    res := [+](res, dScores);
-    
+    res := [EMAIL PROTECTED](res, dScores);
     t_total :+= time();
     if (timing) printf("# nllr timing: total: %d, loop: %d, cLH: %d, dLHs: %d, 
score: %d, aggr: %d\n", t_total, t_loop, t_cLH, t_dLHs, t_score, t_aggr);
     return res;
 }
 
-PROC collTermCount(str col, bat[oid,int] terms) : bat[oid,int] :=
-{
-     var tids := terms.mirror();
-     var offsets1 := tids.leftfetchjoin(bat("tj_" + col + "_TermIndex"));
-     tids := tids.[int]().[+](1).[oid]();
-     var offsets2 := tids.leftfetchjoin(bat("tj_" + col + "_TermIndex"));
-     var res := [-](offsets2.[int](),offsets1.[int]()).select(1,int(nil));
-     return res;
-}
-
-PROC score_NLLR_mil(bat[oid,oid] elem_tid, bat[void,int] pre_size, 
bat[oid,int] tid_cnt, bat[oid,int] tid_frq, bat[oid,oid] elems, dbl _lmbd, int 
q_cnt) : bat[oid,dbl] :=
-{
-     # compute collection terms frequencies 
-     var _tid_frq := [/](_lmbd, tid_frq);
-     tid_frq := nil;
-
-     # compute document sizes
-     var elem_size := pre_size.semijoin(elems);
-
-     # compute scores in batloop over terms
-     var doc_prob := new(oid,dbl);
-     [EMAIL PROTECTED]()
-     {
-         var tmp := elem_tid.select($h);
-         var fac := dbl(tid_cnt.find($h)) / dbl(q_cnt);
-         tmp := tmp.reverse().histogram();
-         tmp := [dbl](tmp);
-         tmp := [/](tmp, elem_size);
-         tmp := [*](tmp, $t);
-         tmp := [+](tmp, 1);
-         tmp := [log](tmp);
-         tmp := [*](tmp, fac);
-         doc_prob.insert(tmp);
-     }
-    
-    var elements := left.mark([EMAIL PROTECTED]);
-    
-    if ( not( returnAllElements ) ) 
-        elements := elements.semijoin(elem_tid);
-     
-     # aggregate doc scores
-     var res := {sum}(doc_prob.tmark([EMAIL PROTECTED]), doc_prob.hmark([EMAIL 
PROTECTED]), elements);
-    
-     #res := res.[/](_tid_frq.count());
-
-     return res;
-}
[EMAIL PROTECTED]
+#PROC collTermCount(str col, bat[oid,int] terms) : bat[oid,int] :=
+#{
+#     var tids := terms.mirror();
+#     var offsets1 := tids.leftfetchjoin(bat("tj_" + col + "_TermIndex"));
+#     tids := tids.[int]().[+](1).[oid]();
+#     var offsets2 := tids.leftfetchjoin(bat("tj_" + col + "_TermIndex"));
+#     var res := [-](offsets2.[int](),offsets1.[int]()).select(1,int(nil));
+#     return res;
+#}
+#
+#PROC score_NLLR_mil(bat[oid,oid] elem_tid, bat[void,int] pre_size, 
bat[oid,int] tid_cnt, bat[oid,int] tid_frq, bat[oid,oid] elems, dbl _lmbd, int 
q_cnt) : bat[oid,dbl] :=
+#{
+#     # compute collection terms frequencies 
+#     var _tid_frq := [/](_lmbd, tid_frq);
+#     tid_frq := nil;
+#
+#     # compute document sizes
+#     var elem_size := pre_size.semijoin(elems);
+#
+#     # compute scores in batloop over terms
+#     var doc_prob := new(oid,dbl);
+#     [EMAIL PROTECTED]()
+#     {
+#        var tmp := elem_tid.select($h);
+#        var fac := dbl(tid_cnt.find($h)) / dbl(q_cnt);
+#        tmp := tmp.reverse().histogram();
+#        tmp := [dbl](tmp);
+#        tmp := [/](tmp, elem_size);
+#        tmp := [*](tmp, $t);
+#        tmp := [+](tmp, 1);
+#        tmp := [log](tmp);
+#        tmp := [*](tmp, fac);
+#        doc_prob.insert(tmp);
+#     }
+#    
+#    var elements := left.mark([EMAIL PROTECTED]);
+#    
+#    if ( not( returnAllElements ) ) 
+#        elements := elements.semijoin(elem_tid);
+#     
+#     # aggregate doc scores
+#     var res := {sum}(doc_prob.tmark([EMAIL PROTECTED]), 
doc_prob.hmark([EMAIL PROTECTED]), elements);
+#    
+#     #res := res.[/](_tid_frq.count());
+#
+#     return res;
+#}
 
 PROC p_containing_q_NLLR_frag(bat[oid,bat] left, bat[void,str] Qterms, flt 
lmbd,BAT[oid,str] qenv) : bat[oid,bat] :=
 {
@@ -1987,7 +2042,8 @@
     var t1 := time();
 
      # get term ids and drop all terms with zero frq in col or background-col
-     var terms := _terms2void_tid( Qterms, qenv ).histogram();
+    var bg_cName := qenv.find(QENV_FTIBGNAME);
+    var terms := _terms2void_tid( Qterms, bg_cName );
      var q_cnt := Qterms.count();
      var tid_frq := collTermCount(qenv.find(QENV_FTIBGNAME), terms);
      var tid_cnt := terms.semijoin(tid_frq);
@@ -2044,99 +2100,99 @@
      return res;
 }
 
-PROC p_containing_q_NLLR(bat[oid,dbl] left, bat[void,str] Qterms, flt lmbd, 
str ind,BAT[oid,str] qenv) : bat[oid,dbl] :=
-{
-    if ( trace ) tj_trace( "BEGIN p_containing_q_NLLR" );
-    var t1 := time();
-    var pre_size := bat("tj_" + qenv.find(QENV_FTINAME) + "_size" + ind);
-    
-     # get term ids and drop all terms with zero frq in col or background-col
-     var terms := _terms2void_tid( Qterms, qenv ).histogram();
-     var q_cnt := Qterms.count();
-     var tid_frq := collTermCount(qenv.find(QENV_FTIBGNAME), terms);
-     terms := terms.semijoin(tid_frq);
-     if (terms.count() = 0) {return new(oid,dbl);}
-    
-     # compute constant factor in score computation
-     var _lmbd := dbl((1.0 - lmbd) / lmbd);
-     var collFrq := bat("tj_" + qenv.find(QENV_FTIBGNAME) + "_Terms").count();
-     _lmbd :*= collFrq;
-     
-     # fetch term occurrences and sort them in preorder
-     var pre_tid := indexfetchjoin(terms.hmark([EMAIL PROTECTED]), 
-                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_TermIndex"), 
-                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_Terms") ).reverse();
-     if (pre_tid.count() = 0) {return new(oid,dbl);}
-     var t1a := time();
-     pre_tid := pre_tid.sort();
-     
-     # evaluate doc/term (anc/desc) relationship
-     var t2 := time();
-     var elem_tid := _containing_desc_tmj(left.sort().mark([EMAIL PROTECTED]), 
pre_tid, pre_size);
-     var t3 := time();
-     pre_tid := nil;
-     if (elem_tid.count() = 0) {return new(oid,dbl);}
-     
-     var res := score_NLLR(elem_tid, pre_size, terms, tid_frq, 
elem_tid.kunique(), _lmbd, q_cnt);
-     # Obey SCOREBASE setting: 
-     if ( int(qenv.find(QENV_SCOREBASE)) = 0 ) {
-        res := [+](left, res);
-     } else {
-        res := [*](left, res);
-     }
-     if ( trace ) tj_trace( "END   p_containing_q_NLLR" );
-     var t4 := time();
-
-     if (timing) printf("# p_containing_q_NLLR(): total time: %d, term 
selection: %d, containmentjoin: %d, score computation: %d\n", t4 - t1, t1a - 
t1, t3 - t2, t4 - t3);
-     return res;
-}
-
-PROC p_containing_q_NLLR_mil(bat[oid,dbl] left, bat[void,str] Qterms, flt 
lmbd, str ind,BAT[oid,str] qenv) : bat[oid,dbl] :=
-{
-    if ( trace ) tj_trace( "BEGIN p_containing_q_NLLR" );
-    var t1 := time();
-    var pre_size := bat("tj_" + qenv.find(QENV_FTINAME) + "_size" + ind);
-    
-     # get term ids and drop all terms with zero frq in col or background-col
-     var terms := _terms2void_tid( Qterms, qenv ).histogram();
-     var q_cnt := Qterms.count();
-     var tid_frq := collTermCount(qenv.find(QENV_FTIBGNAME), terms);
-     terms := terms.semijoin(tid_frq);
-     if (terms.count() = 0) {return new(oid,dbl);}
-    
-     # compute constant factor in score computation
-     var _lmbd := dbl((1.0 - lmbd) / lmbd);
-     var collFrq := bat("tj_" + qenv.find(QENV_FTIBGNAME) + "_Terms").count();
-     _lmbd :*= collFrq;
-     
-     # fetch term occurrences and sort them in preorder
-     var pre_tid := indexfetchjoin(terms.hmark([EMAIL PROTECTED]), 
-                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_TermIndex"), 
-                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_Terms") ).reverse();
-     if (pre_tid.count() = 0) {return new(oid,dbl);}
-     var t1a := time();
-     pre_tid := pre_tid.sort();
-     
-     # evaluate doc/term (anc/desc) relationship
-     var t2 := time();
-     var elem_tid := _containing_desc_tmj(left.sort().mark([EMAIL PROTECTED]), 
pre_tid, pre_size);
-     var t3 := time();
-     pre_tid := nil;
-     if (elem_tid.count() = 0) {return new(oid,dbl);}
-     
-     var res := score_NLLR_mil(elem_tid, pre_size, terms, tid_frq, 
elem_tid.kunique(), _lmbd, q_cnt);
-     
-     # Obey SCOREBASE setting: 
-     if ( int(qenv.find(QENV_SCOREBASE)) = 0 ) {
-        res := [+](left, res);
-     } else {
-        res := [*](left, res);
-     }
-     if ( trace ) tj_trace( "END   p_containing_q_NLLR" );
-     var t4 := time();
-     if (timing) printf("total time: %d, term selection: %d, containmentjoin: 
%d, score computation: %d\n", t4 - t1, t1a - t1, t3 - t2, t4 - t3);
-     return res;
-}
+#PROC p_containing_q_NLLR2(bat[oid,dbl] left, bat[void,str] Qterms, flt lmbd, 
str ind,BAT[oid,str] qenv) : bat[oid,dbl] :=
+#{
+#    if ( trace ) tj_trace( "BEGIN p_containing_q_NLLR" );
+#    var t1 := time();
+#    var pre_size := bat("tj_" + qenv.find(QENV_FTINAME) + "_size" + ind);
+#    
+#     # get term ids and drop all terms with zero frq in col or background-col
+#     var terms := _terms2void_tid( Qterms, qenv ).histogram();
+#     var q_cnt := Qterms.count();
+#     var tid_frq := collTermCount(qenv.find(QENV_FTIBGNAME), terms);
+#     terms := terms.semijoin(tid_frq);
+#     if (terms.count() = 0) {return new(oid,dbl);}
+#    
+#     # compute constant factor in score computation
+#     var _lmbd := dbl((1.0 - lmbd) / lmbd);
+#     var collFrq := bat("tj_" + qenv.find(QENV_FTIBGNAME) + "_Terms").count();
+#     _lmbd :*= collFrq;
+#     
+#     # fetch term occurrences and sort them in preorder
+#     var pre_tid := indexfetchjoin(terms.hmark([EMAIL PROTECTED]), 
+#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_TermIndex"), 
+#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_Terms") ).reverse();
+#     if (pre_tid.count() = 0) {return new(oid,dbl);}
+#     var t1a := time();
+#     pre_tid := pre_tid.sort();
+#     
+#     # evaluate doc/term (anc/desc) relationship
+#     var t2 := time();
+#     var elem_tid := _containing_desc_tmj(left.sort().mark([EMAIL 
PROTECTED]), pre_tid, pre_size);
+#     var t3 := time();
+#     pre_tid := nil;
+#     if (elem_tid.count() = 0) {return new(oid,dbl);}
+#     
+#     var res := score_NLLR(elem_tid, pre_size, terms, tid_frq, 
elem_tid.kunique(), _lmbd, q_cnt);
+#     # Obey SCOREBASE setting: 
+#     if ( int(qenv.find(QENV_SCOREBASE)) = 0 ) {
+#        res := [+](left, res);
+#     } else {
+#        res := [*](left, res);
+#     }
+#     if ( trace ) tj_trace( "END   p_containing_q_NLLR" );
+#     var t4 := time();
+#
+#     if (timing) printf("# p_containing_q_NLLR(): total time: %d, term 
selection: %d, containmentjoin: %d, score computation: %d\n", t4 - t1, t1a - 
t1, t3 - t2, t4 - t3);
+#     return res;
+#}
+#
+#PROC p_containing_q_NLLR_mil(bat[oid,dbl] left, bat[void,str] Qterms, flt 
lmbd, str ind,BAT[oid,str] qenv) : bat[oid,dbl] :=
+#{
+#    if ( trace ) tj_trace( "BEGIN p_containing_q_NLLR" );
+#    var t1 := time();
+#    var pre_size := bat("tj_" + qenv.find(QENV_FTINAME) + "_size" + ind);
+#    
+#     # get term ids and drop all terms with zero frq in col or background-col
+#     var terms := _terms2void_tid( Qterms, qenv ).histogram();
+#     var q_cnt := Qterms.count();
+#     var tid_frq := collTermCount(qenv.find(QENV_FTIBGNAME), terms);
+#     terms := terms.semijoin(tid_frq);
+#     if (terms.count() = 0) {return new(oid,dbl);}
+#    
+#     # compute constant factor in score computation
+#     var _lmbd := dbl((1.0 - lmbd) / lmbd);
+#     var collFrq := bat("tj_" + qenv.find(QENV_FTIBGNAME) + "_Terms").count();
+#     _lmbd :*= collFrq;
+#     
+#     # fetch term occurrences and sort them in preorder
+#     var pre_tid := indexfetchjoin(terms.hmark([EMAIL PROTECTED]), 
+#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_TermIndex"), 
+#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_Terms") ).reverse();
+#     if (pre_tid.count() = 0) {return new(oid,dbl);}
+#     var t1a := time();
+#     pre_tid := pre_tid.sort();
+#     
+#     # evaluate doc/term (anc/desc) relationship
+#     var t2 := time();
+#     var elem_tid := _containing_desc_tmj(left.sort().mark([EMAIL 
PROTECTED]), pre_tid, pre_size);
+#     var t3 := time();
+#     pre_tid := nil;
+#     if (elem_tid.count() = 0) {return new(oid,dbl);}
+#     
+#     var res := score_NLLR_mil(elem_tid, pre_size, terms, tid_frq, 
elem_tid.kunique(), _lmbd, q_cnt);
+#     
+#     # Obey SCOREBASE setting: 
+#     if ( int(qenv.find(QENV_SCOREBASE)) = 0 ) {
+#        res := [+](left, res);
+#     } else {
+#        res := [*](left, res);
+#     }
+#     if ( trace ) tj_trace( "END   p_containing_q_NLLR" );
+#     var t4 := time();
+#     if (timing) printf("total time: %d, term selection: %d, containmentjoin: 
%d, score computation: %d\n", t4 - t1, t1a - t1, t3 - t2, t4 - t3);
+#     return res;
+#}
 
 ##
 # Implementation of the Language Modeling retrieval model, with smoothing.
@@ -2266,64 +2322,65 @@
 #  tc(tm_i, doc): term count of query term tm_i in doc
 #  len(doc)     : size of doc (term or element size)
 ##
-PROC p_containing_q_LM(bat[oid,dbl] ctx, bat[void,str] Qterms, BAT[oid,str] 
qenv) : bat[oid,dbl]
-{   
-    if ( trace ) tj_trace( "BEGIN p_containing_t_LM_COARSE" );
-    # To follow the naming in the formula above, context regions are named 
"documents".
-    # For each term we need:
-    #  - foreground probability (first term). This depends on the context 
region
-    
-    # Convert the query terms from [void,str] to [void,tid]
-    var terms := _terms2void_tid( Qterms, qenv );
-
-    ### Foreground probability: 
-    # Find out the document positions of the terms for foreground probability
-    var tid_pre := indexfetchjoin(terms, 
-                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_TermIndex"), 
-                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_Terms") );
-    
-    if (tid_pre.count() = 0) { return new(oid,dbl); }
-    tid_pre := tid_pre.tsort();
-   
-    # TODO: fragmentation
-    var pre_size := bat("tj_" + qenv.find(QENV_FTINAME) + "_size1");
-    
-    # See which document contain the query terms we create a bat of [doc, 
term-id]:
-    var elems := ctx.sort().mark([EMAIL PROTECTED]);
-    var doc_tid := _containing_desc_tmj(elems, tid_pre.reverse(), pre_size);
-    
-    # len(doc): [doc, size]
-    var doc_len := [dbl](bat("tj_" + qenv.find(QENV_FTINAME) + 
"_size1").semijoin(elems));
-    
-    ###
-    
-    # Now, we need to compute the probability for each document->term pair
-    var res := elems.project(dbl(1.0));
-    
-    # Iterate over all terms.
-    [EMAIL PROTECTED]() {
-        # Compute the first factor
-        # $t contains the term id 
-        var occurrences := doc_tid.select($t).sort();
-       
-        # Count the occurrence of the term in all documents: tc(tm_i, doc), 
-        # for all documents at once
-        var tc_tm_doc  := [dbl](occurrences.reverse().histogram());
-        var foreground := [/](tc_tm_doc, doc_len);
-
-        res := [*](res, foreground);
-    }
-    
-    if ( int(qenv.find(QENV_SCOREBASE)) = 0 ) {
-        # Add the scores to the context set (this should have scores 0, so 
adding is OK)
-        res := [+](ctx, res);
-    } else {
-        # Add the scores to the context set (this should have scores 1, so 
multiplying is OK)
-        res := [*](ctx, res);
-    }
-    if ( trace ) tj_trace( "END   p_containing_t_LMs_COARSE" );
-    return res;
-}
+#PROC p_containing_q_LM2(bat[oid,dbl] ctx, bat[void,str] Qterms, BAT[oid,str] 
qenv) : bat[oid,dbl]
+#{   
+#    if ( trace ) tj_trace( "BEGIN p_containing_t_LM_COARSE" );
+#    # To follow the naming in the formula above, context regions are named 
"documents".
+#    # For each term we need:
+#    #  - foreground probability (first term). This depends on the context 
region
+#    
+#    # Convert the query terms from [void,str] to [void,tid]
+#    var bg_cName := qenv.find(QENV_FTIBGNAME);
+#    var terms := _terms2void_tid( Qterms, bg_cName );
+#
+#    ### Foreground probability: 
+#    # Find out the document positions of the terms for foreground probability
+#    var tid_pre := indexfetchjoin(terms, 
+#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_TermIndex"), 
+#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_Terms") );
+#    
+#    if (tid_pre.count() = 0) { return new(oid,dbl); }
+#    tid_pre := tid_pre.tsort();
+#   
+#    # TODO: fragmentation
+#    var pre_size := bat("tj_" + qenv.find(QENV_FTINAME) + "_size1");
+#    
+#    # See which document contain the query terms we create a bat of [doc, 
term-id]:
+#    var elems := ctx.sort().mark([EMAIL PROTECTED]);
+#    var doc_tid := _containing_desc_tmj(elems, tid_pre.reverse(), pre_size);
+#    
+#    # len(doc): [doc, size]
+#    var doc_len := [dbl](bat("tj_" + qenv.find(QENV_FTINAME) + 
"_size1").semijoin(elems));
+#    
+#    ###
+#    
+#    # Now, we need to compute the probability for each document->term pair
+#    var res := elems.project(dbl(1.0));
+#    
+#    # Iterate over all terms.
+#    [EMAIL PROTECTED]() {
+#        # Compute the first factor
+#        # $t contains the term id 
+#        var occurrences := doc_tid.select($t).sort();
+#      
+#        # Count the occurrence of the term in all documents: tc(tm_i, doc), 
+#        # for all documents at once
+#        var tc_tm_doc  := [dbl](occurrences.reverse().histogram());
+#        var foreground := [/](tc_tm_doc, doc_len);
+#
+#        res := [*](res, foreground);
+#    }
+#    
+#    if ( int(qenv.find(QENV_SCOREBASE)) = 0 ) {
+#        # Add the scores to the context set (this should have scores 0, so 
adding is OK)
+#        res := [+](ctx, res);
+#    } else {
+#        # Add the scores to the context set (this should have scores 1, so 
multiplying is OK)
+#        res := [*](ctx, res);
+#    }
+#    if ( trace ) tj_trace( "END   p_containing_t_LM_COARSE" );
+#    return res;
+#}
 
 ##
 # Implementation of the Language Modeling retrieval model.
@@ -2369,105 +2426,107 @@
 #  tc(tm_i, doc): term count of query term tm_i in doc
 #  len(doc)     : size of doc (term or element size)
 ##
-PROC p_containing_q_LMs(bat[oid,dbl] ctx, bat[void,str] Qterms, bat modifiers, 
flt lambda, int stemming, int size_type,BAT[oid,str] qenv) : bat[oid,dbl]
-{   
-    if ( trace ) tj_trace( "BEGIN p_containing_t_LMs_COARSE" );
-    # To follow the naming in the formula above, context regions are named 
"documents".
-    # For each term we need:
-    #  - foreground probability (first term). This depends on the context 
region
-    #  - background probability (second term). This is the same for every 
context region
-    
-    # Convert the query terms from [void,str] to [void,tid]
-    var terms := _terms2void_tid( Qterms, qenv );
-
-    ### Background probability:
-    # For each term: collection term frequency tc(tm_i, col):
-    # var col_term_frq := tid_pre.reverse().histogram();
-    # small fix to get LMs running again...
-    var col_term_frq := collTermCount(qenv.find(QENV_FTIBGNAME), 
terms.reverse().project(int(0)));
-
-    # Collection size len(col): int
-    var col_len := bat("tj_" + qenv.find(QENV_FTIBGNAME) + "_Terms").count();
-
-    # Take only the terms that occur in the collection
-    terms := terms.reverse().semijoin(col_term_frq).hmark([EMAIL PROTECTED]);
-
-    if (terms.count() = 0) { return new(oid,dbl); }
-    
-    ###
-    
-    ### Foreground probability: 
-    # Find out the document positions of the terms for foreground probability
-    var tid_pre := indexfetchjoin(terms, 
-                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_TermIndex"), 
-                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_Terms") );
-    
-    if (tid_pre.count() = 0) { return new(oid,dbl); }
-    tid_pre := tid_pre.tsort();
-    
-    # TODO: fragmentation
-    var pre_size := bat("tj_" + qenv.find(QENV_FTINAME) + "_size1");
-    
-    # See which document contain the query terms we create a bat of [doc, 
term-id]:
-    var doc_tid := _containing_desc_tmj(ctx.sort().mark([EMAIL PROTECTED]), 
tid_pre.reverse(), pre_size);
-    
-    # len(doc): [doc, size]
-    var doc_len := [dbl](bat("tj_" + qenv.find(QENV_FTINAME) + 
"_size1").semijoin(doc_tid));
-    
-    ###
-    
-    # Now, we need to compute the probability for each document->term pair
-    var doc_termscore   := new(oid,dbl);
-    var prod_background := dbl(1);
-    
-    # Iterate over all terms.
-    [EMAIL PROTECTED]() {
-        # Compute the first factor
-        # $t contains the term id 
-        var occurrences := doc_tid.select($t).sort();
-        
-        # Count the occurrence of the term in all documents: tc(tm_i, doc), 
-        # for all documents at once
-        var tc_tm_doc  := [dbl](occurrences.reverse().histogram());
-        var foreground := [/](tc_tm_doc, doc_len);
-
-        # Compute the background probability: tc(tm_i,col)/len(col)
-        var tc_tm_col  := col_term_frq.find($t); 
-        var background := dbl(tc_tm_col) / dbl(col_len);
-        
-        # Compute the first factor. This generates a [doc, term score] table 
for
-        # each combination of doc and term
-        var total := [+]( [/]( [*]((1.0 - lambda), foreground), lambda * 
background), dbl(1) );
-        doc_termscore.insert( [dbl](total) );
-        
-        # Compute the second factor: product of background statistics over all 
terms
-        prod_background := prod_background * background;
-    }
-    
-    # We now have a table that lists [doc, term score] pairs. These need to be 
aggregated
-    # into [doc, score] pairs: (this is the first aggregate product in the 
formula)
-    
-    var elements := ctx.mark([EMAIL PROTECTED]);
-    
-    if ( not( returnAllElements ) ) 
-        elements := elements.semijoin(doc_tid);
-        
-    var res := {prod}(doc_termscore.tmark([EMAIL PROTECTED]), 
doc_termscore.hmark([EMAIL PROTECTED]), elements );
-    
-    # Now compute the final scores: multiply by the background score
-    res := [*](res, lambda * prod_background);
-
-    if ( int(qenv.find(QENV_SCOREBASE)) = 0 ) {
-        # Add the scores to the context set (this should have scores 0, so 
adding is OK)
-        res := [+](ctx, res);
-    } else {
-        # Add the scores to the context set (this should have scores 1, so 
multiplying is OK)
-        res := [*](ctx, res);
-    }
-    if ( trace ) tj_trace( "END   p_containing_t_LMs_COARSE" );
-    return res;
-}
-
+#PROC p_containing_q_LMs2(bat[oid,dbl] ctx, bat[void,str] Qterms, bat 
modifiers, flt lambda, int stemming, int size_type,BAT[oid,str] qenv) : 
bat[oid,dbl]
+#{   
+#    if ( trace ) tj_trace( "BEGIN p_containing_t_LMs_COARSE" );
+#    # To follow the naming in the formula above, context regions are named 
"documents".
+#    # For each term we need:
+#    #  - foreground probability (first term). This depends on the context 
region
+#    #  - background probability (second term). This is the same for every 
context region
+#    
+#    # Convert the query terms from [void,str] to [void,tid]
+#    var bg_cName := qenv.find(QENV_FTIBGNAME);
+#    var terms := _terms2void_tid( Qterms, bg_cName );
+#
+#    ### Background probability:
+#    # For each term: collection term frequency tc(tm_i, col):
+#    # var col_term_frq := tid_pre.reverse().histogram();
+#    # small fix to get LMs running again...
+#    var col_term_frq := collTermCount(qenv.find(QENV_FTIBGNAME), 
terms.reverse().project(int(0)));
+#
+#    # Collection size len(col): int
+#    var col_len := bat("tj_" + qenv.find(QENV_FTIBGNAME) + "_Terms").count();
+#
+#    # Take only the terms that occur in the collection
+#    terms := terms.reverse().semijoin(col_term_frq).hmark([EMAIL PROTECTED]);
+#
+#    if (terms.count() = 0) { return new(oid,dbl); }
+#    
+#    ###
+#    
+#    ### Foreground probability: 
+#    # Find out the document positions of the terms for foreground probability
+#    var tid_pre := indexfetchjoin(terms, 
+#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_TermIndex"), 
+#                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_Terms") );
+#    
+#    if (tid_pre.count() = 0) { return new(oid,dbl); }
+#    tid_pre := tid_pre.tsort();
+#    
+#    # TODO: fragmentation
+#    var pre_size := bat("tj_" + qenv.find(QENV_FTINAME) + "_size1");
+#    
+#    # See which document contain the query terms we create a bat of [doc, 
term-id]:
+#    var doc_tid := _containing_desc_tmj(ctx.sort().mark([EMAIL PROTECTED]), 
tid_pre.reverse(), pre_size);
+#    
+#    # len(doc): [doc, size]
+#    var doc_len := [dbl](bat("tj_" + qenv.find(QENV_FTINAME) + 
"_size1").semijoin(doc_tid));
+#    
+#    ###
+#    
+#    # Now, we need to compute the probability for each document->term pair
+#    var doc_termscore   := new(oid,dbl);
+#    var prod_background := dbl(1);
+#    
+#    # Iterate over all terms.
+#    [EMAIL PROTECTED]() {
+#        # Compute the first factor
+#        # $t contains the term id 
+#        var occurrences := doc_tid.select($t).sort();
+#        
+#        # Count the occurrence of the term in all documents: tc(tm_i, doc), 
+#        # for all documents at once
+#        var tc_tm_doc  := [dbl](occurrences.reverse().histogram());
+#        var foreground := [/](tc_tm_doc, doc_len);
+#
+#        # Compute the background probability: tc(tm_i,col)/len(col)
+#        var tc_tm_col  := col_term_frq.find($t); 
+#        var background := dbl(tc_tm_col) / dbl(col_len);
+#        
+#        # Compute the first factor. This generates a [doc, term score] table 
for
+#        # each combination of doc and term
+#        var total := [+]( [/]( [*]((1.0 - lambda), foreground), lambda * 
background), dbl(1) );
+#        doc_termscore.insert( [dbl](total) );
+#        
+#        # Compute the second factor: product of background statistics over 
all terms
+#        prod_background := prod_background * background;
+#    }
+#    
+#    # We now have a table that lists [doc, term score] pairs. These need to 
be aggregated
+#    # into [doc, score] pairs: (this is the first aggregate product in the 
formula)
+#    
+#    var elements := ctx.mark([EMAIL PROTECTED]);
+#    
+#    if ( not( returnAllElements ) ) 
+#        elements := elements.semijoin(doc_tid);
+#        
+#    var res := {prod}(doc_termscore.tmark([EMAIL PROTECTED]), 
doc_termscore.hmark([EMAIL PROTECTED]), elements );
+#    
+#    # Now compute the final scores: multiply by the background score
+#    res := [*](res, lambda * prod_background);
+#    ctx.print();
+#    qenv.find(QENV_SCOREBASE).print();
+#    if ( int(qenv.find(QENV_SCOREBASE)) = 0 ) {
+#        # Add the scores to the context set (this should have scores 0, so 
adding is OK)
+#        res := [+](ctx, res);
+#    } else {
+#        # Add the scores to the context set (this should have scores 1, so 
multiplying is OK)
+#        res := [*](ctx, res);
+#    }
+#    if ( trace ) tj_trace( "END   p_containing_t_LMs_COARSE" );
+#    return res;
+#}
+#
 ##
 # Returns the collection frequency table (should be precomputed, but is 
calculated for now)
 ##


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

Reply via email to