Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory sc8-pr-cvs16.sourceforge.net:/tmp/cvs-serv22930

Modified Files:
        nexi.c nexi_generate_mil.c pftijah.mx 
Log Message:
corrected usage of collection lambda. it was used just in the opposite way
as a document lambda for the LMS retrieval model.

furthermore, included an implementation for the old LM model. thus, a language
model without smoothing. this model ensures, that "all" terms have to be found
within an document.



Index: nexi_generate_mil.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/nexi_generate_mil.c,v
retrieving revision 1.32
retrieving revision 1.33
diff -u -d -r1.32 -r1.33
--- nexi_generate_mil.c 29 May 2007 15:09:02 -0000      1.32
+++ nexi_generate_mil.c 30 May 2007 14:08:09 -0000      1.33
@@ -708,7 +708,7 @@
 
         case MODEL_LM :
 
-          MILPRINTF(MILOUT, "R%d := R%d.p_containing_q_LM(terms, modifiers, 
%d, %d,qenv);\n", com_num, com_nr_left, txt_retr_model->stemming, 
txt_retr_model->size_type);
+          MILPRINTF(MILOUT, "R%d := R%d.p_containing_q_LM(terms, qenv);\n", 
com_num, com_nr_left);
 
           break;
 

Index: nexi.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/nexi.c,v
retrieving revision 1.63
retrieving revision 1.64
diff -u -d -r1.63 -r1.64
--- nexi.c      30 May 2007 14:01:52 -0000      1.63
+++ nexi.c      30 May 2007 14:08:08 -0000      1.64
@@ -368,14 +368,14 @@
                 txt_retr_model->down_prop = DOWN_WSUMA;
             }
             
-        } else if ( strcmp(optName,"txtmodel_collectionLambda") == 0 || 
-                    strcmp(optName,"txtmodel_param1") == 0) {
+        } else if ( strcmp(optName,"collection-lambda") == 0 || 
+                    strcmp(optName,"ir-model-param1") == 0) {
             txt_retr_model->param1 = atof( optVal );
 
-        } else if ( strcmp(optName,"txtmodel_param2") == 0 ) {
+        } else if ( strcmp(optName,"ir-model-param2") == 0 ) {
             txt_retr_model->param2 = atof( optVal );
             
-        } else if ( strcmp(optName,"txtmodel_param3") == 0 ) {
+        } else if ( strcmp(optName,"ir-model-param3") == 0 ) {
             txt_retr_model->param3 = atof( optVal );
         
         } else if ( strcmp(optName,"txtmodel_returnall") == 0 ) {

Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.128
retrieving revision 1.129
diff -u -d -r1.128 -r1.129
--- pftijah.mx  30 May 2007 14:01:53 -0000      1.128
+++ pftijah.mx  30 May 2007 14:08:09 -0000      1.129
@@ -2126,6 +2126,105 @@
 # var modifiers := new(void,int).seqbase(oid(0));
 # terms.append("information");
 # terms.append("retrieval");
+# var R2 := p_containing_t_LM(R1, terms, 1, SIZE_TERM);
+#
+# R2 now contains all regions in R1, however with scores attached according
+# to the occurrence of the terms "information" and "retrieval"
+#
+#
+# The score value for a single term and document is defined as follows:
+#                                                  
+#  S(doc|tm) = FG(tm) 
+#
+# where 
+#            tc(tm,doc)
+#  FG(tm) = ------------       (foreground statistics)
+#             len(doc)
+#
+# To compute score values for a set of terms at once, this function implements 
+# presence weighting (thanks to Thijs for pointing this out), using the 
following 
+# formula:
+#
+#                ___             
+# S(doc|q) =     | |     FG(tm_i)
+#             tm_i in q          
+#
+# where:                                                   
+#  q            : query: set of query terms
+#  tm_i         : query term
+#  doc          : search document or, in this case, context region
+#  tc(tm_i, doc): term count of query term tm_i in doc
+#  len(doc)     : size of doc (term or element size)
+##
+PROC p_containing_q_LM(bat[oid,dbl] ctx, bat[void,str] Qterms, BAT[oid,str] 
qenv) : bat[oid,dbl]
+{   
+    if ( trace ) tj_trace( "BEGIN p_containing_t_LM_COARSE" );
+    # To follow the naming in the formula above, context regions are named 
"documents".
+    # For each term we need:
+    #  - foreground probability (first term). This depends on the context 
region
+    
+    # Convert the query terms from [void,str] to [void,tid]
+    var terms := Qterms_to_void_tid( Qterms, qenv );
+
+    ### Foreground probability: 
+    # Find out the document positions of the terms for foreground probability
+    var tid_pre := indexfetchjoin(terms, 
+                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_TermIndex"), 
+                                   bat("tj_" + qenv.find(QENV_FTINAME) + 
"_Terms") );
+    
+    if (tid_pre.count() = 0) { return new(oid,dbl); }
+    tid_pre := tid_pre.tsort();
+   
+    # TODO: fragmentation
+    var pre_size := bat("tj_" + qenv.find(QENV_FTINAME) + "_size1");
+    
+    # See which document contain the query terms we create a bat of [doc, 
term-id]:
+    var elems := ctx.sort().mark([EMAIL PROTECTED]);
+    var doc_tid := _containing_desc3(elems, tid_pre.reverse(), pre_size,qenv);
+    
+    # len(doc): [doc, size]
+    var doc_len := [dbl](bat("tj_" + qenv.find(QENV_FTINAME) + 
"_size1").semijoin(elems));
+    
+    ###
+    
+    # Now, we need to compute the probability for each document->term pair
+    var res := elems.project(dbl(1.0));
+    
+    # Iterate over all terms.
+    [EMAIL PROTECTED]() {
+        # Compute the first factor
+        # $t contains the term id 
+        var occurrences := doc_tid.select($t).sort();
+       
+        # Count the occurrence of the term in all documents: tc(tm_i, doc), 
+        # for all documents at once
+        var tc_tm_doc  := [dbl](occurrences.reverse().histogram());
+        var foreground := [/](tc_tm_doc, doc_len);
+
+        res := [*](res, foreground);
+    }
+    
+    if ( int(qenv.find(QENV_SCOREBASE)) = 0 ) {
+        # Add the scores to the context set (this should have scores 0, so 
adding is OK)
+        res := [+](ctx, res);
+    } else {
+        # Add the scores to the context set (this should have scores 1, so 
multiplying is OK)
+        res := [*](ctx, res);
+    }
+    if ( trace ) tj_trace( "END   p_containing_t_LMs_COARSE" );
+    return res;
+}
+
+##
+# Implementation of the Language Modeling retrieval model.
+#
+# This function is meant for set-based score computation (COARSE2 algebra), 
e.g.:
+#
+# var R1 := select_node("article");
+# var terms := new(void,str).seqbase(oid(0));
+# var modifiers := new(void,int).seqbase(oid(0));
+# terms.append("information");
+# terms.append("retrieval");
 # var R2 := p_containing_t_LMs(R1, terms, modifiers, 0.5, 1, SIZE_TERM);
 #
 # R2 now contains all regions in R1, however with scores attached according
@@ -2134,7 +2233,7 @@
 #
 # The score value for a single term and document is defined as follows:
 #                                                  
-#  S(doc|tm) = lambda * FG(tm) + (1 - lambda) * BG(tm)
+#  S(doc|tm) = (1 - lambda) * FG(tm) + lambda * BG(tm)
 #
 # where 
 #            tc(tm,doc)
@@ -2149,9 +2248,9 @@
 # presence weighting (thanks to Thijs for pointing this out), using the 
following 
 # formula:
 #
-#            /    ___       lambda * FG(tm_i)         \        ___     
-# S(doc|q) = |    | |     ----------------------  + 1 |  *     | |    (1 - 
lambda) * BG(tm_i)
-#            \ tm_i in q  (1 - lambda) * BG(tm_i)     /     tm_i in q  
+#            /    ___     (1 - lambda) * FG(tm_i)     \        ___     
+# S(doc|q) = |    | |     ----------------------  + 1 |  *     | |    lambda * 
BG(tm_i)
+#            \ tm_i in q      lambda * BG(tm_i)       /     tm_i in q  
 #
 # where:                                                   
 #  q            : query: set of query terms
@@ -2228,7 +2327,7 @@
         
         # Compute the first factor. This generates a [doc, term score] table 
for
         # each combination of doc and term
-        var total := [+]( [/]( [*](lambda, foreground), (1.0 - lambda) * 
background), dbl(1) );
+        var total := [+]( [/]( [*]((1.0 - lambda), foreground), lambda * 
background), dbl(1) );
         doc_termscore.insert( [dbl](total) );
         
         # Compute the second factor: product of background statistics over all 
terms
@@ -2237,7 +2336,6 @@
     
     # We now have a table that lists [doc, term score] pairs. These need to be 
aggregated
     # into [doc, score] pairs: (this is the first aggregate product in the 
formula)
-    #var res := {prod}(doc_termscore, ctx.mark(oid(0)));
     
     var elements := ctx.mark([EMAIL PROTECTED]);
     
@@ -2247,7 +2345,7 @@
     var res := {prod}(doc_termscore.tmark([EMAIL PROTECTED]), 
doc_termscore.hmark([EMAIL PROTECTED]), elements );
     
     # Now compute the final scores: multiply by the background score
-    res := [*](res, (1.0 - lambda) * prod_background);
+    res := [*](res, lambda * prod_background);
 
     if ( int(qenv.find(QENV_SCOREBASE)) = 0 ) {
         # Add the scores to the context set (this should have scores 0, so 
adding is OK)


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

Reply via email to