Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv3017

Modified Files:
        pftijah.mx 
Log Message:
introduce new text search features:
- phrase search
- mandatory terms 
- negated/disliked terms

implementation of the new features required to
- extend the NEXI grammar (parser)
- introduce a new normalization phase for the query text
- reflect changes during optimization
- runtime support to execute the new operations on the tijah-index




U pftijah.mx
Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.224
retrieving revision 1.225
diff -u -d -r1.224 -r1.225
--- pftijah.mx  8 Apr 2009 07:04:55 -0000       1.224
+++ pftijah.mx  16 Apr 2009 10:29:37 -0000      1.225
@@ -2368,7 +2368,7 @@
 @mil
 
 #####################################################################
-# Containing_query
+# Containing_query_term
 #####################################################################
 
 PROC tj_term2tid (bat[str,dbl] term_score) : bat[oid,dbl] :=
@@ -2402,7 +2402,7 @@
 @:getTermDocCnt_nid(nest)@
 @:getTermDocCnt_nid(unnest)@
 @= getTermDocCnt_nid
-PROC _gettermdocc...@1_nid(BAT[oid,oid] e_pre, BAT[void,int] e_size, 
BAT[void,oid] t_pre) : BAT[oid,int] := {
+PROC _gettermdocc...@1_nid(BAT[oid,oid] e_pre, BAT[void,int] e_size, 
BAT[oid,oid] t_pre) : BAT[oid,int] := {
     
     # get doc - term relation
     var e_tPre := treemergejo...@1_nid(e_pre, e_size, t_pre);
@@ -2412,7 +2412,7 @@
 @:getTermDocCnt_pre(nest)@
 @:getTermDocCnt_pre(unnest)@
 @= getTermDocCnt_pre
-PROC _gettermdocc...@1_pre(BAT[oid,any] e_pre, BAT[void,int] e_size, 
BAT[void,oid] t_pre) : BAT[oid,int] := {
+PROC _gettermdocc...@1_pre(BAT[oid,any] e_pre, BAT[void,int] e_size, 
BAT[oid,oid] t_pre) : BAT[oid,int] := {
     
     # get doc - term relation
     var e_tPre := treemergejo...@1_pre(e_pre, e_size, t_pre.reverse());
@@ -2516,12 +2516,12 @@
 }
 
 
-@:containing_query_LM(nest,nid,TagSize)@
-@:containing_query_LM(unnest,nid,TagSize)@
-@:containing_query_LM(nest,pre,size1)@
-@:containing_query_LM(unnest,pre,size1)@
-...@= containing_query_LM
-PROC tj_containing_que...@1_@2_LM (bat[oid,any] left, bat[oid,dbl] query) : 
bat[oid,dbl] :=
+@:containing_query_term_LM(nest,nid,TagSize)@
+@:containing_query_term_LM(unnest,nid,TagSize)@
+@:containing_query_term_LM(nest,pre,size1)@
+@:containing_query_term_LM(unnest,pre,size1)@
+...@= containing_query_term_LM
+PROC tj_containing_que...@1_@2_term_LM (bat[oid,any] left, bat[oid,dbl] query) 
: bat[oid,dbl] :=
 {
     var t_total := 0;
     t_total :-= time();
@@ -2569,12 +2569,12 @@
 @mil
 
 
-@:containing_query_LMs(nest,nid,TagSize)@
-@:containing_query_LMs(unnest,nid,TagSize)@
-@:containing_query_LMs(nest,pre,size1)@
-@:containing_query_LMs(unnest,pre,size1)@
-...@= containing_query_LMs
-PROC tj_containing_que...@1_@2_LMs (bat[oid,any] left, bat[oid,dbl] query) : 
bat[oid,dbl] :=
+@:containing_query_term_LMs(nest,nid,TagSize)@
+@:containing_query_term_LMs(unnest,nid,TagSize)@
+@:containing_query_term_LMs(nest,pre,size1)@
+@:containing_query_term_LMs(unnest,pre,size1)@
+...@= containing_query_term_LMs
+PROC tj_containing_que...@1_@2_term_LMs (bat[oid,any] left, bat[oid,dbl] 
query) : bat[oid,dbl] :=
 {
     var t_total := 0;
     t_total :-= time();
@@ -2625,12 +2625,12 @@
 @mil
 
 
-@:containing_query_NLLR(nest,nid,TagSize)@
-@:containing_query_NLLR(unnest,nid,TagSize)@
-@:containing_query_NLLR(nest,pre,size1)@
-@:containing_query_NLLR(unnest,pre,size1)@
-...@= containing_query_NLLR
-PROC tj_containing_que...@1_@2_NLLR (bat[oid,any] left, bat[oid,dbl] query) : 
bat[oid,dbl] :=
+@:containing_query_term_NLLR(nest,nid,TagSize)@
+@:containing_query_term_NLLR(unnest,nid,TagSize)@
+@:containing_query_term_NLLR(nest,pre,size1)@
+@:containing_query_term_NLLR(unnest,pre,size1)@
+...@= containing_query_term_NLLR
+PROC tj_containing_que...@1_@2_term_NLLR (bat[oid,any] left, bat[oid,dbl] 
query) : bat[oid,dbl] :=
 {
     var t_total := 0;
     t_total :-= time();
@@ -2678,12 +2678,12 @@
 @mil
 
 
-@:containing_query_OKAPI(nest,nid,TagSize)@
-@:containing_query_OKAPI(unnest,nid,TagSize)@
-@:containing_query_OKAPI(nest,pre,size1)@
-@:containing_query_OKAPI(unnest,pre,size1)@
-...@= containing_query_OKAPI
-PROC tj_containing_que...@1_@2_OKAPI (bat[oid,any] left, bat[oid,dbl] query) : 
bat[oid,dbl] :=
+@:containing_query_term_OKAPI(nest,nid,TagSize)@
+@:containing_query_term_OKAPI(unnest,nid,TagSize)@
+@:containing_query_term_OKAPI(nest,pre,size1)@
+@:containing_query_term_OKAPI(unnest,pre,size1)@
+...@= containing_query_term_OKAPI
+PROC tj_containing_que...@1_@2_term_OKAPI (bat[oid,any] left, bat[oid,dbl] 
query) : bat[oid,dbl] :=
 {
     var t_total := 0;
     t_total :-= time();
@@ -2730,8 +2730,108 @@
 }
 @mil
 
+@:getTermDoc_nid(nest)@
+@:getTermDoc_nid(unnest)@
+...@= getTermDoc_nid
+PROC _gettermd...@1_nid(BAT[oid,oid] e_pre, BAT[void,int] e_size, BAT[oid,oid] 
t_pre) : BAT[oid,int] := {
+    
+    # get doc - term relation
+    return treemergejo...@1_nid(e_pre, e_size, t_pre);
+}
+...@mil
+@:getTermDoc_pre(nest)@
+@:getTermDoc_pre(unnest)@
+...@= getTermDoc_pre
+PROC _gettermd...@1_pre(BAT[oid,any] e_pre, BAT[void,int] e_size, BAT[oid,oid] 
t_pre) : BAT[oid,int] := {
+    
+    # get doc - term relation
+    return treemergejo...@1_pre(e_pre, e_size, t_pre.reverse());
+}
+...@mil
+
+@:containing_query_term(nest,nid,plus,TagSize,semijoin)@
+@:containing_query_term(unnest,nid,plus,TagSize,semijoin)@
+@:containing_query_term(nest,pre,plus,size1,semijoin)@
+@:containing_query_term(unnest,pre,plus,size1,semijoin)@
+@:containing_query_term(nest,nid,min,TagSize,kdiff)@
+@:containing_query_term(unnest,nid,min,TagSize,kdiff)@
+@:containing_query_term(nest,pre,min,size1,kdiff)@
+@:containing_query_term(unnest,pre,min,size1,kdiff)@
+...@= containing_query_term
+PROC tj_containing_que...@1_@2_te...@3 (bat[oid,any] left, bat[oid,dbl] query) 
: bat[oid,dbl] :=
+{
+    var t_total := 0;
+    t_total :-= time();
+    
+    if ( count(left) = 0 ) return new(oid,dbl);
+    if ( count(query) = 0 ) return new(oid,dbl);
+    
+    var e_pre := left.chk_order();
+    var e_size := bat("tj_" + ftindex + "_...@4");
+    
+    # loop over query terms
+    qu...@batloop()
+    {
+        # get collection count of term
+        var t_pre := _getTermPositions($h);
+        e_pre := e_p...@5(_gettermd...@1_@2(e_pre, e_size, t_pre));
+    }
+    
+    var res := e_pre;
+    
+    t_total :+= time();
+    if (timing) printf("# te...@3 timing: total: %d\n", t_total);
+    return res;
+}
+...@mil
+
+#returns phrases (nid,pre) of first phrase term
+PROC _selectPhrase(bat[oid,dbl] query) : bat[oid,oid] :=
+{
+    # Select the term positions from the global term dictionary. 
+    var terms := query.reverse();
+    var t_pre := _getTermPositions(terms.fetch(0));
+    
+    var res := t_pre.reverse();
+    var j := terms.count();
+    var i := 1; 
+    while (i < j)
+    {
+        t_pre := _getTermPositions(terms.fetch(i));
+        res := res.semijoin(t_pre.[int]().[-](i).[oid]().reverse());   
+       i :+= 1;
+    }  
+    
+    return res.reverse().chk_order(); 
+}
+
+@:containing_query_phrase(nest,nid,TagSize)@
+@:containing_query_phrase(unnest,nid,TagSize)@
+@:containing_query_phrase(nest,pre,size1)@
+@:containing_query_phrase(unnest,pre,size1)@
+...@= containing_query_phrase
+PROC tj_containing_que...@1_@2_phrase (bat[oid,any] left, bat[oid,dbl] query) 
: bat[oid,dbl] :=
+{
+    var t_total := 0;
+    t_total :-= time();
+    
+    if ( count(left) = 0 ) return new(oid,dbl);
+    if ( count(query) = 0 ) return new(oid,dbl);
+    
+    var e_pre := left.chk_order();
+    var e_size := bat("tj_" + ftindex + "_...@3");
+    var t_pre := _selectPhrase(query);
+    var res := e_pre.semijoin(_gettermd...@1_@2(e_pre, e_size, t_pre));
+    
+    t_total :+= time();
+    if (timing) printf("# phrase timing: total: %d\n", t_total);
+    return res;
+}
+...@mil
+
+
 #####################################################################
-# Containing_conceptquery
+# Containing_query_entity
 #####################################################################
 
 PROC tj_ent2tid (bat[str,dbl] concept_score) : bat[oid,dbl] :=
@@ -2755,7 +2855,7 @@
 @:getConceptDocScr_nid(nest)@
 @:getConceptDocScr_nid(unnest)@
 @= getConceptDocScr_nid
-PROC _getconceptdocs...@1_nid(BAT[oid,oid] e_pre, BAT[void,int] e_size, 
BAT[void,oid] c_pre) : BAT[oid,int] := {
+PROC _getconceptdocs...@1_nid(BAT[oid,oid] e_pre, BAT[void,int] e_size, 
BAT[oid,oid] c_pre) : BAT[oid,int] := {
      
        # get doc - Concept relation
        var c_Scr := bat("tj_" + ftindex + "_ConceptScore");
@@ -2769,7 +2869,7 @@
 @:getConceptDocScr_pre(nest)@
 @:getConceptDocScr_pre(unnest)@
 @= getConceptDocScr_pre
-PROC _getconceptdocs...@1_pre(BAT[oid,any] e_pre, BAT[void,int] e_size, 
BAT[void,oid] c_pre) : BAT[oid,int] := {
+PROC _getconceptdocs...@1_pre(BAT[oid,any] e_pre, BAT[void,int] e_size, 
BAT[oid,oid] c_pre) : BAT[oid,int] := {
      
        # get doc - Concept relation
        var c_Scr := bat("tj_" + ftindex + "_ConceptScore");
@@ -2796,12 +2896,12 @@
     return e_cScores;
 }
 
-@:containing_conceptquery_LogSum(nest,nid,TagSize)@
-@:containing_conceptquery_LogSum(unnest,nid,TagSize)@
-@:containing_conceptquery_LogSum(nest,pre,size1)@
-@:containing_conceptquery_LogSum(unnest,pre,size1)@
-...@= containing_conceptquery_LogSum
-PROC tj_containing_conceptque...@1_@2_LogSum (bat[oid,any] left, bat[oid,dbl] 
query) : bat[oid,dbl] :=
+@:containing_query_entity_LogSum(nest,nid,TagSize)@
+@:containing_query_entity_LogSum(unnest,nid,TagSize)@
+@:containing_query_entity_LogSum(nest,pre,size1)@
+@:containing_query_entity_LogSum(unnest,pre,size1)@
+...@= containing_query_entity_LogSum
+PROC tj_containing_que...@1_@2_entity_LogSum (bat[oid,any] left, bat[oid,dbl] 
query) : bat[oid,dbl] :=
 {
     var t_total := 0;
     t_total :-= time();
@@ -2846,6 +2946,42 @@
 }
 @mil
 
+@:containing_query_entity(nest,nid,plus,TagSize,semijoin)@
+@:containing_query_entity(unnest,nid,plus,TagSize,semijoin)@
+@:containing_query_entity(nest,pre,plus,size1,semijoin)@
+@:containing_query_entity(unnest,pre,plus,size1,semijoin)@
+@:containing_query_entity(nest,nid,min,TagSize,kdiff)@
+@:containing_query_entity(unnest,nid,min,TagSize,kdiff)@
+@:containing_query_entity(nest,pre,min,size1,kdiff)@
+@:containing_query_entity(unnest,pre,min,size1,kdiff)@
+...@= containing_query_entity
+PROC tj_containing_que...@1_@2_enti...@3 (bat[oid,any] left, bat[oid,dbl] 
query) : bat[oid,dbl] :=
+{
+    var t_total := 0;
+    t_total :-= time();
+    
+    if ( count(left) = 0 ) return new(oid,dbl);
+    if ( count(query) = 0 ) return new(oid,dbl);
+    
+    var e_pre := left.chk_order();
+    var e_size := bat("tj_" + ftindex + "_...@4");
+    
+    # loop over query terms
+    qu...@batloop()
+    {
+        # get collection count of term
+        var t_pre := _getConceptPositions($h);
+        e_pre := e_p...@5(_gettermd...@1_@2(e_pre, e_size, t_pre));
+    }
+    
+    var res := e_pre;
+    
+    t_total :+= time();
+    if (timing) printf("# enti...@3 timing: total: %d\n", t_total);
+    return res;
+}
+...@mil
+
 
 #####################################################################
 # and/or combination


------------------------------------------------------------------------------
Stay on top of everything new and different, both inside and 
around Java (TM) technology - register by April 22, and save
$200 on the JavaOne (SM) conference, June 2-5, 2009, San Francisco.
300 plus technical and hands-on sessions. Register today. 
Use priority code J9JMT32. http://p.sf.net/sfu/p
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

Reply via email to