Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv3017
Modified Files:
pftijah.mx
Log Message:
introduce new text search features:
- phrase search
- mandatory terms
- negated/disliked terms
implementation of the new features required to
- extend the NEXI grammar (parser)
- introduce a new normalization phase for the query text
- reflect changes during optimization
- runtime support to execute the new operations on the tijah-index
U pftijah.mx
Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.224
retrieving revision 1.225
diff -u -d -r1.224 -r1.225
--- pftijah.mx 8 Apr 2009 07:04:55 -0000 1.224
+++ pftijah.mx 16 Apr 2009 10:29:37 -0000 1.225
@@ -2368,7 +2368,7 @@
@mil
#####################################################################
-# Containing_query
+# Containing_query_term
#####################################################################
PROC tj_term2tid (bat[str,dbl] term_score) : bat[oid,dbl] :=
@@ -2402,7 +2402,7 @@
@:getTermDocCnt_nid(nest)@
@:getTermDocCnt_nid(unnest)@
@= getTermDocCnt_nid
-PROC _gettermdocc...@1_nid(BAT[oid,oid] e_pre, BAT[void,int] e_size,
BAT[void,oid] t_pre) : BAT[oid,int] := {
+PROC _gettermdocc...@1_nid(BAT[oid,oid] e_pre, BAT[void,int] e_size,
BAT[oid,oid] t_pre) : BAT[oid,int] := {
# get doc - term relation
var e_tPre := treemergejo...@1_nid(e_pre, e_size, t_pre);
@@ -2412,7 +2412,7 @@
@:getTermDocCnt_pre(nest)@
@:getTermDocCnt_pre(unnest)@
@= getTermDocCnt_pre
-PROC _gettermdocc...@1_pre(BAT[oid,any] e_pre, BAT[void,int] e_size,
BAT[void,oid] t_pre) : BAT[oid,int] := {
+PROC _gettermdocc...@1_pre(BAT[oid,any] e_pre, BAT[void,int] e_size,
BAT[oid,oid] t_pre) : BAT[oid,int] := {
# get doc - term relation
var e_tPre := treemergejo...@1_pre(e_pre, e_size, t_pre.reverse());
@@ -2516,12 +2516,12 @@
}
-@:containing_query_LM(nest,nid,TagSize)@
-@:containing_query_LM(unnest,nid,TagSize)@
-@:containing_query_LM(nest,pre,size1)@
-@:containing_query_LM(unnest,pre,size1)@
-...@= containing_query_LM
-PROC tj_containing_que...@1_@2_LM (bat[oid,any] left, bat[oid,dbl] query) :
bat[oid,dbl] :=
+@:containing_query_term_LM(nest,nid,TagSize)@
+@:containing_query_term_LM(unnest,nid,TagSize)@
+@:containing_query_term_LM(nest,pre,size1)@
+@:containing_query_term_LM(unnest,pre,size1)@
+...@= containing_query_term_LM
+PROC tj_containing_que...@1_@2_term_LM (bat[oid,any] left, bat[oid,dbl] query)
: bat[oid,dbl] :=
{
var t_total := 0;
t_total :-= time();
@@ -2569,12 +2569,12 @@
@mil
-@:containing_query_LMs(nest,nid,TagSize)@
-@:containing_query_LMs(unnest,nid,TagSize)@
-@:containing_query_LMs(nest,pre,size1)@
-@:containing_query_LMs(unnest,pre,size1)@
-...@= containing_query_LMs
-PROC tj_containing_que...@1_@2_LMs (bat[oid,any] left, bat[oid,dbl] query) :
bat[oid,dbl] :=
+@:containing_query_term_LMs(nest,nid,TagSize)@
+@:containing_query_term_LMs(unnest,nid,TagSize)@
+@:containing_query_term_LMs(nest,pre,size1)@
+@:containing_query_term_LMs(unnest,pre,size1)@
+...@= containing_query_term_LMs
+PROC tj_containing_que...@1_@2_term_LMs (bat[oid,any] left, bat[oid,dbl]
query) : bat[oid,dbl] :=
{
var t_total := 0;
t_total :-= time();
@@ -2625,12 +2625,12 @@
@mil
-@:containing_query_NLLR(nest,nid,TagSize)@
-@:containing_query_NLLR(unnest,nid,TagSize)@
-@:containing_query_NLLR(nest,pre,size1)@
-@:containing_query_NLLR(unnest,pre,size1)@
-...@= containing_query_NLLR
-PROC tj_containing_que...@1_@2_NLLR (bat[oid,any] left, bat[oid,dbl] query) :
bat[oid,dbl] :=
+@:containing_query_term_NLLR(nest,nid,TagSize)@
+@:containing_query_term_NLLR(unnest,nid,TagSize)@
+@:containing_query_term_NLLR(nest,pre,size1)@
+@:containing_query_term_NLLR(unnest,pre,size1)@
+...@= containing_query_term_NLLR
+PROC tj_containing_que...@1_@2_term_NLLR (bat[oid,any] left, bat[oid,dbl]
query) : bat[oid,dbl] :=
{
var t_total := 0;
t_total :-= time();
@@ -2678,12 +2678,12 @@
@mil
-@:containing_query_OKAPI(nest,nid,TagSize)@
-@:containing_query_OKAPI(unnest,nid,TagSize)@
-@:containing_query_OKAPI(nest,pre,size1)@
-@:containing_query_OKAPI(unnest,pre,size1)@
-...@= containing_query_OKAPI
-PROC tj_containing_que...@1_@2_OKAPI (bat[oid,any] left, bat[oid,dbl] query) :
bat[oid,dbl] :=
+@:containing_query_term_OKAPI(nest,nid,TagSize)@
+@:containing_query_term_OKAPI(unnest,nid,TagSize)@
+@:containing_query_term_OKAPI(nest,pre,size1)@
+@:containing_query_term_OKAPI(unnest,pre,size1)@
+...@= containing_query_term_OKAPI
+PROC tj_containing_que...@1_@2_term_OKAPI (bat[oid,any] left, bat[oid,dbl]
query) : bat[oid,dbl] :=
{
var t_total := 0;
t_total :-= time();
@@ -2730,8 +2730,108 @@
}
@mil
+@:getTermDoc_nid(nest)@
+@:getTermDoc_nid(unnest)@
+...@= getTermDoc_nid
+PROC _gettermd...@1_nid(BAT[oid,oid] e_pre, BAT[void,int] e_size, BAT[oid,oid]
t_pre) : BAT[oid,int] := {
+
+ # get doc - term relation
+ return treemergejo...@1_nid(e_pre, e_size, t_pre);
+}
+...@mil
+@:getTermDoc_pre(nest)@
+@:getTermDoc_pre(unnest)@
+...@= getTermDoc_pre
+PROC _gettermd...@1_pre(BAT[oid,any] e_pre, BAT[void,int] e_size, BAT[oid,oid]
t_pre) : BAT[oid,int] := {
+
+ # get doc - term relation
+ return treemergejo...@1_pre(e_pre, e_size, t_pre.reverse());
+}
+...@mil
+
+@:containing_query_term(nest,nid,plus,TagSize,semijoin)@
+@:containing_query_term(unnest,nid,plus,TagSize,semijoin)@
+@:containing_query_term(nest,pre,plus,size1,semijoin)@
+@:containing_query_term(unnest,pre,plus,size1,semijoin)@
+@:containing_query_term(nest,nid,min,TagSize,kdiff)@
+@:containing_query_term(unnest,nid,min,TagSize,kdiff)@
+@:containing_query_term(nest,pre,min,size1,kdiff)@
+@:containing_query_term(unnest,pre,min,size1,kdiff)@
+...@= containing_query_term
+PROC tj_containing_que...@1_@2_te...@3 (bat[oid,any] left, bat[oid,dbl] query)
: bat[oid,dbl] :=
+{
+ var t_total := 0;
+ t_total :-= time();
+
+ if ( count(left) = 0 ) return new(oid,dbl);
+ if ( count(query) = 0 ) return new(oid,dbl);
+
+ var e_pre := left.chk_order();
+ var e_size := bat("tj_" + ftindex + "_...@4");
+
+ # loop over query terms
+ qu...@batloop()
+ {
+ # get collection count of term
+ var t_pre := _getTermPositions($h);
+ e_pre := e_p...@5(_gettermd...@1_@2(e_pre, e_size, t_pre));
+ }
+
+ var res := e_pre;
+
+ t_total :+= time();
+ if (timing) printf("# te...@3 timing: total: %d\n", t_total);
+ return res;
+}
+...@mil
+
+#returns phrases (nid,pre) of first phrase term
+PROC _selectPhrase(bat[oid,dbl] query) : bat[oid,oid] :=
+{
+ # Select the term positions from the global term dictionary.
+ var terms := query.reverse();
+ var t_pre := _getTermPositions(terms.fetch(0));
+
+ var res := t_pre.reverse();
+ var j := terms.count();
+ var i := 1;
+ while (i < j)
+ {
+ t_pre := _getTermPositions(terms.fetch(i));
+ res := res.semijoin(t_pre.[int]().[-](i).[oid]().reverse());
+ i :+= 1;
+ }
+
+ return res.reverse().chk_order();
+}
+
+@:containing_query_phrase(nest,nid,TagSize)@
+@:containing_query_phrase(unnest,nid,TagSize)@
+@:containing_query_phrase(nest,pre,size1)@
+@:containing_query_phrase(unnest,pre,size1)@
+...@= containing_query_phrase
+PROC tj_containing_que...@1_@2_phrase (bat[oid,any] left, bat[oid,dbl] query)
: bat[oid,dbl] :=
+{
+ var t_total := 0;
+ t_total :-= time();
+
+ if ( count(left) = 0 ) return new(oid,dbl);
+ if ( count(query) = 0 ) return new(oid,dbl);
+
+ var e_pre := left.chk_order();
+ var e_size := bat("tj_" + ftindex + "_...@3");
+ var t_pre := _selectPhrase(query);
+ var res := e_pre.semijoin(_gettermd...@1_@2(e_pre, e_size, t_pre));
+
+ t_total :+= time();
+ if (timing) printf("# phrase timing: total: %d\n", t_total);
+ return res;
+}
+...@mil
+
+
#####################################################################
-# Containing_conceptquery
+# Containing_query_entity
#####################################################################
PROC tj_ent2tid (bat[str,dbl] concept_score) : bat[oid,dbl] :=
@@ -2755,7 +2855,7 @@
@:getConceptDocScr_nid(nest)@
@:getConceptDocScr_nid(unnest)@
@= getConceptDocScr_nid
-PROC _getconceptdocs...@1_nid(BAT[oid,oid] e_pre, BAT[void,int] e_size,
BAT[void,oid] c_pre) : BAT[oid,int] := {
+PROC _getconceptdocs...@1_nid(BAT[oid,oid] e_pre, BAT[void,int] e_size,
BAT[oid,oid] c_pre) : BAT[oid,int] := {
# get doc - Concept relation
var c_Scr := bat("tj_" + ftindex + "_ConceptScore");
@@ -2769,7 +2869,7 @@
@:getConceptDocScr_pre(nest)@
@:getConceptDocScr_pre(unnest)@
@= getConceptDocScr_pre
-PROC _getconceptdocs...@1_pre(BAT[oid,any] e_pre, BAT[void,int] e_size,
BAT[void,oid] c_pre) : BAT[oid,int] := {
+PROC _getconceptdocs...@1_pre(BAT[oid,any] e_pre, BAT[void,int] e_size,
BAT[oid,oid] c_pre) : BAT[oid,int] := {
# get doc - Concept relation
var c_Scr := bat("tj_" + ftindex + "_ConceptScore");
@@ -2796,12 +2896,12 @@
return e_cScores;
}
-@:containing_conceptquery_LogSum(nest,nid,TagSize)@
-@:containing_conceptquery_LogSum(unnest,nid,TagSize)@
-@:containing_conceptquery_LogSum(nest,pre,size1)@
-@:containing_conceptquery_LogSum(unnest,pre,size1)@
-...@= containing_conceptquery_LogSum
-PROC tj_containing_conceptque...@1_@2_LogSum (bat[oid,any] left, bat[oid,dbl]
query) : bat[oid,dbl] :=
+@:containing_query_entity_LogSum(nest,nid,TagSize)@
+@:containing_query_entity_LogSum(unnest,nid,TagSize)@
+@:containing_query_entity_LogSum(nest,pre,size1)@
+@:containing_query_entity_LogSum(unnest,pre,size1)@
+...@= containing_query_entity_LogSum
+PROC tj_containing_que...@1_@2_entity_LogSum (bat[oid,any] left, bat[oid,dbl]
query) : bat[oid,dbl] :=
{
var t_total := 0;
t_total :-= time();
@@ -2846,6 +2946,42 @@
}
@mil
+@:containing_query_entity(nest,nid,plus,TagSize,semijoin)@
+@:containing_query_entity(unnest,nid,plus,TagSize,semijoin)@
+@:containing_query_entity(nest,pre,plus,size1,semijoin)@
+@:containing_query_entity(unnest,pre,plus,size1,semijoin)@
+@:containing_query_entity(nest,nid,min,TagSize,kdiff)@
+@:containing_query_entity(unnest,nid,min,TagSize,kdiff)@
+@:containing_query_entity(nest,pre,min,size1,kdiff)@
+@:containing_query_entity(unnest,pre,min,size1,kdiff)@
+...@= containing_query_entity
+PROC tj_containing_que...@1_@2_enti...@3 (bat[oid,any] left, bat[oid,dbl]
query) : bat[oid,dbl] :=
+{
+ var t_total := 0;
+ t_total :-= time();
+
+ if ( count(left) = 0 ) return new(oid,dbl);
+ if ( count(query) = 0 ) return new(oid,dbl);
+
+ var e_pre := left.chk_order();
+ var e_size := bat("tj_" + ftindex + "_...@4");
+
+ # loop over query terms
+ qu...@batloop()
+ {
+ # get collection count of term
+ var t_pre := _getConceptPositions($h);
+ e_pre := e_p...@5(_gettermd...@1_@2(e_pre, e_size, t_pre));
+ }
+
+ var res := e_pre;
+
+ t_total :+= time();
+ if (timing) printf("# enti...@3 timing: total: %d\n", t_total);
+ return res;
+}
+...@mil
+
#####################################################################
# and/or combination
------------------------------------------------------------------------------
Stay on top of everything new and different, both inside and
around Java (TM) technology - register by April 22, and save
$200 on the JavaOne (SM) conference, June 2-5, 2009, San Francisco.
300 plus technical and hands-on sessions. Register today.
Use priority code J9JMT32. http://p.sf.net/sfu/p
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins