Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory
sfp-cvsdas-1.v30.ch3.sourceforge.com:/tmp/cvs-serv28540/modules/pftijah
Modified Files:
Tag: M5XQ
pftijah.mx
Log Message:
propagated changes of Friday Feb 12 2010
from the XQFT branch to the M5XQ branch
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2010/02/12 - sjoerd: modules/pftijah/pftijah.mx,1.246.2.9
propagated changes of Tuesday Feb 09 2010 - Friday Feb 12 2010
from the development trunk to the XQFT branch
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2010/02/09 - jflokstra: modules/pftijah/pftijah.mx,1.255
- implement handling of NEXI programs by the ftc operator. Default
behaviour is still the precompiled term query but when the term is prefixed by
a "%" it is interpreted as a NEXI query. So
ftc "%.[about(.,Churchill)]"
is equivalent with:
ftc "Churchill"
But only much slower because it is not precompiled.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2010/02/12 - sjoerd: modules/pftijah/pftijah.mx,1.256
propagated changes of Wednesday Feb 10 2010 - Friday Feb 12 2010
from the Feb2010 branch to the development trunk
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2010/02/10 - stmane: modules/pftijah/pftijah.mx,1.249.2.5
propagated changes of Wednesday Feb 10 2010
from the Nov2009 branch to the Feb2010 branch
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2010/02/10 - cornuz: modules/pftijah/pftijah.mx,1.238.2.9
- re-enable shred bit in tj_addcollection_frag (previously disabled by
mistake), to shred documents before indexing when needed
- initialize variable last_pre before the loop for indexing in chunks
- some minor fixes to ADDHELP text
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.226.2.25
retrieving revision 1.226.2.26
diff -u -d -r1.226.2.25 -r1.226.2.26
--- pftijah.mx 8 Feb 2010 17:12:47 -0000 1.226.2.25
+++ pftijah.mx 12 Feb 2010 13:34:04 -0000 1.226.2.26
@@ -114,7 +114,7 @@
BAT[oid,oid] item,
BAT[oid,int] kind,
BAT[oid,str] doc_loaded)
- : BAT[oid,oid] = CMDpf2tijah_node;
+ : BAT[void,oid] = CMDpf2tijah_node;
"Translate Pathfinder node sequence to tijah node sequence"
.COMMAND offsetindex( BAT[void,oid] offset_tid, wrd res_size)
@@ -999,13 +999,13 @@
ADDHELP("tj_add2collection", "flokstra & rode", "Jan 2007",
"PARAMETERS:\n\
-- str ftiName: the name of the collection.\n
-- str uri_loc: the location of the xml document.\n
-- str uri_name: the name of the xml document (optional).\n
+- str ftiName: the name of the collection.\n\
+- str uri_loc: the location of the xml document.\n\
+- str uri_name: the name of the xml document (optional).\n\
- bit shred: when true the doc is shredded when necessary.\n\
DESCRIPTION:\n\
Add a document to a pftijah collection. The document is indexed and if the \n\
-shred parameter is true it is also shredded in Pathfinder.
+shred parameter is true it is also shredded in Pathfinder. \n\
The index is automatically finalized at the end of the method.",
"pftijah");
PROC tj_add2collection(str ftiName, str uri_loc, str uri_name, bit shred) :
void
@@ -1650,14 +1650,14 @@
PROC tj_add2collection_frag(str ftiName, str uri, str filename, bit shred) :
void
{
var uris := new(str,str).insert(uri, filename);
- return tj_add2collection_frag(ftiName, uris, shred);
+ tj_add2collection_frag(ftiName, uris, shred);
}
ADDHELP("tj_add2collection_frag", "flokstra & rode", "Sept 2009",
"PARAMETERS:\n\
--` str ftiName: the name of the collection.\n
+- str ftiName: the name of the collection.\n\
- BAT[str,str]: the bat containing the [location,name] pairs of the xml
docs.\n\
-- bit shred: when true the doc is shredded when necessary (deprecated).\n\
+- bit shred: when true the doc is shredded when necessary.\n\
DESCRIPTION:\n\
Adds a documents to the index. If needed, the index is split into several
fragments.\n\
Each fragment is finalized after it is filled to its maximum capacity.",
@@ -1665,9 +1665,6 @@
PROC tj_add2collection_frag(str ftiName, BAT[str,str] uri, bit shred) : void
{
if ( verbose ) tj_verbose(HASH +"TJ tj_add2collection_frag(\"%s\")
called.\n",ftiName);
- if (shred)
- ERROR("tj_add2collection_frag: shred bit no longer supported");
-
var coll_lock := tj_get_collection_lock(ftiName);
lock_set(coll_lock);
var err := CATCH({
@@ -1678,6 +1675,37 @@
# get first free collection fragment (first fragment that is not yet
filled completely)
var collBat := _tj_get_collection_frag(ftiName, commitBats);
+ # shred documents if needed
+ if ( shred ) {
+ var pf_collection := _tj_get_parameter2(collBat,"pf_collection");
+ u...@batloop() {
+ var uri_loc := $h;
+ var uri_name := $t;
+ if ( isnil(uri_loc) ) {
+ ERROR("tj_add2collection_frag: should specify doc_uri (and
doc_name).");
+ }
+ if ( isnil(uri_name) ) {
+ uri_name := uri_loc;
+ } else if ( uri_name = "" ) {
+ uri_name := uri_loc;
+ }
+ if (not(bat("doc_name").reverse().exist(uri_name))) {
+ var s_start := usec();
+ if ( isnil(pf_collection) ) {
+ shred_doc(uri_loc,uri_name);
+ } else {
+ shred_doc(uri_loc,uri_name,pf_collection,0LL);
+ }
+ if ( timing ) {
+ ms := (usec() - s_start)/1000;
+ printf(HASH +"TJ tj_add2collection_frag(\"%s\"): shred time =
%lld.%03llds.\n",uri_name,/(ms,1000),%(ms,1000));
+ }
+ } else {
+ if ( verbose ) tj_verbose(HASH +"TJ tj_add2collection_frag
doc(\"%s\") already shredded.\n",uri_name);
+ }
+ }
+ }
+
# set access back to BAT_APPEND
_tj_set_forwardindex_access(collBat, BAT_APPEND);
@@ -1694,7 +1722,7 @@
var first_doc := 0;
var last_doc := uri.count()-1;
- var last_pre;
+ var last_pre := collBat.find("size").count_wrd() + 1;
while(first_doc <= last_doc) {
var uri_chunk := uri.slice(first_doc, first_doc+chunksize-1);
var ws_opt := ws_create(0);
@@ -3221,29 +3249,44 @@
var xdoc_firstpre := bat("tj_" + ftiName + "0_doc_firstpre");
var xpfpre := bat("tj_" + ftiName + "0_pfpre");
var doc_loaded :=
par_ws.fetch(CONT_COLL).join(bat("doc_collection").reverse()).join(bat("doc_name"));
+
+ var tjPre_score;
+
if ( verbose ) tj_verbose(HASH +" ALG_tj_ftfun_handler: compute
startnodes\n");
iter_tjPre :=
pf2tijah_node(true,xdoc_name,xdoc_firstpre,xpfpre,sn_item,[int](sn_kind),doc_loaded);
+ if ( not(ftc_term.startsWith("%")) ) {
+ if ( verbose ) tj_verbose(HASH +" ALG_tj_ftfun_handler: run
precompiled term search\n");
- # now tokenize the term string and add a default weight
- var T := tijah_tokenize2bat(ftc_term).reverse().project(dbl(1.0));
- var Q := tj_prepare_query(T);
+ # now tokenize the term string and add a default weight
+ var T := tijah_tokenize2bat(ftc_term).reverse().project(dbl(1.0));
+ var Q := tj_prepare_query(T);
- # tj_init_termHash(ftiName);
- # tj_init_tagHash(ftiName);
- var scorebase := dbl(0.000000);
- var c_lambda := dbl(0.800000);
- var okapi_k1 := dbl(1.200000);
- var okapi_b := dbl(0.750000);
- var downprop := "max";
- var upprop := "max";
- var andcomb := "prod";
- var orcomb := "sum";
- var returnall := TRUE;
+ # tj_init_termHash(ftiName);
+ # tj_init_tagHash(ftiName);
+ var scorebase := dbl(0.000000);
+ var c_lambda := dbl(0.800000);
+ var okapi_k1 := dbl(1.200000);
+ var okapi_b := dbl(0.750000);
+ var downprop := "max";
+ var upprop := "max";
+ var andcomb := "prod";
+ var orcomb := "sum";
+ var returnall := TRUE;
- var tjPre_score :=
tj_containing_query_nest_pre_term_NLLR(iter_tjPre.reverse(), Q);
+ tjPre_score :=
tj_containing_query_nest_pre_term_NLLR(iter_tjPre.reverse(), Q);
+ } else {
+ if ( verbose ) tj_verbose(HASH +" ALG_tj_ftfun_handler: run
compiled nexi query\n");
+ ftc_term := ftc_term.substring(2); # strip the "%"
+ # the next tmark() is a bit mysterious. The result of pf2tijah_node
+ # should be a [void,oid] but is an [oid,oid]
+ iter_tjPre := iter_tjPre.tmark(0...@0);
+ var opt := new(str,str);
+ opt.insert("return-all","true");
+ opt.insert("_query",ftc_term);
+ tjPre_score :=
run_tijah_query("DFLT_FT_INDEX",opt,true,iter_tjPre);
+ }
var iter_score := iter_tjPre.leftjoin(tjPre_score);
-
result_item_s.append(iter_score);
});
lock_unset(tijah_lock);
------------------------------------------------------------------------------
SOLARIS 10 is the OS for Data Centers - provides features such as DTrace,
Predictive Self Healing and Award Winning ZFS. Get Solaris 10 NOW
http://p.sf.net/sfu/solaris-dev2dev
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins