Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv2199
Modified Files:
pftijah.mx
Log Message:
- introduce fragmented indexing on XQuery level
- fixed bugs on MIL level function for fragemented indexing
- adapted the test cases
what still misses is the querying on the fragmented index,
but the default for indexing is currently set to create only
one single index, which allows querying to work as before
U pftijah.mx
Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.239
retrieving revision 1.240
diff -u -d -r1.239 -r1.240
--- pftijah.mx 8 Oct 2009 09:44:06 -0000 1.239
+++ pftijah.mx 14 Oct 2009 10:01:06 -0000 1.240
@@ -385,24 +385,6 @@
const dflt_bg_index := "DFLT_FT_INDEX";
const dflt_score_base := "0";
-PROC tj_get_ft_index(BAT[str,str] tj_options, bit chk_exists) : str
-{
- var res := dflt_ft_index;
- if ( tj_options.exist("ft-index") ) {
- res := tj_options.find("ft-index");
- }
- if ( chk_exists ) {
- if ( not(view_bbp_name().reverse().exist("tj_" + res + "_size")) ) {
- ERROR("tj_get_ft_index, ft-index \"%s\" does not exist. create full
text index using tijah:create-ft-index()\n",res);
- }
- if ( not(view_bbp_name().reverse().exist("tj_" + res + "_TagSize")) ) {
- ERROR("tj_get_ft_index, ft-index \"%s\" does not exist. create full
text index using tijah:create-ft-index()\n",res);
- }
- }
- if ( verbose ) printf(HASH +"TJ:tj_get_ft_index() = %s.\n",res);
- return res;
-}
-
ADDHELP("tj_init_global", "flokstra & rode", "Jan 2007",
"PARAMETERS:\n\
- optional BAT[str,str] param: initialization parameter for global pftijah.\n\
@@ -829,67 +811,6 @@
#
-# Start of ft-index / pf-collection dependency module
-#
-
-PROC modify_pfc_fti(str fti_name, BAT[void,str] v_pfc) : BAT[str,str]
-{
- var result;
-
- lock_set(tj_dep_lock);
- var err := CATCH({
- var glb_fti_pfc := bat("tj_pfc_fti_dep");
- var fti_dep := glb_fti_pfc.reverse().select(fti_name).reverse();
- if ( fti_dep.select("*").count_wrd() > wrd(0) ) {
- ERROR(HASH + " pfc_fti_dep: unable to extend ft-index when created
with *\n");
- }
- var new_fti_pfc :=
v_pfc.tunique().project(fti_name).reverse().sunique().tdiff(fti_dep);
- if ( verbose ) {
- printf(HASH +"TJ:modify_pfc_fti: ***** added dep ***\n");
- new_fti_pfc.print();
- }
- var sz := new_fti_pfc.count_wrd();
- if ( sz > wrd(0) ) {
- if ( sz = wrd(1) ) {
- glb_fti_pfc.insert(new_fti_pfc);
- if ( new_fti_pfc.uselect("*").count_wrd() > wrd(0) ) {
- bat("tj_pfc_fti_dep_star").insert(new_fti_pfc);
- }
- } else {
- if ( new_fti_pfc.uselect("*").count_wrd() > wrd(0) ) {
- ERROR(HASH + " pfc_fti_dep: when using * it must be the
only collection dependency.\n");
- }
- glb_fti_pfc.insert(new_fti_pfc);
- }
- }
- result := new_fti_pfc;
- });
- if ( verbose ) {
- printf("\n" + HASH + " LOG modify_pfc_fti(\"%s\") START, v_pfc =\n",
fti_name);
- v_pfc.print();
- printf(HASH +" Dependence BATs are [STAR|ALL]:\n");
- bat("tj_pfc_fti_dep_star").print();
- bat("tj_pfc_fti_dep").print();
- printf(HASH +" LOG modify_pfc_fti(\"%s\") END.\n", fti_name);
- }
- lock_unset(tj_dep_lock);
- if (not(isnil(err))) ERROR(err);
- #
- return result;
-}
-
-PROC delete_pfc_fti(str fti_name) :void
-{
- lock_set(tj_dep_lock);
- var err := CATCH({
- bat("tj_pfc_fti_dep").delete(fti_name);
- bat("tj_pfc_fti_dep_star").delete(fti_name);
- });
- lock_unset(tj_dep_lock);
- if (not(isnil(err))) ERROR(err);
-}
-
-#
# End of ft-index / pf-collection dependency module
#
@@ -1369,6 +1290,24 @@
# #
#####################################################################
+PROC tj_get_ft_index(BAT[str,str] tj_options, bit chk_exists) : str
+{
+ var res := dflt_ft_index;
+ if ( tj_options.exist("ft-index") ) {
+ res := tj_options.find("ft-index");
+ }
+ if ( chk_exists ) {
+ if ( not(view_bbp_name().reverse().exist("tj_" + res + "_param")) ) {
+ ERROR("tj_get_ft_index, ft-index \"%s\" does not exist. create full
text index using tijah:create-ft-index()\n",res);
+ }
+ if ( not(view_bbp_name().reverse().exist("tj_" + res + "_termdict")) ) {
+ ERROR("tj_get_ft_index, ft-index \"%s\" does not exist. create full
text index using tijah:create-ft-index()\n",res);
+ }
+ }
+ if ( verbose ) printf(HASH + " TJ:tj_get_ft_index() = %s.\n",res);
+ return res;
+}
+
# set a collection parameter
PROC _tj_set_parameter2(BAT[str,bat] collBat, str par, str val) : void
{
@@ -1487,7 +1426,7 @@
var tagfilter := "";
var whitelist := "";
var blacklist := "";
- var fragsize := "0";
+ var fragsize := str(INT_MAX / 2);
var delay_finalize := "0";
pa...@batloop() {
@@ -1697,6 +1636,15 @@
return commitBats;
}
+#
+# convenience function
+#
+PROC tj_add2collection_frag(str ftiName, str uri, str filename, bit shred) :
void
+{
+ var uris := new(str,str).insert(uri, filename);
+ return tj_add2collection_frag(ftiName, uris, shred);
+}
+
ADDHELP("tj_add2collection_frag", "flokstra & rode", "Sept 2009",
"PARAMETERS:\n\
-` str ftiName: the name of the collection.\n
@@ -1903,9 +1851,9 @@
i := nil;
tmp := nil;
tmpsize := nil;
- replaceBats.insert(collBat.find("_tagIndex").bbpname(), "tj_" +
ftindex + "_TagIndex");
- replaceBats.insert(collBat.find("_tags").bbpname(), "tj_" +
ftindex + "_Tags");
- replaceBats.insert(collBat.find("_tagSize").bbpname(), "tj_" +
ftindex + "_TagSize");
+ replaceBats.insert("_tagIndex", "tj_" + ftindex + "_TagIndex");
+ replaceBats.insert("_tags", "tj_" + ftindex + "_Tags");
+ replaceBats.insert("_tagSize", "tj_" + ftindex + "_TagSize");
submitBats.append("tj_" + ftindex + "_TagIndex");
submitBats.append("tj_" + ftindex + "_Tags");
submitBats.append("tj_" + ftindex + "_TagSize");
@@ -1924,8 +1872,8 @@
collBat.replace("_terms", i.fetch(1).access(BAT_READ).mmap(1));
i := nil;
tmp := nil;
- replaceBats.insert(collBat.find("_termIndex").bbpname(), "tj_"
+ ftindex + "_TermIndex");
- replaceBats.insert(collBat.find("_terms").bbpname(), "tj_" +
ftindex + "_Terms");
+ replaceBats.insert("_termIndex", "tj_" + ftindex +
"_TermIndex");
+ replaceBats.insert("_terms", "tj_" + ftindex + "_Terms");
submitBats.append("tj_" + ftindex + "_TermIndex");
submitBats.append("tj_" + ftindex + "_Terms");
submitBats.append(tf.bbpname());
@@ -1946,6 +1894,9 @@
tags := nil;
tagsize.persists(true).access(BAT_READ).mmap(1).bbpname("tj_" +
ftindex + "_TagSize");
tagsize := nil;
+ collBat.insert("_tagIndex", bat("tj_" + ftindex +
"_TagIndex"));
+ collBat.insert("_tags", bat("tj_" + ftindex + "_Tags"));
+ collBat.insert("_tagSize", bat("tj_" + ftindex + "_TagSize"));
submitBats.append("tj_" + ftindex + "_TagIndex");
submitBats.append("tj_" + ftindex + "_Tags");
submitBats.append("tj_" + ftindex + "_TagSize");
@@ -1965,6 +1916,8 @@
terms.persists(true).access(BAT_READ).mmap(1).bbpname("tj_" +
ftindex + "_Terms");
termindex := nil;
terms := nil;
+ collBat.insert("_termIndex", bat("tj_" + ftindex +
"_TermIndex"));
+ collBat.insert("_terms", bat("tj_" + ftindex + "_Terms"));
submitBats.append("tj_" + ftindex + "_TermIndex");
submitBats.append("tj_" + ftindex + "_Terms");
submitBats.append(tf.bbpname());
@@ -2000,7 +1953,7 @@
var replaceBats := commitBats.find("replaceBats");
replaceb...@batloop() {
bat($t).persists(false).rename("del_" + $t);
- bat($h).persists(true).bbpname($t);
+ collBat.find($h).persists(true).bbpname($t);
}
var submitBats := commitBats.find("submitBats");
@@ -2011,6 +1964,7 @@
PROC tj_collection_delete(str ftiName) : void
{
+ delete_pfc_fti(ftiName);
var tj_bats := view_bbp_name().like("tj_" + ftiName).tmark();
[persists]([bat](tj_bats), const false);
subcommit(tj_bats);
@@ -2023,6 +1977,67 @@
subcommit(tj_bats);
}
+#
+# Start of ft-index / pf-collection dependency module
+#
+
+PROC modify_pfc_fti(str fti_name, BAT[void,str] v_pfc) : BAT[str,str]
+{
+ var result;
+
+ lock_set(tj_dep_lock);
+ var err := CATCH({
+ var glb_fti_pfc := bat("tj_pfc_fti_dep");
+ var fti_dep := glb_fti_pfc.reverse().select(fti_name).reverse();
+ if ( fti_dep.select("*").count_wrd() > wrd(0) ) {
+ ERROR(HASH + " pfc_fti_dep: unable to extend ft-index when created
with *\n");
+ }
+ var new_fti_pfc :=
v_pfc.tunique().project(fti_name).reverse().sunique().tdiff(fti_dep);
+ if ( verbose ) {
+ printf(HASH +"TJ:modify_pfc_fti: ***** added dep ***\n");
+ new_fti_pfc.print();
+ }
+ var sz := new_fti_pfc.count_wrd();
+ if ( sz > wrd(0) ) {
+ if ( sz = wrd(1) ) {
+ glb_fti_pfc.insert(new_fti_pfc);
+ if ( new_fti_pfc.uselect("*").count_wrd() > wrd(0) ) {
+ bat("tj_pfc_fti_dep_star").insert(new_fti_pfc);
+ }
+ } else {
+ if ( new_fti_pfc.uselect("*").count_wrd() > wrd(0) ) {
+ ERROR(HASH + " pfc_fti_dep: when using * it must be the
only collection dependency.\n");
+ }
+ glb_fti_pfc.insert(new_fti_pfc);
+ }
+ }
+ result := new_fti_pfc;
+ });
+ if ( verbose ) {
+ printf("\n" + HASH + " LOG modify_pfc_fti(\"%s\") START, v_pfc =\n",
fti_name);
+ v_pfc.print();
+ printf(HASH +" Dependence BATs are [STAR|ALL]:\n");
+ bat("tj_pfc_fti_dep_star").print();
+ bat("tj_pfc_fti_dep").print();
+ printf(HASH +" LOG modify_pfc_fti(\"%s\") END.\n", fti_name);
+ }
+ lock_unset(tj_dep_lock);
+ if (not(isnil(err))) ERROR(err);
+ #
+ return result;
+}
+
+PROC delete_pfc_fti(str fti_name) :void
+{
+ lock_set(tj_dep_lock);
+ var err := CATCH({
+ bat("tj_pfc_fti_dep").delete(fti_name);
+ bat("tj_pfc_fti_dep_star").delete(fti_name);
+ });
+ lock_unset(tj_dep_lock);
+ if (not(isnil(err))) ERROR(err);
+}
+
#####################################################################
@@ -2377,7 +2392,7 @@
}
# temporary algebra query handler
-PROC ALG_tj_query_handler(
+PROC ALG_tj_query_handler2(
bit par_storeScore,
BAT[oid,bat] pfop_sn,
BAT[oid,bat] pfop_query,
@@ -2425,8 +2440,8 @@
optbat := new(str,str,32);
}
if ( verbose ) printf(HASH +" ALG_tj_query_handler: handle
startNodes.\n");
- var ftindex := tj_get_ft_index(optbat,true);
- var tijah_lock := tj_get_collection_lock(ftindex);
+ var ftiName := tj_get_ft_index(optbat,true);
+ var tijah_lock := tj_get_collection_lock(ftiName);
lock_set(tijah_lock);
var err := CATCH({
var startNodes;
@@ -2439,9 +2454,9 @@
sn_item := sn_item.tmark(0...@0);
sn_kind := sn_kind.tmark(0...@0);
- var xdoc_name := bat("tj_" + ftindex + "_doc_name");
- var xdoc_firstpre := bat("tj_" + ftindex + "_doc_firstpre");
- var xpfpre := bat("tj_" + ftindex + "_pfpre");
+ var xdoc_name := bat("tj_" + ftiName + "0_doc_name");
+ var xdoc_firstpre := bat("tj_" + ftiName + "0_doc_firstpre");
+ var xpfpre := bat("tj_" + ftiName + "0_pfpre");
var doc_loaded :=
par_ws.fetch(CONT_COLL).join(bat("doc_collection").reverse()).join(bat("doc_name"));
if ( verbose ) printf(HASH +" ALG_tj_query_handler: compute
startnodes\n");
startNodes :=
pf2tijah_node(false,xdoc_name,xdoc_firstpre,xpfpre,sn_item,[int](sn_kind),doc_loaded);
@@ -2451,7 +2466,7 @@
optbat.access(BAT_WRITE);
optbat.insert("_query",pfop_query.fetch(1...@0).fetch(int($h)));
if ( verbose ) printf(HASH +" ALG_tj_query_handler: run tijah
query.\n");
- var nexi_allscores :=
run_tijah_query(ftindex,optbat,has_sn,startNodes);
+ var nexi_allscores :=
run_tijah_query(ftiName,optbat,has_sn,startNodes);
var nexi_score;
if ( verbose ) printf(HASH +" ALG_tj_query_handler: handling
scores.\n");
if ( optbat.exist("returnNumber") ) {
@@ -2460,12 +2475,174 @@
} else {
nexi_score := nexi_allscores;
}
- var docpre := bat("tj_" + ftindex + "_doc_firstpre").[oid]();
- var pfpre := bat("tj_" + ftindex + "_pfpre");
+ var docpre := bat("tj_" + ftiName + "0_doc_firstpre").[oid]();
+ var pfpre := bat("tj_" + ftiName + "0_pfpre");
var item := nexi_score.hmark(0...@0);
var frag := [find_lower](const docpre.reverse().mark(0...@0), item);
item := item.join(pfpre).sort().tmark();
- var needed_docs := bat("tj_" + ftindex +
"_doc_name").semijoin(frag.tunique());
+ var needed_docs := bat("tj_" + ftiName +
"0_doc_name").semijoin(frag.tunique());
+ var loaded_docs := par_ws.fetch(OPEN_NAME).reverse();
+ var docs_to_load :=
kdiff(needed_docs.reverse(),loaded_docs).hmark(0...@0);
+ ws_opendoc(par_ws, docs_to_load);
+ var doc_loaded :=
reverse(par_ws.fetch(OPEN_CONT)).leftfetchjoin(par_ws.fetch(OPEN_NAME));
+ # On the forced document loading size we keep using the old interface
+ # until ws.fetch(OPEN_CONT|OPEN_DOC) disappears. This interface is
+ # much cheaper for us and works also correct when the OPEN_CONT|DOC
+ # bats are not complete.
+ # var doc_loaded :=
par_ws.fetch(CONT_COLL).join(bat("doc_collection").reverse()).join(bat("doc_name"));
+ var fid_pffid := needed_docs.join(doc_loaded.reverse());
+ frag := frag.join(fid_pffid).sort().tmark();
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: handled new
frags/documents.\n");
+ if ( par_storeScore ) {
+ var tID := oid(par_scoreDB.fetch(0...@0).count_wrd() + 8888);
+
par_scoreDB.fetch(4...@0).insert(lng(tID),lng(nexi_allscores.count_wrd()));
+ par_scoreDB.fetch(0...@0).append(item.project(tID));
+ par_scoreDB.fetch(1...@0).append(frag);
+ par_scoreDB.fetch(2...@0).append(item);
+ par_scoreDB.fetch(3...@0).append(nexi_score.tmark());
+ result_id.append(lng(tID));
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: stored loop
score.\n");
+ } else {
+ result_iter.append(item.project($t));
+ result_pos.append(item.mark(1...@0));
+ result_frag.append(frag);
+ result_item.append(item);
+ }
+ });
+ lock_unset(tijah_lock);
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: released lock.\n");
+ if (not(isnil(err))) ERROR(err);
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: stored loop nodes
in result.\n");
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: loop finish,
id=%d.\n",$t);
+ } # end batloop over queries
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: batloop finished.\n");
+ var iter;
+ var item;
+ var ipik;
+ var kind;
+ var pos;
+ if ( par_storeScore ) {
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: create int
return.\n");
+ item := result_id;
+ iter := par_loop.tmark(oid(0));
+ ipik := iter;
+ pos := oid(1);
+ kind := new(oid,oid);
+ } else {
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: create node
return.\n");
+ iter := result_iter;
+ pos := result_pos;
+ kind := result_frag;
+ item := result_item;
+ ipik := iter;
+ }
+ if ( verbose ) {
+ printf(HASH +" ALG_tj_query_handler: iter/item/kind/pos result
start\n");
+ iter.print();
+ item.print();
+ kind.print();
+ pos.print();
+ printf(HASH +" ALG_tj_query_handler: iter/item/kind/pos result
finish\n");
+ }
+ var res := ALG_tj_pfop(iter,item,kind,pos.materialize(ipik));
+ #
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: FINISH.\n");
+ return res;
+}
+
+# temporary algebra query handler
+PROC ALG_tj_query_handler(
+ bit par_storeScore,
+ BAT[oid,bat] pfop_sn,
+ BAT[oid,bat] pfop_query,
+ BAT[oid,bat] pfop_opt,
+ BAT[void,any] par_loop,
+ BAT[oid,bat] par_ws,
+ BAT[oid,bat] par_scoreDB
+ ) : BAT[void,bat]
+{
+ var result_id;
+ var result_iter;
+ var result_item;
+ var result_pos;
+ var result_frag;
+
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: START.\n");
+ if ( par_storeScore ) {
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: storeScore=TRUE.\n");
+ result_id := new(void,lng).seqbase(0...@0);
+ } else {
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler:
storeScore=FALSE.\n");
+ result_iter := new(void,oid).seqbase(0...@0);
+ result_item := new(void,oid).seqbase(0...@0);
+ result_pos := new(void,oid).seqbase(0...@0);
+ result_frag := new(void,oid).seqbase(0...@0);
+ }
+
+ var has_sn := (pfop_sn.count_wrd() > wrd(0));
+ var has_options := (pfop_opt.count_wrd() > wrd(0));
+
+ par_l...@batloop() {
+ var optbat;
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: loop start,
id=%d.\n",$t);
+ if ( has_options ) {
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: running option
handler.\n");
+ var opt_iter := pfop_opt.fetch(0...@0).select($t);
+ var opt_item := pfop_opt.fetch(1...@0).semijoin(opt_iter);
+ var opt_kind := pfop_opt.fetch(2...@0).semijoin(opt_iter);
+ opt_iter := opt_iter.tmark(0...@0);
+ opt_item := opt_item.tmark(0...@0);
+ opt_kind := opt_kind.tmark(0...@0);
+ optbat :=
serialize_tijah_opt(par_ws,1,opt_iter,opt_iter,opt_item,set_kind(opt_kind,ELEM),new(void,lng),new(void,dbl),new(void,str));
+ if ( verbose ) optbat.print();
+ } else {
+ optbat := new(str,str,32);
+ }
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: handle
startNodes.\n");
+ var ftiName := tj_get_ft_index(optbat,true);
+ var tijah_lock := tj_get_collection_lock(ftiName);
+ lock_set(tijah_lock);
+ var err := CATCH({
+ var startNodes;
+ if ( has_sn ) {
+ var sn_iter := pfop_sn.fetch(0...@0);
+ var sn_iteration := pfop_query.fetch(0...@0).fetch(int($h));
+ sn_iter := sn_iter.select(sn_iteration);
+ var sn_item := pfop_sn.fetch(1...@0).semijoin(sn_iter);
+ var sn_kind := pfop_sn.fetch(2...@0).semijoin(sn_iter);
+ sn_item := sn_item.tmark(0...@0);
+ sn_kind := sn_kind.tmark(0...@0);
+
+ var xdoc_name := bat("tj_" + ftiName + "0_doc_name");
+ var xdoc_firstpre := bat("tj_" + ftiName + "0_doc_firstpre");
+ var xpfpre := bat("tj_" + ftiName + "0_pfpre");
+ var doc_loaded :=
par_ws.fetch(CONT_COLL).join(bat("doc_collection").reverse()).join(bat("doc_name"));
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: compute
startnodes\n");
+ startNodes :=
pf2tijah_node(false,xdoc_name,xdoc_firstpre,xpfpre,sn_item,[int](sn_kind),doc_loaded);
+ } else {
+ startNodes := new(void,oid);
+ }
+ optbat.access(BAT_WRITE);
+ optbat.insert("_query",pfop_query.fetch(1...@0).fetch(int($h)));
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: run tijah
query.\n");
+ var nexi_allscores :=
run_tijah_query(ftiName,optbat,has_sn,startNodes);
+ var nexi_score;
+ if ( verbose ) printf(HASH +" ALG_tj_query_handler: handling
scores.\n");
+ if ( optbat.exist("returnNumber") ) {
+ var retNum := int(optbat.find("returnNumber"));
+ nexi_score := nexi_allscores.slice(0, retNum - 1);
+ } else {
+ nexi_score := nexi_allscores;
+ }
+
+ var docpre := bat("tj_" + ftiName + "0_doc_firstpre").[oid]();
+ var pfpre := bat("tj_" + ftiName + "0_pfpre");
+ var item := nexi_score.hmark(0...@0);
+ var frag := [find_lower](const docpre.reverse().mark(0...@0), item);
+ item := item.join(pfpre).sort().tmark();
+ var needed_docs := bat("tj_" + ftiName +
"0_doc_name").semijoin(frag.tunique());
+
+
var loaded_docs := par_ws.fetch(OPEN_NAME).reverse();
var docs_to_load :=
kdiff(needed_docs.reverse(),loaded_docs).hmark(0...@0);
ws_opendoc(par_ws, docs_to_load);
@@ -2574,8 +2751,8 @@
var iter_tjPre;
var ftc_term := pfop_query.fetch(1...@0).fetch(int($h));
- var ftindex := dflt_ft_index; # incomplete
- var tijah_lock := tj_get_collection_lock(ftindex);
+ var ftiName := dflt_ft_index; # incomplete
+ var tijah_lock := tj_get_collection_lock(ftiName);
lock_set(tijah_lock);
var err := CATCH({
var sn_iter := pfop_sn.fetch(0...@0);
@@ -2584,9 +2761,9 @@
var sn_item := pfop_sn.fetch(1...@0).semijoin(sn_iter);
var sn_kind := pfop_sn.fetch(2...@0).semijoin(sn_iter);
- var xdoc_name := bat("tj_" + ftindex + "_doc_name");
- var xdoc_firstpre := bat("tj_" + ftindex + "_doc_firstpre");
- var xpfpre := bat("tj_" + ftindex + "_pfpre");
+ var xdoc_name := bat("tj_" + ftiName + "0_doc_name");
+ var xdoc_firstpre := bat("tj_" + ftiName + "0_doc_firstpre");
+ var xpfpre := bat("tj_" + ftiName + "0_pfpre");
var doc_loaded :=
par_ws.fetch(CONT_COLL).join(bat("doc_collection").reverse()).join(bat("doc_name"));
if ( verbose ) printf(HASH +" ALG_tj_ftfun_handler: compute
startnodes\n");
iter_tjPre :=
pf2tijah_node(true,xdoc_name,xdoc_firstpre,xpfpre,sn_item,[int](sn_kind),doc_loaded);
@@ -2595,8 +2772,8 @@
var Q := tijah_tokenize2bat(ftc_term).reverse().project(dbl(1.0));
Q := tj_term2tid(Q);
- # tj_init_termHash(ftindex);
- # tj_init_tagHash(ftindex);
+ # tj_init_termHash(ftiName);
+ # tj_init_tagHash(ftiName);
var scorebase := dbl(0.000000);
var c_lambda := dbl(0.800000);
var okapi_k1 := dbl(1.200000);
@@ -2704,7 +2881,7 @@
# stream_nil, ws_id(ws));
#}
-PROC ALG_tj_docmgmt_tape(BAT[str,bat] tape,
+PROC ALG_tj_docmgmt_tape2(BAT[str,bat] tape,
BAT[void,BAT] ws,
BAT[void,str] location,
BAT[void,str] docnames,
@@ -2779,6 +2956,97 @@
return true;
}
+PROC ALG_tj_docmgmt_tape(BAT[str,bat] tape,
+ BAT[void,BAT] ws,
+ BAT[void,str] location,
+ BAT[void,str] docnames,
+ BAT[void,str] colnames,
+ BAT[void,lng] percentages) : bit
+{
+ if ( verbose ) printf(HASH +" ALG_tj_docmgmt_tape: START.\n");
+
+ # analyse doctape, create new collection dependencies, and
+ # add documents to the doclist for each ft-index (fti_cluster)
+
+ var fti_cluster := new(str,bat);
+ t...@batloop() {
+ var op := $h;
+ var collbat := $t.fetch(0...@0);
+ var optbat := $t.fetch(1...@0);
+ var ftiName := tj_get_ft_index(optbat,(op!="create"));
+ var doclist;
+ if ( fti_cluster.exist(ftiName) ) {
+ doclist := fti_cluster.find(ftiName);
+ } else {
+ doclist := new(str,str);
+ fti_cluster.insert(ftiName, doclist);
+ }
+
+ if ( op = "create" ) {
+ if ( verbose ) printf(HASH +" ALG_tj_docmgmt_tape:
init_collection(%s).\n",ftiName);
+ tj_init_collection_base(ftiName, optbat);
+ modify_pfc_fti(ftiName, collbat);
+ if ( collbat.uselect("*").count_wrd() > wrd(0) ) {
+ var tmp := bat("doc_name");
+ doclist.insert(tmp.reverse().project(str(nil)).reverse());
+ } else {
+ var tmp :=
bat("doc_name").semijoin(bat("doc_collection").join(bat("collection_name").join(collbat.reverse())));
+ doclist.insert(tmp.reverse().project(str(nil)).reverse());
+ }
+ } else if ( op = "extend" ) {
+ if ( verbose ) printf(HASH +" ALG_tj_docmgmt_tape:
extend_collection(%s).\n",ftiName);
+ modify_pfc_fti(ftiName, collbat);
+ if ( collbat.uselect("*").count_wrd() > wrd(0) ) {
+ var tmp := bat("doc_name");
+ doclist.insert(tmp.reverse().project(str(nil)).reverse());
+ } else {
+ var tmp :=
bat("doc_name").semijoin(bat("doc_collection").join(bat("collection_name").join(collbat.reverse())));
+ doclist.insert(tmp.reverse().project(str(nil)).reverse());
+ }
+ } else if ( op = "remove" ) {
+ if ( verbose ) printf(HASH +" ALG_tj_docmgmt_tape:
tj_delete_collection(%s).\n",ftiName);
+ tj_collection_delete(ftiName);
+ } else {
+ ERROR("ALG_tj_docmgmt_tape: unknown op");
+ }
+ }
+
+ # commit dependency bats, since they might be changed
+ var submit_bats := new(void,str).seqbase(0...@0);
+ submit_bats.append("tj_pfc_fti_dep");
+ submit_bats.append("tj_pfc_fti_dep_star");
+ subcommit(submit_bats);
+
+ # determine which documents added to pathfinder have to get indexed by
pf/tijah due to dependencies
+ # add those documents to the doclist for each ft-index (fti_cluster)
+ var pfc_name := docnames.reverse().leftfetchjoin(colnames);
+ var pfdep := bat("tj_pfc_fti_dep");
+ var pfdep_star := bat("tj_pfc_fti_dep_star");
+
+ var fti_dname := pfdep.join(pfc_name.reverse());
+ if ( pfdep_star.count_wrd() > wrd(0) ) {
+ fti_dname.insert(pfdep_star.cross(pfc_name.reverse()));
+ }
+
+ fti_dn...@batloop() {
+ var cb;
+ if ( fti_cluster.exist($h) ) {
+ cb := fti_cluster.find($h);
+ } else {
+ cb := new(str,str);
+ fti_cluster.insert($h,cb);
+ }
+ cb.insert(str(nil),$t);
+ }
+ fti_clus...@batloop() {
+ if ( verbose ) { printf(HASH +"TJ:tj_play_doc_tape() doing
ft-index \"%s\".\n",$h); $t.print(); }
+ tj_add2collection_frag($h,$t,false);
+ }
+
+ if ( verbose ) printf(HASH +" ALG_tj_docmgmt_tape: FINISH.\n");
+ return true;
+}
+
var tracefile_handle := nil;
PROC tj_trace( str s ) : void
{
@@ -3823,6 +4091,36 @@
#"TIJAH");
################################################################################
+# Pathfinder PRE output
+################################################################################
+
+##
+# Map all pre identifiers to their corresponding pathfinder pre IDs / documents
+# Returns a bat [void, bat]: bat1 (rank,pfpre), bat2 (rank,score), bat3
(rank,docname)
+##
+PROC tj_pre2pfpre(bat[oid,dbl] pre_score) : bat[void,bat]
+{
+ var t_total := 0 - time();
+
+ if (pre_score.count_wrd() = wrd(0)) return new(str,dbl);
+ var pres := pre_score.hmark(0...@0);
+ var scores := pre_score.tmark(0...@0);
+ var pfpres := pres.leftjoin(bat("tj_" + ftindex + "_pfpre")).tmark(0...@0);
+ var docpre := bat("tj_" + ftindex + "_doc_firstpre");
+ var mark_did := [find_lower](const docpre.reverse(), pres);
+ var mark_docname := mark_did.leftjoin(bat("tj_" + ftindex +
"_doc_name")).tmark(0...@0);
+
+ var res := new(void,bat).seqbase(0...@0);
+ res.append(pfpres);
+ res.append(scores);
+ res.append(mark_docname);
+
+ t_total :+= time();
+ if (timing) printf(HASH +" add inexpath timing: total: %d\n", t_total);
+ return res;
+}
+
+################################################################################
# INEX output
################################################################################
@@ -3856,8 +4154,8 @@
################################################################################
##
-# Map all pre identifiers to their stored inexpath expressions
-# Returns a bat [pre, any].
+# Merge the results of all index fragments.
+# Returns a bat [void, bat]: bat1 (rank,pfpre), bat2 (rank,score), bat3
(rank,docname)
##
PROC tj_merge_frag_results(bat[void,bat] res_frag, int topk) : bat[str,dbl]
{
@@ -3871,6 +4169,22 @@
return res.tsort_rev();
}
+##
+# Merge the results of all index fragments.
+# Returns a bat [pre, any].
+##
+PROC tj_merge_frag_results_inex(bat[void,bat] res_frag, int topk) :
bat[str,dbl]
+{
+ var res := new(str,dbl);
+ res_f...@batloop(){
+ res.insert($t);
+ }
+ if (topk > 0) {
+ return res.tsort_rev().slice(0, topk - 1);
+ }
+ return res.tsort_rev();
+}
+
#####################################################################
#
------------------------------------------------------------------------------
Come build with us! The BlackBerry(R) Developer Conference in SF, CA
is the only developer event you need to attend this year. Jumpstart your
developing skills, take BlackBerry mobile applications to market and stay
ahead of the curve. Join us from November 9 - 12, 2009. Register now!
http://p.sf.net/sfu/devconference
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins