Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv14459/modules/pftijah

Modified Files:
      Tag: M5XQ
        pftijah.mx 
Log Message:
propagated changes of Wednesday Oct 14 2009
from the development trunk to the M5XQ branch

  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  2009/10/14 - hrode: modules/pftijah/pftijah.mx,1.240
  - introduce fragmented indexing on XQuery level
  - fixed bugs on MIL level function for fragemented indexing
  - adapted the test cases
  
  what still misses is the querying on the fragmented index,
  but the default for indexing is currently set to create only
  one single index, which allows querying to work as before
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.226.2.12
retrieving revision 1.226.2.13
diff -u -d -r1.226.2.12 -r1.226.2.13
--- pftijah.mx  8 Oct 2009 11:29:30 -0000       1.226.2.12
+++ pftijah.mx  14 Oct 2009 12:06:43 -0000      1.226.2.13
@@ -385,24 +385,6 @@
 const dflt_bg_index   := "DFLT_FT_INDEX";
 const dflt_score_base := "0";
 
-PROC tj_get_ft_index(BAT[str,str] tj_options, bit chk_exists) : str 
-{
-    var res := dflt_ft_index;
-    if ( tj_options.exist("ft-index") ) {
-        res := tj_options.find("ft-index");
-    }
-    if ( chk_exists ) {
-      if ( not(view_bbp_name().reverse().exist("tj_" + res + "_size")) ) {
-        ERROR("tj_get_ft_index, ft-index \"%s\" does not exist. create full 
text index using tijah:create-ft-index()\n",res);
-      }
-      if ( not(view_bbp_name().reverse().exist("tj_" + res + "_TagSize")) ) {
-        ERROR("tj_get_ft_index, ft-index \"%s\" does not exist. create full 
text index using tijah:create-ft-index()\n",res);
-      }
-    }
-    if ( verbose ) printf(HASH +"TJ:tj_get_ft_index() = %s.\n",res);
-    return res;
-}
-
 ADDHELP("tj_init_global", "flokstra & rode", "Jan 2007",
 "PARAMETERS:\n\
 - optional BAT[str,str] param: initialization parameter for global pftijah.\n\
@@ -829,67 +811,6 @@
 
 
 #
-# Start of ft-index / pf-collection dependency module
-#
-
-PROC modify_pfc_fti(str fti_name, BAT[void,str]  v_pfc) : BAT[str,str] 
-{
-    var result;
-
-    lock_set(tj_dep_lock);
-    var err := CATCH({
-       var glb_fti_pfc     := bat("tj_pfc_fti_dep");
-       var fti_dep := glb_fti_pfc.reverse().select(fti_name).reverse();
-       if ( fti_dep.select("*").count_wrd() > wrd(0) ) {
-           ERROR(HASH + " pfc_fti_dep: unable to extend ft-index when created 
with *\n");
-       }
-       var new_fti_pfc := 
v_pfc.tunique().project(fti_name).reverse().sunique().tdiff(fti_dep);
-       if ( verbose ) {
-           printf(HASH +"TJ:modify_pfc_fti: ***** added dep ***\n");
-           new_fti_pfc.print();
-       }
-       var sz := new_fti_pfc.count_wrd();
-       if ( sz > wrd(0) ) {
-           if ( sz = wrd(1) ) {
-               glb_fti_pfc.insert(new_fti_pfc);
-               if ( new_fti_pfc.uselect("*").count_wrd() > wrd(0) ) {
-                   bat("tj_pfc_fti_dep_star").insert(new_fti_pfc);
-               }
-           } else {
-               if ( new_fti_pfc.uselect("*").count_wrd() > wrd(0) ) {
-                   ERROR(HASH + " pfc_fti_dep: when using * it must be the 
only collection dependency.\n");
-               }
-               glb_fti_pfc.insert(new_fti_pfc);
-           }
-       }
-       result := new_fti_pfc;
-    });
-    if ( verbose ) {
-       printf("\n" + HASH + " LOG modify_pfc_fti(\"%s\") START, v_pfc =\n", 
fti_name);
-       v_pfc.print();
-       printf(HASH +" Dependence BATs are [STAR|ALL]:\n");
-       bat("tj_pfc_fti_dep_star").print();
-       bat("tj_pfc_fti_dep").print();
-       printf(HASH +" LOG modify_pfc_fti(\"%s\") END.\n", fti_name);
-    }
-    lock_unset(tj_dep_lock);
-    if (not(isnil(err))) ERROR(err);
-    #
-    return result;
-}
-
-PROC delete_pfc_fti(str fti_name) :void 
-{
-    lock_set(tj_dep_lock);
-    var err := CATCH({
-       bat("tj_pfc_fti_dep").delete(fti_name);
-       bat("tj_pfc_fti_dep_star").delete(fti_name);
-    });
-    lock_unset(tj_dep_lock);
-    if (not(isnil(err))) ERROR(err);
-}
-
-#
 # End of ft-index / pf-collection dependency module
 #
 
@@ -1369,6 +1290,24 @@
 #                                                                   #
 #####################################################################
 
+PROC tj_get_ft_index(BAT[str,str] tj_options, bit chk_exists) : str 
+{
+    var res := dflt_ft_index;
+    if ( tj_options.exist("ft-index") ) {
+        res := tj_options.find("ft-index");
+    }
+    if ( chk_exists ) {
+      if ( not(view_bbp_name().reverse().exist("tj_" + res + "_param")) ) {
+        ERROR("tj_get_ft_index, ft-index \"%s\" does not exist. create full 
text index using tijah:create-ft-index()\n",res);
+      }
+      if ( not(view_bbp_name().reverse().exist("tj_" + res + "_termdict")) ) {
+        ERROR("tj_get_ft_index, ft-index \"%s\" does not exist. create full 
text index using tijah:create-ft-index()\n",res);
+      }
+    }
+    if ( verbose ) printf(HASH + " TJ:tj_get_ft_index() = %s.\n",res);
+    return res;
+}
+
 # set a collection parameter
 PROC _tj_set_parameter2(BAT[str,bat] collBat, str par, str val) : void
 {
@@ -1487,7 +1426,7 @@
       var tagfilter      := "";
       var whitelist      := "";
       var blacklist      := "";
-      var fragsize       := "0";
+      var fragsize       := str(INT_MAX / 2);
       var delay_finalize := "0";
   
       pa...@batloop() {
@@ -1697,6 +1636,15 @@
       return commitBats;
 }    
 
+#
+# convenience function
+#
+PROC tj_add2collection_frag(str ftiName, str uri, str filename, bit shred) : 
void
+{
+      var uris := new(str,str).insert(uri, filename);
+      return tj_add2collection_frag(ftiName, uris, shred);
+}
+
 ADDHELP("tj_add2collection_frag", "flokstra & rode", "Sept 2009",
 "PARAMETERS:\n\
 -` str ftiName: the name of the collection.\n
@@ -1903,9 +1851,9 @@
                i := nil;
                tmp := nil;
                tmpsize := nil;
-               replaceBats.insert(collBat.find("_tagIndex").bbpname(), "tj_" + 
ftindex + "_TagIndex");
-                replaceBats.insert(collBat.find("_tags").bbpname(), "tj_" + 
ftindex + "_Tags");
-                replaceBats.insert(collBat.find("_tagSize").bbpname(), "tj_" + 
ftindex + "_TagSize");
+               replaceBats.insert("_tagIndex", "tj_" + ftindex + "_TagIndex");
+                replaceBats.insert("_tags", "tj_" + ftindex + "_Tags");
+                replaceBats.insert("_tagSize", "tj_" + ftindex + "_TagSize");
                submitBats.append("tj_" + ftindex + "_TagIndex");
                submitBats.append("tj_" + ftindex + "_Tags");
                submitBats.append("tj_" + ftindex + "_TagSize");
@@ -1924,8 +1872,8 @@
                 collBat.replace("_terms", i.fetch(1).access(BAT_READ).mmap(1));
                i := nil;
                tmp := nil;
-               replaceBats.insert(collBat.find("_termIndex").bbpname(), "tj_" 
+ ftindex + "_TermIndex");
-                replaceBats.insert(collBat.find("_terms").bbpname(), "tj_" + 
ftindex + "_Terms");
+               replaceBats.insert("_termIndex", "tj_" + ftindex + 
"_TermIndex");
+                replaceBats.insert("_terms", "tj_" + ftindex + "_Terms");
                submitBats.append("tj_" + ftindex + "_TermIndex");
                submitBats.append("tj_" + ftindex + "_Terms");
                submitBats.append(tf.bbpname());
@@ -1946,6 +1894,9 @@
                tags := nil;
                tagsize.persists(true).access(BAT_READ).mmap(1).bbpname("tj_" + 
ftindex + "_TagSize");
                tagsize := nil;
+                collBat.insert("_tagIndex", bat("tj_" + ftindex + 
"_TagIndex"));
+                collBat.insert("_tags", bat("tj_" + ftindex + "_Tags"));
+                collBat.insert("_tagSize", bat("tj_" + ftindex + "_TagSize"));
                submitBats.append("tj_" + ftindex + "_TagIndex");
                submitBats.append("tj_" + ftindex + "_Tags");
                submitBats.append("tj_" + ftindex + "_TagSize");
@@ -1965,6 +1916,8 @@
                terms.persists(true).access(BAT_READ).mmap(1).bbpname("tj_" + 
ftindex + "_Terms");
                termindex := nil;
                terms := nil;
+                collBat.insert("_termIndex", bat("tj_" + ftindex + 
"_TermIndex"));
+                collBat.insert("_terms", bat("tj_" + ftindex + "_Terms"));
                submitBats.append("tj_" + ftindex + "_TermIndex");
                submitBats.append("tj_" + ftindex + "_Terms");
                submitBats.append(tf.bbpname());
@@ -2000,7 +1953,7 @@
       var replaceBats := commitBats.find("replaceBats");
       replaceb...@batloop() {
         bat($t).persists(false).rename("del_" + $t);
-        bat($h).persists(true).bbpname($t);
+        collBat.find($h).persists(true).bbpname($t);
       }
     
       var submitBats := commitBats.find("submitBats");
@@ -2011,6 +1964,7 @@
 
 PROC tj_collection_delete(str ftiName) : void
 {
+      delete_pfc_fti(ftiName);
       var tj_bats := view_bbp_name().like("tj_" + ftiName).tmark();
       [persists]([bat](tj_bats), const false);
       subcommit(tj_bats);
@@ -2023,6 +1977,67 @@
       subcommit(tj_bats);
 }
 
+#
+# Start of ft-index / pf-collection dependency module
+#
+
+PROC modify_pfc_fti(str fti_name, BAT[void,str]  v_pfc) : BAT[str,str] 
+{
+    var result;
+
+    lock_set(tj_dep_lock);
+    var err := CATCH({
+       var glb_fti_pfc     := bat("tj_pfc_fti_dep");
+       var fti_dep := glb_fti_pfc.reverse().select(fti_name).reverse();
+       if ( fti_dep.select("*").count_wrd() > wrd(0) ) {
+           ERROR(HASH + " pfc_fti_dep: unable to extend ft-index when created 
with *\n");
+       }
+       var new_fti_pfc := 
v_pfc.tunique().project(fti_name).reverse().sunique().tdiff(fti_dep);
+       if ( verbose ) {
+           printf(HASH +"TJ:modify_pfc_fti: ***** added dep ***\n");
+           new_fti_pfc.print();
+       }
+       var sz := new_fti_pfc.count_wrd();
+       if ( sz > wrd(0) ) {
+           if ( sz = wrd(1) ) {
+               glb_fti_pfc.insert(new_fti_pfc);
+               if ( new_fti_pfc.uselect("*").count_wrd() > wrd(0) ) {
+                   bat("tj_pfc_fti_dep_star").insert(new_fti_pfc);
+               }
+           } else {
+               if ( new_fti_pfc.uselect("*").count_wrd() > wrd(0) ) {
+                   ERROR(HASH + " pfc_fti_dep: when using * it must be the 
only collection dependency.\n");
+               }
+               glb_fti_pfc.insert(new_fti_pfc);
+           }
+       }
+       result := new_fti_pfc;
+    });
+    if ( verbose ) {
+       printf("\n" + HASH + " LOG modify_pfc_fti(\"%s\") START, v_pfc =\n", 
fti_name);
+       v_pfc.print();
+       printf(HASH +" Dependence BATs are [STAR|ALL]:\n");
+       bat("tj_pfc_fti_dep_star").print();
+       bat("tj_pfc_fti_dep").print();
+       printf(HASH +" LOG modify_pfc_fti(\"%s\") END.\n", fti_name);
+    }
+    lock_unset(tj_dep_lock);
+    if (not(isnil(err))) ERROR(err);
+    #
+    return result;
+}
+
+PROC delete_pfc_fti(str fti_name) :void 
+{
+    lock_set(tj_dep_lock);
+    var err := CATCH({
+       bat("tj_pfc_fti_dep").delete(fti_name);
+       bat("tj_pfc_fti_dep_star").delete(fti_name);
+    });
+    lock_unset(tj_dep_lock);
+    if (not(isnil(err))) ERROR(err);
+}
+
 
 
 #####################################################################
@@ -2377,7 +2392,7 @@
 }
 
 # temporary algebra query handler
-PROC ALG_tj_query_handler(
+PROC ALG_tj_query_handler2(
         bit par_storeScore,
         BAT[oid,bat] pfop_sn,
         BAT[oid,bat] pfop_query,
@@ -2425,8 +2440,8 @@
        optbat := new(str,str,32);
       }
       if ( verbose ) printf(HASH +" ALG_tj_query_handler: handle 
startNodes.\n");
-      var ftindex := tj_get_ft_index(optbat,true);
-      var tijah_lock := tj_get_collection_lock(ftindex);
+      var ftiName := tj_get_ft_index(optbat,true);
+      var tijah_lock := tj_get_collection_lock(ftiName);
       lock_set(tijah_lock);
       var err := CATCH({
         var startNodes;
@@ -2439,9 +2454,9 @@
          sn_item := sn_item.tmark(0...@0);
          sn_kind := sn_kind.tmark(0...@0);
 
-         var xdoc_name := bat("tj_" + ftindex + "_doc_name");
-         var xdoc_firstpre := bat("tj_" + ftindex + "_doc_firstpre");
-         var xpfpre := bat("tj_" + ftindex + "_pfpre");
+         var xdoc_name := bat("tj_" + ftiName + "0_doc_name");
+         var xdoc_firstpre := bat("tj_" + ftiName + "0_doc_firstpre");
+         var xpfpre := bat("tj_" + ftiName + "0_pfpre");
          var doc_loaded := 
par_ws.fetch(CONT_COLL).join(bat("doc_collection").reverse()).join(bat("doc_name"));
          if ( verbose ) printf(HASH +" ALG_tj_query_handler: compute 
startnodes\n");
          startNodes := 
pf2tijah_node(false,xdoc_name,xdoc_firstpre,xpfpre,sn_item,[int](sn_kind),doc_loaded);
@@ -2451,7 +2466,7 @@
         optbat.access(BAT_WRITE);
         optbat.insert("_query",pfop_query.fetch(1...@0).fetch(int($h)));
         if ( verbose ) printf(HASH +" ALG_tj_query_handler: run tijah 
query.\n");
-        var nexi_allscores := 
run_tijah_query(ftindex,optbat,has_sn,startNodes);
+        var nexi_allscores := 
run_tijah_query(ftiName,optbat,has_sn,startNodes);
         var nexi_score;
         if ( verbose ) printf(HASH +" ALG_tj_query_handler: handling 
scores.\n");
         if ( optbat.exist("returnNumber") ) {
@@ -2460,12 +2475,174 @@
         } else {
          nexi_score := nexi_allscores;
         }
-        var docpre := bat("tj_" + ftindex + "_doc_firstpre").[oid]();
-        var pfpre  :=  bat("tj_" + ftindex + "_pfpre");
+        var docpre := bat("tj_" + ftiName + "0_doc_firstpre").[oid]();
+        var pfpre  :=  bat("tj_" + ftiName + "0_pfpre");
         var item   := nexi_score.hmark(0...@0);
         var frag := [find_lower](const docpre.reverse().mark(0...@0), item);
         item := item.join(pfpre).sort().tmark();
-        var needed_docs := bat("tj_" + ftindex + 
"_doc_name").semijoin(frag.tunique());
+        var needed_docs := bat("tj_" + ftiName + 
"0_doc_name").semijoin(frag.tunique());
+        var loaded_docs := par_ws.fetch(OPEN_NAME).reverse();
+        var docs_to_load := 
kdiff(needed_docs.reverse(),loaded_docs).hmark(0...@0);
+        ws_opendoc(par_ws, docs_to_load);
+        var doc_loaded := 
reverse(par_ws.fetch(OPEN_CONT)).leftfetchjoin(par_ws.fetch(OPEN_NAME));
+       # On the forced document loading size we keep using the old interface
+       # until ws.fetch(OPEN_CONT|OPEN_DOC) disappears. This interface is
+       # much cheaper for us and works also correct when the OPEN_CONT|DOC
+       # bats are not complete.
+       # var doc_loaded := 
par_ws.fetch(CONT_COLL).join(bat("doc_collection").reverse()).join(bat("doc_name"));
+        var fid_pffid := needed_docs.join(doc_loaded.reverse());
+        frag := frag.join(fid_pffid).sort().tmark();
+        if ( verbose ) printf(HASH +" ALG_tj_query_handler: handled new 
frags/documents.\n");
+        if ( par_storeScore ) {
+         var tID := oid(par_scoreDB.fetch(0...@0).count_wrd() + 8888);
+         
par_scoreDB.fetch(4...@0).insert(lng(tID),lng(nexi_allscores.count_wrd()));
+         par_scoreDB.fetch(0...@0).append(item.project(tID));
+         par_scoreDB.fetch(1...@0).append(frag);
+         par_scoreDB.fetch(2...@0).append(item);
+         par_scoreDB.fetch(3...@0).append(nexi_score.tmark());
+         result_id.append(lng(tID));
+         if ( verbose ) printf(HASH +" ALG_tj_query_handler: stored loop 
score.\n");
+        } else {
+         result_iter.append(item.project($t));
+         result_pos.append(item.mark(1...@0));
+         result_frag.append(frag);
+         result_item.append(item);
+        }
+      });
+      lock_unset(tijah_lock);
+      if ( verbose ) printf(HASH +" ALG_tj_query_handler: released lock.\n");
+      if (not(isnil(err))) ERROR(err);
+       if ( verbose ) printf(HASH +" ALG_tj_query_handler: stored loop nodes 
in result.\n");
+      if ( verbose ) printf(HASH +" ALG_tj_query_handler: loop finish, 
id=%d.\n",$t);
+     } # end batloop over queries
+     if ( verbose ) printf(HASH +" ALG_tj_query_handler: batloop finished.\n");
+     var iter;
+     var item;
+     var ipik;
+     var kind;
+     var pos;
+     if ( par_storeScore ) {
+      if ( verbose ) printf(HASH +" ALG_tj_query_handler: create int 
return.\n");
+      item := result_id;
+      iter := par_loop.tmark(oid(0));
+      ipik := iter;
+      pos  := oid(1);
+      kind := new(oid,oid);
+     } else {
+      if ( verbose ) printf(HASH +" ALG_tj_query_handler: create node 
return.\n");
+      iter := result_iter;
+      pos  := result_pos;
+      kind := result_frag;
+      item := result_item;
+      ipik := iter;
+     }
+      if ( verbose ) {
+         printf(HASH +" ALG_tj_query_handler: iter/item/kind/pos result 
start\n");
+         iter.print();
+         item.print();
+         kind.print();
+         pos.print();
+         printf(HASH +" ALG_tj_query_handler: iter/item/kind/pos result 
finish\n");
+      }
+     var res := ALG_tj_pfop(iter,item,kind,pos.materialize(ipik));
+     #
+     if ( verbose ) printf(HASH +" ALG_tj_query_handler: FINISH.\n");
+     return res;
+}
+
+# temporary algebra query handler
+PROC ALG_tj_query_handler(
+        bit par_storeScore,
+        BAT[oid,bat] pfop_sn,
+        BAT[oid,bat] pfop_query,
+        BAT[oid,bat] pfop_opt,
+        BAT[void,any] par_loop,
+        BAT[oid,bat] par_ws,
+        BAT[oid,bat] par_scoreDB
+        ) : BAT[void,bat] 
+{
+     var result_id;
+     var result_iter;
+     var result_item;
+     var result_pos;
+     var result_frag;
+
+    if ( verbose ) printf(HASH +" ALG_tj_query_handler: START.\n");
+     if ( par_storeScore ) {
+      if ( verbose ) printf(HASH +" ALG_tj_query_handler: storeScore=TRUE.\n");
+      result_id   := new(void,lng).seqbase(0...@0);
+     } else {
+      if ( verbose ) printf(HASH +" ALG_tj_query_handler: 
storeScore=FALSE.\n");
+      result_iter := new(void,oid).seqbase(0...@0);
+      result_item := new(void,oid).seqbase(0...@0);
+      result_pos  := new(void,oid).seqbase(0...@0);
+      result_frag := new(void,oid).seqbase(0...@0);
+     }
+
+     var has_sn      := (pfop_sn.count_wrd() > wrd(0));
+     var has_options := (pfop_opt.count_wrd() > wrd(0));
+
+     par_l...@batloop() {
+      var optbat;
+      if ( verbose ) printf(HASH +" ALG_tj_query_handler: loop start, 
id=%d.\n",$t);
+      if ( has_options ) {
+       if ( verbose ) printf(HASH +" ALG_tj_query_handler: running option 
handler.\n");
+       var opt_iter := pfop_opt.fetch(0...@0).select($t);
+       var opt_item := pfop_opt.fetch(1...@0).semijoin(opt_iter);
+       var opt_kind := pfop_opt.fetch(2...@0).semijoin(opt_iter);
+       opt_iter := opt_iter.tmark(0...@0);
+       opt_item := opt_item.tmark(0...@0);
+       opt_kind := opt_kind.tmark(0...@0);
+       optbat := 
serialize_tijah_opt(par_ws,1,opt_iter,opt_iter,opt_item,set_kind(opt_kind,ELEM),new(void,lng),new(void,dbl),new(void,str));
+       if ( verbose ) optbat.print();
+      } else {
+       optbat := new(str,str,32);
+      }
+      if ( verbose ) printf(HASH +" ALG_tj_query_handler: handle 
startNodes.\n");
+      var ftiName := tj_get_ft_index(optbat,true);
+      var tijah_lock := tj_get_collection_lock(ftiName);
+      lock_set(tijah_lock);
+      var err := CATCH({
+        var startNodes;
+        if ( has_sn ) {
+         var sn_iter := pfop_sn.fetch(0...@0);
+         var sn_iteration := pfop_query.fetch(0...@0).fetch(int($h));
+         sn_iter := sn_iter.select(sn_iteration);
+         var sn_item := pfop_sn.fetch(1...@0).semijoin(sn_iter);
+         var sn_kind := pfop_sn.fetch(2...@0).semijoin(sn_iter);
+         sn_item := sn_item.tmark(0...@0);
+         sn_kind := sn_kind.tmark(0...@0);
+
+         var xdoc_name := bat("tj_" + ftiName + "0_doc_name");
+         var xdoc_firstpre := bat("tj_" + ftiName + "0_doc_firstpre");
+         var xpfpre := bat("tj_" + ftiName + "0_pfpre");
+         var doc_loaded := 
par_ws.fetch(CONT_COLL).join(bat("doc_collection").reverse()).join(bat("doc_name"));
+         if ( verbose ) printf(HASH +" ALG_tj_query_handler: compute 
startnodes\n");
+         startNodes := 
pf2tijah_node(false,xdoc_name,xdoc_firstpre,xpfpre,sn_item,[int](sn_kind),doc_loaded);
+        } else {
+         startNodes := new(void,oid);
+        }
+        optbat.access(BAT_WRITE);
+        optbat.insert("_query",pfop_query.fetch(1...@0).fetch(int($h)));
+        if ( verbose ) printf(HASH +" ALG_tj_query_handler: run tijah 
query.\n");
+        var nexi_allscores := 
run_tijah_query(ftiName,optbat,has_sn,startNodes);
+        var nexi_score;
+        if ( verbose ) printf(HASH +" ALG_tj_query_handler: handling 
scores.\n");
+        if ( optbat.exist("returnNumber") ) {
+         var retNum := int(optbat.find("returnNumber"));
+         nexi_score := nexi_allscores.slice(0, retNum - 1);
+        } else {
+         nexi_score := nexi_allscores;
+        }
+
+        var docpre := bat("tj_" + ftiName + "0_doc_firstpre").[oid]();
+        var pfpre  :=  bat("tj_" + ftiName + "0_pfpre");
+        var item   := nexi_score.hmark(0...@0);
+        var frag := [find_lower](const docpre.reverse().mark(0...@0), item);
+        item := item.join(pfpre).sort().tmark();
+        var needed_docs := bat("tj_" + ftiName + 
"0_doc_name").semijoin(frag.tunique());
+
+
         var loaded_docs := par_ws.fetch(OPEN_NAME).reverse();
         var docs_to_load := 
kdiff(needed_docs.reverse(),loaded_docs).hmark(0...@0);
         ws_opendoc(par_ws, docs_to_load);
@@ -2574,8 +2751,8 @@
         var iter_tjPre;
         var ftc_term := pfop_query.fetch(1...@0).fetch(int($h));
 
-        var ftindex := dflt_ft_index; # incomplete
-        var tijah_lock := tj_get_collection_lock(ftindex);
+        var ftiName := dflt_ft_index; # incomplete
+        var tijah_lock := tj_get_collection_lock(ftiName);
         lock_set(tijah_lock);
         var err := CATCH({
           var sn_iter := pfop_sn.fetch(0...@0);
@@ -2584,9 +2761,9 @@
           var sn_item := pfop_sn.fetch(1...@0).semijoin(sn_iter);
           var sn_kind := pfop_sn.fetch(2...@0).semijoin(sn_iter);
   
-          var xdoc_name := bat("tj_" + ftindex + "_doc_name");
-          var xdoc_firstpre := bat("tj_" + ftindex + "_doc_firstpre");
-          var xpfpre := bat("tj_" + ftindex + "_pfpre");
+          var xdoc_name := bat("tj_" + ftiName + "0_doc_name");
+          var xdoc_firstpre := bat("tj_" + ftiName + "0_doc_firstpre");
+          var xpfpre := bat("tj_" + ftiName + "0_pfpre");
           var doc_loaded := 
par_ws.fetch(CONT_COLL).join(bat("doc_collection").reverse()).join(bat("doc_name"));
           if ( verbose ) printf(HASH +" ALG_tj_ftfun_handler: compute 
startnodes\n");
           iter_tjPre := 
pf2tijah_node(true,xdoc_name,xdoc_firstpre,xpfpre,sn_item,[int](sn_kind),doc_loaded);
@@ -2595,8 +2772,8 @@
          var Q := tijah_tokenize2bat(ftc_term).reverse().project(dbl(1.0));
          Q := tj_term2tid(Q);
 
-         # tj_init_termHash(ftindex);
-         # tj_init_tagHash(ftindex);
+         # tj_init_termHash(ftiName);
+         # tj_init_tagHash(ftiName);
          var scorebase := dbl(0.000000);
          var c_lambda  := dbl(0.800000);
          var okapi_k1  := dbl(1.200000);
@@ -2704,7 +2881,7 @@
 #                   stream_nil, ws_id(ws));
 #}
 
-PROC ALG_tj_docmgmt_tape(BAT[str,bat] tape,
+PROC ALG_tj_docmgmt_tape2(BAT[str,bat] tape,
                         BAT[void,BAT] ws,
                         BAT[void,str] location,
                         BAT[void,str] docnames,
@@ -2779,6 +2956,97 @@
        return true;
 }
 
+PROC ALG_tj_docmgmt_tape(BAT[str,bat] tape,
+                        BAT[void,BAT] ws,
+                        BAT[void,str] location,
+                        BAT[void,str] docnames,
+                        BAT[void,str] colnames,
+                        BAT[void,lng] percentages) : bit 
+{
+        if ( verbose ) printf(HASH +" ALG_tj_docmgmt_tape: START.\n");
+
+       # analyse doctape, create new collection dependencies, and
+        # add documents to the doclist for each ft-index (fti_cluster) 
+
+        var fti_cluster := new(str,bat);
+        t...@batloop() {
+           var op       := $h;
+           var collbat  := $t.fetch(0...@0);
+           var optbat   := $t.fetch(1...@0);
+           var ftiName  := tj_get_ft_index(optbat,(op!="create"));
+            var doclist;       
+            if ( fti_cluster.exist(ftiName) ) {
+              doclist := fti_cluster.find(ftiName);
+            } else {
+              doclist := new(str,str);
+              fti_cluster.insert(ftiName, doclist);
+            }
+
+           if ( op = "create" ) {
+                if ( verbose ) printf(HASH +" ALG_tj_docmgmt_tape: 
init_collection(%s).\n",ftiName);
+               tj_init_collection_base(ftiName, optbat);
+                modify_pfc_fti(ftiName, collbat);
+                if ( collbat.uselect("*").count_wrd() > wrd(0) ) {
+                   var tmp := bat("doc_name");
+                   doclist.insert(tmp.reverse().project(str(nil)).reverse());
+                } else {
+                   var tmp := 
bat("doc_name").semijoin(bat("doc_collection").join(bat("collection_name").join(collbat.reverse())));
+                   doclist.insert(tmp.reverse().project(str(nil)).reverse());
+                }
+           } else if ( op = "extend" ) {
+                if ( verbose ) printf(HASH +" ALG_tj_docmgmt_tape: 
extend_collection(%s).\n",ftiName);
+                modify_pfc_fti(ftiName, collbat);
+                if ( collbat.uselect("*").count_wrd() > wrd(0) ) {
+                   var tmp := bat("doc_name");
+                   doclist.insert(tmp.reverse().project(str(nil)).reverse());
+                } else {
+                   var tmp := 
bat("doc_name").semijoin(bat("doc_collection").join(bat("collection_name").join(collbat.reverse())));
+                   doclist.insert(tmp.reverse().project(str(nil)).reverse());
+                }
+           } else if ( op = "remove" ) {
+                if ( verbose ) printf(HASH +" ALG_tj_docmgmt_tape: 
tj_delete_collection(%s).\n",ftiName);
+               tj_collection_delete(ftiName);
+           } else {
+               ERROR("ALG_tj_docmgmt_tape: unknown op");
+           }
+       }
+
+        # commit dependency bats, since they might be changed
+        var submit_bats := new(void,str).seqbase(0...@0);
+        submit_bats.append("tj_pfc_fti_dep");
+        submit_bats.append("tj_pfc_fti_dep_star");
+        subcommit(submit_bats);
+
+       # determine which documents added to pathfinder have to get indexed by 
pf/tijah due to dependencies
+        # add those documents to the doclist for each ft-index (fti_cluster) 
+        var pfc_name   := docnames.reverse().leftfetchjoin(colnames);
+        var pfdep      := bat("tj_pfc_fti_dep");
+        var pfdep_star := bat("tj_pfc_fti_dep_star");
+        
+        var fti_dname  := pfdep.join(pfc_name.reverse());
+        if ( pfdep_star.count_wrd() > wrd(0) ) {
+            fti_dname.insert(pfdep_star.cross(pfc_name.reverse()));
+        }
+
+        fti_dn...@batloop() {
+            var cb;
+            if ( fti_cluster.exist($h) ) {
+              cb := fti_cluster.find($h);
+            } else {
+              cb := new(str,str);
+              fti_cluster.insert($h,cb);
+            }
+            cb.insert(str(nil),$t);
+        }
+        fti_clus...@batloop() {
+            if ( verbose ) { printf(HASH +"TJ:tj_play_doc_tape() doing 
ft-index \"%s\".\n",$h); $t.print(); }
+            tj_add2collection_frag($h,$t,false);
+        }
+
+        if ( verbose ) printf(HASH +" ALG_tj_docmgmt_tape: FINISH.\n");
+       return true;
+}
+
 var tracefile_handle := nil;
 PROC tj_trace( str s ) : void  
 {
@@ -3823,6 +4091,36 @@
 #"TIJAH");
 
 
################################################################################
+# Pathfinder PRE output 
+################################################################################
+
+##
+# Map all pre identifiers to their corresponding pathfinder pre IDs / documents
+# Returns a bat [void, bat]: bat1 (rank,pfpre), bat2 (rank,score), bat3 
(rank,docname)
+##
+PROC tj_pre2pfpre(bat[oid,dbl] pre_score) : bat[void,bat] 
+{
+    var t_total := 0 - time();
+
+    if (pre_score.count_wrd() = wrd(0)) return new(str,dbl);
+    var pres := pre_score.hmark(0...@0);
+    var scores := pre_score.tmark(0...@0);
+    var pfpres := pres.leftjoin(bat("tj_" + ftindex + "_pfpre")).tmark(0...@0);
+    var docpre := bat("tj_" + ftindex + "_doc_firstpre");
+    var mark_did := [find_lower](const docpre.reverse(), pres);
+    var mark_docname := mark_did.leftjoin(bat("tj_" + ftindex + 
"_doc_name")).tmark(0...@0);
+
+    var res := new(void,bat).seqbase(0...@0);
+    res.append(pfpres);
+    res.append(scores);
+    res.append(mark_docname);
+
+    t_total :+= time();
+    if (timing) printf(HASH +" add inexpath timing: total: %d\n", t_total);
+    return res;
+}
+
+################################################################################
 # INEX output 
 
################################################################################
 
@@ -3856,8 +4154,8 @@
 
################################################################################
 
 ##
-# Map all pre identifiers to their stored inexpath expressions
-# Returns a bat [pre, any].
+# Merge the results of all index fragments.
+# Returns a bat [void, bat]: bat1 (rank,pfpre), bat2 (rank,score), bat3 
(rank,docname)
 ##
 PROC tj_merge_frag_results(bat[void,bat] res_frag, int topk) : bat[str,dbl] 
 {
@@ -3871,6 +4169,22 @@
     return res.tsort_rev();
 }
 
+##
+# Merge the results of all index fragments.
+# Returns a bat [pre, any].
+##
+PROC tj_merge_frag_results_inex(bat[void,bat] res_frag, int topk) : 
bat[str,dbl] 
+{
+    var res := new(str,dbl);
+    res_f...@batloop(){
+        res.insert($t);
+    }
+    if (topk > 0) {
+        return res.tsort_rev().slice(0, topk - 1);
+    }
+    return res.tsort_rev();
+}
+
 
 #####################################################################
 #


------------------------------------------------------------------------------
Come build with us! The BlackBerry(R) Developer Conference in SF, CA
is the only developer event you need to attend this year. Jumpstart your
developing skills, take BlackBerry mobile applications to market and stay 
ahead of the curve. Join us from November 9 - 12, 2009. Register now!
http://p.sf.net/sfu/devconference
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

Reply via email to