Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory sc8-pr-cvs16.sourceforge.net:/tmp/cvs-serv32237

Modified Files:
        pftijah.mx 
Log Message:
- create a new tj_rebuild_collection(str ftiName) function. It totally rebuilds
  the current ft-index according to the specified pf-collection/ft-index
  dependencies. This function can be used as a first crude solution to
  document deletions and updates.



Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.140
retrieving revision 1.141
diff -u -d -r1.140 -r1.141
--- pftijah.mx  13 Jun 2007 16:59:21 -0000      1.140
+++ pftijah.mx  14 Jun 2007 07:56:54 -0000      1.141
@@ -314,25 +314,6 @@
 
 # var GQENV := create_qenv(dflt_ft_index,dflt_bg_index,dflt_score_base);
 
-PROC run_tijah_query(str ftiName, BAT[str,str] opt, bit use_startnodes, 
BAT[void,oid] nodes, str q) : BAT[oid,dbl] :=
-{
-       if ( verbose ) printf("#TJ:run_tijah_query(\"%s\",..) 
called.\n",ftiName);
-       var parambat := bat("tj_" + ftiName + "_param");
-       var delfin   := lng(parambat.find("delay_finalize"));
-        if ( delfin > lng(0) ) {
-           if ( verbose ) printf("#TJ:run_tijah_query(\"%s\",..) checking 
delayed finalize.\n",ftiName);
-            var finlast  := lng(parambat.find("_last_finalizedPre"));
-            var prelast  := lng(parambat.find("_last_tijahPre"));
-           if ( not(prelast=finlast) ) {
-               if ( verbose ) printf("#TJ:run_tijah_query(\"%s\",..) 
performing delayed finalize (%d != %d).\n",ftiName,int(finlast),int(prelast));
-                var collBat := _tj_collection(ftiName);
-                _tj_finalize_collection(ftiName, collBat, TRUE);
-                _tj_commit(collBat); 
-           }
-       }
-       return _run_tijah_query(opt,use_startnodes,nodes,q);
-}
-
 PROC tj_get_ft_index(BAT[str,str] tj_options, bit chk_exists) : str :=
 {
     var res := dflt_ft_index;
@@ -506,6 +487,22 @@
 "pftijah");
 PROC tj_init_collection(str ftiName, BAT[str,str] param, BAT[void,str] v_pfc) 
: void :=
 {
+    _tj_init_collection(ftiName,param,v_pfc,FALSE);
+}
+
+PROC tj_init_collection(str ftiName, BAT[str,str] param) : void :=
+{
+    _tj_init_collection(ftiName,param,new(void,str),FALSE);
+}
+
+PROC tj_rebuild_collection(str ftiName) : void  
+{
+    # WARNING: not thoroughly tested yet
+    _tj_init_collection(ftiName,new(str,str),new(void,str),TRUE);
+}
+
+PROC _tj_init_collection(str ftiName, BAT[str,str] param, BAT[void,str] v_pfc, 
bit rebuild) : void :=
+{
     var coll_lock;
 
     #
@@ -519,17 +516,24 @@
        tj_init_global(new(str,str),false); # just in case 
       }
       if (bat("tj_collName").reverse().exist(ftiName)) {
-       ERROR("tj_init_collection, pftijah collection already exists: 
%s\n",ftiName);
-      }
-      #
-      var coll_oid;
-      if (bat("tj_collName").count() = 0) { 
-        coll_oid := [EMAIL PROTECTED];
+       if ( not(rebuild) )
+         ERROR("tj_init_collection, pftijah collection already exists: 
%s\n",ftiName);
       } else {
-        coll_oid := oid(int(bat("tj_collName").reverse().max()) + 1);
+        if (rebuild) {
+         ERROR("tj_rebuild_collection, pftijah collection \"%s\" does not 
exists\n",ftiName);
+       }
       }
-      bat("tj_collName").insert(coll_oid, ftiName);
       #
+      if ( not(rebuild) ) {
+          var coll_oid;
+          if (bat("tj_collName").count() = 0) { 
+            coll_oid := [EMAIL PROTECTED];
+          } else {
+            coll_oid := oid(int(bat("tj_collName").reverse().max()) + 1);
+          }
+          bat("tj_collName").insert(coll_oid, ftiName);
+          #
+      }
       coll_lock := tj_get_collection_lock(ftiName);
     });
     lock_unset(tj_adm_lock);
@@ -537,34 +541,71 @@
     #
     # now the collection stuff 
     #
+    var extra_del_bat;
     lock_set(coll_lock);
     var err := CATCH({
       if ( not(GLOBAL_TTBAT) ) {
-          new(oid,str).persists(true).bbpname(_tj_TermBat(ftiName));
-          new(oid,str).persists(true).bbpname(_tj_TagBat(ftiName));
+         if ( rebuild ) {
+             # INCOMPLETE, not throwing them away is much faster!!!
+              bat(_tj_TermBat(ftiName)).delete();
+              bat(_tj_TagBat(ftiName)).delete();
+         } else {
+              new(oid,str).persists(true).bbpname(_tj_TermBat(ftiName));
+              new(oid,str).persists(true).bbpname(_tj_TagBat(ftiName));
+         }
+      }
+      extra_del_bat := new(void,str).seqbase([EMAIL PROTECTED]);
+      if ( rebuild ) {
+          bat("tj_" + ftiName + "_doc_name").delete();
+          bat("tj_" + ftiName + "_doc_firstpre").delete();
+         # do not delete the param bat
+         var frag_offset := int(2); # we delete all buns in existing 1 frag
+         var frag_last := bat("tj_" + ftiName + "_fragments").count();
+         while (frag_offset < frag_last)
+         {
+               var bn := "tj_" + ftiName + "_tid" + str(frag_offset);
+               bat(bn).persists(FALSE);
+                extra_del_bat.append(bn);
+               bn := "tj_" + ftiName + "_size" + str(frag_offset);
+               bat(bn).persists(FALSE);
+                extra_del_bat.append(bn);
+               frag_offset :+= 1;
+         }
+          bat("tj_" + ftiName + "_tid1").delete();
+          bat("tj_" + ftiName + "_size1").delete();
+          bat("tj_" + ftiName + "_fragments").delete();
+          bat("tj_" + ftiName + "_fragments").append([EMAIL PROTECTED]);
+          bat("tj_" + ftiName + "_pfpre").delete();
+      } else {
+          new(void,str).seqbase([EMAIL 
PROTECTED]).persists(true).bbpname("tj_" + ftiName + "_doc_name");
+          new(void,oid).seqbase([EMAIL 
PROTECTED]).persists(true).bbpname("tj_" + ftiName + "_doc_firstpre");
+          new(str,str).persists(true).bbpname("tj_" + ftiName + "_param");
+          
+          new(void,oid).seqbase([EMAIL 
PROTECTED]).persists(true).bbpname("tj_" + ftiName + "_tid1");
+          new(void,int).seqbase([EMAIL 
PROTECTED]).persists(true).bbpname("tj_" + ftiName + "_size1");
+          # bat contains the start oid of every tid/size frag. Head is postfix
+          # string to _tid/_size. Normally "", "2", "3"
+          new(void,oid).seqbase([EMAIL 
PROTECTED]).persists(true).bbpname("tj_" + ftiName + "_fragments");
+          bat("tj_" + ftiName + "_fragments").append([EMAIL PROTECTED]);
+          new(oid,oid).persists(true).bbpname("tj_" + ftiName + "_pfpre");
       }
-      new(void,str).seqbase([EMAIL PROTECTED]).persists(true).bbpname("tj_" + 
ftiName + "_doc_name");
-      new(void,oid).seqbase([EMAIL PROTECTED]).persists(true).bbpname("tj_" + 
ftiName + "_doc_firstpre");
-      new(str,str).persists(true).bbpname("tj_" + ftiName + "_param");
-      
-      new(void,oid).seqbase([EMAIL PROTECTED]).persists(true).bbpname("tj_" + 
ftiName + "_tid1");
-      new(void,int).seqbase([EMAIL PROTECTED]).persists(true).bbpname("tj_" + 
ftiName + "_size1");
-      # bat contains the start oid of every tid/size frag. Head is postfix
-      # string to _tid/_size. Normally "", "2", "3"
-      new(void,oid).seqbase([EMAIL PROTECTED]).persists(true).bbpname("tj_" + 
ftiName + "_fragments");
-      bat("tj_" + ftiName + "_fragments").append([EMAIL PROTECTED]);
-      new(oid,oid).persists(true).bbpname("tj_" + ftiName + "_pfpre");
          
-      #
-      # now read the param file
-      #
-      var stemmer        := "nostemming";
-      var tokenizer      := "flex";
-      var tagfilter      := "";
-      var fragsize       := "0";
-      var delay_finalize := "0";
+      var parambat := bat("tj_" + ftiName + "_param");
+      if ( rebuild ) {
+       parambat.replace("status","building");
+       parambat.replace("_last_tijahPre","1");
+       parambat.replace("_last_finalizedPre","0");
+      } else {
+        #
+        # now read the param file
+        #
+        var stemmer        := "nostemming";
+        var tokenizer      := "flex";
+        var tagfilter      := "";
+        var fragsize       := "0";
+        var delay_finalize := "0";
   
-      [EMAIL PROTECTED]() {
+        [EMAIL PROTECTED]() {
          if ( verbose ) 
printf("#TJ:tj_init_collection():param[%s]=\"%s\"\n",$h,$t);
          if ( $h = "stemmer" ) {
              stemmer := $t;
@@ -584,27 +625,44 @@
          } else {
              ERROR("# tj_init_collection() unknown parameter [%s].\n",$h);
          }
+        }
+        #
+        # now set the parameters for this collection
+        #
+        parambat.insert("_version","1.01");
+        parambat.insert("name",ftiName);
+        parambat.insert("height","0");
+        parambat.insert("tokenizer",tokenizer);
+        parambat.insert("stemmer",stemmer);
+        parambat.insert("fragmentSize",fragsize);
+        parambat.insert("tagFilter",tagfilter);
+        parambat.insert("preExpansion","4");
+        parambat.insert("status","building");
+        parambat.insert("_last_tijahPre","1");
+        parambat.insert("_last_finalizedPre","0");
+        parambat.insert("delay_finalize",delay_finalize);
+      }
+      if ( rebuild  ) {
+        # reconstruct the original v_pfc
+       v_pfc := bat("tj_pfc_fti_dep").reverse().select(ftiName).reverse();
+      } else {
+        #
+        # now modify the global fti pfc dependency administration. We may 
ignore
+        # the return value because all dependencies for this collection are 
new.
+        modify_pfc_fti(ftiName,v_pfc);
+      }
+      if ( rebuild ) {
+       if (view_bbp_name().reverse().exist("tj_" + ftiName + "_TermIndex")) {
+               bat("tj_" + ftiName + "_TermIndex").persists(false);
+               bat("tj_" + ftiName + "_Terms").persists(false);
+               bat("tj_" + ftiName + "_TagIndex").persists(false);
+               bat("tj_" + ftiName + "_Tags").persists(false);
+       }
       }
-      #
-      # now set the parameters for this collection
-      #
-      bat("tj_" + ftiName + "_param").insert("_version","1.01");
-      bat("tj_" + ftiName + "_param").insert("name",ftiName);
-      bat("tj_" + ftiName + "_param").insert("height","0");
-      bat("tj_" + ftiName + "_param").insert("tokenizer",tokenizer);
-      bat("tj_" + ftiName + "_param").insert("stemmer",stemmer);
-      bat("tj_" + ftiName + "_param").insert("fragmentSize",fragsize);
-      bat("tj_" + ftiName + "_param").insert("tagFilter",tagfilter);
-      bat("tj_" + ftiName + "_param").insert("preExpansion","4");
-      bat("tj_" + ftiName + "_param").insert("status","building");
-      bat("tj_" + ftiName + "_param").insert("_last_tijahPre","1");
-      bat("tj_" + ftiName + "_param").insert("_last_finalizedPre","0");
-      bat("tj_" + ftiName + "_param").insert("delay_finalize",delay_finalize);
-      #
-      # now modify the global fti pfc dependency administration. We may ignore
-      # the return value because all dependencies for this collection are new.
-      modify_pfc_fti(ftiName,v_pfc);
     });
+    if ( rebuild ) {
+        subcommit(extra_del_bat);
+    }
     subcommit(_tj_collection_str(ftiName));
     lock_unset(coll_lock);
     #
@@ -637,11 +695,6 @@
     if ( verbose ) printf("#TJ:tj_init_collection(\"%s\") 
finished.\n",ftiName);
 }
 
-PROC tj_init_collection(str ftiName, BAT[str,str] param) : void :=
-{
-    tj_init_collection(ftiName,param,new(void,str));
-}
-
 #
 # The tj_extend_collection() adds new pf collection dependencies to an existing
 # collection.
@@ -844,7 +897,7 @@
            curversion := "0.0";
        }
        if ( curversion < "1.0" ) {
-            ERROR("_tj_collection(): pftijah index structure changed, reindex 
collection!!");
+            ERROR("_tj_collection():%s: pftijah index structure changed, 
reindex collection!!",curversion);
        }
        tjCollBat.insert("_globalTerms", bat(_tj_TermBat(ftiName)));
        tjCollBat.insert("_globalTags", bat(_tj_TagBat(ftiName)));
@@ -1030,10 +1083,10 @@
             var prelast  := lng(parambat.find("_last_tijahPre"));
             var fdelta   := prelast - finlast;
            if ( (prelast - finlast) < delfin ) {
-                  if ( false ) printf("#TJ:_tj_finalize_collection(\"%s\") 
delaying finalization (%d < %d).\n",ftiName,int(fdelta),int(delfin));
+                  if ( verbose ) printf("#TJ:_tj_finalize_collection(\"%s\") 
delaying finalization (%d < %d).\n",ftiName,int(fdelta),int(delfin));
                  return;
            } else {
-                  if ( false ) printf("#TJ:_tj_finalize_collection(\"%s\") 
finalization treshhold reached (%d > %d).\n",ftiName,int(fdelta),int(delfin));
+                  if ( verbose ) printf("#TJ:_tj_finalize_collection(\"%s\") 
finalization treshhold reached (%d > %d).\n",ftiName,int(fdelta),int(delfin));
            }
         }
     }
@@ -1150,6 +1203,33 @@
     if (not(isnil(err))) ERROR(err);
 }
 
+
+#####################################################################
+#                                                                   #
+# The query section                                                #
+#                                                                   #
+#####################################################################
+
+PROC run_tijah_query(str ftiName, BAT[str,str] opt, bit use_startnodes, 
BAT[void,oid] nodes, str q) : BAT[oid,dbl] :=
+{
+       if ( verbose ) printf("#TJ:run_tijah_query(\"%s\",..) 
called.\n",ftiName);
+       var parambat := bat("tj_" + ftiName + "_param");
+       var delfin   := lng(parambat.find("delay_finalize"));
+        if ( delfin > lng(0) ) {
+           if ( verbose ) printf("#TJ:run_tijah_query(\"%s\",..) checking 
delayed finalize.\n",ftiName);
+            var finlast  := lng(parambat.find("_last_finalizedPre"));
+            var prelast  := lng(parambat.find("_last_tijahPre"));
+           if ( not(prelast=finlast) ) {
+               if ( verbose ) printf("#TJ:run_tijah_query(\"%s\",..) 
performing delayed finalize (%d != %d).\n",ftiName,int(finlast),int(prelast));
+                var collBat := _tj_collection(ftiName);
+                _tj_finalize_collection(ftiName, collBat, TRUE);
+                _tj_commit(collBat); 
+           }
+       }
+       return _run_tijah_query(opt,use_startnodes,nodes,q);
+}
+
+
 #####################################################################
 #                                                                   #
 # End of the new implementation of the interfaces                   #


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

Reply via email to