Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory sfp-cvsdas-1.v30.ch3.sourceforge.com:/tmp/cvs-serv22252

Modified Files:
      Tag: Nov2009
        pftijah.mx 
Log Message:
The _param bat was used to keep a string status variable (last pre) that was 
updated at every document during indexing.
This caused the string heap of this bat (only a few tuples long) to grow with 
the size of the collection to index,
which in turn caused a find() operation on this bat (also performed for each 
document to index) to become very expensive.
Expensive string search inside a batloop: not good! Replaced this status 
variable with an actual mil variable (wrd btw, not lng!).




Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.238.2.5
retrieving revision 1.238.2.6
diff -u -d -r1.238.2.5 -r1.238.2.6
--- pftijah.mx  3 Feb 2010 15:09:30 -0000       1.238.2.5
+++ pftijah.mx  3 Feb 2010 16:12:05 -0000       1.238.2.6
@@ -1669,32 +1669,37 @@
       var t_start := usec();
     
       var commitBats := _tj_create_commitBats();
- 
+
       # get first free collection fragment (first fragment that is not yet 
filled completely)       
       var collBat := _tj_get_collection_frag(ftiName, commitBats);
 
       # set access back to BAT_APPEND
       _tj_set_forwardindex_access(collBat, BAT_APPEND);
-
-      var frag_size := lng(_tj_get_parameter2(collBat, "fragmentSize"));
+      
+      _tj_set_parameter2(collBat, "status", "building");
+      var frag_size := wrd(_tj_get_parameter2(collBat, "fragmentSize"));
+      var last_pre;
       u...@batloop() {
-         _tj_add2collection_frag(ftiName, collBat, $h, $t, shred);
-         
-         # check capacity of current fragment
-         var last_pre := lng(_tj_get_parameter2(collBat, "_last_tijahPre"));
-         if ( last_pre >= frag_size ) {
+        last_pre := _tj_add2collection_frag(ftiName, collBat, $h, $t, shred);
+        
+        # check capacity of current fragment
+        if ( last_pre >= frag_size ) {
+          if ( verbose ) tj_verbose(HASH +"TJ tj_add2collection_frag(\"%s\"), 
fragment limit reached.\n",ftiName);
+            _tj_set_parameter2(collBat, "_last_tijahPre", str(last_pre));
             _tj_finalize_collection_frag(ftiName, collBat, commitBats, false);
             collBat := _tj_get_collection_frag(ftiName, commitBats);
             _tj_set_forwardindex_access(collBat, BAT_APPEND);
-         }
+          }
       }
+      # update params
+      _tj_set_parameter2(collBat, "_last_tijahPre", str(last_pre));
       _tj_finalize_collection_frag(ftiName, collBat, commitBats, false);
       _tj_commit_frag(collBat, commitBats); 
 
       if ( timing ) {
-         var ms := (usec()-t_start)/1000;
-         printf(HASH +"TJ tj_add2collection(BAT): total time = 
%lld.%03llds.\n",/(ms,1000),%(ms,1000));
-       }
+        var ms := (usec()-t_start)/1000;
+        printf(HASH +"TJ tj_add2collection(BAT): total time = 
%lld.%03llds.\n",/(ms,1000),%(ms,1000));
+      }
     });
     lock_unset(coll_lock);
     if ( verbose ) tj_verbose(HASH +"TJ tj_add2collection_frag(\"%s\") 
finished.\n",ftiName);
@@ -1705,34 +1710,34 @@
 # this _tj_add2collection_frag adds a single document to the index (calling 
tj_throw2collection)
 # if necessary, the document is shredded before
 #
-PROC _tj_add2collection_frag(str ftiName, BAT[str,bat] collBat, str uri_loc, 
str uri_name, bit shred) : void
+PROC _tj_add2collection_frag(str ftiName, BAT[str,bat] collBat, str uri_loc, 
str uri_name, bit shred) : wrd
 { 
     var ms;
     if ( verbose ) tj_verbose(HASH +"TJ 
_tj_add2collection_frag(\"%s\",\"%s\",\"%s\") 
start.\n",ftiName,uri_loc,uri_name);
     if ( shred ) {
-       var pf_collection := _tj_get_parameter2(collBat,"pf_collection");
-       if ( isnil(uri_loc) ) {
-           ERROR("_tj_add2collection: should specify doc_uri (and doc_name).");
-       }
-        if ( isnil(uri_name) ) {
-         uri_name := uri_loc;
-       } else if ( uri_name = "" ) {
-         uri_name := uri_loc;
-       }
-        if (not(bat("doc_name").reverse().exist(uri_name))) {
-            var s_start := usec();
-           if ( isnil(pf_collection) ) {
-               shred_doc(uri_loc,uri_name);
-           } else {
-               shred_doc(uri_loc,uri_name,pf_collection,0LL);
-           }
-            if ( timing ) {
-             ms := (usec() - s_start)/1000;
-              printf(HASH +"TJ _tj_add2collection_frag(\"%s\"): shred time = 
%lld.%03llds.\n",uri_name,/(ms,1000),%(ms,1000));
-            }
+      var pf_collection := _tj_get_parameter2(collBat,"pf_collection");
+      if ( isnil(uri_loc) ) {
+          ERROR("_tj_add2collection: should specify doc_uri (and doc_name).");
+      }
+      if ( isnil(uri_name) ) {
+        uri_name := uri_loc;
+      } else if ( uri_name = "" ) {
+        uri_name := uri_loc;
+      }
+      if (not(bat("doc_name").reverse().exist(uri_name))) {
+        var s_start := usec();
+        if ( isnil(pf_collection) ) {
+          shred_doc(uri_loc,uri_name);
         } else {
-            if ( verbose ) tj_verbose(HASH +"TJ _tj_add2collection_frag 
doc(\"%s\") already shredded.\n",uri_name);
-       }
+          shred_doc(uri_loc,uri_name,pf_collection,0LL);
+        }
+        if ( timing ) {
+          ms := (usec() - s_start)/1000;
+          printf(HASH +"TJ _tj_add2collection_frag(\"%s\"): shred time = 
%lld.%03llds.\n",uri_name,/(ms,1000),%(ms,1000));
+        }
+      } else {
+        if ( verbose ) tj_verbose(HASH +"TJ _tj_add2collection_frag 
doc(\"%s\") already shredded.\n",uri_name);
+      }
     }
     var        i_start := usec();
     var xrpc_mode := "";
@@ -1741,16 +1746,16 @@
     _tj_throw2collection(collBat,ws_opt,uri_name);
     ws_destroy(ws_opt);
     
-    #update params
+    # compute last pre
     var lstPre := collBat.find("size").count_wrd() + 1;
-    _tj_set_parameter2(collBat, "_last_tijahPre", str(lstPre));
-    _tj_set_parameter2(collBat, "status", "building");
 
     if ( timing ) {
-       ms := (usec()-i_start)/1000;
-        printf(HASH +"TJ _tj_add2collection_frag(\"%s\"): forward index 
creation time = %lld.%03llds.\n",uri_name,/(ms,1000),%(ms,1000));
+      ms := (usec()-i_start)/1000;
+      printf(HASH +"TJ _tj_add2collection_frag(\"%s\"): forward index creation 
time = %lld.%03llds.\n",uri_name,/(ms,1000),%(ms,1000));
     }
     if ( verbose ) tj_verbose(HASH +"TJ _tj_add2collection_frag(\"%s\") 
finished.\n",ftiName);
+    
+    return lstPre;
 }
 
 # 


------------------------------------------------------------------------------
The Planet: dedicated and managed hosting, cloud storage, colocation
Stay online with enterprise data centers and the best network in the business
Choose flexible plans and management services without long-term contracts
Personal 24x7 support from experience hosting pros just a phone call away.
http://p.sf.net/sfu/theplanet-com
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

Reply via email to