Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory 
sfp-cvsdas-1.v30.ch3.sourceforge.com:/tmp/cvs-serv30239/modules/pftijah

Modified Files:
      Tag: M5XQ
        pftijah.mx 
Log Message:
propagated changes of Sunday Feb 07 2010
from the XQFT branch to the M5XQ branch

  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  2010/02/07 - stmane: modules/pftijah/pftijah.mx,1.246.2.7
  propagated changes of Sunday Feb 07 2010
  from the development trunk to the XQFT branch
  
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    2010/02/07 - stmane: modules/pftijah/pftijah.mx,1.253
    propagated changes of Sunday Feb 07 2010
    from the Feb2010 branch to the development trunk
  
      ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      2010/02/07 - stmane: modules/pftijah/pftijah.mx,1.249.2.3
      propagated changes of Friday Feb 05 2010 - Sunday Feb 07 2010
      from the Nov2009 branch to the Feb2010 branch
  
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        2010/02/05 - cornuz: modules/pftijah/pftijah.mx,1.238.2.7
        The fragment limit in the new fragmented index is based on number of 
used PREs, rather than documents.
        So, it is not known a priori how many documents will fit in one 
fragment.
        For this reason, the fragmented indexing was done per-document. After 
each document the fragment
        size was checked.
        This introduced two important reasons for bad performance and 
scalability:
  
        1) [performance] a working set was created for each document. This cost 
far more than the document indexing itself
        2) [scalability] a dbat set was created for each document (using 
_tj_throw2collection instead of _tj_throw2collection_bat). This triggered a 
dbat_extend for each document inserted, with terrible consequences as soon as 
the bats were memory mapped.
  
        Fixed by indexing in batches of documents, so that the overhead is 
amortised. For very large documents, this may not be optimal, because the 
fragment limit would be checked too rarely. By setting documentSize="large" in 
<TijahOptions>, the batch size is set to 1, which reproduces the behaviour of 
one document at the time.
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.226.2.23
retrieving revision 1.226.2.24
diff -u -d -r1.226.2.23 -r1.226.2.24
--- pftijah.mx  4 Feb 2010 10:41:29 -0000       1.226.2.23
+++ pftijah.mx  7 Feb 2010 18:21:57 -0000       1.226.2.24
@@ -1431,6 +1431,7 @@
       var blacklist      := "";
       var fragsize       := str(INT_MAX / 2);
       var delay_finalize := "0";
+      var docsize        := "small";
   
       pa...@batloop() {
         if ( verbose ) tj_verbose(HASH +"TJ 
_tj_init_collection_base():param[%s]=\"%s\"\n",$h,$t);
@@ -1442,6 +1443,8 @@
             parambat.insert($h,$t);
         } else if ( $h = "fragmentSize" ) {
             fragsize := $t;
+        } else if ( $h = "documentSize" ) {
+            docsize := toLower($t);
         } else if ( $h = "whitelist" ) {
             whitelist := $t;
         } else if ( $h = "blacklist" ) {
@@ -1460,6 +1463,7 @@
       parambat.insert("tokenizer",tokenizer);
       parambat.insert("stemmer",stemmer);
       parambat.insert("fragmentSize",fragsize);
+      parambat.insert("documentSize",docsize);
       parambat.insert("curFragment","0");
       parambat.insert("preExpansion","4");
       parambat.insert("lastStopWord","0");
@@ -1468,6 +1472,7 @@
       parambat.insert("_last_tijahPre","1");
       parambat.insert("_last_finalizedPre","0");
       parambat.insert("delay_finalize",delay_finalize);
+
       if( not(whitelist = "") )
             parambat.insert("whitelist",whitelist);
       if( not(blacklist = "") )
@@ -1652,17 +1657,17 @@
 "PARAMETERS:\n\
 -` str ftiName: the name of the collection.\n
 - BAT[str,str]: the bat containing the [location,name] pairs of the xml 
docs.\n\
-- bit shred: when true the doc is shredded when necessary.\n\
+- bit shred: when true the doc is shredded when necessary (deprecated).\n\
 DESCRIPTION:\n\
-The multidocument version of tj_add2collection. The main difference with the\n\
-other method is the [str,str] bat which contains the location of the 
document\n\
-in the head and the name in the tail.\n\
-The advantage of this method is that the collection is finalized after all\n\
-xml documents in the bat are added to the collection.",
+Adds a documents to the index. If needed, the index is split into several 
fragments.\n\
+Each fragment is finalized after it is filled to its maximum capacity.",
 "pftijah");
 PROC tj_add2collection_frag(str ftiName, BAT[str,str] uri, bit shred) : void
 {
     if ( verbose ) tj_verbose(HASH +"TJ tj_add2collection_frag(\"%s\") 
called.\n",ftiName);
+    if (shred)
+      GDKerror("tj_add2collection_frag: shred bit no longer supported");
+      
     var coll_lock := tj_get_collection_lock(ftiName);
     lock_set(coll_lock);
     var err := CATCH({
@@ -1677,20 +1682,37 @@
       _tj_set_forwardindex_access(collBat, BAT_APPEND);
       
       _tj_set_parameter2(collBat, "status", "building");
+
       var frag_size := wrd(_tj_get_parameter2(collBat, "fragmentSize"));
+
+      # for large documents, we need to check the fragment capacity limit for 
after each document
+      # for small documents, this is too expensive, so we do it in larger 
batches
+      var doc_size := _tj_get_parameter2(collBat, "documentSize");
+      var chunksize := 10000;
+      if (doc_size = "large")
+        chunksize := 1;
+      
+      var first_doc := 0;
+      var last_doc := uri.count()-1;
       var last_pre;
-      u...@batloop() {
-        last_pre := _tj_add2collection_frag(ftiName, collBat, $h, $t, shred);
+      while(first_doc <= last_doc) {
+        var uri_chunk := uri.slice(first_doc, first_doc+chunksize-1);
+        var ws_opt := ws_create(0); 
+        ws_opendoc(ws_opt, uri_chunk.tmark(0...@0));
+        _tj_throw2collection_bat(collBat,ws_opt,uri_chunk);
+        ws_destroy(ws_opt);
+        last_pre := collBat.find("size").count_wrd() + 1;
         
         # check capacity of current fragment
         if ( last_pre >= frag_size ) {
           if ( verbose ) tj_verbose(HASH +"TJ tj_add2collection_frag(\"%s\"), 
fragment limit reached.\n",ftiName);
-            _tj_set_parameter2(collBat, "_last_tijahPre", str(last_pre));
-            _tj_finalize_collection_frag(ftiName, collBat, commitBats, false);
-            collBat := _tj_get_collection_frag(ftiName, commitBats);
-            _tj_set_forwardindex_access(collBat, BAT_APPEND);
-          }
-      }
+          _tj_set_parameter2(collBat, "_last_tijahPre", str(last_pre));
+          _tj_finalize_collection_frag(ftiName, collBat, commitBats, false);
+          collBat := _tj_get_collection_frag(ftiName, commitBats);
+          _tj_set_forwardindex_access(collBat, BAT_APPEND);
+        }
+        first_doc := first_doc+chunksize;
+      } 
       # update params
       _tj_set_parameter2(collBat, "_last_tijahPre", str(last_pre));
       _tj_finalize_collection_frag(ftiName, collBat, commitBats, false);
@@ -1706,57 +1728,6 @@
     if (not(isnil(err))) ERROR(err);
 }
 
-#
-# this _tj_add2collection_frag adds a single document to the index (calling 
tj_throw2collection)
-# if necessary, the document is shredded before
-#
-PROC _tj_add2collection_frag(str ftiName, BAT[str,bat] collBat, str uri_loc, 
str uri_name, bit shred) : wrd
-{ 
-    var ms;
-    if ( verbose ) tj_verbose(HASH +"TJ 
_tj_add2collection_frag(\"%s\",\"%s\",\"%s\") 
start.\n",ftiName,uri_loc,uri_name);
-    if ( shred ) {
-      var pf_collection := _tj_get_parameter2(collBat,"pf_collection");
-      if ( isnil(uri_loc) ) {
-          ERROR("_tj_add2collection: should specify doc_uri (and doc_name).");
-      }
-      if ( isnil(uri_name) ) {
-        uri_name := uri_loc;
-      } else if ( uri_name = "" ) {
-        uri_name := uri_loc;
-      }
-      if (not(bat("doc_name").reverse().exist(uri_name))) {
-        var s_start := usec();
-        if ( isnil(pf_collection) ) {
-          shred_doc(uri_loc,uri_name);
-        } else {
-          shred_doc(uri_loc,uri_name,pf_collection,0LL);
-        }
-        if ( timing ) {
-          ms := (usec() - s_start)/1000;
-          printf(HASH +"TJ _tj_add2collection_frag(\"%s\"): shred time = 
%lld.%03llds.\n",uri_name,/(ms,1000),%(ms,1000));
-        }
-      } else {
-        if ( verbose ) tj_verbose(HASH +"TJ _tj_add2collection_frag 
doc(\"%s\") already shredded.\n",uri_name);
-      }
-    }
-    var        i_start := usec();
-    var xrpc_mode := "";
-    var ws_opt := ws_create(0); 
-    ws_opendoc(ws_opt, bat(void,str,1).append(uri_name));
-    _tj_throw2collection(collBat,ws_opt,uri_name);
-    ws_destroy(ws_opt);
-    
-    # compute last pre
-    var lstPre := collBat.find("size").count_wrd() + 1;
-
-    if ( timing ) {
-      ms := (usec()-i_start)/1000;
-      printf(HASH +"TJ _tj_add2collection_frag(\"%s\"): forward index creation 
time = %lld.%03llds.\n",uri_name,/(ms,1000),%(ms,1000));
-    }
-    if ( verbose ) tj_verbose(HASH +"TJ _tj_add2collection_frag(\"%s\") 
finished.\n",ftiName);
-    
-    return lstPre;
-}
 
 # 
 # _tj_finalize_collection_frag builds an inverted index on the given fragment 
(if finalization is not delayed)


------------------------------------------------------------------------------
The Planet: dedicated and managed hosting, cloud storage, colocation
Stay online with enterprise data centers and the best network in the business
Choose flexible plans and management services without long-term contracts
Personal 24x7 support from experience hosting pros just a phone call away.
http://p.sf.net/sfu/theplanet-com
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

Reply via email to