Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory 
sfp-cvsdas-1.v30.ch3.sourceforge.com:/tmp/cvs-serv11974/modules/pftijah

Modified Files:
      Tag: XQFT
        pftijah.mx 
Log Message:
propagated changes of Tuesday Feb 09 2010 - Friday Feb 12 2010
from the development trunk to the XQFT branch

  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  2010/02/09 - jflokstra: modules/pftijah/pftijah.mx,1.255
  - implement handling of NEXI programs by the ftc operator. Default behaviour 
is still the precompiled term query but when the term is prefixed by a "%" it 
is interpreted as a NEXI query. So
  
  ftc "%.[about(.,Churchill)]"
  
  is equivalent with:
  
  ftc "Churchill"
  
  But only much slower because it is not precompiled.
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  2010/02/12 - sjoerd: modules/pftijah/pftijah.mx,1.256
  propagated changes of Wednesday Feb 10 2010 - Friday Feb 12 2010
  from the Feb2010 branch to the development trunk
  
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    2010/02/10 - stmane: modules/pftijah/pftijah.mx,1.249.2.5
    propagated changes of Wednesday Feb 10 2010
    from the Nov2009 branch to the Feb2010 branch
  
      ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      2010/02/10 - cornuz: modules/pftijah/pftijah.mx,1.238.2.9
      - re-enable shred bit in tj_addcollection_frag (previously disabled by 
mistake), to shred documents before indexing when needed
      - initialize variable last_pre before the loop for indexing in chunks
      - some minor fixes to ADDHELP text
      ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.246.2.8
retrieving revision 1.246.2.9
diff -u -d -r1.246.2.8 -r1.246.2.9
--- pftijah.mx  8 Feb 2010 16:07:47 -0000       1.246.2.8
+++ pftijah.mx  12 Feb 2010 11:57:36 -0000      1.246.2.9
@@ -114,7 +114,7 @@
                       BAT[oid,oid] item,
                      BAT[oid,int] kind,
                       BAT[oid,str]  doc_loaded)
-                      : BAT[oid,oid] = CMDpf2tijah_node;
+                      : BAT[void,oid] = CMDpf2tijah_node;
  "Translate Pathfinder node sequence to tijah node sequence"
 
 .COMMAND offsetindex( BAT[void,oid] offset_tid, wrd res_size)
@@ -999,13 +999,13 @@
 
 ADDHELP("tj_add2collection", "flokstra & rode", "Jan 2007",
 "PARAMETERS:\n\
-- str ftiName: the name of the collection.\n
-- str uri_loc: the location of the xml document.\n
-- str uri_name: the name of the xml document (optional).\n
+- str ftiName: the name of the collection.\n\
+- str uri_loc: the location of the xml document.\n\
+- str uri_name: the name of the xml document (optional).\n\
 - bit shred: when true the doc is shredded when necessary.\n\
 DESCRIPTION:\n\
 Add a document to a pftijah collection. The document is indexed and if the \n\
-shred parameter is true it is also shredded in Pathfinder. 
+shred parameter is true it is also shredded in Pathfinder. \n\
 The index is automatically finalized at the end of the method.",
 "pftijah");
 PROC tj_add2collection(str ftiName, str uri_loc, str uri_name, bit shred) : 
void
@@ -1650,14 +1650,14 @@
 PROC tj_add2collection_frag(str ftiName, str uri, str filename, bit shred) : 
void
 {
       var uris := new(str,str).insert(uri, filename);
-      return tj_add2collection_frag(ftiName, uris, shred);
+      tj_add2collection_frag(ftiName, uris, shred);
 }
 
 ADDHELP("tj_add2collection_frag", "flokstra & rode", "Sept 2009",
 "PARAMETERS:\n\
--` str ftiName: the name of the collection.\n
+- str ftiName: the name of the collection.\n\
 - BAT[str,str]: the bat containing the [location,name] pairs of the xml 
docs.\n\
-- bit shred: when true the doc is shredded when necessary (deprecated).\n\
+- bit shred: when true the doc is shredded when necessary.\n\
 DESCRIPTION:\n\
 Adds a documents to the index. If needed, the index is split into several 
fragments.\n\
 Each fragment is finalized after it is filled to its maximum capacity.",
@@ -1665,9 +1665,6 @@
 PROC tj_add2collection_frag(str ftiName, BAT[str,str] uri, bit shred) : void
 {
     if ( verbose ) tj_verbose(HASH +"TJ tj_add2collection_frag(\"%s\") 
called.\n",ftiName);
-    if (shred)
-      ERROR("tj_add2collection_frag: shred bit no longer supported");
-      
     var coll_lock := tj_get_collection_lock(ftiName);
     lock_set(coll_lock);
     var err := CATCH({
@@ -1678,6 +1675,37 @@
       # get first free collection fragment (first fragment that is not yet 
filled completely)       
       var collBat := _tj_get_collection_frag(ftiName, commitBats);
 
+      # shred documents if needed
+      if ( shred ) {
+        var pf_collection := _tj_get_parameter2(collBat,"pf_collection");
+        u...@batloop() {
+          var uri_loc := $h;
+          var uri_name := $t;
+          if ( isnil(uri_loc) ) {
+            ERROR("tj_add2collection_frag: should specify doc_uri (and 
doc_name).");
+          }
+          if ( isnil(uri_name) ) {
+            uri_name := uri_loc;
+          } else if ( uri_name = "" ) {
+            uri_name := uri_loc;
+          }
+          if (not(bat("doc_name").reverse().exist(uri_name))) {
+            var s_start := usec();
+            if ( isnil(pf_collection) ) {
+              shred_doc(uri_loc,uri_name);
+            } else {
+              shred_doc(uri_loc,uri_name,pf_collection,0LL);
+            }
+            if ( timing ) {
+              ms := (usec() - s_start)/1000;
+              printf(HASH +"TJ tj_add2collection_frag(\"%s\"): shred time = 
%lld.%03llds.\n",uri_name,/(ms,1000),%(ms,1000));
+            }
+          } else {
+            if ( verbose ) tj_verbose(HASH +"TJ tj_add2collection_frag 
doc(\"%s\") already shredded.\n",uri_name);
+          }
+        }
+      }
+    
       # set access back to BAT_APPEND
       _tj_set_forwardindex_access(collBat, BAT_APPEND);
       
@@ -1694,7 +1722,7 @@
       
       var first_doc := 0;
       var last_doc := uri.count()-1;
-      var last_pre;
+      var last_pre := collBat.find("size").count_wrd() + 1;
       while(first_doc <= last_doc) {
         var uri_chunk := uri.slice(first_doc, first_doc+chunksize-1);
         var ws_opt := ws_create(0); 
@@ -3221,29 +3249,44 @@
           var xdoc_firstpre := bat("tj_" + ftiName + "0_doc_firstpre");
           var xpfpre := bat("tj_" + ftiName + "0_pfpre");
           var doc_loaded := 
par_ws.fetch(CONT_COLL).join(bat("doc_collection").reverse()).join(bat("doc_name"));
+
+         var tjPre_score;
+
           if ( verbose ) tj_verbose(HASH +" ALG_tj_ftfun_handler: compute 
startnodes\n");
           iter_tjPre := 
pf2tijah_node(true,xdoc_name,xdoc_firstpre,xpfpre,sn_item,[int](sn_kind),doc_loaded);
+         if ( not(ftc_term.startsWith("%")) ) {
+             if ( verbose ) tj_verbose(HASH +" ALG_tj_ftfun_handler: run 
precompiled term search\n");
          
-         # now tokenize the term string and add a default weight
-         var T := tijah_tokenize2bat(ftc_term).reverse().project(dbl(1.0));
-         var Q := tj_prepare_query(T);
+             # now tokenize the term string and add a default weight
+             var T := tijah_tokenize2bat(ftc_term).reverse().project(dbl(1.0));
+             var Q := tj_prepare_query(T);
 
-         # tj_init_termHash(ftiName);
-         # tj_init_tagHash(ftiName);
-         var scorebase := dbl(0.000000);
-         var c_lambda  := dbl(0.800000);
-         var okapi_k1  := dbl(1.200000);
-         var okapi_b   := dbl(0.750000);
-         var downprop  := "max";
-         var upprop    := "max";
-         var andcomb   := "prod";
-         var orcomb    := "sum";
-         var returnall := TRUE;
+             # tj_init_termHash(ftiName);
+             # tj_init_tagHash(ftiName);
+             var scorebase := dbl(0.000000);
+             var c_lambda  := dbl(0.800000);
+             var okapi_k1  := dbl(1.200000);
+             var okapi_b   := dbl(0.750000);
+             var downprop  := "max";
+             var upprop    := "max";
+             var andcomb   := "prod";
+             var orcomb    := "sum";
+             var returnall := TRUE;
 
-         var tjPre_score := 
tj_containing_query_nest_pre_term_NLLR(iter_tjPre.reverse(), Q);
+             tjPre_score := 
tj_containing_query_nest_pre_term_NLLR(iter_tjPre.reverse(), Q);
+         } else {
+             if ( verbose ) tj_verbose(HASH +" ALG_tj_ftfun_handler: run 
compiled  nexi query\n");
+            ftc_term := ftc_term.substring(2); # strip the "%"
+            # the next tmark() is a bit mysterious. The result of pf2tijah_node
+            # should be a [void,oid] but is an [oid,oid]
+            iter_tjPre := iter_tjPre.tmark(0...@0);
+            var opt := new(str,str);
+            opt.insert("return-all","true");
+            opt.insert("_query",ftc_term);
+            tjPre_score := 
run_tijah_query("DFLT_FT_INDEX",opt,true,iter_tjPre);
+         }
          var iter_score := iter_tjPre.leftjoin(tjPre_score);
 
-
          result_item_s.append(iter_score);
         });
         lock_unset(tijah_lock);


------------------------------------------------------------------------------
SOLARIS 10 is the OS for Data Centers - provides features such as DTrace,
Predictive Self Healing and Award Winning ZFS. Get Solaris 10 NOW
http://p.sf.net/sfu/solaris-dev2dev
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

Reply via email to