Update of /cvsroot/monetdb/pathfinder/modules/pftijah
In directory
sfp-cvsdas-1.v30.ch3.sourceforge.com:/tmp/cvs-serv28827/modules/pftijah
Modified Files:
Tag: XQFT
pftijah.mx
Log Message:
propagated changes of Sunday Feb 07 2010
from the development trunk to the XQFT branch
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2010/02/07 - stmane: modules/pftijah/pftijah.mx,1.253
propagated changes of Sunday Feb 07 2010
from the Feb2010 branch to the development trunk
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2010/02/07 - stmane: modules/pftijah/pftijah.mx,1.249.2.3
propagated changes of Friday Feb 05 2010 - Sunday Feb 07 2010
from the Nov2009 branch to the Feb2010 branch
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2010/02/05 - cornuz: modules/pftijah/pftijah.mx,1.238.2.7
The fragment limit in the new fragmented index is based on number of used
PREs, rather than documents.
So, it is not known a priori how many documents will fit in one fragment.
For this reason, the fragmented indexing was done per-document. After
each document the fragment
size was checked.
This introduced two important reasons for bad performance and scalability:
1) [performance] a working set was created for each document. This cost
far more than the document indexing itself
2) [scalability] a dbat set was created for each document (using
_tj_throw2collection instead of _tj_throw2collection_bat). This triggered a
dbat_extend for each document inserted, with terrible consequences as soon as
the bats were memory mapped.
Fixed by indexing in batches of documents, so that the overhead is
amortised. For very large documents, this may not be optimal, because the
fragment limit would be checked too rarely. By setting documentSize="large" in
<TijahOptions>, the batch size is set to 1, which reproduces the behaviour of
one document at the time.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Index: pftijah.mx
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/modules/pftijah/pftijah.mx,v
retrieving revision 1.246.2.6
retrieving revision 1.246.2.7
diff -u -d -r1.246.2.6 -r1.246.2.7
--- pftijah.mx 4 Feb 2010 10:35:23 -0000 1.246.2.6
+++ pftijah.mx 7 Feb 2010 18:15:12 -0000 1.246.2.7
@@ -1431,6 +1431,7 @@
var blacklist := "";
var fragsize := str(INT_MAX / 2);
var delay_finalize := "0";
+ var docsize := "small";
pa...@batloop() {
if ( verbose ) tj_verbose(HASH +"TJ
_tj_init_collection_base():param[%s]=\"%s\"\n",$h,$t);
@@ -1442,6 +1443,8 @@
parambat.insert($h,$t);
} else if ( $h = "fragmentSize" ) {
fragsize := $t;
+ } else if ( $h = "documentSize" ) {
+ docsize := toLower($t);
} else if ( $h = "whitelist" ) {
whitelist := $t;
} else if ( $h = "blacklist" ) {
@@ -1460,6 +1463,7 @@
parambat.insert("tokenizer",tokenizer);
parambat.insert("stemmer",stemmer);
parambat.insert("fragmentSize",fragsize);
+ parambat.insert("documentSize",docsize);
parambat.insert("curFragment","0");
parambat.insert("preExpansion","4");
parambat.insert("lastStopWord","0");
@@ -1468,6 +1472,7 @@
parambat.insert("_last_tijahPre","1");
parambat.insert("_last_finalizedPre","0");
parambat.insert("delay_finalize",delay_finalize);
+
if( not(whitelist = "") )
parambat.insert("whitelist",whitelist);
if( not(blacklist = "") )
@@ -1652,17 +1657,17 @@
"PARAMETERS:\n\
-` str ftiName: the name of the collection.\n
- BAT[str,str]: the bat containing the [location,name] pairs of the xml
docs.\n\
-- bit shred: when true the doc is shredded when necessary.\n\
+- bit shred: when true the doc is shredded when necessary (deprecated).\n\
DESCRIPTION:\n\
-The multidocument version of tj_add2collection. The main difference with the\n\
-other method is the [str,str] bat which contains the location of the
document\n\
-in the head and the name in the tail.\n\
-The advantage of this method is that the collection is finalized after all\n\
-xml documents in the bat are added to the collection.",
+Adds a documents to the index. If needed, the index is split into several
fragments.\n\
+Each fragment is finalized after it is filled to its maximum capacity.",
"pftijah");
PROC tj_add2collection_frag(str ftiName, BAT[str,str] uri, bit shred) : void
{
if ( verbose ) tj_verbose(HASH +"TJ tj_add2collection_frag(\"%s\")
called.\n",ftiName);
+ if (shred)
+ GDKerror("tj_add2collection_frag: shred bit no longer supported");
+
var coll_lock := tj_get_collection_lock(ftiName);
lock_set(coll_lock);
var err := CATCH({
@@ -1677,20 +1682,37 @@
_tj_set_forwardindex_access(collBat, BAT_APPEND);
_tj_set_parameter2(collBat, "status", "building");
+
var frag_size := wrd(_tj_get_parameter2(collBat, "fragmentSize"));
+
+ # for large documents, we need to check the fragment capacity limit for
after each document
+ # for small documents, this is too expensive, so we do it in larger
batches
+ var doc_size := _tj_get_parameter2(collBat, "documentSize");
+ var chunksize := 10000;
+ if (doc_size = "large")
+ chunksize := 1;
+
+ var first_doc := 0;
+ var last_doc := uri.count()-1;
var last_pre;
- u...@batloop() {
- last_pre := _tj_add2collection_frag(ftiName, collBat, $h, $t, shred);
+ while(first_doc <= last_doc) {
+ var uri_chunk := uri.slice(first_doc, first_doc+chunksize-1);
+ var ws_opt := ws_create(0);
+ ws_opendoc(ws_opt, uri_chunk.tmark(0...@0));
+ _tj_throw2collection_bat(collBat,ws_opt,uri_chunk);
+ ws_destroy(ws_opt);
+ last_pre := collBat.find("size").count_wrd() + 1;
# check capacity of current fragment
if ( last_pre >= frag_size ) {
if ( verbose ) tj_verbose(HASH +"TJ tj_add2collection_frag(\"%s\"),
fragment limit reached.\n",ftiName);
- _tj_set_parameter2(collBat, "_last_tijahPre", str(last_pre));
- _tj_finalize_collection_frag(ftiName, collBat, commitBats, false);
- collBat := _tj_get_collection_frag(ftiName, commitBats);
- _tj_set_forwardindex_access(collBat, BAT_APPEND);
- }
- }
+ _tj_set_parameter2(collBat, "_last_tijahPre", str(last_pre));
+ _tj_finalize_collection_frag(ftiName, collBat, commitBats, false);
+ collBat := _tj_get_collection_frag(ftiName, commitBats);
+ _tj_set_forwardindex_access(collBat, BAT_APPEND);
+ }
+ first_doc := first_doc+chunksize;
+ }
# update params
_tj_set_parameter2(collBat, "_last_tijahPre", str(last_pre));
_tj_finalize_collection_frag(ftiName, collBat, commitBats, false);
@@ -1706,57 +1728,6 @@
if (not(isnil(err))) ERROR(err);
}
-#
-# this _tj_add2collection_frag adds a single document to the index (calling
tj_throw2collection)
-# if necessary, the document is shredded before
-#
-PROC _tj_add2collection_frag(str ftiName, BAT[str,bat] collBat, str uri_loc,
str uri_name, bit shred) : wrd
-{
- var ms;
- if ( verbose ) tj_verbose(HASH +"TJ
_tj_add2collection_frag(\"%s\",\"%s\",\"%s\")
start.\n",ftiName,uri_loc,uri_name);
- if ( shred ) {
- var pf_collection := _tj_get_parameter2(collBat,"pf_collection");
- if ( isnil(uri_loc) ) {
- ERROR("_tj_add2collection: should specify doc_uri (and doc_name).");
- }
- if ( isnil(uri_name) ) {
- uri_name := uri_loc;
- } else if ( uri_name = "" ) {
- uri_name := uri_loc;
- }
- if (not(bat("doc_name").reverse().exist(uri_name))) {
- var s_start := usec();
- if ( isnil(pf_collection) ) {
- shred_doc(uri_loc,uri_name);
- } else {
- shred_doc(uri_loc,uri_name,pf_collection,0LL);
- }
- if ( timing ) {
- ms := (usec() - s_start)/1000;
- printf(HASH +"TJ _tj_add2collection_frag(\"%s\"): shred time =
%lld.%03llds.\n",uri_name,/(ms,1000),%(ms,1000));
- }
- } else {
- if ( verbose ) tj_verbose(HASH +"TJ _tj_add2collection_frag
doc(\"%s\") already shredded.\n",uri_name);
- }
- }
- var i_start := usec();
- var xrpc_mode := "";
- var ws_opt := ws_create(0);
- ws_opendoc(ws_opt, bat(void,str,1).append(uri_name));
- _tj_throw2collection(collBat,ws_opt,uri_name);
- ws_destroy(ws_opt);
-
- # compute last pre
- var lstPre := collBat.find("size").count_wrd() + 1;
-
- if ( timing ) {
- ms := (usec()-i_start)/1000;
- printf(HASH +"TJ _tj_add2collection_frag(\"%s\"): forward index creation
time = %lld.%03llds.\n",uri_name,/(ms,1000),%(ms,1000));
- }
- if ( verbose ) tj_verbose(HASH +"TJ _tj_add2collection_frag(\"%s\")
finished.\n",ftiName);
-
- return lstPre;
-}
#
# _tj_finalize_collection_frag builds an inverted index on the given fragment
(if finalization is not delayed)
------------------------------------------------------------------------------
The Planet: dedicated and managed hosting, cloud storage, colocation
Stay online with enterprise data centers and the best network in the business
Choose flexible plans and management services without long-term contracts
Personal 24x7 support from experience hosting pros just a phone call away.
http://p.sf.net/sfu/theplanet-com
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins