Revision: 15926
http://gate.svn.sourceforge.net/gate/?rev=15926&view=rev
Author: valyt
Date: 2012-07-12 15:13:39 +0000 (Thu, 12 Jul 2012)
Log Message:
-----------
Use occurrences counts for deciding when to dump a batch, instead of document
counts. This should be more resilient in face of different document sizes.
Modified Paths:
--------------
mimir/trunk/mimir-core/src/gate/mimir/index/Indexer.java
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirDirectIndexBuilder.java
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java
Modified: mimir/trunk/mimir-core/src/gate/mimir/index/Indexer.java
===================================================================
--- mimir/trunk/mimir-core/src/gate/mimir/index/Indexer.java 2012-07-12
13:29:31 UTC (rev 15925)
+++ mimir/trunk/mimir-core/src/gate/mimir/index/Indexer.java 2012-07-12
15:13:39 UTC (rev 15926)
@@ -309,7 +309,7 @@
* The name of the index subdirectory storing MG4J indexes.
*/
public static final String MG4J_INDEX_DIRNAME = "mg4j";
-
+
/**
* The basename used for all MG4J indexes.
*/
Modified: mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java
===================================================================
--- mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java
2012-07-12 13:29:31 UTC (rev 15925)
+++ mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java
2012-07-12 15:13:39 UTC (rev 15926)
@@ -295,8 +295,16 @@
/**
* How many documents should be indexed in memory before dumping a batch to
* disk.
+ * @deprecated use {@link #OCCURRENCES_PER_BATCH} to limit the size of a
batch
+ * based on the number of occurrences instead.
*/
public static final int DOCUMENTS_PER_BATCH = 4000;
+
+ /**
+ * How many occurrences to index in each batch. This metric is more
reliable,
+ * than document counts, as it does not depend on average document size.
+ */
+ public static final int OCCURRENCES_PER_BATCH = 20 * 1000 * 1000;
/**
* The size of the {@link #inputQueue}.
Modified:
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirDirectIndexBuilder.java
===================================================================
---
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirDirectIndexBuilder.java
2012-07-12 13:29:31 UTC (rev 15925)
+++
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirDirectIndexBuilder.java
2012-07-12 15:13:39 UTC (rev 15926)
@@ -14,33 +14,22 @@
*/
package gate.mimir.index.mg4j;
-import static it.unimi.dsi.big.mg4j.tool.Scan.Completeness.COUNTS;
-import static it.unimi.dsi.big.mg4j.tool.Scan.Completeness.POSITIONS;
-import static it.unimi.dsi.io.OutputBitStream.GAMMA;
-import static it.unimi.dsi.io.OutputBitStream.MAX_PRECOMPUTED;
import gate.Annotation;
import gate.mimir.IndexConfig;
import gate.mimir.index.IndexException;
import gate.mimir.index.Indexer;
-import gate.mimir.index.mg4j.MimirIndexBuilder.PostingsList;
import gate.mimir.util.MG4JTools;
import gate.util.GateRuntimeException;
-
import it.unimi.dsi.Util;
import it.unimi.dsi.big.mg4j.index.Index;
import it.unimi.dsi.big.mg4j.index.IndexIterator;
import it.unimi.dsi.big.mg4j.index.IndexReader;
-import it.unimi.dsi.big.mg4j.io.ByteArrayPostingList;
import it.unimi.dsi.big.mg4j.tool.Scan.Completeness;
-import it.unimi.dsi.bits.Fast;
import it.unimi.dsi.logging.ProgressLogger;
import java.io.File;
import java.io.IOException;
-import java.lang.reflect.Field;
-import java.net.URI;
import java.text.NumberFormat;
-import java.util.concurrent.BlockingQueue;
import org.apache.log4j.Logger;
@@ -73,10 +62,8 @@
this.indexBaseName = subIndexName + BASENAME_SUFFIX;
// create the progress logger. We use this.getClass to use the
// logger belonging to a subclass rather than our own.
- this.progressLogger = new ProgressLogger(
- Logger.getLogger(this.getClass()), "documents");
- closed = false;
- closingProgress = 0;
+ progressLogger = new ProgressLogger(
+ Logger.getLogger(this.getClass()), "input terms");
savePositions = false;
}
@@ -118,7 +105,7 @@
// input documentIDs become output termIDs
// input termIDs become output documentIDs
// NB: the variables in this method are named based on output semantics!
-
+ progressLogger.start("Inverting index...");
try {
// open the input index for reading
Index inputIndex = MG4JTools.openMg4jIndex(
@@ -155,7 +142,7 @@
}
termPostings.setDocumentPointer(outputDocId);
termPostings.setCount(count);
- occurrencesInTheCurrentBatch++;
+ occurrencesInTheCurrentBatch += count;
if(termPostings.outOfMemoryError) {
// we are running out of memory, dump batches ASAP to free it up.
indexer.getMg4jIndexer().dumpASAP();
@@ -187,8 +174,12 @@
}
if ( // we have been asked to dump
( dumpBatchASAP ||
- //.. OR we reached the maximum document limit for a batch
- documentPointer == MG4JIndexer.DOCUMENTS_PER_BATCH ) &&
+ //.. OR we reached the maximum occurrences for a batch
+ // We're not storing positions, so the amount of data in the
+ // index is smaller. We increase the number of occurrences /
batch
+ // by a factor of 3.
+ occurrencesInTheCurrentBatch >
(MG4JIndexer.OCCURRENCES_PER_BATCH * 3)
+ ) &&
// AND there is data to dump
occurrencesInTheCurrentBatch > 0 ){
dumpBatch();
@@ -207,7 +198,7 @@
}
}
-
+ inputIndexReader.close();
// dump the last current batch
flush();
// close the index (combine the batches)
Modified:
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java
===================================================================
--- mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java
2012-07-12 13:29:31 UTC (rev 15925)
+++ mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java
2012-07-12 15:13:39 UTC (rev 15926)
@@ -910,8 +910,8 @@
if (
// we have been asked to dump
( dumpBatchASAP ||
- //.. OR we reached the maximum document limit for a batch
- documentPointer == MG4JIndexer.DOCUMENTS_PER_BATCH
+ //.. OR we reached the maximum occurrences for a batch
+ occurrencesInTheCurrentBatch >
MG4JIndexer.OCCURRENCES_PER_BATCH
) &&
// AND there is data to dump
occurrencesInTheCurrentBatch > 0
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and
threat landscape has changed and how IT managers can respond. Discussions
will include endpoint security, mobile security and the latest in malware
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs