Revision: 15926
          http://gate.svn.sourceforge.net/gate/?rev=15926&view=rev
Author:   valyt
Date:     2012-07-12 15:13:39 +0000 (Thu, 12 Jul 2012)
Log Message:
-----------
Use occurrences  counts for deciding when to dump a batch, instead of document 
counts. This should be more resilient in face of different document sizes.

Modified Paths:
--------------
    mimir/trunk/mimir-core/src/gate/mimir/index/Indexer.java
    mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java
    
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirDirectIndexBuilder.java
    mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java

Modified: mimir/trunk/mimir-core/src/gate/mimir/index/Indexer.java
===================================================================
--- mimir/trunk/mimir-core/src/gate/mimir/index/Indexer.java    2012-07-12 
13:29:31 UTC (rev 15925)
+++ mimir/trunk/mimir-core/src/gate/mimir/index/Indexer.java    2012-07-12 
15:13:39 UTC (rev 15926)
@@ -309,7 +309,7 @@
    * The name of the index subdirectory storing MG4J indexes.
    */
   public static final String MG4J_INDEX_DIRNAME = "mg4j";
-
+  
   /**
    * The basename used for all MG4J indexes.
    */

Modified: mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java
===================================================================
--- mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java   
2012-07-12 13:29:31 UTC (rev 15925)
+++ mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java   
2012-07-12 15:13:39 UTC (rev 15926)
@@ -295,8 +295,16 @@
   /**
    * How many documents should be indexed in memory before dumping a batch to 
    * disk.
+   * @deprecated use {@link #OCCURRENCES_PER_BATCH} to limit the size of a 
batch 
+   * based on the number of occurrences instead.
    */
   public static final int DOCUMENTS_PER_BATCH = 4000;
+  
+  /**
+   * How many occurrences to index in each batch. This metric is more 
reliable, 
+   * than document counts, as it does not depend on average document size. 
+   */
+  public static final int OCCURRENCES_PER_BATCH = 20 * 1000 * 1000;
 
   /**
    * The size of the {@link #inputQueue}. 

Modified: 
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirDirectIndexBuilder.java
===================================================================
--- 
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirDirectIndexBuilder.java   
    2012-07-12 13:29:31 UTC (rev 15925)
+++ 
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirDirectIndexBuilder.java   
    2012-07-12 15:13:39 UTC (rev 15926)
@@ -14,33 +14,22 @@
  */
 package gate.mimir.index.mg4j;
 
-import static it.unimi.dsi.big.mg4j.tool.Scan.Completeness.COUNTS;
-import static it.unimi.dsi.big.mg4j.tool.Scan.Completeness.POSITIONS;
-import static it.unimi.dsi.io.OutputBitStream.GAMMA;
-import static it.unimi.dsi.io.OutputBitStream.MAX_PRECOMPUTED;
 import gate.Annotation;
 import gate.mimir.IndexConfig;
 import gate.mimir.index.IndexException;
 import gate.mimir.index.Indexer;
-import gate.mimir.index.mg4j.MimirIndexBuilder.PostingsList;
 import gate.mimir.util.MG4JTools;
 import gate.util.GateRuntimeException;
-
 import it.unimi.dsi.Util;
 import it.unimi.dsi.big.mg4j.index.Index;
 import it.unimi.dsi.big.mg4j.index.IndexIterator;
 import it.unimi.dsi.big.mg4j.index.IndexReader;
-import it.unimi.dsi.big.mg4j.io.ByteArrayPostingList;
 import it.unimi.dsi.big.mg4j.tool.Scan.Completeness;
-import it.unimi.dsi.bits.Fast;
 import it.unimi.dsi.logging.ProgressLogger;
 
 import java.io.File;
 import java.io.IOException;
-import java.lang.reflect.Field;
-import java.net.URI;
 import java.text.NumberFormat;
-import java.util.concurrent.BlockingQueue;
 
 import org.apache.log4j.Logger;
 
@@ -73,10 +62,8 @@
     this.indexBaseName = subIndexName + BASENAME_SUFFIX;
     // create the progress logger.  We use this.getClass to use the
     // logger belonging to a subclass rather than our own.
-    this.progressLogger = new ProgressLogger(
-            Logger.getLogger(this.getClass()), "documents");
-    closed = false;
-    closingProgress = 0;
+    progressLogger = new ProgressLogger(
+            Logger.getLogger(this.getClass()), "input terms");
     savePositions = false;
   }
 
@@ -118,7 +105,7 @@
     // input documentIDs become output termIDs
     // input termIDs become output documentIDs
     // NB: the variables in this method are named based on output semantics! 
-    
+    progressLogger.start("Inverting index...");
     try {
       // open the input index for reading
       Index inputIndex = MG4JTools.openMg4jIndex(
@@ -155,7 +142,7 @@
             }
             termPostings.setDocumentPointer(outputDocId);
             termPostings.setCount(count);
-            occurrencesInTheCurrentBatch++;
+            occurrencesInTheCurrentBatch += count;
             if(termPostings.outOfMemoryError) {
               // we are running out of memory, dump batches ASAP to free it up.
               indexer.getMg4jIndexer().dumpASAP();
@@ -187,8 +174,12 @@
         }
         if ( // we have been asked to dump 
              ( dumpBatchASAP || 
-               //.. OR we reached the maximum document limit for a batch       
-               documentPointer == MG4JIndexer.DOCUMENTS_PER_BATCH ) &&
+               //.. OR we reached the maximum occurrences for a batch       
+               // We're not storing positions, so the amount of data in the 
+               // index is smaller. We increase the number of occurrences / 
batch
+               // by a factor of 3.
+               occurrencesInTheCurrentBatch > 
(MG4JIndexer.OCCURRENCES_PER_BATCH * 3)
+               ) &&
              // AND there is data to dump
              occurrencesInTheCurrentBatch > 0 ){
           dumpBatch();
@@ -207,7 +198,7 @@
         }
         
       }
-      
+      inputIndexReader.close();
       // dump the last current batch
       flush();
       // close the index (combine the batches)

Modified: 
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java
===================================================================
--- mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java     
2012-07-12 13:29:31 UTC (rev 15925)
+++ mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java     
2012-07-12 15:13:39 UTC (rev 15926)
@@ -910,8 +910,8 @@
         if (
                // we have been asked to dump 
              ( dumpBatchASAP || 
-               //.. OR we reached the maximum document limit for a batch       
-               documentPointer == MG4JIndexer.DOCUMENTS_PER_BATCH
+               //.. OR we reached the maximum occurrences for a batch  
+               occurrencesInTheCurrentBatch > 
MG4JIndexer.OCCURRENCES_PER_BATCH  
              ) &&
              // AND there is data to dump
              occurrencesInTheCurrentBatch > 0

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to