Revision: 17196
          http://sourceforge.net/p/gate/code/17196
Author:   valyt
Date:     2013-12-20 15:20:21 +0000 (Fri, 20 Dec 2013)
Log Message:
-----------
We now write .sizes and .stats files as well, which brings us back to the same 
on-disk representation of an MG4J index as M?\195?\173mir version 4.

Started work on designing the clustering of indexes.  

Modified Paths:
--------------
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2013-12-20 12:49:03 UTC (rev 17195)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2013-12-20 15:20:21 UTC (rev 17196)
@@ -26,6 +26,7 @@
 import it.unimi.di.big.mg4j.index.IndexWriter;
 import it.unimi.di.big.mg4j.index.QuasiSuccinctIndex;
 import it.unimi.di.big.mg4j.index.QuasiSuccinctIndexWriter;
+import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
 import it.unimi.di.big.mg4j.io.IOFactory;
 import it.unimi.di.big.mg4j.tool.Scan;
 import it.unimi.dsi.big.io.FileLinesCollection;
@@ -45,14 +46,17 @@
 import it.unimi.dsi.io.OutputBitStream;
 import it.unimi.dsi.lang.MutableString;
 import it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction;
+import it.unimi.dsi.util.BloomFilter;
 import it.unimi.dsi.util.Properties;
 
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
+import java.io.PrintStream;
 import java.io.PrintWriter;
 import java.nio.ByteOrder;
+import java.util.List;
 import java.util.concurrent.BlockingQueue;
 
 import org.apache.commons.configuration.ConfigurationException;
@@ -241,6 +245,22 @@
   }
   
   /**
+   * Class representing an MG4J index batch, such as the head or any of the 
+   * tails.
+   */
+  protected static class MG4JIndex {
+    protected File indexDir;
+    protected Index index;
+    protected BloomFilter<Void> termFilter;
+    int numberOfDocuments;
+    int numberOfTerms;
+    long numberOfPostings;
+    long numberOfOccurences;
+    int maxCount;
+    
+  }
+  
+  /**
    * Given a terms file (text file with one term per line) this method 
generates
    * the corresponding termmap file (binary representation of a StringMap).
    * @param termsFile the input file
@@ -333,6 +353,22 @@
   protected MimirIndex parent;
   
   /**
+   * The head index for this atomic index.
+   */
+  protected MG4JIndex head;
+  
+  /**
+   * The tails for this atomic index.
+   */
+  protected List<MG4JIndex> tails;
+  
+  /**
+   * The cluster-view of all the MG4J indexes that are part of this index (i.e.
+   * the head and all the tails). 
+   */
+  protected DocumentalCluster indexCluster;
+  
+  /**
    * A set of properties added to the ones obtained from the index writer when
    * writing out batches.
    */
@@ -372,7 +408,7 @@
   
   
   /**
-   * THe number of documents currently stored in RAM.
+   * The number of documents currently stored in RAM.
    */
   protected long documentsInRAM;
   
@@ -382,6 +418,11 @@
   protected Object2ReferenceOpenHashMap<MutableString, PostingsList> termMap;
   
   /**
+   * The sizes (numbers of terms) for all the documents indexed in RAM.
+   */
+  protected IntArrayList documentSizesInRAM;
+  
+  /**
    * Creates a new AtomicIndex
    * 
    * @param parent the {@link MimirIndex} containing this atomic index.
@@ -411,21 +452,6 @@
     
   }
 
-       protected void initIndex() {
-    if(indexDirectory.exists()) {
-      // opening an existing index
-      //TODO
-    } else {
-      // new index creation
-      indexDirectory.mkdirs();
-      documentPointer = 0;
-    }
-    occurrencesInRAM = 0;
-    maxDocSizeInRAM = -1;
-    documentsInRAM = 0;
-    termMap = new Object2ReferenceOpenHashMap<MutableString, 
-        PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );
-       }
        
   /**
         * Gets the name of this atomic index. This is used as the file name 
for the 
@@ -454,6 +480,32 @@
        }
        
        /**
+        * Starts a new MG4J batch. First time around this will be the head, 
+        * subsequent calls will start a new tail.
+        */
+       protected void newBatch() {
+         occurrencesInRAM = 0;
+    maxDocSizeInRAM = -1;
+    documentsInRAM = 0;
+    // hack to force zero-based batches. This forces us to use a merged cluster
+    // but avoids exceptions during indexing. We'll fix this if possible, 
after 
+    // we get advice on the MG4J mailing list.
+    documentPointer = 0;
+    if(termMap == null) {
+      termMap = new Object2ReferenceOpenHashMap<MutableString, 
+          PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );      
+    } else {
+      termMap.clear();
+      termMap.trim( INITIAL_TERM_MAP_SIZE );
+    } 
+    if(documentSizesInRAM  == null) {
+      documentSizesInRAM = new IntArrayList();
+    } else {
+      documentSizesInRAM.clear();
+    }
+       }
+       
+       /**
         * Writes all the data currently stored in RAM to a new tail index.
         * @throws IOException 
         * @throws IndexException 
@@ -527,6 +579,13 @@
     pw.close();
     generateTermMap(new File(mg4jBasename + DiskBasedIndex.TERMS_EXTENSION),
         new File(mg4jBasename + DiskBasedIndex.TERMMAP_EXTENSION));
+    // write the sizes file
+    File sizesFile = new File(mg4jBasename + DiskBasedIndex.SIZES_EXTENSION);
+    OutputBitStream sizesStream = new OutputBitStream(sizesFile);   
+    for(int docSize : documentSizesInRAM.elements()) {
+      sizesStream.writeGamma(docSize);
+    }
+    sizesStream.close();
     
     // write the actual index
     int maxCount = 0;
@@ -550,6 +609,12 @@
       properties.addAll(additionalProperties);
       Scan.saveProperties( IOFactory.FILESYSTEM_FACTORY, properties, 
           mg4jBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
+      
+      // write stats
+      PrintStream statsPs = new PrintStream(new File(mg4jBasename + 
+          DiskBasedIndex.STATS_EXTENSION));
+      indexWriter.printStats(statsPs);
+      statsPs.close();
     } catch(ConfigurationException e) {
       // this should never happen
       throw new IndexException("Error while saving tail properties", e);
@@ -563,17 +628,8 @@
          
          // clear queued-documents folder
        
-         // clear out internal state, in preparation for the next tail
-         
-         occurrencesInRAM = 0;
-    maxDocSizeInRAM = -1;
-    documentsInRAM = 0;
-    // hack to force zero-based batches. This forces us to use a merged cluster
-    // but avoids exceptions during indexing. We'll fix this if possible, 
after 
-    // we get advice on the MG4J mailing list.
-    documentPointer = 0;
-    termMap.clear();
-    termMap.trim( INITIAL_TERM_MAP_SIZE );
+         // clear out internal state, in preparation for the next tail  
+         newBatch();
        }
        
        /**
@@ -606,7 +662,18 @@
          indexingThread = Thread.currentThread();
          GATEDocument aDocument;
          try{
-         initIndex();
+           // open the index
+           if(indexDirectory.exists()) {
+             // opening an existing index
+             //TODO
+           } else {
+             // new index creation
+             indexDirectory.mkdirs();
+             documentPointer = 0;
+           }
+           
+           // start in-RAM indexing
+           newBatch();
          if(inputQueue != null) {
         while((aDocument = inputQueue.take()) != GATEDocument.END_OF_QUEUE){
           try {
@@ -735,6 +802,8 @@
       // the current document is finished
       int docLength = tokenPosition + 1;
       if(docLength > maxDocSizeInRAM) maxDocSizeInRAM = docLength;
+      documentSizesInRAM.add(docLength);
+      
     } catch (IOException e) {
       throw new IndexException("IO Exception while indexing", e);
     }finally {

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Rapidly troubleshoot problems before they affect your business. Most IT 
organizations don't have a clear picture of how application performance 
affects their revenue. With AppDynamics, you get 100% visibility into your 
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to