[gate-cvs] SF.net SVN: gate:[17192] mimir/branches/5.0

valyt Fri, 20 Dec 2013 02:56:45 -0800

Revision: 17192
          http://sourceforge.net/p/gate/code/17192
Author:   valyt
Date:     2013-12-20 10:56:20 +0000 (Fri, 20 Dec 2013)
Log Message:
-----------
We now seem able to create multi-tailed Token indexes (or at least some files 
on disk that have the correct names).


Modified Paths:
--------------
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
    
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/util/MG4JTools.java
    mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2013-12-20 08:57:15 UTC (rev 17191)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2013-12-20 10:56:20 UTC (rev 17192)
@@ -21,22 +21,31 @@
 import gate.util.GateRuntimeException;
 import it.unimi.di.big.mg4j.index.CompressionFlags;
 import it.unimi.di.big.mg4j.index.DiskBasedIndex;
+import it.unimi.di.big.mg4j.index.Index;
 import it.unimi.di.big.mg4j.index.IndexReader;
 import it.unimi.di.big.mg4j.index.IndexWriter;
 import it.unimi.di.big.mg4j.index.QuasiSuccinctIndex;
 import it.unimi.di.big.mg4j.index.QuasiSuccinctIndexWriter;
 import it.unimi.di.big.mg4j.io.IOFactory;
+import it.unimi.di.big.mg4j.tool.Scan;
+import it.unimi.dsi.big.io.FileLinesCollection;
+import it.unimi.dsi.big.util.ShiftAddXorSignedStringMap;
+import it.unimi.dsi.big.util.StringMap;
 import it.unimi.dsi.bits.Fast;
+import it.unimi.dsi.bits.TransformationStrategies;
 import it.unimi.dsi.fastutil.Arrays;
 import it.unimi.dsi.fastutil.Hash;
 import it.unimi.dsi.fastutil.Swapper;
 import it.unimi.dsi.fastutil.ints.IntArrayList;
 import it.unimi.dsi.fastutil.ints.IntComparator;
 import it.unimi.dsi.fastutil.ints.IntList;
+import it.unimi.dsi.fastutil.io.BinIO;
 import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
 import it.unimi.dsi.fastutil.objects.Object2ReferenceOpenHashMap;
 import it.unimi.dsi.io.OutputBitStream;
 import it.unimi.dsi.lang.MutableString;
+import it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction;
+import it.unimi.dsi.util.Properties;
 
 import java.io.File;
 import java.io.FileOutputStream;
@@ -46,6 +55,7 @@
 import java.nio.ByteOrder;
 import java.util.concurrent.BlockingQueue;
 
+import org.apache.commons.configuration.ConfigurationException;
 import org.apache.log4j.Logger;
 
 import com.google.common.io.PatternFilenameFilter;
@@ -80,7 +90,7 @@
     /**
      * The list of document pointer differentials (differences from 
      * {@link #firstDocumentPointer}). For the sake of easy alignment, we 
-     * actaully store a <tt>0</tt> on the first position.
+     * actually store a <tt>0</tt> on the first position.
      */
     private IntList documentPointersDifferential;
     
@@ -230,6 +240,22 @@
     }
   }
   
+  /**
+   * Given a terms file (text file with one term per line) this method 
generates
+   * the corresponding termmap file (binary representation of a StringMap).
+   * @param termsFile the input file
+   * @param termmapFile the output file
+   * @throws IOException 
+   */
+  public static void generateTermMap(File termsFile, File termmapFile) throws 
IOException {
+    FileLinesCollection fileLinesCollection =
+        new FileLinesCollection(termsFile.getAbsolutePath(), "UTF-8");
+      StringMap<CharSequence> terms = new ShiftAddXorSignedStringMap(
+        fileLinesCollection.iterator(),
+        new LcpMonotoneMinimalPerfectHashFunction<CharSequence>(
+          fileLinesCollection, TransformationStrategies.prefixFreeUtf16()));
+      BinIO.storeObject(terms, termmapFile);    
+  }  
 
   /**
    * The file name (under the current directory for this atomic index) which 
@@ -255,8 +281,6 @@
    */
   public static final String TAIL_FILE_NAME_PREFIX = "tail-";
   
-  
-  
   /**
    * The file name (under the current directory for this atomic index) for the
    * directory containing the documents that have been queued for indexing, 
but 
@@ -284,16 +308,18 @@
   protected String name;
   
   protected File indexDirectory;
+
   
   /**
-   * The number of occurrences stored in this index.
+   * The size (number of terms) for the longest document indexed but not yet 
+   * saved. 
    */
-  protected long totalOccurrences;
+  protected int maxDocSizeInRam = -1;
   
   /**
    * The number of occurrences represented in RAM and not yet written to disk. 
 
    */
-  protected long newOccurrences;
+  protected long occurrencesInRam = 0;
   
   /**
    * How many occurrences to be accumulated in RAM before a new tail batch is
@@ -306,6 +332,12 @@
    */
   protected MimirIndex parent;
   
+  /**
+   * A set of properties added to the ones obtained from the index writer when
+   * writing out batches.
+   */
+  protected Properties additionalProperties;
+  
   protected boolean hasDirectIndex;
   
   protected Thread indexingThread;
@@ -368,6 +400,7 @@
     
     this.currentTerm = new MutableString();
     
+    this.additionalProperties = new Properties();
     
   }
 
@@ -378,14 +411,12 @@
     } else {
       // new index creation
       indexDirectory.mkdirs();
-      
-      totalOccurrences = 0;
-      newOccurrences = 0;
       documentPointer = 0;
-      
-      termMap = new Object2ReferenceOpenHashMap<MutableString, 
-          PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );
-    }    
+    }
+    occurrencesInRam = 0;
+    maxDocSizeInRam = -1;
+    termMap = new Object2ReferenceOpenHashMap<MutableString, 
+        PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );
        }
        
   /**
@@ -417,19 +448,25 @@
        /**
         * Writes all the data currently stored in RAM to a new tail index.
         * @throws IOException 
+        * @throws IndexException 
         */
-       public void writeNewTail() throws IOException {
+       protected void writeCurrentTail() throws IOException, IndexException {
          // find the name for the new tail
-         String[] existingTails = indexDirectory.list(TAILS_FILENAME_FILTER);
          int tailNo = -1;
-         for(String aTail : existingTails) {
-           int aTailNo = 
Integer.parseInt(aTail.substring(TAIL_FILE_NAME_PREFIX.length()));
-           if(aTailNo > tailNo) tailNo = aTailNo;
+         File headDir = new File(indexDirectory, HEAD_FILE_NAME);
+         if(headDir.exists()) {
+           // we have a head, calculate the tail number for this new tail
+           String[] existingTails = indexDirectory.list(TAILS_FILENAME_FILTER);
+           for(String aTail : existingTails) {
+             int aTailNo = 
Integer.parseInt(aTail.substring(TAIL_FILE_NAME_PREFIX.length()));
+             if(aTailNo > tailNo) tailNo = aTailNo;
+           }
+           tailNo++;       
          }
-         tailNo++;
          
          // Open an index writer for the new tail
-         String newTailName = TAIL_FILE_NAME_PREFIX + Integer.toString(tailNo);
+         String newTailName = tailNo == -1 ? HEAD_FILE_NAME : 
+             (TAIL_FILE_NAME_PREFIX + Integer.toString(tailNo));
          File newTailDir = new File(indexDirectory, newTailName);
          newTailDir.mkdir();
          String mg4jBasename = new File(newTailDir, name).getAbsolutePath();
@@ -445,7 +482,7 @@
     int numTerms = termMap.size();
     logger.info( "Generating index for batch " + newTailName + 
             "; documents: " + documentPointer + "; terms:" + numTerms + 
-            "; occurrences: " + newOccurrences );
+            "; occurrences: " + occurrencesInRam );
     
     // We write down all term in appearance order in termArray.
     final MutableString[] termArray = termMap.keySet().toArray(new 
MutableString[ numTerms ]);
@@ -470,7 +507,7 @@
                 termArray[other] = temp;
               }
             });
-         // write the term map
+         // write the terms and termmap files
     PrintWriter pw = new PrintWriter( 
         new OutputStreamWriter(new FastBufferedOutputStream(
             new FileOutputStream(mg4jBasename + 
DiskBasedIndex.TERMS_EXTENSION), 
@@ -480,6 +517,9 @@
       t.println( pw );
     }
     pw.close();
+    generateTermMap(new File(mg4jBasename + DiskBasedIndex.TERMS_EXTENSION),
+        new File(mg4jBasename + DiskBasedIndex.TERMMAP_EXTENSION));
+    
     // write the actual index
     int maxCount = 0;
     for ( int i = 0; i < numTerms; i++ ) {
@@ -487,8 +527,25 @@
       if ( maxCount < postingsList.maxCount ) maxCount = postingsList.maxCount;
       postingsList.write(indexWriter);
     }
-    
     indexWriter.close();
+    // write the index properties
+    try {
+      Properties properties = indexWriter.properties();
+      additionalProperties.setProperty( Index.PropertyKeys.SIZE, 
+          indexWriter.writtenBits());
+      // -1 means unknown
+      additionalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, 
+          maxDocSizeInRam);
+      additionalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount 
);
+      additionalProperties.setProperty( Index.PropertyKeys.OCCURRENCES, 
+          occurrencesInRam );
+      properties.addAll(additionalProperties);
+      Scan.saveProperties( IOFactory.FILESYSTEM_FACTORY, properties, 
+          mg4jBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
+    } catch(ConfigurationException e) {
+      // this should never happen
+      throw new IndexException("Error while saving tail properties", e);
+    }
          // merge new tail into index cluster
     
          if(hasDirectIndex) {
@@ -497,13 +554,16 @@
          }
          
          // clear queued-documents folder
+       
+         // clear out internal state, in preparation for the next tail
          
-         newOccurrences = 0;
-         
-         termMap.clear();
+         occurrencesInRam = 0;
+    maxDocSizeInRam = -1;
+    documentPointer = 0;
+    termMap.clear();
+    termMap.trim( INITIAL_TERM_MAP_SIZE );
        }
        
-       
        /**
         * Combines all the currently existing tails into the head, generating 
a new
         * head index.
@@ -543,13 +603,13 @@
             logger.error("Problem while indexing document!", e);
           }
           //dump batch if needed AND there is data to dump
-          if (occurrencesPerBatch > 0 && newOccurrences > occurrencesPerBatch){
-            writeNewTail();
+          if (occurrencesPerBatch > 0 && occurrencesInRam > 
occurrencesPerBatch){
+            writeCurrentTail();
           }
           outputQueue.put(aDocument);
         }
         // we're done
-        writeNewTail();
+        writeCurrentTail();
         flush();
          }
          }catch(InterruptedException e) {
@@ -658,8 +718,11 @@
             currentTerm.replace(aTerm == null ? "" : aTerm);
             indexCurrentTerm();
           }
-        }        
+        }
       }
+      // the current document is finished
+      int docLength = tokenPosition + 1;
+      if(docLength > maxDocSizeInRam) maxDocSizeInRam = docLength;
     } catch (IOException e) {
       throw new IndexException("IO Exception while indexing", e);
     }finally {
@@ -686,10 +749,29 @@
     //for duplicate values.
     if(termPostings.checkPosition(tokenPosition)){
       termPostings.addPosition(tokenPosition);
-      newOccurrences++;
+      occurrencesInRam++;
     } else {
       logger.debug("Duplicate position");
     }
   }
-  
+
+  public long getOccurrencesPerBatch() {
+    return occurrencesPerBatch;
+  }
+
+  public void setOccurrencesPerBatch(long occurrencesPerBatch) {
+    this.occurrencesPerBatch = occurrencesPerBatch;
+  }
+
+  public boolean isHasDirectIndex() {
+    return hasDirectIndex;
+  }
+
+  public void setHasDirectIndex(boolean hasDirectIndex) {
+    this.hasDirectIndex = hasDirectIndex;
+  }
+
+  public File getIndexDirectory() {
+    return indexDirectory;
+  }
 }

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java    
2013-12-20 08:57:15 UTC (rev 17191)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java    
2013-12-20 10:56:20 UTC (rev 17192)
@@ -23,7 +23,9 @@
 import gate.mimir.index.mg4j.GATEDocumentFactory;
 import gate.mimir.index.mg4j.zipcollection.DocumentCollectionWriter;
 import gate.mimir.index.mg4j.zipcollection.DocumentData;
+import it.unimi.di.big.mg4j.index.Index;
 import it.unimi.di.big.mg4j.index.TermProcessor;
+import it.unimi.dsi.lang.ObjectParser;
 
 import java.io.File;
 import java.io.IOException;
@@ -118,6 +120,9 @@
       logger.info("Creating zipped collection for field \"" + name + "\"");
       collectionWriter = new DocumentCollectionWriter(indexDirectory);
     }
+    // save the term processor
+    additionalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR, 
+        ObjectParser.toSpec(termProcessor));
     
     indexingThread = new Thread(this, "Mimir-" + name + " indexing thread");
     indexingThread.start();
@@ -171,7 +176,8 @@
    * order of offset.
    */
   protected Annotation[] getAnnotsToProcess(GATEDocument gateDocument) {
-    return gateDocument.getTokenAnnots();
+    Annotation[] tokens = gateDocument.getTokenAnnots(); 
+    return tokens;
   }
 
   /**
@@ -227,6 +233,4 @@
     }
     super.flush();
   }  
-  
-
 }

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java
===================================================================
--- 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java  
    2013-12-20 08:57:15 UTC (rev 17191)
+++ 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java  
    2013-12-20 10:56:20 UTC (rev 17192)
@@ -16,14 +16,11 @@
 
 import gate.Annotation;
 import gate.mimir.IndexConfig;
+import gate.mimir.index.AtomicIndex;
 import gate.mimir.index.IndexException;
 import gate.mimir.index.Indexer;
 import gate.util.GateRuntimeException;
 import it.unimi.dsi.Util;
-import it.unimi.dsi.big.io.FileLinesCollection;
-import it.unimi.dsi.big.util.ShiftAddXorSignedStringMap;
-import it.unimi.dsi.big.util.StringMap;
-import it.unimi.dsi.bits.TransformationStrategies;
 import it.unimi.dsi.fastutil.Arrays;
 import it.unimi.dsi.fastutil.Hash;
 import it.unimi.dsi.fastutil.Swapper;
@@ -60,7 +57,6 @@
 import it.unimi.di.big.mg4j.tool.Concatenate;
 import it.unimi.di.big.mg4j.tool.Scan;
 import it.unimi.di.big.mg4j.tool.Scan.Completeness;
-import it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction;
 import it.unimi.dsi.util.Properties;
 
 import java.io.File;
@@ -999,7 +995,7 @@
           .getStringArray(IndexCluster.PropertyKeys.LOCALINDEX);
       combineBatches(inputBasename, getGlobalFile("").getAbsolutePath());
       // save the termMap
-      generateTermMap(getGlobalFile(DiskBasedIndex.TERMS_EXTENSION), 
+      
AtomicIndex.generateTermMap(getGlobalFile(DiskBasedIndex.TERMS_EXTENSION), 
         getGlobalFile(DiskBasedIndex.TERMMAP_EXTENSION));
       // closing completed
       closingProgress = 1;
@@ -1018,23 +1014,6 @@
   }
   
   /**
-   * Given a terms file (text file with one term per line) this method 
generates
-   * the corresponding termmap file (binary representation of a StringMap).
-   * @param termsFile the input file
-   * @param termmapFile the output file
-   * @throws IOException 
-   */
-  public static void generateTermMap(File termsFile, File termmapFile) throws 
IOException {
-    FileLinesCollection fileLinesCollection =
-        new FileLinesCollection(termsFile.getAbsolutePath(), "UTF-8");
-      StringMap<CharSequence> terms = new ShiftAddXorSignedStringMap(
-        fileLinesCollection.iterator(),
-        new LcpMonotoneMinimalPerfectHashFunction<CharSequence>(
-          fileLinesCollection, TransformationStrategies.prefixFreeUtf16()));
-      BinIO.storeObject(terms, termmapFile);    
-  }
-  
-  /**
    * Combines a set of batches. If the provided number of input batches is 
    * greater than {@link #MAXIMUM_BATCHES_TO_COMBINE}, then this method will 
    * start hierarchical batch combination: it will combine 

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/util/MG4JTools.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/util/MG4JTools.java    
2013-12-20 08:57:15 UTC (rev 17191)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/util/MG4JTools.java    
2013-12-20 10:56:20 UTC (rev 17192)
@@ -14,7 +14,7 @@
  */
 package gate.mimir.util;
 
-import gate.mimir.index.mg4j.MimirIndexBuilder;
+import gate.mimir.index.AtomicIndex;
 import gate.mimir.search.QueryEngine;
 import it.unimi.di.big.mg4j.index.DiskBasedIndex;
 import it.unimi.di.big.mg4j.index.Index;
@@ -66,7 +66,7 @@
         // and generate the new one
         File termsFile = new File(URI.create(indexUri.toString()
           + DiskBasedIndex.TERMS_EXTENSION));
-        MimirIndexBuilder.generateTermMap(termsFile, termMapFile);
+        AtomicIndex.generateTermMap(termsFile, termMapFile);
       } else {
         throw new IOException("Could not rename old termmap file (" + 
             termMapFile.getAbsolutePath() + ").");

Modified: mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
===================================================================
--- mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java      
2013-12-20 08:57:15 UTC (rev 17191)
+++ mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java      
2013-12-20 10:56:20 UTC (rev 17192)
@@ -230,13 +230,13 @@
         inputQueue, outputQueue, 
         indexConfig.getTokenIndexers()[0], 
         false);
-    
+    ati.setOccurrencesPerBatch(500000);
     File zipFile = new File(args[1]);
     String fileURI = zipFile.toURI().toString();
     ZipFile zip = new ZipFile(args[1]);
     Enumeration<? extends ZipEntry> entries = zip.entries();
     
-    int copies = 1;
+    int copies = 100;
     while(entries.hasMoreElements()) {
       ZipEntry entry = entries.nextElement();
       if(entry.isDirectory()) {
@@ -250,7 +250,6 @@
       }
     }
     ati.close();
-
   }
   
   /**

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Rapidly troubleshoot problems before they affect your business. Most IT 
organizations don't have a clear picture of how application performance 
affects their revenue. With AppDynamics, you get 100% visibility into your 
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[17192] mimir/branches/5.0

Reply via email to