AtomicIndex.java

valyt Thu, 20 Feb 2014 07:36:23 -0800

Revision: 17370
          http://sourceforge.net/p/gate/code/17370
Author:   valyt
Date:     2014-02-20 15:35:58 +0000 (Thu, 20 Feb 2014)
Log Message:
-----------
More Javadocs.


Modified Paths:
--------------
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2014-02-20 15:12:35 UTC (rev 17369)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2014-02-20 15:35:58 UTC (rev 17370)
@@ -96,20 +96,27 @@
 import com.google.common.io.PatternFilenameFilter;
 
 /**
+ * <p>
  * An inverted index associating terms with documents. Terms can be either 
token
- * feature values, or semantic annotation URIs. Optionally, a direct index may 
- * also be present.
- * 
+ * feature values, or annotations. Optionally, a direct index may also be 
+ * present.
+ * </p>
+ * <p>
  * An atomic index manages a head index (the principal data) and a set of tail 
  * indexes (batches containing updates). Additionally, the data representing 
  * all the new documents that have been queued for indexing since the last tail
  * was written are stored in RAM.
- * 
+ * </p>
+ * <p>
  * When direct indexing is enabled, the term IDs in the direct index are 
  * different from the term IDs in the inverted index. In the inverted index 
  * the term IDs are their position in the lexicographically sorted list of all
  * terms. In the directed index, the term IDs are their position in the list
  * sorted by the time they were first seen during indexing.
+ * </p>
+ * <p>
+ * The head and tail batches can be combined into a new head by a 
+ * <em>compact</em> operation.
  */
 public abstract class AtomicIndex implements Runnable {
   
@@ -399,48 +406,54 @@
   /**
    * Given a terms file (text file with one term per line) this method 
generates
    * the corresponding termmap file (binary representation of a StringMap).
-   * Optionally, a {@link BloomFilter} can also be generated, if the suitable 
+   * Optionally, a {@link BloomFilter} can also be generated, if the suitable
    * target file is provided.
+   * 
    * @param termsFile the input file
-   * @param termmapFile the output file
-   * @param bloomFilterFile if not null, the file to be used for writing
-   * the {@link BloomFilter} for the index.
-   * @throws IOException 
+   * @param termmapFile the output termmap file, or <code>null</code> if a 
+   * termmap is not required.
+   * @param bloomFilterFile the file to be used for writing the 
+   * {@link BloomFilter} for the index, or <code>null</code> if a Bloom filter
+   * is not required.
+   * @throws IOException
    */
-  public static void generateTermMap(File termsFile, File termmapFile, 
+  public static void generateTermMap(File termsFile, File termmapFile,
       File bloomFilterFile) throws IOException {
     FileLinesCollection fileLinesCollection =
         new FileLinesCollection(termsFile.getAbsolutePath(), "UTF-8");
-      StringMap<CharSequence> terms = new ShiftAddXorSignedStringMap(
-        fileLinesCollection.iterator(),
-        new LcpMonotoneMinimalPerfectHashFunction<CharSequence>(
-          fileLinesCollection, TransformationStrategies.prefixFreeUtf16()));
-      BinIO.storeObject(terms, termmapFile);
-      if(bloomFilterFile != null) {
-        BloomFilter<Void> bloomFilter = BloomFilter.create(terms.size64());
-        for(MutableString term : fileLinesCollection) {
-          bloomFilter.add(term);
-        }
-        BinIO.storeObject(bloomFilter, bloomFilterFile);
+    if(termmapFile != null) {
+      StringMap<CharSequence> terms =
+          new ShiftAddXorSignedStringMap(
+              fileLinesCollection.iterator(),
+              new LcpMonotoneMinimalPerfectHashFunction<CharSequence>(
+                  fileLinesCollection, 
TransformationStrategies.prefixFreeUtf16()));      
+      BinIO.storeObject(terms, termmapFile);      
+    }
+
+    if(bloomFilterFile != null) {
+      BloomFilter<Void> bloomFilter = 
BloomFilter.create(fileLinesCollection.size64());
+      for(MutableString term : fileLinesCollection) {
+        bloomFilter.add(term);
       }
+      BinIO.storeObject(bloomFilter, bloomFilterFile);
+    }
   }  
 
   /**
    * Creates a documental cluster from a list of {@link MG4JIndex} values.
    * 
-   * @param subIndexes the indexes to be combined into a cluster 
+   * @param batches the indexes to be combined into a cluster 
    * @param termProcessor the term processor to be used (can be null)
    * @return a documental cluster view of the list of indexes provided.
    */
   protected final static Index openInvertedIndexCluster(
-      List<MG4JIndex> subIndexes,
-      TermProcessor termProcessor){
+      List<MG4JIndex> batches, TermProcessor termProcessor){
     
-    if(subIndexes == null || subIndexes.size() == 0) return null;
-    if(subIndexes.size() == 1) return subIndexes.get(0).invertedIndex;
+    if(batches == null || batches.size() == 0) return null;
+    if(batches.size() == 1) return batches.get(0).invertedIndex;
     
     // prepare the documental cluster
-    Index[] indexes = new Index[subIndexes.size()];
+    Index[] indexes = new Index[batches.size()];
     long[] cutPoints = new long[indexes.length];
     cutPoints[0] = 0;
     int numberOfTerms = -1;
@@ -453,7 +466,7 @@
     @SuppressWarnings("unchecked")
     BloomFilter<Void> bloomFilters[] = new BloomFilter[indexes.length];
     
-    for(MG4JIndex aSubIndex : subIndexes) {
+    for(MG4JIndex aSubIndex : batches) {
       indexes[indexIdx] = aSubIndex.invertedIndex;
       if(indexIdx < cutPoints.length - 1) {
         cutPoints[indexIdx + 1] = cutPoints[indexIdx] + 
@@ -490,15 +503,19 @@
           );
   }  
   
-  
-  protected final static Index openDirectIndexCluster(
-      List<MG4JIndex> subIndexes){
+  /**
+   * Opens the direct index files from all the batches and combines them into
+   * a {@link LexicalCluster}.
+   * @param batches the batches to be opened.
+   * @return
+   */
+  protected final static Index openDirectIndexCluster(List<MG4JIndex> batches){
     
-    if(subIndexes == null || subIndexes.size() == 0) return null;
-    if(subIndexes.size() == 1) return subIndexes.get(0).directIndex;
+    if(batches == null || batches.size() == 0) return null;
+    if(batches.size() == 1) return batches.get(0).directIndex;
     
     // prepare the lexical cluster
-    Index[] indexes = new Index[subIndexes.size()];
+    Index[] indexes = new Index[batches.size()];
     int[] cutPoints = new int[indexes.length];
     cutPoints[0] = 0;
     String[] cutPointTerms = new String[indexes.length];
@@ -512,7 +529,7 @@
     @SuppressWarnings("unchecked")
     BloomFilter<Void> bloomFilters[] = new BloomFilter[indexes.length];
     
-    for(MG4JIndex aSubIndex : subIndexes) {
+    for(MG4JIndex aSubIndex : batches) {
       indexes[indexIdx] = aSubIndex.directIndex;
       // we build this based on the inverted index, as the cut-points for the
       // lexical partitioning are based on document IDs
@@ -1058,11 +1075,8 @@
        }
        
        /**
-        * 
+        * Writes the in-RAM data to a new direct index batch.
         * @param batchDir
-        * @param termArray the in-RAM terms, sorted lexicographically
-        * @throws IOException
-        * @throws IndexException 
         */
   protected void writeDirectIndex(File batchDir) 
       throws IOException, IndexException {
@@ -1240,8 +1254,7 @@
        
        
        /**
-        * Combines all the currently existing sub-indexes, generating a new
-        * head index.
+        * Combines all the currently existing batches, generating a new head 
index.
         * @throws IndexException 
         * @throws IOException 
         * @throws ConfigurationException 
@@ -1440,8 +1453,8 @@
        /**
         * Instructs this index to dump to disk all the in-RAM index data at 
the fist 
         * opportunity.
-        * @return a {@link Future} value that, upon completion, will return 
the number of
-        * occurrences written to disk.
+        * @return a {@link Future} value that, upon completion, will return 
the 
+        * number of occurrences written to disk.
         * @throws InterruptedException if this thread is interrupted while 
trying to
         * queue the dump request.
         */
@@ -1482,7 +1495,7 @@
   }
        
        /**
-        * Opens one sub-index, specified as a directory inside this Atom 
Index's
+        * Opens one sub-index, specified as a directory inside this Atomic 
Index's
         * index directory.
         * @param subIndexDirname
         * @return
@@ -1770,27 +1783,50 @@
     }
   }
 
-
+  /**
+   * Gets the top level directory for this atomic index. This will be a 
+   * directory contained in the top level directory of the {@link MimirIndex}
+   * which includes this atomic index.
+   * @return
+   */
   public File getIndexDirectory() {
     return indexDirectory;
   }
 
+  /**
+   * Gets the top level {@link MimirIndex} to which this atomic index belongs.
+   * @return
+   */
   public MimirIndex getParent() {
     return parent;
   }
 
+  /**
+   * Gets the input queue used by this atomic index. This queue is used to 
+   * submit documents for indexing.
+   * @return
+   */
   public BlockingQueue<GATEDocument> getInputQueue() {
     return inputQueue;
   }
 
+  /**
+   * Gets the output queue used by this atomic index. This is used to 
+   * &quot;return&quot; documents that have finished indexing. Notably, values 
+   * in this queue will have their occurrences value (see
+   * {@link GATEDocument#getOccurrences()}) increased by the number of 
+   * occurrences generated by indexing the document in this atomic index.
+   * 
+   * @return
+   */
   public BlockingQueue<GATEDocument> getOutputQueue() {
     return outputQueue;
   }
 
   /**
-   * Gets an {@link Index} value that can be used to search this atomic index.
-   * This will normally be a {@link DocumentalCluster} view over all the 
-   * sub-indexes contained. 
+   * Gets the inverted index (an {@link Index} value) that can be used to 
+   * search this atomic index. This will normally be a 
+   * {@link DocumentalCluster} view over all the batches contained. 
    * @return
    */
   public Index getIndex() {
@@ -1818,18 +1854,32 @@
  
   /**
    * Gets the term string for a given direct term ID. The term ID must have 
been 
-   * obtained from this index's direct index.
+   * obtained from the direct index of this index.
    * @param termId the ID for the term being sought.
    * @return the string for the given term.
    */
   public CharSequence getDirectTerm(long termId) {
     return directTerms.get(termId);
   }
- 
+  
+  /**
+   * Gets the list of direct terms for this index. The terms are sorted by the 
+   * first they were seen, and <strong>not</strong> lexicographically.
+   * @return
+   */
   public ObjectBigList<? extends CharSequence> getDirectTerms() {
     return directTerms;
   }
   
+  /**
+   * Gets the occurrence count in the whole index for a given direct term,
+   * specified by a direct term ID (which must have been obtained from the 
+   * direct index of this index).
+   * 
+   * @param directTermId
+   * @return
+   * @throws IOException
+   */
   public long getDirectTermOccurenceCount(long directTermId) throws 
IOException {
     String termStr = directTerms.get(directTermId);
     // we need to sum up all the counts for this term in the inverted index

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Managing the Performance of Cloud-Based Applications
Take advantage of what the Cloud has to offer - Avoid Common Pitfalls.
Read the Whitepaper.
http://pubads.g.doubleclick.net/gampad/clk?id=121054471&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[17370] mimir/branches/5.0/mimir-core/src/gate/mimir/index /AtomicIndex.java

Reply via email to