AtomicIndex.java

valyt Fri, 20 Dec 2013 03:25:37 -0800

Revision: 17194
          http://sourceforge.net/p/gate/code/17194
Author:   valyt
Date:     2013-12-20 11:25:05 +0000 (Fri, 20 Dec 2013)
Log Message:
-----------
Renamed some variables.


Modified Paths:
--------------
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2013-12-20 11:07:26 UTC (rev 17193)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2013-12-20 11:25:05 UTC (rev 17194)
@@ -218,7 +218,7 @@
       flush();
       if(indexWriter instanceof QuasiSuccinctIndexWriter) {
         ((QuasiSuccinctIndexWriter)indexWriter).newInvertedList(frequency,
-            occurrences, sumMaxPos);  
+            occurrences, sumMaxPos);
       } else {
         indexWriter.newInvertedList();
       }
@@ -314,12 +314,12 @@
    * The size (number of terms) for the longest document indexed but not yet 
    * saved. 
    */
-  protected int maxDocSizeInRam = -1;
+  protected int maxDocSizeInRAM = -1;
   
   /**
    * The number of occurrences represented in RAM and not yet written to disk. 
 
    */
-  protected long occurrencesInRam = 0;
+  protected long occurrencesInRAM = 0;
   
   /**
    * How many occurrences to be accumulated in RAM before a new tail batch is
@@ -365,11 +365,18 @@
   protected MutableString currentTerm;
   
   /**
-   * The current document pointer (gets incremented for each document).
+   * The current document pointer (gets incremented for each document). This is
+   * a global for all documents in the atomic index, in all the batches.
    */
   protected long documentPointer;
   
+  
   /**
+   * THe number of documents currently stored in RAM.
+   */
+  protected long documentsInRAM;
+  
+  /**
    * An in-memory inverted index that gets dumped to files for each batch. 
    */
   protected Object2ReferenceOpenHashMap<MutableString, PostingsList> termMap;
@@ -413,8 +420,9 @@
       indexDirectory.mkdirs();
       documentPointer = 0;
     }
-    occurrencesInRam = 0;
-    maxDocSizeInRam = -1;
+    occurrencesInRAM = 0;
+    maxDocSizeInRAM = -1;
+    documentsInRAM = 0;
     termMap = new Object2ReferenceOpenHashMap<MutableString, 
         PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );
        }
@@ -473,19 +481,19 @@
          QuasiSuccinctIndexWriter indexWriter = new QuasiSuccinctIndexWriter(
              IOFactory.FILESYSTEM_FACTORY,
              mg4jBasename,
-             documentPointer,
+             documentsInRAM,
              Fast.mostSignificantBit(QuasiSuccinctIndex.DEFAULT_QUANTUM),
              QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE,
              CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX,
              ByteOrder.nativeOrder());
          // write the data from RAM
-    int numTerms = termMap.size();
+    int numTermsInRAM = termMap.size();
     logger.info( "Generating index for batch " + newTailName + 
-            "; documents: " + documentPointer + "; terms:" + numTerms + 
-            "; occurrences: " + occurrencesInRam );
+            "; documents: " + documentsInRAM + "; terms:" + numTermsInRAM + 
+            "; occurrences: " + occurrencesInRAM );
     
     // We write down all term in appearance order in termArray.
-    final MutableString[] termArray = termMap.keySet().toArray(new 
MutableString[ numTerms ]);
+    final MutableString[] termArray = termMap.keySet().toArray(new 
MutableString[ numTermsInRAM ]);
     // We sort the terms appearing in the batch and write them on disk.
     Arrays.quickSort(0, termArray.length, 
             new IntComparator() {
@@ -522,7 +530,7 @@
     
     // write the actual index
     int maxCount = 0;
-    for ( int i = 0; i < numTerms; i++ ) {
+    for ( int i = 0; i < numTermsInRAM; i++ ) {
       PostingsList postingsList = termMap.get( termArray[ i ] );
       if ( maxCount < postingsList.maxCount ) maxCount = postingsList.maxCount;
       postingsList.write(indexWriter);
@@ -535,10 +543,10 @@
           indexWriter.writtenBits());
       // -1 means unknown
       additionalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, 
-          maxDocSizeInRam);
+          maxDocSizeInRAM);
       additionalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount 
);
       additionalProperties.setProperty( Index.PropertyKeys.OCCURRENCES, 
-          occurrencesInRam );
+          occurrencesInRAM );
       properties.addAll(additionalProperties);
       Scan.saveProperties( IOFactory.FILESYSTEM_FACTORY, properties, 
           mg4jBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
@@ -557,8 +565,12 @@
        
          // clear out internal state, in preparation for the next tail
          
-         occurrencesInRam = 0;
-    maxDocSizeInRam = -1;
+         occurrencesInRAM = 0;
+    maxDocSizeInRAM = -1;
+    documentsInRAM = 0;
+    // hack to force zero-based batches. This forces us to use a merged cluster
+    // but avoids exceptions during indexing. We'll fix this if possible, 
after 
+    // we get advice on the MG4J mailing list.
     documentPointer = 0;
     termMap.clear();
     termMap.trim( INITIAL_TERM_MAP_SIZE );
@@ -603,7 +615,7 @@
             logger.error("Problem while indexing document!", e);
           }
           //dump batch if needed AND there is data to dump
-          if (occurrencesPerBatch > 0 && occurrencesInRam > 
occurrencesPerBatch){
+          if (occurrencesPerBatch > 0 && occurrencesInRAM > 
occurrencesPerBatch){
             writeCurrentTail();
           }
           outputQueue.put(aDocument);
@@ -722,12 +734,13 @@
       }
       // the current document is finished
       int docLength = tokenPosition + 1;
-      if(docLength > maxDocSizeInRam) maxDocSizeInRam = docLength;
+      if(docLength > maxDocSizeInRAM) maxDocSizeInRAM = docLength;
     } catch (IOException e) {
       throw new IndexException("IO Exception while indexing", e);
     }finally {
       documentEnding(gateDocument);
       documentPointer++;
+      documentsInRAM++;
     }
   }
   
@@ -749,7 +762,7 @@
     //for duplicate values.
     if(termPostings.checkPosition(tokenPosition)){
       termPostings.addPosition(tokenPosition);
-      occurrencesInRam++;
+      occurrencesInRAM++;
     } else {
       logger.debug("Duplicate position");
     }

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Rapidly troubleshoot problems before they affect your business. Most IT 
organizations don't have a clear picture of how application performance 
affects their revenue. With AppDynamics, you get 100% visibility into your 
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[17194] mimir/branches/5.0/mimir-core/src/gate/mimir/index /AtomicIndex.java

Reply via email to