[gate-cvs] SF.net SVN: gate:[17244] mimir/branches/5.0

valyt Wed, 22 Jan 2014 09:32:06 -0800

Revision: 17244
          http://sourceforge.net/p/gate/code/17244
Author:   valyt
Date:     2014-01-22 17:30:47 +0000 (Wed, 22 Jan 2014)
Log Message:
-----------
We can now generate a set of files on disk that *look like* a direct index. 
Still untested, so no idea if the data contained makes any sense at all..


Modified Paths:
--------------
    mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
    mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java        
2014-01-22 16:06:49 UTC (rev 17243)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java        
2014-01-22 17:30:47 UTC (rev 17244)
@@ -528,6 +528,14 @@
     return occurrencesPerBatch;
   }
 
+  /**
+   * Gets the current estimated number of occurrences in RAM
+   * @return
+   */
+  public long getOccurrencesInRam() {
+    return occurrencesInRam;
+  }
+
   public int getIndexingQueueSize() {
     return indexingQueueSize;
   }

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2014-01-22 16:06:49 UTC (rev 17243)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2014-01-22 17:30:47 UTC (rev 17244)
@@ -26,6 +26,7 @@
 import it.unimi.di.big.mg4j.index.Index;
 import it.unimi.di.big.mg4j.index.IndexReader;
 import it.unimi.di.big.mg4j.index.IndexWriter;
+import it.unimi.di.big.mg4j.index.NullTermProcessor;
 import it.unimi.di.big.mg4j.index.QuasiSuccinctIndex;
 import it.unimi.di.big.mg4j.index.QuasiSuccinctIndexWriter;
 import it.unimi.di.big.mg4j.index.SkipBitStreamIndexWriter;
@@ -63,6 +64,7 @@
 import it.unimi.dsi.io.InputBitStream;
 import it.unimi.dsi.io.OutputBitStream;
 import it.unimi.dsi.lang.MutableString;
+import it.unimi.dsi.lang.ObjectParser;
 import it.unimi.dsi.logging.ProgressLogger;
 import it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction;
 import it.unimi.dsi.util.BloomFilter;
@@ -79,6 +81,7 @@
 import java.net.URISyntaxException;
 import java.nio.ByteOrder;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;
@@ -225,7 +228,7 @@
         if(firstDocumentPointer < 0) firstDocumentPointer = pointer;
         if(lastDocumentPointer == -1) {
           // this is the first document
-          documentPointersDifferential.add(0);      
+          documentPointersDifferential.add(0);  
         } else {
           // close previous document
           flush();
@@ -251,6 +254,10 @@
       }
     }
     
+    public void setCount(int count) {
+      this.count = count;
+    }
+    
     /**
      * Checks whether the given position is valid (i.e. greater than the last 
      * seen positions. If the position is invalid, this means that a call to
@@ -286,8 +293,10 @@
     public void write(IndexWriter indexWriter) throws IOException {
       flush();
       if(indexWriter instanceof QuasiSuccinctIndexWriter) {
-        ((QuasiSuccinctIndexWriter)indexWriter).newInvertedList(frequency,
-            occurrences, sumMaxPos);
+        ((QuasiSuccinctIndexWriter)indexWriter).newInvertedList(
+            frequency,
+            occurrences, 
+            positions!= null ? sumMaxPos : 0);
       } else {
         indexWriter.newInvertedList();
       }
@@ -295,16 +304,17 @@
       indexWriter.writeFrequency(frequency);
       long currDocumentPointer = firstDocumentPointer;
       int positionsStart = 0;
-      
       for(int docId = 0; docId < documentPointersDifferential.size(); docId++) 
{
         currDocumentPointer += documentPointersDifferential.get(docId);
         int currCount = counts.get(docId);
         OutputBitStream obs = indexWriter.newDocumentRecord();
         indexWriter.writeDocumentPointer(obs, currDocumentPointer);
         indexWriter.writePositionCount(obs, currCount);
-        indexWriter.writeDocumentPositions(obs, positions.elements(),
-            positionsStart, currCount, -1);
-        positionsStart += currCount;
+        if(positions != null){
+          indexWriter.writeDocumentPositions(obs, positions.elements(),
+              positionsStart, currCount, -1);
+          positionsStart += currCount;       
+        }
       }
     }
 
@@ -461,6 +471,20 @@
   }  
   
   /**
+   * Converts a long value into a String containing a zero-padded Hex 
+   * representation of the input value. The lexicographic ordering of the 
+   * generated strings is the same as the natural order of the corresponding
+   * long values.
+   *  
+   * @param value the value to convert.
+   * @return the string representation.
+   */
+  public static final String longToTerm(long value) {
+    String valueStr = Long.toHexString(value);
+    return "0000000000000000".substring(valueStr.length()) + valueStr;
+  }  
+  
+  /**
    * The file name (under the current directory for this atomic index) which 
    * stores the principal index. 
    */
@@ -561,6 +585,12 @@
    */
   protected Properties additionalProperties;
   
+  /**
+   * A set of properties added to the ones obtained from the direct index 
writer
+   * when writing out batches.
+   */
+  protected Properties additionalDirectProperties;
+  
   protected boolean hasDirectIndex;
   
   protected Thread indexingThread;
@@ -662,6 +692,11 @@
     
     this.additionalProperties = new Properties();
     
+    if(hasDirectIndex) {
+      additionalDirectProperties = new Properties();
+      additionalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR, 
+          ObjectParser.toSpec(NullTermProcessor.getInstance()));
+    }
     initIndex();
   }
 
@@ -787,7 +822,8 @@
     int numTermsInRAM = termMap.size();
     logger.info( "Generating index for batch " + newTailName + 
             "; documents: " + documentsInRAM + "; terms:" + numTermsInRAM + 
-            "; occurrences: " + occurrencesInRAM );
+            "; occurrences: " + occurrencesInRAM +
+            " / " + parent.getOccurrencesInRam());
     
     // We write down all term in appearance order in termArray.
     final MutableString[] termArray = termMap.keySet().toArray(new 
MutableString[ numTermsInRAM ]);
@@ -869,30 +905,31 @@
       throw new IndexException("Error while saving tail properties", e);
     }
          
+    if(hasDirectIndex) {
+      writeDirectIndex(newTailDir, termArray);
+    }
+    // update parent
+    parent.subtractOccurrences(occurrencesInRAM);
+    
+    // clear out internal state, in preparation for the next tail  
+    newBatch();
+    
     // merge new tail into index cluster
     try {
       // modify internal state
       synchronized(this) {
         subIndexes.add(openSubIndex(newTailName));
-        indexCluster = openIndexCluster(subIndexes, termProcessor);      
+        indexCluster = openIndexCluster(subIndexes, termProcessor);
+        if(hasDirectIndex) {
+          // TODO
+          // merge the new direct batch into the direct cluster
+        }
       }
     } catch(Exception e) {
       throw new IndexException("Could not open the index just written to " +
          mg4jBasename , e);
     }
-
-         if(hasDirectIndex) {
-           //TODO
-           // dump new direct tail (invert the tail just written)
-           // merge new direct tail into direct index cluster
-         }
          
-         // update parent
-    parent.subtractOccurrences(occurrencesInRAM);
-    
-         // clear out internal state, in preparation for the next tail  
-         newBatch();
-         
          // notify "listeners"
     synchronized(this) {
       if(tailWriteRequested != null) {
@@ -903,6 +940,151 @@
        }
        
        /**
+        * 
+        * @param batchDir
+        * @param termArray the in-RAM terms, sorted lexicographically
+        * @throws IOException
+        * @throws IndexException 
+        */
+  protected void writeDirectIndex(File batchDir, MutableString[] termArray) 
+      throws IOException, IndexException {
+    // The index we are writing is a direct index, so we give it new terms
+    // which are actually document IDs, and they have posting lists containing
+    // document IDs, which are actually termIDs.
+
+    //1. invert index data in RAM
+    Object2ReferenceOpenHashMap<MutableString, PostingsList> docMap = 
+          new Object2ReferenceOpenHashMap<MutableString, 
+            PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );
+    MutableString docIdStr = new MutableString();
+    // we now read the posting lists for all the terms, in ascending term order
+    for(int termId = 0; termId < termArray.length; termId++) {
+      PostingsList termPostings = termMap.get(termArray[termId]);
+      long docPointer = termPostings.firstDocumentPointer;
+      for(int i = 0; i < termPostings.documentPointersDifferential.size(); 
i++) {
+        docPointer += termPostings.documentPointersDifferential.get(i);        
+        int count = termPostings.counts.getInt(i);
+        // convert data to the correct type
+        docIdStr.replace(longToTerm(docPointer));
+        // at this point we have term, document, counts so we can write the 
data
+        // to the in-RAM direct index
+        PostingsList docPostings = docMap.get(docIdStr);
+        if(docPostings == null) {
+          docPostings = new PostingsList(false);
+          docMap.put(docIdStr.copy(), docPostings);
+        }
+        docPostings.newDocumentPointer(termId);
+        docPostings.setCount(count); 
+        docPostings.flush();
+      }
+    }
+    
+    // 2. write the data from RAM
+    String mg4jBasename = new File(batchDir, name + "-dir").getAbsolutePath();
+    // copy the default compression flags, and remove positions
+    Map<Component, Coding> flags = new HashMap<Component, Coding>(
+        CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX);
+    flags.remove(Component.POSITIONS);
+    QuasiSuccinctIndexWriter directIndexWriter =
+        new QuasiSuccinctIndexWriter(
+            IOFactory.FILESYSTEM_FACTORY,
+            mg4jBasename, 
+            termArray.length,
+            Fast.mostSignificantBit(QuasiSuccinctIndex.DEFAULT_QUANTUM),
+            QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE,
+            flags,
+            ByteOrder.nativeOrder());
+    
+    // sort all the docIds
+    final MutableString[] docArray = docMap.keySet().toArray(new 
MutableString[ docMap.size() ]);
+    // We sort the terms appearing in the batch and write them on disk.
+    Arrays.quickSort(0, docArray.length, 
+            new IntComparator() {
+              @Override
+              public int compare(Integer one, Integer other) {
+                return compare(one.intValue(), other.intValue());
+              }
+              
+              @Override
+              public int compare(int one, int other) {
+                return docArray[one].compareTo(docArray[other]);
+              }
+            },
+            new Swapper() {
+              @Override
+              public void swap(int one, int other) {
+                MutableString temp = docArray[one];
+                docArray[one] = docArray[other];
+                docArray[other] = temp;
+              }
+            });
+    
+    BloomFilter<Void> docBloomFilter = BloomFilter.create(docArray.length);
+    PrintWriter pw = new PrintWriter( 
+        new OutputStreamWriter(new FastBufferedOutputStream(
+            new FileOutputStream(mg4jBasename + 
DiskBasedIndex.TERMS_EXTENSION), 
+            64 * 1024), 
+        "UTF-8" ));
+    for (MutableString t : docArray ) {
+      t.println( pw );
+      docBloomFilter.add(t);
+    }
+    pw.close();
+    generateTermMap(new File(mg4jBasename + DiskBasedIndex.TERMS_EXTENSION),
+        new File(mg4jBasename + DiskBasedIndex.TERMMAP_EXTENSION), null);
+    // write the bloom filter
+    BinIO.storeObject(docBloomFilter, 
+        new File(mg4jBasename + DocumentalCluster.BLOOM_EXTENSION)); 
+    // write the sizes file
+    File sizesFile = new File(mg4jBasename + DiskBasedIndex.SIZES_EXTENSION);
+    OutputBitStream sizesStream = new OutputBitStream(sizesFile);
+    int maxTermSize = -1; // -1 means unknown
+    for(MutableString term : termArray) {
+      int termSize = (int)termMap.get(term).frequency;
+      sizesStream.writeGamma(termSize);
+      if(termSize > maxTermSize) maxTermSize = termSize;
+    }
+    sizesStream.close();
+    
+    // write the actual index
+    int maxCount = 0;
+    long occurrences = 0;
+    for ( int i = 0; i < docArray.length; i++ ) {
+      PostingsList postingsList = docMap.get( docArray[ i ] );
+      if ( maxCount < postingsList.maxCount ) maxCount = postingsList.maxCount;
+      postingsList.write(directIndexWriter);
+      occurrences += postingsList.occurrences;
+    }
+    directIndexWriter.close();
+    // write the index properties
+    try {
+      Properties properties = directIndexWriter.properties();
+      additionalDirectProperties.setProperty( Index.PropertyKeys.SIZE, 
+          directIndexWriter.writtenBits());
+      // -1 means unknown
+      additionalDirectProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, 
+          maxTermSize);
+      additionalDirectProperties.setProperty( Index.PropertyKeys.MAXCOUNT, 
maxCount );
+      additionalDirectProperties.setProperty( Index.PropertyKeys.OCCURRENCES, 
+          occurrences);
+      properties.addAll(additionalDirectProperties);
+      Scan.saveProperties( IOFactory.FILESYSTEM_FACTORY, properties, 
+          mg4jBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
+      
+      // write stats
+      PrintStream statsPs = new PrintStream(new File(mg4jBasename + 
+          DiskBasedIndex.STATS_EXTENSION));
+      directIndexWriter.printStats(statsPs);
+      statsPs.close();
+    } catch(ConfigurationException e) {
+      // this should never happen
+      throw new IndexException("Error while saving tail properties", e);
+    }
+    
+  }
+       
+       
+       /**
         * Combines all the currently existing sub-indexes, generating a new
         * head index.
         * @throws IndexException 
@@ -1320,7 +1502,6 @@
    * @return the string for the given term.
    */
   public String getTerm(long termId) {
-    // TODO: list() is an optional operation, so this may be null
     return getIndex().termMap.list().get(termId).toString();
   }
  

Modified: mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
===================================================================
--- mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java      
2014-01-22 16:06:49 UTC (rev 17243)
+++ mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java      
2014-01-22 17:30:47 UTC (rev 17244)
@@ -81,9 +81,9 @@
 public class Scratch {
 
   public static void main (String[] args) throws Exception {
-//    mainIndexer5(args);
+    mainIndexer5(args);
     
-     mainSimple(args);
+//     mainSimple(args);
     
 //     mainDirectIndexes(args);
 //    mainBuildDirectIndex(args);
@@ -247,7 +247,7 @@
     ZipFile zip = new ZipFile(args[1]);
     Enumeration<? extends ZipEntry> entries = zip.entries();
     
-    int copies = 100;
+    int copies = 10;
     boolean compress = false;
     ResourceData docRd = 
Gate.getCreoleRegister().get(DocumentImpl.class.getName());
     while(entries.hasMoreElements()) {

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
CenturyLink Cloud: The Leader in Enterprise Cloud Services.
Learn Why More Businesses Are Choosing CenturyLink Cloud For
Critical Workloads, Development Environments & Everything In Between.
Get a Quote or Start a Free Trial Today. 
http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[17244] mimir/branches/5.0

Reply via email to