mimir

valyt Thu, 12 Jul 2012 06:29:48 -0700

Revision: 15925
          http://gate.svn.sourceforge.net/gate/?rev=15925&view=rev
Author:   valyt
Date:     2012-07-12 13:29:31 +0000 (Thu, 12 Jul 2012)
Log Message:
-----------
- Factored out some utilities for MG4J indexes into a separate class (MG4JTools)
- Inversion of inverted indexes now generates files, without producing 
exceptions. Remains to be seen if those files actually form a valid index.


Modified Paths:
--------------
    
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirDirectIndexBuilder.java
    mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java
    mimir/trunk/mimir-core/src/gate/mimir/search/QueryEngine.java

Added Paths:
-----------
    mimir/trunk/mimir-core/src/gate/mimir/util/MG4JTools.java

Modified: 
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirDirectIndexBuilder.java
===================================================================
--- 
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirDirectIndexBuilder.java   
    2012-07-12 03:12:18 UTC (rev 15924)
+++ 
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirDirectIndexBuilder.java   
    2012-07-12 13:29:31 UTC (rev 15925)
@@ -19,53 +19,65 @@
 import static it.unimi.dsi.io.OutputBitStream.GAMMA;
 import static it.unimi.dsi.io.OutputBitStream.MAX_PRECOMPUTED;
 import gate.Annotation;
+import gate.mimir.IndexConfig;
 import gate.mimir.index.IndexException;
 import gate.mimir.index.Indexer;
+import gate.mimir.index.mg4j.MimirIndexBuilder.PostingsList;
+import gate.mimir.util.MG4JTools;
 import gate.util.GateRuntimeException;
 
 import it.unimi.dsi.Util;
+import it.unimi.dsi.big.mg4j.index.Index;
 import it.unimi.dsi.big.mg4j.index.IndexIterator;
 import it.unimi.dsi.big.mg4j.index.IndexReader;
 import it.unimi.dsi.big.mg4j.io.ByteArrayPostingList;
 import it.unimi.dsi.big.mg4j.tool.Scan.Completeness;
 import it.unimi.dsi.bits.Fast;
+import it.unimi.dsi.logging.ProgressLogger;
 
+import java.io.File;
+import java.io.IOException;
 import java.lang.reflect.Field;
+import java.net.URI;
+import java.text.NumberFormat;
 import java.util.concurrent.BlockingQueue;
 
+import org.apache.log4j.Logger;
+
 /**
  * Class use to transpose an inverted index by building an MG4J index where 
  * terms and documents are used as reverse images of each each other. 
  */
 public class MimirDirectIndexBuilder extends MimirIndexBuilder {
-  static Field countField;
-  static {
-    try {
-      countField = ByteArrayPostingList.class.getDeclaredField("count");
-      countField.setAccessible(true);
-    } catch(Exception e) {
-      throw new AssertionError("Could not acces the " + 
-         ByteArrayPostingList.class.getName() + 
-         ".count field via reflection.");
-    }
-  }
+
+  private static Logger logger = 
Logger.getLogger(MimirDirectIndexBuilder.class);
   
-  /**
-   * A reader for the (inverted) index being transposed.
-   */
-  private IndexReader inputIndexReader;
+  protected String inputSubindexBasename;
   
+  protected static final String BASENAME_SUFFIX = "-dir";
+  
   /**
-   * @param inputQueue
-   * @param outputQueue
-   * @param indexer
-   * @param baseName
+   * @param indexDirectory the top level directory for the Mímir index being
+   * modified.
+   * @param subIndexName the name for the subindex being modified (e.g. 
+   * &quot;mimir-token-0&quot;).
+   * @throws IndexException 
+   * @throws IOException 
    */
-  public MimirDirectIndexBuilder(BlockingQueue<GATEDocument> inputQueue,
-                                 BlockingQueue<GATEDocument> outputQueue,
-                                 Indexer indexer, String baseName) {
-    super(inputQueue, outputQueue, indexer, baseName);
-    // TODO Auto-generated constructor stub
+  public MimirDirectIndexBuilder(File indexDirectory, String subIndexName) 
+      throws IOException, IndexException {
+    super();
+    this.indexConfig = IndexConfig.readConfigFromFile(
+      new File(indexDirectory, Indexer.INDEX_CONFIG_FILENAME), indexDirectory);
+    this.inputSubindexBasename = subIndexName;
+    this.indexBaseName = subIndexName + BASENAME_SUFFIX;
+    // create the progress logger.  We use this.getClass to use the
+    // logger belonging to a subclass rather than our own.
+    this.progressLogger = new ProgressLogger(
+            Logger.getLogger(this.getClass()), "documents");
+    closed = false;
+    closingProgress = 0;
+    savePositions = false;
   }
 
   /* (non-Javadoc)
@@ -74,8 +86,7 @@
   @Override
   protected Annotation[] getAnnotsToProcess(GATEDocument gateDocument)
     throws IndexException {
-    // TODO Auto-generated method stub
-    return null;
+    throw new UnsupportedOperationException("Not implemented.");
   }
 
   /* (non-Javadoc)
@@ -85,7 +96,7 @@
   protected void calculateStartPositionForAnnotation(Annotation ann,
                                                      GATEDocument gateDocument)
     throws IndexException {
-    // TODO Auto-generated method stub
+    throw new UnsupportedOperationException("Not implemented.");
   }
 
   /* (non-Javadoc)
@@ -93,79 +104,109 @@
    */
   @Override
   protected String[] calculateTermStringForAnnotation(Annotation ann,
-                                                      GATEDocument 
gateDocument)
-    throws IndexException {
-    // TODO Auto-generated method stub
-    return null;
+      GATEDocument gateDocument) throws IndexException {
+    throw new UnsupportedOperationException("Not implemented.");
   }
 
-  /**
-   * @param args
-   */
-  public static void main(String[] args) {
-    // TODO Auto-generated method stub
-  }
 
+
   /* (non-Javadoc)
    * @see gate.mimir.index.mg4j.MimirIndexBuilder#run()
    */
   @Override
   public void run() {
+    // input documentIDs become output termIDs
+    // input termIDs become output documentIDs
+    // NB: the variables in this method are named based on output semantics! 
+    
     try {
+      // open the input index for reading
+      Index inputIndex = MG4JTools.openMg4jIndex(
+        new File(new File(indexConfig.getIndexDirectory(), 
+          Indexer.MG4J_INDEX_DIRNAME), 
+          Indexer.MG4J_INDEX_BASENAME + "-" + inputSubindexBasename).toURI());
+      IndexReader inputIndexReader = inputIndex.getReader();
+      // open the output index for writing
       initIndex();
-      IndexIterator termIterator = inputIndexReader.nextIterator();
-      while(termIterator != null) {
-        // the current term
-        long termId = termIterator.termNumber();
-        // the current document
-        long docId = termIterator.nextDocument();
-        while(docId != IndexIterator.END_OF_LIST) {
-          // how many times the current term occurs in the current document 
-          int count = termIterator.count();
-          // index the data
-          ByteArrayPostingList postingsList = null; // TODO get from local 
in-RAM cache
-          if(postingsList == null) {
-            postingsList = new ByteArrayPostingList(null, true, 
Completeness.COUNTS);
-            //TODO: add to local cache
+      // we are iterating over the input terms (output 'documents')
+      NumberFormat percentNF = NumberFormat.getPercentInstance();
+      IndexIterator inputTermIterator = inputIndexReader.nextIterator();
+      long termsProcessed = 0;
+      while(inputTermIterator != null) {
+        try { // Start: process output document
+          // the current input term ID is an output document ID.
+          long outputDocId = inputTermIterator.termNumber();
+          //zero document related counters
+          tokenPosition = -1;
+          // for each input term, we iterate over its documents
+          // the current input document ID is an output term ID
+          long outputTermId = inputTermIterator.nextDocument();
+          while(outputTermId != IndexIterator.END_OF_LIST && outputTermId != 
-1) {
+            tokenPosition ++;
+            currentTerm.replace(longToTerm(outputTermId));
+            // how many times the current term occurs in the current document 
+            int count = inputTermIterator.count();
+            //check if we have seen this mention before
+            PostingsList termPostings = termMap.get(currentTerm);
+            if(termPostings == null){
+              //new term -> create a new postings list.
+              termMap.put(currentTerm.copy(), termPostings = new PostingsList(
+                  new byte[ 32 ], true, Completeness.COUNTS));
+            }
+            termPostings.setDocumentPointer(outputDocId);
+            termPostings.setCount(count);
+            occurrencesInTheCurrentBatch++;
+            if(termPostings.outOfMemoryError) {
+              // we are running out of memory, dump batches ASAP to free it up.
+              indexer.getMg4jIndexer().dumpASAP();
+            }
+            // and move to the next output term (input document)
+            outputTermId = inputTermIterator.nextDocument();
+          }          
+        } finally {
+          //write the size of the current document to the sizes stream
+          try {
+            sizesStream.writeGamma(tokenPosition + 1);
+          } catch(IOException e) {
+            throw new IndexException(e);
+          } finally {
+            if(tokenPosition > maxTermPositionInBatch) {
+              maxTermPositionInBatch = tokenPosition;
+            }
+            //increment doc pointer for next doc
+            documentPointer++;
+            progressLogger.update();
           }
-          postingsList.setDocumentPointer(termId);
-          // this is horrible, but can't be avoided due to the BAPL class
-          // keeping lots of things very private.
-          countField.setInt(postingsList, count);
-          // TODO
+        } // End: process output document
+        
+        //dump batch if needed
+        int percAvailableMemory = Util.percAvailableMemory();
+        if(percAvailableMemory < MIN_AVAILABLE_MEMORY) {
+          dumpBatchASAP = true;
+          indexer.getMg4jIndexer().dumpASAP();
         }
+        if ( // we have been asked to dump 
+             ( dumpBatchASAP || 
+               //.. OR we reached the maximum document limit for a batch       
+               documentPointer == MG4JIndexer.DOCUMENTS_PER_BATCH ) &&
+             // AND there is data to dump
+             occurrencesInTheCurrentBatch > 0 ){
+          dumpBatch();
+          //now get ready for the next batch
+          currentBatch++;
+          initBatch();
+        }
         
-        termIterator = inputIndexReader.nextIterator();
+        // and move to the next input term (output 'document')
+        inputTermIterator = inputIndexReader.nextIterator();
+        termsProcessed++;
+        if(termsProcessed % 1000 == 0) {
+          logger.debug("Processed " + 
+              percentNF.format((double)termsProcessed / 
inputIndex.numberOfTerms) + 
+              " terms");  
+        }
+        
       }
-
-//      while(true){
-//        try {
-//          processDocument(aDocument);
-//        } catch(Throwable e) {
-//          logger.error("Problem while indexing document!", e);
-//        }
-//        //dump batch if needed
-//        int percAvailableMemory = Util.percAvailableMemory();
-//        if(percAvailableMemory < MIN_AVAILABLE_MEMORY) {
-//          dumpBatchASAP = true;
-//          indexer.getMg4jIndexer().dumpASAP();
-//        }
-//        if (
-//               // we have been asked to dump 
-//             ( dumpBatchASAP || 
-//               //.. OR we reached the maximum document limit for a batch     
  
-//               documentPointer == MG4JIndexer.DOCUMENTS_PER_BATCH
-//             ) &&
-//             // AND there is data to dump
-//             occurrencesInTheCurrentBatch > 0
-//           ){
-//          dumpBatch();
-//          //now get ready for the next batch
-//          currentBatch++;
-//          initBatch();
-//        }
-//        outputQueue.put(aDocument);
-//      }
       
       // dump the last current batch
       flush();
@@ -177,5 +218,18 @@
     }
   }
   
+  /**
+   * Converts a long value into a String containing a zero-padded Hex 
+   * representation of the input value. The lexicographic ordering of the 
+   * generated strings is the same as the natural order of the corresponding
+   * long values.
+   *  
+   * @param value the value to convert.
+   * @return the string representation.
+   */
+  public static final String longToTerm(long value) {
+    String valueStr = Long.toHexString(value);
+    return "0000000000000000".substring(valueStr.length()) + valueStr;
+  }
   
 }

Modified: 
mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java
===================================================================
--- mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java     
2012-07-12 03:12:18 UTC (rev 15924)
+++ mimir/trunk/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java     
2012-07-12 13:29:31 UTC (rev 15925)
@@ -66,6 +66,7 @@
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.PrintWriter;
+import java.lang.reflect.Field;
 import java.lang.reflect.InvocationTargetException;
 import java.net.URISyntaxException;
 import java.util.EnumMap;
@@ -108,9 +109,35 @@
    * addPosition(int), if the position provided is the same as the previous 
one. 
    */
   protected static class PostingsList extends ByteArrayPostingList{
-    public PostingsList(byte[] a, boolean differential) {
-      super(a, differential, Completeness.POSITIONS);
+    
+    private static final Field countField;
+    
+    static {
+      // Hack to access the normally private count field inside the 
+      // ByteArrayPostingsList class.
+      try {
+        countField = ByteArrayPostingList.class.getDeclaredField("count");
+        countField.setAccessible(true);
+      } catch(Exception e) {
+        throw new AssertionError("Could not acces the " + 
+           ByteArrayPostingList.class.getName() + 
+           ".count field via reflection.");
+      }
     }
+    
+    public PostingsList(byte[] a, boolean differential,
+                        Completeness completeness) {
+      super(a, differential, completeness);
+      // TODO Auto-generated constructor stub
+    }
+    
+    public void setCount(int count) {
+      try {
+        countField.setInt(this, count);
+      } catch(Exception e) {
+        new RuntimeException("Could not set counts by reflection.");
+      }
+    }
 
     /**
      * The last seen position.
@@ -177,13 +204,14 @@
   /**
    * Flag showing whether the indexer is closed. 
    */
-  private boolean closed = false;
+  protected boolean closed = false;
   
+  protected boolean savePositions = true;
   /**
    * A value between 0 and 1 representing the progress of the current index 
    * closing operation. 
    */
-  private volatile double closingProgress = 0.0;
+  protected volatile double closingProgress = 0.0;
   
   /**
    * The index configuration.
@@ -308,6 +336,19 @@
    */
   protected String indexBaseName;
   
+  /**
+   * Protected no-op constructor. 
+   * Allows sub-classes to initialise the internal state according to their 
+   * own requirements.
+   */
+  protected MimirIndexBuilder() {  
+    // create the progress logger.  We use this.getClass to use the
+    // logger belonging to a subclass rather than our own.
+    this.progressLogger = new ProgressLogger(
+            Logger.getLogger(this.getClass()), "documents");
+    closed = false;
+    closingProgress = 0;    
+  }
   
   public MimirIndexBuilder(BlockingQueue<GATEDocument> inputQueue,
           BlockingQueue<GATEDocument> outputQueue,
@@ -530,7 +571,8 @@
     if(termPostings == null){
       //new term -> create a new postings list.
       termMap.put( currentTerm.copy(), 
-              termPostings = new PostingsList( new byte[ 32 ], true));
+              termPostings = new PostingsList( new byte[ 32 ], true, 
+                  Completeness.POINTERS));
     }
     //add the current posting to the current postings list
     termPostings.setDocumentPointer(documentPointer);
@@ -612,9 +654,10 @@
       
       final OutputBitStream offsetsStream = new OutputBitStream(
               getBatchFile(DiskBasedIndex.OFFSETS_EXTENSION));
-  
-      final OutputBitStream posLengthsStream = new OutputBitStream(
-              getBatchFile(DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION));
+      final OutputBitStream posLengthsStream = savePositions ?
+          new OutputBitStream(
+            getBatchFile(DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION)) :
+          null;
 
       
       ByteArrayPostingList postingsList;
@@ -647,7 +690,7 @@
         frequenciesStream.writeLongGamma( frequency );
         globCountsStream.writeLongGamma( postingsList.globCount );
         offsetsStream.writeLongGamma( indexStream.writtenBits() - prevOffset );
-        posLengthsStream.writeLongGamma( postingsList.posNumBits );
+        if(savePositions) posLengthsStream.writeLongGamma( 
postingsList.posNumBits );
         prevOffset = indexStream.writtenBits();
       }
   
@@ -666,7 +709,7 @@
       properties.addProperty( Index.PropertyKeys.CODING, "FREQUENCIES:GAMMA" );
       properties.addProperty( Index.PropertyKeys.CODING, "POINTERS:DELTA" );
       properties.addProperty( Index.PropertyKeys.CODING, "COUNTS:GAMMA" );
-      properties.addProperty( Index.PropertyKeys.CODING, "POSITIONS:DELTA" );
+      if(savePositions) properties.addProperty( Index.PropertyKeys.CODING, 
"POSITIONS:DELTA" );
       properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, 
               termProcessor == null ? 
               NullTermProcessor.class.getName() :
@@ -679,7 +722,7 @@
       indexStream.close();
       offsetsStream.close();
       globCountsStream.close();
-      posLengthsStream.close();
+      if(savePositions) posLengthsStream.close();
       frequenciesStream.close();
       termMap.clear();
       termMap.trim( INITIAL_TERM_MAP_SIZE );
@@ -1061,10 +1104,19 @@
           NoSuchMethodException{
     if(inputBasenames.length <= MAXIMUM_BATCHES_TO_COMBINE){
       //simple combine
+      
+      Map<Component,Coding> codingFlags;
+      if(savePositions) {
+        codingFlags = CompressionFlags.DEFAULT_STANDARD_INDEX; 
+      } else {
+        codingFlags = new EnumMap<Component, 
Coding>(CompressionFlags.DEFAULT_STANDARD_INDEX);
+        codingFlags.remove(Component.POSITIONS);
+      }
+      
       new Concatenate(outputBaseName,
               inputBasenames, false, 
               Combine.DEFAULT_BUFFER_SIZE, 
-              CompressionFlags.DEFAULT_STANDARD_INDEX,
+              codingFlags,
               false, true, 
               // BitStreamIndex.DEFAULT_QUANTUM,
               // replaced with optimised automatic calculation

Modified: mimir/trunk/mimir-core/src/gate/mimir/search/QueryEngine.java
===================================================================
--- mimir/trunk/mimir-core/src/gate/mimir/search/QueryEngine.java       
2012-07-12 03:12:18 UTC (rev 15924)
+++ mimir/trunk/mimir-core/src/gate/mimir/search/QueryEngine.java       
2012-07-12 13:29:31 UTC (rev 15925)
@@ -23,7 +23,6 @@
 import gate.mimir.index.IndexException;
 import gate.mimir.index.Indexer;
 import gate.mimir.index.mg4j.MentionsIndexBuilder;
-import gate.mimir.index.mg4j.MimirIndexBuilder;
 import gate.mimir.index.mg4j.TokenIndexBuilder;
 import gate.mimir.index.mg4j.zipcollection.DocumentCollection;
 import gate.mimir.index.mg4j.zipcollection.DocumentData;
@@ -33,15 +32,12 @@
 import gate.mimir.search.query.parser.ParseException;
 import gate.mimir.search.query.parser.QueryParser;
 import gate.mimir.search.score.MimirScorer;
+import gate.mimir.util.MG4JTools;
 import it.unimi.dsi.fastutil.ints.Int2ObjectLinkedOpenHashMap;
 import it.unimi.dsi.fastutil.ints.IntBigList;
 import it.unimi.dsi.fastutil.ints.IntList;
-import it.unimi.dsi.fastutil.io.BinIO;
 import it.unimi.dsi.fastutil.longs.Long2ObjectLinkedOpenHashMap;
-import it.unimi.dsi.util.Properties;
-import it.unimi.dsi.big.mg4j.index.DiskBasedIndex;
 import it.unimi.dsi.big.mg4j.index.Index;
-import it.unimi.dsi.big.mg4j.index.Index.UriKeys;
 
 import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
@@ -59,10 +55,7 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
 import java.util.List;
-import java.util.Map;
 import java.util.Set;
 import java.util.SortedSet;
 import java.util.Timer;
@@ -167,7 +160,7 @@
    * The maximum size of an index that can be loaded in memory (by default 64
    * MB).
    */
-  protected static final long MAX_IN_MEMORY_INDEX = 64 * 1024 * 1024;
+  public static final long MAX_IN_MEMORY_INDEX = 64 * 1024 * 1024;
   
   /**
    * The default value for the document block size.
@@ -799,51 +792,8 @@
   IllegalAccessException, InvocationTargetException,
   NoSuchMethodException {
     // see if the index needs upgrading
-    upgradeIndex(indexUri);
-    // calculate the flags
-    Index theIndex;
-    try {
-      // Optimisations: if the index size (i.e. index + positions files) is
-      // less than 64MB, then we load the index in memory,
-      // otherwise we memory-map it.
-      long size = 0;
-      File aFile =
-        new File(URI.create(indexUri.toString()
-                + DiskBasedIndex.INDEX_EXTENSION));
-      if(aFile.exists()) {
-        size += aFile.length();
-      } else {
-        // no index file!
-        throw new IllegalArgumentException(
-                "Could not locate the index file at " + aFile.getAbsolutePath()
-                + "!");
-      }
-      aFile =
-        new File(URI.create(indexUri.toString()
-                + DiskBasedIndex.POSITIONS_EXTENSION));
-      if(aFile.exists()) {
-        size += aFile.length();
-      } else {
-        // no index file!
-        throw new IllegalArgumentException(
-                "Could not locate the index file at " + aFile.getAbsolutePath()
-                + "!");
-      }
-      String options = "?" + (size <= MAX_IN_MEMORY_INDEX ? 
-          UriKeys.INMEMORY.toString().toLowerCase() + "=1" : 
-          (UriKeys.MAPPED.name().toLowerCase() + "=1;" + 
-           UriKeys.OFFSETSTEP.toString().toLowerCase() + "=-" + 
-           DiskBasedIndex.DEFAULT_OFFSET_STEP ));
-      
-      logger.debug("Opening index: " + indexUri.toString() + options);
-      theIndex = Index.getInstance(indexUri.toString() + options, true, true);
-    } catch(IOException e) {
-      // memory mapping failed
-      logger.info("Memory mapping failed for index " + indexUri
-              + ". Loading as file index instead");
-      // now try to just open it as an on-disk index
-      theIndex = Index.getInstance(indexUri.toString(), true, true);
-    }
+    MG4JTools.upgradeIndex(indexUri);
+    Index theIndex = MG4JTools.openMg4jIndex(indexUri);
     return new IndexReaderPool(theIndex);
   }
   
@@ -855,65 +805,12 @@
    * @throws IOException
    * @throws ClassNotFoundException
    * @throws ConfigurationException 
+   * @deprecated Use {@link MG4JTools#upgradeIndex(URI)} instead
    */
   public static void upgradeIndex(URI indexUri) throws IOException, 
       ClassNotFoundException, ConfigurationException {
-    // check if the term map is 32 bits, and convert if needed.
-    File termMapFile = new File(URI.create(indexUri.toString()
-          + DiskBasedIndex.TERMMAP_EXTENSION));
-    Object termmap = BinIO.loadObject(termMapFile);
-    if(termmap instanceof it.unimi.dsi.util.StringMap) {
-      // 32 bit index: save the old termmap
-      logger.warn("Old index format detected (32 bits term map file); " +
-          "converting to new version. Old files will be backed up with " +
-          "a .32bit extension.");
-      if(termMapFile.renameTo(new File(URI.create(indexUri.toString()
-          + DiskBasedIndex.TERMMAP_EXTENSION + ".32bit")))) {
-        // and generate the new one
-        File termsFile = new File(URI.create(indexUri.toString()
-          + DiskBasedIndex.TERMS_EXTENSION));
-        MimirIndexBuilder.generateTermMap(termsFile, termMapFile);
-      } else {
-        throw new IOException("Could not rename old termmap file (" + 
-            termMapFile.getAbsolutePath() + ").");
+        MG4JTools.upgradeIndex(indexUri);
       }
-    }
-    // check if the .properties file contains any mg4j-standard classes,
-    // and replace all mentions with the equivalent mg4j-big ones
-    File propsFile = new File(URI.create(indexUri.toString()
-      + DiskBasedIndex.PROPERTIES_EXTENSION));
-    Properties indexProps = new Properties(propsFile);
-    indexProps.setAutoSave(false);
-    Iterator<String> keysIter = indexProps.getKeys();
-    String OLDPKG = "it.unimi.dsi.mg4j";
-    String NEWPKG = "it.unimi.dsi.big.mg4j";
-    Map<String, String> newVals = new LinkedHashMap<String, String>();
-    while(keysIter.hasNext()) {
-      String key = keysIter.next();
-      Object value = indexProps.getProperty(key);
-      if(value instanceof String && ((String)value).indexOf(OLDPKG) >= 0) {
-        newVals.put(key, ((String)value).replace(OLDPKG, NEWPKG));
-      }
-    }
-    if(newVals.size() > 0) {
-      // save a backup
-      logger.warn("Old index format detected (32 bits properties file); " +
-          "converting to new version. Old files will be backed up with " +
-          "a .32bit extension.");
-      if(propsFile.renameTo(new File(URI.create(indexUri.toString()
-        + DiskBasedIndex.PROPERTIES_EXTENSION + ".32bit")))) {
-        // update the properties values
-        for(Map.Entry<String, String> newEntry : newVals.entrySet()) {
-          indexProps.setProperty(newEntry.getKey(), newEntry.getValue());
-        }
-        // save the changed props
-        indexProps.save();
-      } else {
-        throw new IOException("Could not rename old properties file (" + 
-            propsFile.getAbsolutePath() + ").");          
-      }
-    }
-  }
   
   /**
    * Marks a given document (identified by its ID) as deleted. Deleted 
documents

Added: mimir/trunk/mimir-core/src/gate/mimir/util/MG4JTools.java
===================================================================
--- mimir/trunk/mimir-core/src/gate/mimir/util/MG4JTools.java                   
        (rev 0)
+++ mimir/trunk/mimir-core/src/gate/mimir/util/MG4JTools.java   2012-07-12 
13:29:31 UTC (rev 15925)
@@ -0,0 +1,179 @@
+/*
+ *  Mg4JTools.java
+ *
+ *  Copyright (c) 2007-2012, The University of Sheffield.
+ *
+ *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), 
+ *  and is free software, licenced under the GNU Lesser General Public License,
+ *  Version 3, June 2007 (also included with this distribution as file
+ *  LICENCE-LGPL3.html).
+ *  
+ *  Valentin Tablan, 12 Jul 2012
+ *
+ *  $Id$
+ */
+package gate.mimir.util;
+
+import gate.mimir.index.mg4j.MimirIndexBuilder;
+import gate.mimir.search.QueryEngine;
+import it.unimi.dsi.big.mg4j.index.DiskBasedIndex;
+import it.unimi.dsi.big.mg4j.index.Index;
+import it.unimi.dsi.big.mg4j.index.Index.UriKeys;
+import it.unimi.dsi.fastutil.io.BinIO;
+import it.unimi.dsi.util.Properties;
+
+import java.io.File;
+import java.io.IOException;
+import java.lang.reflect.InvocationTargetException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import org.apache.commons.configuration.ConfigurationException;
+import org.apache.log4j.Logger;
+
+/**
+ * Class providing utility methods for working with MG4J indexes. 
+ */
+public class MG4JTools {
+
+  protected static final Logger logger = Logger.getLogger(MG4JTools.class);
+  
+  /**
+   * Given a index URI (a file URI denoting the index base name for all the 
+   * index files), this method checks if the index if an older version, and 
+   * upgrades it to the current version, making sure it can be opened. 
+   * @param indexUri
+   * @throws IOException
+   * @throws ClassNotFoundException
+   * @throws ConfigurationException 
+   */
+  public static void upgradeIndex(URI indexUri) throws IOException, 
+      ClassNotFoundException, ConfigurationException {
+    // check if the term map is 32 bits, and convert if needed.
+    File termMapFile = new File(URI.create(indexUri.toString()
+          + DiskBasedIndex.TERMMAP_EXTENSION));
+    Object termmap = BinIO.loadObject(termMapFile);
+    if(termmap instanceof it.unimi.dsi.util.StringMap) {
+      // 32 bit index: save the old termmap
+      logger.warn("Old index format detected (32 bits term map file); " +
+          "converting to new version. Old files will be backed up with " +
+          "a .32bit extension.");
+      if(termMapFile.renameTo(new File(URI.create(indexUri.toString()
+          + DiskBasedIndex.TERMMAP_EXTENSION + ".32bit")))) {
+        // and generate the new one
+        File termsFile = new File(URI.create(indexUri.toString()
+          + DiskBasedIndex.TERMS_EXTENSION));
+        MimirIndexBuilder.generateTermMap(termsFile, termMapFile);
+      } else {
+        throw new IOException("Could not rename old termmap file (" + 
+            termMapFile.getAbsolutePath() + ").");
+      }
+    }
+    // check if the .properties file contains any mg4j-standard classes,
+    // and replace all mentions with the equivalent mg4j-big ones
+    File propsFile = new File(URI.create(indexUri.toString()
+      + DiskBasedIndex.PROPERTIES_EXTENSION));
+    Properties indexProps = new Properties(propsFile);
+    indexProps.setAutoSave(false);
+    Iterator<String> keysIter = indexProps.getKeys();
+    String OLDPKG = "it.unimi.dsi.mg4j";
+    String NEWPKG = "it.unimi.dsi.big.mg4j";
+    Map<String, String> newVals = new LinkedHashMap<String, String>();
+    while(keysIter.hasNext()) {
+      String key = keysIter.next();
+      Object value = indexProps.getProperty(key);
+      if(value instanceof String && ((String)value).indexOf(OLDPKG) >= 0) {
+        newVals.put(key, ((String)value).replace(OLDPKG, NEWPKG));
+      }
+    }
+    if(newVals.size() > 0) {
+      // save a backup
+      logger.warn("Old index format detected (32 bits properties file); " +
+          "converting to new version. Old files will be backed up with " +
+          "a .32bit extension.");
+      if(propsFile.renameTo(new File(URI.create(indexUri.toString()
+        + DiskBasedIndex.PROPERTIES_EXTENSION + ".32bit")))) {
+        // update the properties values
+        for(Map.Entry<String, String> newEntry : newVals.entrySet()) {
+          indexProps.setProperty(newEntry.getKey(), newEntry.getValue());
+        }
+        // save the changed props
+        indexProps.save();
+      } else {
+        throw new IOException("Could not rename old properties file (" + 
+            propsFile.getAbsolutePath() + ").");          
+      }
+    }
+  }
+
+  /**
+   * Opens one MG4J index.
+   * 
+   * @param indexUri a URI denoting the basename for the index (a file path 
+   * with the correct basename, but no extension). 
+   * 
+   * @return the MG4J {@link Index} object.
+   * @throws ConfigurationException
+   * @throws SecurityException
+   * @throws IOException
+   * @throws URISyntaxException
+   * @throws ClassNotFoundException
+   * @throws InstantiationException
+   * @throws IllegalAccessException
+   * @throws InvocationTargetException
+   * @throws NoSuchMethodException
+   */
+  public static Index openMg4jIndex(URI indexUri) 
+      throws ConfigurationException, SecurityException, IOException, 
+      URISyntaxException, ClassNotFoundException, InstantiationException, 
+      IllegalAccessException, InvocationTargetException, NoSuchMethodException 
{
+    Index theIndex = null;
+    String basename = indexUri.toString();
+    try {
+      // Optimisations: if the index size (i.e. index + positions files) is
+      // less than 64MB, then we load the index in memory,
+      // otherwise we memory-map it.
+      long size = 0;
+      File aFile =
+        new File(URI.create(basename + DiskBasedIndex.INDEX_EXTENSION));
+      if(aFile.exists()) {
+        size += aFile.length();
+      } else {
+        // no index file!
+        throw new IllegalArgumentException(
+                "Could not locate the index file at " + aFile.getAbsolutePath()
+                + "!");
+      }
+      aFile =
+        new File(URI.create(basename + DiskBasedIndex.POSITIONS_EXTENSION));
+      if(aFile.exists()) {
+        size += aFile.length();
+      } else {
+        // no index file!
+        throw new IllegalArgumentException(
+                "Could not locate the index file at " + aFile.getAbsolutePath()
+                + "!");
+      }
+      String options = "?" + (size <= QueryEngine.MAX_IN_MEMORY_INDEX ? 
+          UriKeys.INMEMORY.toString().toLowerCase() + "=1" : 
+          (UriKeys.MAPPED.name().toLowerCase() + "=1;" + 
+           UriKeys.OFFSETSTEP.toString().toLowerCase() + "=-" + 
+           DiskBasedIndex.DEFAULT_OFFSET_STEP ));
+      
+      logger.debug("Opening index: " + basename + options);
+      theIndex = Index.getInstance(basename + options, true, true);
+    } catch(IOException e) {
+      // memory mapping failed
+      logger.info("Memory mapping failed for index " + basename
+              + ". Loading as file index instead");
+      // now try to just open it as an on-disk index
+      theIndex = Index.getInstance(basename, true, true);
+    }
+    return theIndex;
+  }
+  
+  
+}


Property changes on: mimir/trunk/mimir-core/src/gate/mimir/util/MG4JTools.java
___________________________________________________________________
Added: svn:mime-type
   + text/plain
Added: svn:keywords
   + Id
Added: svn:eol-style
   + native

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[15925] mimir/trunk/mimir-core/src/gate/mimir

Reply via email to