Revision: 17204
          http://sourceforge.net/p/gate/code/17204
Author:   valyt
Date:     2013-12-23 16:16:04 +0000 (Mon, 23 Dec 2013)
Log Message:
-----------
- basic support for searching now implemented, for the simplest case. 
- we now also write Bloom Filters when creating new indexes, to speed up 
searches in the documental cluster.
- moved the term processor up to AtomicIndex, to allow the search 
implementation to reside at the level.

Modified Paths:
--------------
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2013-12-21 18:23:19 UTC (rev 17203)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2013-12-23 16:16:04 UTC (rev 17204)
@@ -26,7 +26,11 @@
 import it.unimi.di.big.mg4j.index.IndexWriter;
 import it.unimi.di.big.mg4j.index.QuasiSuccinctIndex;
 import it.unimi.di.big.mg4j.index.QuasiSuccinctIndexWriter;
+import it.unimi.di.big.mg4j.index.TermProcessor;
+import it.unimi.di.big.mg4j.index.Index.UriKeys;
+import it.unimi.di.big.mg4j.index.cluster.ContiguousDocumentalStrategy;
 import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
+import it.unimi.di.big.mg4j.index.cluster.DocumentalConcatenatedCluster;
 import it.unimi.di.big.mg4j.io.IOFactory;
 import it.unimi.di.big.mg4j.tool.Scan;
 import it.unimi.dsi.big.io.FileLinesCollection;
@@ -55,7 +59,10 @@
 import java.io.OutputStreamWriter;
 import java.io.PrintStream;
 import java.io.PrintWriter;
+import java.lang.reflect.InvocationTargetException;
+import java.net.URISyntaxException;
 import java.nio.ByteOrder;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.concurrent.BlockingQueue;
 
@@ -257,7 +264,20 @@
     long numberOfPostings;
     long numberOfOccurences;
     int maxCount;
-    
+    public MG4JIndex(Index index, File indexDir, 
+        BloomFilter<Void> termFilter,
+        int numberOfDocuments, int numberOfTerms, long numberOfPostings,
+        long numberOfOccurences, int maxCount) {
+      super();
+      this.index = index;
+      this.indexDir = indexDir;
+      this.termFilter = termFilter;
+      this.numberOfDocuments = numberOfDocuments;
+      this.numberOfTerms = numberOfTerms;
+      this.numberOfPostings = numberOfPostings;
+      this.numberOfOccurences = numberOfOccurences;
+      this.maxCount = maxCount;
+    }
   }
   
   /**
@@ -278,6 +298,60 @@
   }  
 
   /**
+   * Creates a documental cluster from a list of {@link MG4JIndex} values.
+   * 
+   * @param subIndexes the indexes to be combined into a cluster 
+   * @param termProcessor the term processor to be used (can be null)
+   * @return a documental cluster view of the list of indexes provided.
+   */
+  protected final static DocumentalCluster openIndexCluster(
+      List<MG4JIndex> subIndexes,
+      TermProcessor termProcessor){
+    
+    if(subIndexes == null || subIndexes.size() == 0) return null;
+    // prepare the documental cluster
+    Index[] indexes = new Index[subIndexes.size()];
+    long[] cutPoints = new long[indexes.length];
+    cutPoints[0] = 0;
+    int numberOfTerms = -1;
+    int numberOfDocuments = -1;
+    long numberOfPostings = -1;
+    long numberOfOccurences =-1;
+    int maxCount =-1;
+    int indexIdx = 0;
+    
+    for(MG4JIndex aSubIndex : subIndexes) {
+      indexes[indexIdx] = aSubIndex.index;
+      if(indexIdx < cutPoints.length - 1) {
+        cutPoints[indexIdx + 1] = aSubIndex.numberOfDocuments;
+      }
+      numberOfTerms += aSubIndex.numberOfTerms;
+      numberOfDocuments += aSubIndex.numberOfDocuments;
+      numberOfPostings += aSubIndex.numberOfPostings;
+      numberOfOccurences += aSubIndex.numberOfOccurences;
+      if(maxCount < aSubIndex.maxCount) maxCount = aSubIndex.maxCount;
+      indexIdx++;
+    }
+    return new DocumentalConcatenatedCluster(indexes,
+          new ContiguousDocumentalStrategy(cutPoints),
+          false, // flat = all component indexes have the same term list
+          null, // Bloom Filters
+          numberOfDocuments, 
+          numberOfTerms, 
+          numberOfPostings, 
+          numberOfOccurences, 
+          maxCount, 
+          null, // payload
+          true, // hasCounts 
+          true, // hasPositions, 
+          termProcessor, 
+          null, // field 
+          null, // sizes
+          null // properties
+          );
+  }  
+  
+  /**
    * The file name (under the current directory for this atomic index) which 
    * stores the principal index. 
    */
@@ -328,9 +402,13 @@
   protected String name;
   
   protected File indexDirectory;
-
   
   /**
+   * The term processor used to process the feature values being indexed.
+   */
+  protected TermProcessor termProcessor = null;
+  
+  /**
    * The size (number of terms) for the longest document indexed but not yet 
    * saved. 
    */
@@ -353,16 +431,11 @@
   protected MimirIndex parent;
   
   /**
-   * The head index for this atomic index.
+   * A list containing the head and tails of this index.
    */
-  protected MG4JIndex head;
+  protected List<MG4JIndex> subIndexes;
   
   /**
-   * The tails for this atomic index.
-   */
-  protected List<MG4JIndex> tails;
-  
-  /**
    * The cluster-view of all the MG4J indexes that are part of this index (i.e.
    * the head and all the tails). 
    */
@@ -410,7 +483,7 @@
   /**
    * The number of documents currently stored in RAM.
    */
-  protected long documentsInRAM;
+  protected int documentsInRAM;
   
   /**
    * An in-memory inverted index that gets dumped to files for each batch. 
@@ -422,6 +495,7 @@
    */
   protected IntArrayList documentSizesInRAM;
   
+  
   /**
    * Creates a new AtomicIndex
    * 
@@ -450,9 +524,26 @@
     
     this.additionalProperties = new Properties();
     
+    initIndex();
   }
 
-       
+       /**
+        * Opens the index and prepares it for indexing and searching. 
+        */
+       protected void initIndex() {
+    // open the index
+    if(indexDirectory.exists()) {
+      // opening an existing index
+      //TODO
+    } else {
+      // new index creation
+      indexDirectory.mkdirs();
+      documentPointer = 0;
+      subIndexes = new ArrayList<AtomicIndex.MG4JIndex>();
+    }
+    indexCluster = openIndexCluster(subIndexes, termProcessor);
+       }
+               
   /**
         * Gets the name of this atomic index. This is used as the file name 
for the 
         * directory storing the index files.
@@ -567,7 +658,8 @@
                 termArray[other] = temp;
               }
             });
-         // write the terms and termmap files
+         // write the terms, termmap, and bloom filter files
+    BloomFilter<Void> termFilter = BloomFilter.create(numTermsInRAM);
     PrintWriter pw = new PrintWriter( 
         new OutputStreamWriter(new FastBufferedOutputStream(
             new FileOutputStream(mg4jBasename + 
DiskBasedIndex.TERMS_EXTENSION), 
@@ -575,10 +667,14 @@
         "UTF-8" ));
     for (MutableString t : termArray ) {
       t.println( pw );
+      termFilter.add(t.toString());
     }
     pw.close();
     generateTermMap(new File(mg4jBasename + DiskBasedIndex.TERMS_EXTENSION),
         new File(mg4jBasename + DiskBasedIndex.TERMMAP_EXTENSION));
+    // write the bloom filter
+    BinIO.storeObject(termFilter, 
+        new File(mg4jBasename + DocumentalCluster.BLOOM_EXTENSION)); 
     // write the sizes file
     File sizesFile = new File(mg4jBasename + DiskBasedIndex.SIZES_EXTENSION);
     OutputBitStream sizesStream = new OutputBitStream(sizesFile);   
@@ -586,13 +682,14 @@
       sizesStream.writeGamma(docSize);
     }
     sizesStream.close();
-    
+    long postingsInRam = 0;
     // write the actual index
     int maxCount = 0;
     for ( int i = 0; i < numTermsInRAM; i++ ) {
       PostingsList postingsList = termMap.get( termArray[ i ] );
       if ( maxCount < postingsList.maxCount ) maxCount = postingsList.maxCount;
       postingsList.write(indexWriter);
+      postingsInRam += postingsList.frequency;
     }
     indexWriter.close();
     // write the index properties
@@ -619,7 +716,31 @@
       // this should never happen
       throw new IndexException("Error while saving tail properties", e);
     }
-         // merge new tail into index cluster
+         
+    // merge new tail into index cluster
+    Index newIndex = null;
+    try {
+      try{
+        newIndex = Index.getInstance(mg4jBasename + "?" +
+            UriKeys.MAPPED.name().toLowerCase() + "=1;");
+      } catch(IOException e) {
+        // memory mapping failed
+        logger.info("Memory mapping failed for index " + mg4jBasename
+                + ". Loading as file index instead");
+        // now try to just open it as an on-disk index
+        newIndex = Index.getInstance(mg4jBasename, true, true);
+      }
+    } catch(ConfigurationException | ClassNotFoundException | SecurityException
+        | InstantiationException | IllegalAccessException
+        | InvocationTargetException | NoSuchMethodException
+        | URISyntaxException e) {
+      throw new IndexException("Could not open the index just written to " +
+         mg4jBasename , e);
+    }
+    MG4JIndex newIndexData = new MG4JIndex(newIndex, newTailDir, termFilter, 
+        documentsInRAM, numTermsInRAM, postingsInRam, occurrencesInRAM, 
maxCount);
+    subIndexes.add(newIndexData);
+    indexCluster = openIndexCluster(subIndexes, termProcessor);
     
          if(hasDirectIndex) {
            // dump new direct tail (invert the tail just written)
@@ -662,16 +783,6 @@
          indexingThread = Thread.currentThread();
          GATEDocument aDocument;
          try{
-           // open the index
-           if(indexDirectory.exists()) {
-             // opening an existing index
-             //TODO
-           } else {
-             // new index creation
-             indexDirectory.mkdirs();
-             documentPointer = 0;
-           }
-           
            // start in-RAM indexing
            newBatch();
          if(inputQueue != null) {
@@ -856,4 +967,14 @@
   public File getIndexDirectory() {
     return indexDirectory;
   }
+
+  /**
+   * Gets an {@link Index} value that can be used to search this atomic index.
+   * This will normally be a {@link DocumentalCluster} view over all the 
+   * sub-indexes contained. 
+   * @return
+   */
+  public Index getIndex() {
+    return indexCluster;
+  }
 }

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java    
2013-12-21 18:23:19 UTC (rev 17203)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java    
2013-12-23 16:16:04 UTC (rev 17204)
@@ -94,13 +94,9 @@
    */
   protected String featureName;
   
+
   
   /**
-   * The term processor used to process the feature values being indexed.
-   */
-  protected TermProcessor termProcessor;
-  
-  /**
    * @param parent
    * @param name
    * @param indexDirectory

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Rapidly troubleshoot problems before they affect your business. Most IT 
organizations don't have a clear picture of how application performance 
affects their revenue. With AppDynamics, you get 100% visibility into your 
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to