mimir

valyt Wed, 29 Jan 2014 04:02:23 -0800

Revision: 17253
          http://sourceforge.net/p/gate/code/17253
Author:   valyt
Date:     2014-01-29 12:01:38 +0000 (Wed, 29 Jan 2014)
Log Message:
-----------
More work on supporting direct indexes:
- Direct terms are not lexicographically sorted, but are instead ordered by 
occurrence time;
- this implies that direct term IDs are now different from inverted term IDs. 
Not ideal, but can't be helped.


Modified Paths:
--------------
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
    
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java
    
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/TermTypeTermsQuery.java

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2014-01-28 17:13:53 UTC (rev 17252)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2014-01-29 12:01:38 UTC (rev 17253)
@@ -60,7 +60,11 @@
 import it.unimi.dsi.fastutil.io.BinIO;
 import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
 import it.unimi.dsi.fastutil.longs.LongBigList;
+import it.unimi.dsi.fastutil.objects.Object2LongAVLTreeMap;
+import it.unimi.dsi.fastutil.objects.Object2LongMap;
 import it.unimi.dsi.fastutil.objects.Object2ReferenceOpenHashMap;
+import it.unimi.dsi.fastutil.objects.ObjectBigArrayBigList;
+import it.unimi.dsi.fastutil.objects.ObjectBigList;
 import it.unimi.dsi.io.InputBitStream;
 import it.unimi.dsi.io.OutputBitStream;
 import it.unimi.dsi.lang.MutableString;
@@ -82,6 +86,7 @@
 import java.nio.ByteOrder;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;
@@ -96,14 +101,20 @@
 import com.google.common.io.PatternFilenameFilter;
 
 /**
- * An indirect index associating terms with documents. Terms can be either 
token
+ * An inverted index associating terms with documents. Terms can be either 
token
  * feature values, or semantic annotation URIs. Optionally, a direct index may 
  * also be present.
  * 
  * An atomic index manages a head index (the principal data) and a set of tail 
- * indexes (batches containing updates). Additionally, the date representing 
+ * indexes (batches containing updates). Additionally, the data representing 
  * all the new documents that have been queued for indexing since the last tail
- * was written are stored in RAM.  
+ * was written are stored in RAM.
+ * 
+ * When direct indexing is enabled, the term IDs in the direct index are 
+ * different from the term IDs in the inverted index. In the inverted index 
+ * the term IDs are their position in the lexicographically sorted list of all
+ * terms. In the directed index, the term IDs are their position in the list
+ * sorted by the time they were first seen during indexing.
  */
 public abstract class AtomicIndex implements Runnable {
   
@@ -137,6 +148,11 @@
     
   }
   
+  /**
+   * A callable that does nothing. This is used to produce instances of 
+   * {@link CustomFuture} which are only used to wait for the completion of an 
+   * operation that returns nothing.
+   */
   private static final Callable<Void> noOpVoid = new Callable<Void>() {
     @Override
     public Void call() throws Exception {
@@ -160,37 +176,47 @@
     private long lastDocumentPointer = -1;
     
     /**
-     * The list of document pointer differentials (differences from 
-     * {@link #firstDocumentPointer}). For the sake of easy alignment, we 
-     * actually store a <tt>0</tt> on the first position.
+     * The list of document pointer differentials (differences from the 
previous
+     * document pointer. The first document pointer value is stored in
+     * {@link #firstDocumentPointer}), and we store a <tt>0</tt> in the first 
+     * position of this list. The actual value of a document pointer for a 
given
+     * position is:
+     * <dl>
+     *   <dt>pos 0</dt>
+     *   <dd>{@link #firstDocumentPointer}</dd>
+     *   <dt>pos i</dt>
+     *   <dd>pointer at position (i-1) + documentPointersDifferential[i]</dd>
+     * </dl>
      */
     private IntList documentPointersDifferential;
     
 
     /**
-     * The list of counts for each document. This list is aligned with 
+     * The count (number of terms) for each document. This list is aligned 
with 
      * {@link #documentPointersDifferential}.
      */
     private IntList counts;
     
     /**
-     * The list of positions in this postings list. For each document at 
position
-     * <tt>i</i>, there will be counts[i] positions stored in this list.
+     * The list of positions in this postings list. For each document at 
+     * position <tt>i</i>, there will be counts[i] positions stored in this 
+     * list. This value is <code>non-null</code> only if positions are stored,
+     * which is configured through a construction-time parameter.
      */
     private IntArrayList positions;
     
     /**
-     * The last seen position.
+     * The last seen position in the current document.
      */
     private int lastPosition = -1;
     
     /**
-     * The number of position in the current document
+     * The number of positions in the current document
      */
     private int count = 0; 
     
     /**
-     * The maximum count of all the stored documents
+     * The maximum term count of all the stored documents
      */
     private int maxCount = 0;
     
@@ -200,7 +226,7 @@
     private long frequency = 0;
     
     /**
-     * The total number of occurrences stored.
+     * The total number of term occurrences in all stored documents.
      */
     private long occurrences = 0;
     
@@ -223,7 +249,7 @@
      * @param pointer
      */
     public void newDocumentPointer(long pointer) {
-      // is this really a new document
+      // is this really a new document?
       if(pointer != lastDocumentPointer) {
         if(firstDocumentPointer < 0) firstDocumentPointer = pointer;
         if(lastDocumentPointer == -1) {
@@ -254,13 +280,18 @@
       }
     }
     
+    /**
+     * When storing positions, the count is automatically calculated. When not 
+     * storing positions, it needs to be explicitly set by calling this method.
+     * @param count
+     */
     public void setCount(int count) {
       this.count = count;
     }
     
     /**
      * Checks whether the given position is valid (i.e. greater than the last 
-     * seen positions. If the position is invalid, this means that a call to
+     * seen position. If the position is invalid, this means that a call to
      * {@link #addPosition(int)} with the same value would actually be a 
      * no-operation.  
      * @param pos
@@ -286,7 +317,7 @@
     }
     
     /**
-     * Writes the data contained in this postings list to a index writer
+     * Writes the data contained in this postings list to an index writer
      * @param indexWriter
      * @throws IOException 
      */
@@ -416,7 +447,7 @@
    * @param termProcessor the term processor to be used (can be null)
    * @return a documental cluster view of the list of indexes provided.
    */
-  protected final static Index openIndexCluster(
+  protected final static Index openInvertedIndexCluster(
       List<MG4JIndex> subIndexes,
       TermProcessor termProcessor){
     
@@ -508,6 +539,8 @@
    */
   public static final String TAIL_FILE_NAME_PREFIX = "tail-";
   
+  
+  public static final String DIRECT_TERMS_FILENAME = "direct.terms";
   /**
    * The file name (under the current directory for this atomic index) for the
    * directory containing the documents that have been queued for indexing, 
but 
@@ -577,7 +610,7 @@
    * The cluster-view of all the MG4J indexes that are part of this index (i.e.
    * the head and all the tails). 
    */
-  protected Index indexCluster;
+  protected Index invertedIndex;
   
   /**
    * The direct index for this atomic index. If 
@@ -598,8 +631,30 @@
    */
   protected Properties additionalDirectProperties;
   
+  /**
+   * Is the direct indexing enabled? Direct indexes are used to find terms 
+   * occurring in given documents. This is the reverse operation to the typical
+   * search, which finds documents containing a given a set of terms.
+   */
   protected boolean hasDirectIndex;
   
+  /**
+   * This map associates direct index terms with their IDs. See the note at the
+   * top-level javadocs for this class for a discussion on direct and inverted 
+   * term IDs. 
+   */
+  protected Object2LongMap<String> directTermIds;
+  
+  /**
+   * The terms in the direct index, in the order they were first seen during 
+   * indexing.
+   */
+  protected ObjectBigList<String> directTerms;
+  
+  /**
+   * The single thread used to index documents. All writes to the index files
+   * are done from this thread.
+   */
   protected Thread indexingThread;
   
   /**
@@ -630,7 +685,6 @@
    */
   protected long documentPointer;
   
-  
   /**
    * The number of documents currently stored in RAM.
    */
@@ -686,7 +740,6 @@
       boolean hasDirectIndex, TermProcessor termProcessor,
       BlockingQueue<GATEDocument> inputQueue,
       BlockingQueue<GATEDocument> outputQueue) throws IOException, 
IndexException {
-    super();
     this.parent = parent;
     this.name = name;
     this.indexDirectory = indexDirectory;
@@ -743,8 +796,30 @@
       indexDirectory.mkdirs();
     }
     synchronized(this) {
-      indexCluster = openIndexCluster(subIndexes, termProcessor);
+      invertedIndex = openInvertedIndexCluster(subIndexes, termProcessor);
     }
+    // open direct index
+    if(hasDirectIndex) {
+      directTerms = new ObjectBigArrayBigList<String>();
+      directTermIds = new Object2LongAVLTreeMap<String>();
+      directTermIds.defaultReturnValue(-1);
+      File directTermsFile = new File(indexDirectory, DIRECT_TERMS_FILENAME);
+      if(directTermsFile.exists()) {
+        FileLinesCollection fileLines = new FileLinesCollection(
+            directTermsFile.getAbsolutePath(), "UTF-8");
+        Iterator<MutableString> termsIter = fileLines.iterator();
+        long termID = 0;
+        while(termsIter.hasNext()) {
+          String term = termsIter.next().toString();
+          directTerms.add(term);
+          directTermIds.put(term, termID++);
+        }
+      }
+      synchronized(this) {
+        //TODO
+        // open direct index cluster
+      }
+    }
        }
                
   /**
@@ -791,11 +866,12 @@
        }
        
        /**
-        * Writes all the data currently stored in RAM to a new tail index.
+        * Writes all the data currently stored in RAM to a new index batch. 
The first
+        * batch is the head index, all other batches are tail indexes.
         * @throws IOException 
         * @throws IndexException 
         */
-       protected void writeCurrentTail() throws IOException, IndexException {
+       protected void writeCurrentBatch() throws IOException, IndexException {
          if(documentsInRAM == 0) return;
 
          // find the name for the new tail
@@ -913,7 +989,7 @@
     }
          
     if(hasDirectIndex) {
-      writeDirectIndex(newTailDir, termArray);
+      writeDirectIndex(newTailDir);
     }
     // update parent
     parent.subtractOccurrences(occurrencesInRAM);
@@ -926,7 +1002,7 @@
       // modify internal state
       synchronized(this) {
         subIndexes.add(openSubIndex(newTailName));
-        indexCluster = openIndexCluster(subIndexes, termProcessor);
+        invertedIndex = openInvertedIndexCluster(subIndexes, termProcessor);
         if(hasDirectIndex) {
           // TODO
           // merge the new direct batch into the direct cluster
@@ -953,7 +1029,7 @@
         * @throws IOException
         * @throws IndexException 
         */
-  protected void writeDirectIndex(File batchDir, MutableString[] termArray) 
+  protected void writeDirectIndex(File batchDir) 
       throws IOException, IndexException {
     // The index we are writing is a direct index, so we give it new terms
     // which are actually document IDs, and they have posting lists containing
@@ -964,27 +1040,44 @@
           new Object2ReferenceOpenHashMap<MutableString, 
             PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );
     MutableString docIdStr = new MutableString();
-    // we now read the posting lists for all the terms, in ascending term order
-    for(int termId = 0; termId < termArray.length; termId++) {
-      PostingsList termPostings = termMap.get(termArray[termId]);
-      long docPointer = termPostings.firstDocumentPointer;
-      for(int i = 0; i < termPostings.documentPointersDifferential.size(); 
i++) {
-        docPointer += termPostings.documentPointersDifferential.get(i);        
-        int count = termPostings.counts.getInt(i);
-        // convert data to the correct type
-        docIdStr.replace(longToTerm(docPointer));
-        // at this point we have term, document, counts so we can write the 
data
-        // to the in-RAM direct index
-        PostingsList docPostings = docMap.get(docIdStr);
-        if(docPostings == null) {
-          docPostings = new PostingsList(false);
-          docMap.put(docIdStr.copy(), docPostings);
-        }
-        docPostings.newDocumentPointer(termId);
-        docPostings.setCount(count); 
-        docPostings.flush();
+    // make sure all the terms about to be indexed have direct ID
+    for(MutableString termMS : termMap.keySet()) {
+      String termString = termMS.toString();
+      long directTermId = directTermIds.getLong(termString);
+      if(directTermId == directTermIds.defaultReturnValue()) {
+        // term not seen before
+        directTerms.add(termString);
+        directTermId = directTerms.size64() -1;
+        directTermIds.put(termString, directTermId);
       }
     }
+    // we now read the posting lists for all the terms, in ascending term 
order    
+    //for(int termId = 0; termId < termArray.length; termId++) {
+    MutableString termMS = new MutableString();
+    for(long directTermId = 0; directTermId < directTerms.size64(); 
directTermId++){
+      String termString = directTerms.get(directTermId);
+      termMS.replace(termString);
+      PostingsList termPostings = termMap.get(termMS);
+      if(termPostings != null) {
+        long docPointer = termPostings.firstDocumentPointer;
+        for(int i = 0; i < termPostings.documentPointersDifferential.size(); 
i++) {
+          docPointer += termPostings.documentPointersDifferential.get(i);      
 
+          int count = termPostings.counts.getInt(i);
+          // convert data to the correct type
+          docIdStr.replace(longToTerm(docPointer));
+          // at this point we have term, document, counts so we can write the 
data
+          // to the in-RAM direct index
+          PostingsList docPostings = docMap.get(docIdStr);
+          if(docPostings == null) {
+            docPostings = new PostingsList(false);
+            docMap.put(docIdStr.copy(), docPostings);
+          }
+          docPostings.newDocumentPointer(directTermId);
+          docPostings.setCount(count); 
+          docPostings.flush();
+        } 
+      }
+    }
     
     // 2. write the data from RAM
     String mg4jBasename = new File(batchDir, name + "-dir").getAbsolutePath();
@@ -996,7 +1089,7 @@
         new QuasiSuccinctIndexWriter(
             IOFactory.FILESYSTEM_FACTORY,
             mg4jBasename, 
-            termArray.length,
+            directTerms.size64(),
             Fast.mostSignificantBit(QuasiSuccinctIndex.DEFAULT_QUANTUM),
             QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE,
             flags,
@@ -1043,11 +1136,17 @@
     BinIO.storeObject(docBloomFilter, 
         new File(mg4jBasename + DocumentalCluster.BLOOM_EXTENSION)); 
     // write the sizes file
+    // this is a list of document sizes (directTerms in our case)    
     File sizesFile = new File(mg4jBasename + DiskBasedIndex.SIZES_EXTENSION);
     OutputBitStream sizesStream = new OutputBitStream(sizesFile);
     int maxTermSize = -1; // -1 means unknown
-    for(MutableString term : termArray) {
-      int termSize = (int)termMap.get(term).frequency;
+    //for(MutableString term : termArray) {
+    for(long directTermId = 0; directTermId < directTerms.size64(); 
directTermId++){
+      String termString = directTerms.get(directTermId);
+      termMS.replace(termString);
+      PostingsList termPostings = termMap.get(termMS);
+      int termSize = termPostings != null ?
+          (int)termPostings.frequency : 0;
       sizesStream.writeGamma(termSize);
       if(termSize > maxTermSize) maxTermSize = termSize;
     }
@@ -1087,7 +1186,14 @@
       // this should never happen
       throw new IndexException("Error while saving tail properties", e);
     }
-    
+    //update the index-wide direct terms file
+    pw = new PrintWriter(new OutputStreamWriter(new FastBufferedOutputStream(
+        new FileOutputStream(new File(indexDirectory, DIRECT_TERMS_FILENAME)),
+        64 * 1024), "UTF-8" ));
+    for (String t : directTerms ) {
+      pw.println(t);
+    }
+    pw.close();
   }
        
        
@@ -1140,6 +1246,10 @@
     } catch(Exception e) {
       throw new IndexException("Exception while combining sub-indexes", e);
     }
+         
+         if(hasDirectIndex()) {
+           // TODO
+         }
 
          // update the internal state
     synchronized(this) {
@@ -1151,7 +1261,7 @@
       if(headDir.exists() && headDir.renameTo(headDirOld)){
         if(headDirNew.renameTo(headDir)) {
           subIndexes.add(0, openSubIndex(HEAD_FILE_NAME));
-          indexCluster = openIndexCluster(subIndexes, termProcessor);
+          invertedIndex = openInvertedIndexCluster(subIndexes, termProcessor);
           
           // clean-up: delete old head, used-up tails
           if(!gate.util.Files.rmdir(headDirOld)) {
@@ -1278,7 +1388,7 @@
           if(aDocument != GATEDocument.END_OF_QUEUE) {
             if(aDocument == DUMP_BATCH) {
               //dump batch was requested
-              writeCurrentTail();
+              writeCurrentBatch();
             } else if(aDocument == COMPRESS_INDEX) {
               // compress index was requested
               try {
@@ -1300,7 +1410,7 @@
             }
           } else {
             // close down
-            writeCurrentTail();
+            writeCurrentBatch();
             flush();
           }
           if(aDocument != DUMP_BATCH && aDocument != COMPRESS_INDEX) {
@@ -1491,7 +1601,7 @@
    * @return
    */
   public Index getIndex() {
-    return indexCluster;
+    return invertedIndex;
   }
   
   
@@ -1507,7 +1617,7 @@
    * representations of document IDs (as produced by {@link 
#longToTerm(long)}).
    * The search results is a set of &quot;document IDs&quot;, which are 
actually
    * term IDs. The actual term string corresponding to the returned term IDs 
can
-   * be obtained by calling {@link #getTerm(long)}.   
+   * be obtained by calling {@link #getDirectTerm(long)}.   
    */
   public Index getDirectIndex() {
     return directIndex;
@@ -1523,21 +1633,21 @@
   }
  
   /**
-   * Gets the term string for a given term ID. The term ID must have been 
+   * Gets the term string for a given direct term ID. The term ID must have 
been 
    * obtained from this index's direct index.
    * @param termId the ID for the term being sought.
    * @return the string for the given term.
    */
-  public String getTerm(long termId) {
-    return getIndex().termMap.list().get(termId).toString();
+  public CharSequence getDirectTerm(long termId) {
+    return directTerms.get(termId);
   }
  
-  public StringMap<? extends CharSequence> getTermMap() {
-    return getIndex().termMap;
+  public ObjectBigList<? extends CharSequence> getDirectTerms() {
+    return directTerms;
   }
   
-  public LongBigList getTermOccurenceCounts() throws IOException {
+  public long getDirectTermOccurenceCount(long directTermId) throws 
IOException {
     //TODO: copy from IndexReaderPool
-    return null;
+    return -1;
   }
 }

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java
===================================================================
--- 
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java
      2014-01-28 17:13:53 UTC (rev 17252)
+++ 
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java
      2014-01-29 12:01:38 UTC (rev 17253)
@@ -265,7 +265,7 @@
       String termString = null;
       // get the term string
       try {
-        termString = atomicIndex.getTerm(termId);
+        termString = atomicIndex.getDirectTerm(termId).toString();
       } catch(Exception e) {
         System.err.println("Error reading indirect index term with ID " +
           termId);

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/TermTypeTermsQuery.java
===================================================================
--- 
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/TermTypeTermsQuery.java
   2014-01-28 17:13:53 UTC (rev 17252)
+++ 
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/TermTypeTermsQuery.java
   2014-01-29 12:01:38 UTC (rev 17253)
@@ -186,8 +186,8 @@
     }
     
     // once we have the index reader, we scan the whole dictionary
-    termId:for(long i = 0; i < atomicIndex.getTermMap().size64(); i++) {
-      String termString = atomicIndex.getTerm(i);
+    termId:for(long i = 0; i < atomicIndex.getDirectTerms().size64(); i++) {
+      String termString = atomicIndex.getDirectTerm(i).toString();
       // check this term should be returned
       if(indexType == IndexType.ANNOTATIONS &&
          !annotationHelper.isMentionUri(termString)) continue termId;
@@ -199,7 +199,7 @@
         termDescriptions.add(annotationHelper.describeMention(termString));
       }
       if(countsEnabled) {
-        long termCount = atomicIndex.getTermOccurenceCounts().get(i);
+        long termCount = atomicIndex.getDirectTermOccurenceCount(i);
         if(termCount > Integer.MAX_VALUE) {
           logger.warn("Term count lenght greater than 32 bit. Data was 
pratially lost!");
           termCounts.add(Integer.MAX_VALUE);

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
WatchGuard Dimension instantly turns raw network data into actionable 
security intelligence. It gives you real-time visual feedback on key
security issues and trends.  Skip the complicated setup - simply import
a virtual appliance and go from zero to informed in seconds.
http://pubads.g.doubleclick.net/gampad/clk?id=123612991&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[17253] mimir/branches/5.0/mimir-core/src/gate/mimir

Reply via email to