mimir

valyt Thu, 20 Feb 2014 07:01:14 -0800

Revision: 17368
          http://sourceforge.net/p/gate/code/17368
Author:   valyt
Date:     2014-02-20 15:00:45 +0000 (Thu, 20 Feb 2014)
Log Message:
-----------
Javadocs for the newly written APIs.


Modified Paths:
--------------
    mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java

Added Paths:
-----------
    mimir/branches/5.0/mimir-core/src/gate/mimir/package-info.java

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java        
2014-02-20 14:52:32 UTC (rev 17367)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java        
2014-02-20 15:00:45 UTC (rev 17368)
@@ -16,6 +16,7 @@
 
 import gate.Document;
 import gate.Gate;
+import gate.creole.AnalyserRunningStrategy;
 import gate.mimir.IndexConfig.SemanticIndexerConfig;
 import gate.mimir.IndexConfig.TokenIndexerConfig;
 import gate.mimir.index.AtomicAnnotationIndex;
@@ -27,6 +28,7 @@
 import gate.mimir.index.IndexException;
 import gate.mimir.search.QueryEngine;
 import gate.util.GateRuntimeException;
+import it.unimi.di.big.mg4j.index.cluster.IndexCluster;
 
 import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
@@ -58,17 +60,56 @@
 import org.apache.log4j.Logger;
 
 /**
- * A Mímir index which can index document and answer queries. This class is the
+ * <p>
+ * A Mímir index which can index documents and answer queries. This class is 
the
  * main entry point to the Mímir API.
- * 
+ * </p>
  * A Mímir index is a compound index comprising the following data elements:
  * <ul>
- *   <li>A datastore used by the semantic annotation helpers</li>
- *   <li>one or more indirect sub-indexes</li>
- *   <li>one or more direct sub-indexes (optional)</li>
- *   <li>the zip collection containing the document textual content and 
- *   metadata</li>
+ * <li>one or more sub-indexes (implemented by classes that extend
+ * {@link AtomicIndex}.</li>
+ * <li>a document collection containing the document textual content and
+ * metadata</li>
  * </ul>
+ * <p>
+ * Each sub-index indexes either a certain feature of token annotations 
+ * ({@link AtomicTokenIndex}) or one or more annotation types 
+ * ({@link AtomicAnnotationIndex}).
+ * </p>
+ * <p>
+ * A Mímir index is continually accepting documents to be indexed (through 
calls
+ * to {@link #indexDocument(Document)}) and can answer queries though the
+ * {@link QueryEngine} instance returned by {@link #getQueryEngine()}.
+ * </p>
+ * <p>
+ * Documents submitted for indexing are initially accumulated in RAM, during
+ * which time they are not available for being searched. After documents in RAM
+ * are written to disk (a <em>sync-to-disk</em> operation), they become
+ * searchable. In-RAM documents are synced to disk after a certain amount of
+ * data has been accumulated (see {@link #setOccurrencesPerBatch(long)}) and
+ * also at regular time intervals (see {@link #setTimeBetweenBatches(int)}).
+ * </p>
+ * <p>
+ * Client code can request a <em>sync to disk</em> operation by calling
+ * {@link #requestSyncToDisk()}.
+ * </p>
+ * <p>
+ * Every sync-to-disk operation causes a new index <em>batch</em> to be 
created.
+ * All the batches are merged into a {@link IndexCluster} which is then used to
+ * serve queries. If the number of clusters gets too large, it can harm
+ * efficiency or the system can run into problems due to too large a number of
+ * files being open. To avoid this, the index batches can be <em>compacted</em>
+ * into a single batch. The index will automatically do that once the number of
+ * batches exceeds {@link IndexConfig#setMaximumBatches(int)}.
+ * </p>
+ * <p>
+ * Client code can request a compact operation by calling
+ * {@link #requestCompactIndex()}.
+ * </p>
+ * <p>
+ * In order to keep its consistency, a Mímir index <strong>must</strong> be
+ * closed orderly by calling {@link #close()} before the JVM is shut down.
+ * </p>
  */
 public class MimirIndex {
   
@@ -410,7 +451,8 @@
   protected QueryEngine queryEngine;
   
   /**
-   * Create a new Index.
+   * Creates a new Mímir index.
+   * 
    * @param indexConfig the configuration for the index.
    * @throws IOException 
    * @throws IndexException 
@@ -423,7 +465,7 @@
   }
   
   /**
-   * Open and existing Index.
+   * Open and existing Mímir index.
    * @param indexDirectory the on-disk directory containing the index to be 
    * opened.
    * @throws IndexException if the index cannot be opened
@@ -542,7 +584,11 @@
   }
   
   /**
-   * Queues a new document for indexing.
+   * Queues a new document for indexing. The document will first go into the
+   * indexing queue, from where the various sub-indexes take their input. Once
+   * processed, the document data is stored in RAM until a sync-to-disk 
+   * operation occurs. Only after that does the document become searchable. 
+   * 
    * @param document the document to be indexed.
    * @throws InterruptedException if the process of posting the new document
    * to all the input queues is interrupted.
@@ -595,13 +641,16 @@
   }
   
   /**
-   * Asks this index to compact all its batches into a single index. This 
-   * reduces the number of open file handles required. The work happens in 
several 
-   * background threads (one for each sub-index) at the earliest opportunity.
-   * @return a list of futures that can be used to find out when the operation
-   * has completed.
-   * @throws InterruptedException if the current thread has been interrupted
-   * while trying to queue the compaction request.
+   * Asks each of the sub-indexes in this index to compact all their batches
+   * into a single index. This reduces the number of open file handles 
required.
+   * The work happens in several background threads (one for each sub-index) at
+   * the earliest opportunity.
+   * 
+   * @return a list of futures (one for each sub-index) that can be used to 
find
+   *         out when the operation has completed.
+   * @throws InterruptedException
+   *           if the current thread has been interrupted while trying to queue
+   *           the compaction request.
    */
   public List<Future<Void>> requestCompactIndex() throws InterruptedException {
     List<Future<Void>> futures = new ArrayList<Future<Void>>();
@@ -613,6 +662,17 @@
     return futures;
   }
   
+  /**
+   * Requests that the {@link DocumentCollection} contained by this index is 
+   * compacted. This method blocks until the compaction has completed.
+   * 
+   * In normal operation, the index maintains the collection, which includes 
+   * regular compactions, so there should be no reason to call this method.
+   * 
+   * @throws ZipException
+   * @throws IOException
+   * @throws IndexException
+   */
   public void compactDocumentCollection() throws ZipException, IOException, 
IndexException {
     documentCollection.compact();
   }
@@ -630,7 +690,11 @@
   /**
    * Stops this index from accepting any further document for indexing, stops
    * this index from accepting any more queries, finishes indexing all the 
-   * currently queued documents, writes all the files to disk, and returns.
+   * currently queued documents, writes all the files to disk, after which it
+   * returns control to the calling thread.
+   * This may be a lengthy operation, depending on the amount of data that 
still
+   * needs to be written to disk.
+   * 
    * @throws InterruptedException 
    * @throws IOException 
    */
@@ -676,10 +740,21 @@
     logger.info("Index shutdown complete");
   }
 
+  
+  /**
+   * Gets the {@link IndexConfig} value for this index.
+   * @return
+   */
   public IndexConfig getIndexConfig() {
     return indexConfig;
   }
   
+  /**
+   * Returns the {@link QueryEngine} instance that can be used to post queries
+   * to this index. Each index holds one single query engine, so the same value
+   * will always be returned by repeated calls.
+   * @return
+   */
   public QueryEngine getQueryEngine() {
     if(queryEngine == null) {
       queryEngine = new QueryEngine(this);
@@ -687,38 +762,101 @@
     return queryEngine;
   }
 
+  
+  /**
+   * Gets the top level directory for this index.
+   * @return
+   */
   public File getIndexDirectory() {
     return indexDirectory;
   }
 
-  public long getOccurrencesPerBatch() {
-    return occurrencesPerBatch;
-  }
-
   /**
-   * Gets the current estimated number of occurrences in RAM
+   * Gets the current estimated number of occurrences in RAM. An occurrence
+   * represents one term (either a token or an annotation) occurring in an
+   * indexed document. This value can be used as a good measurement of the 
total
+   * amount of data that is currently being stored in RAM and waiting to be
+   * synced to disk.
    * @return
    */
   public long getOccurrencesInRam() {
     return occurrencesInRam;
   }
 
+  /**
+   * Returns the size of the indexing queue. See 
+   * {@link #setIndexingQueueSize(int)} for more comments.
+   * @return
+   */
   public int getIndexingQueueSize() {
     return indexingQueueSize;
   }
 
+  /**
+   * Sets the size of the indexing queue(s) used by this index.
+   * Documents submitted for indexing are held in a queue until the indexers 
+   * become ready to process them. One queue is used for each of the 
+   * sub-indexes. A larger queue size can smooth out bursts of activity, but 
+   * requires more memory (as a larger number of documents may need to be 
stored
+   * at the same time). A smaller value is more economical, but it can leads 
to 
+   * slow-downs when certain documents take too long to index, and can clog up
+   * the queue. Defaults to {@value #DEFAULT_INDEXING_QUEUE_SIZE}.
+   * @param indexingQueueSize
+   */
   public void setIndexingQueueSize(int indexingQueueSize) {
     this.indexingQueueSize = indexingQueueSize;
   }
 
+  /**
+   * Gets the number of occurrences that should be used as a trigger for a sync
+   * to disk operation, leading to the creation of a new index batch.
+   * @return
+   */
+  public long getOccurrencesPerBatch() {
+    return occurrencesPerBatch;
+  }
+  
+  /**
+   * Sets the number of occurrences that should trigger a sync-to-disk 
operation
+   * leading to a new batch being created from the data previously stored in
+   * RAM.
+   * 
+   * An occurrence represents one term (either a token or an annotation)
+   * occurring in an indexed document. This value can be used as a good
+   * measurement of the total amount of data that is currently being stored in
+   * RAM and waiting to be synced to disk.
+   * 
+   * @param occurrencesPerBatch
+   */
   public void setOccurrencesPerBatch(long occurrencesPerBatch) {
     this.occurrencesPerBatch = occurrencesPerBatch;
   }
   
+  
+  /**
+   * Gets the time interval (in milliseconds) between sync-to-disk operations.
+   * This is approximately the maximum amount of time that a document can spend
+   * being stored in RAM (and thus not searchable) after having been submitted
+   * for indexing. The measurement is not precise because of the time spent by
+   * the document in the indexing queue (after being received but before being
+   * processed) and the time take to write a new index batch to disk.
+   * 
+   * @return
+   */
   public int getTimeBetweenBatches() {
     return getIndexConfig().getTimeBetweenBatches();
   }
 
+  /**
+   * Sets the time interval (in milliseconds) between sync-to-disk operations.
+   * This is approximately the maximum amount of time that a document can spend
+   * being stored in RAM (and thus not searchable) after having been submitted
+   * for indexing. The measurement is not precise because of the time spent by
+   * the document in the indexing queue (after being received but before being
+   * processed) and the time take to write a new index batch to disk.
+   * 
+   * @return
+   */  
   public void setTimeBetweenBatches(int timeBetweenBatches) {
     if(indexConfig.getTimeBetweenBatches() != timeBetweenBatches) {
       indexConfig.setTimeBetweenBatches(timeBetweenBatches);
@@ -733,6 +871,13 @@
     }
   }
 
+  /**
+   * Gets the {@link DocumentCollection} instance used by this index. The 
+   * document collection is normally fully managed by the index, so there 
should
+   * be no need to access it directly through this method.
+   * 
+   * @return
+   */
   public DocumentCollection getDocumentCollection() {
     return documentCollection;
   }
@@ -867,6 +1012,14 @@
     }
   }
   
+  
+  /**
+   * Returns the {@link AtomicTokenIndex} responsible for indexing a particular
+   * feature on token annotations.
+   * 
+   * @param featureName
+   * @return
+   */
   public AtomicTokenIndex getTokenIndex(String featureName) {
     if(featureName == null) {
       // return the default token index
@@ -881,6 +1034,13 @@
     return null;
   }
   
+  /**
+   * Returns the {@link AtomicAnnotationIndex} instance responsible for 
indexing
+   * annotations of the type specified.
+   * 
+   * @param annotationType
+   * @return
+   */
   public AtomicAnnotationIndex getAnnotationIndex(String annotationType) {
     for(int i = 0; i < indexConfig.getSemanticIndexers().length; i++) {
       for(String aType : 

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java  
2014-02-20 14:52:32 UTC (rev 17367)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java  
2014-02-20 15:00:45 UTC (rev 17368)
@@ -15,6 +15,7 @@
 package gate.mimir.index;
 
 
+import gate.mimir.MimirIndex;
 import it.unimi.dsi.fastutil.longs.Long2ObjectLinkedOpenHashMap;
 
 import java.io.BufferedOutputStream;
@@ -48,7 +49,8 @@
 
 /**
  * A Mimir document collection. Consists of one or more zip files containing 
- * serialised {@link DocumentData} values.
+ * serialised {@link DocumentData} values. Each {@link MimirIndex} contains a
+ * document collection.
  */
 public class DocumentCollection {
   

Added: mimir/branches/5.0/mimir-core/src/gate/mimir/package-info.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/package-info.java              
                (rev 0)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/package-info.java      
2014-02-20 15:00:45 UTC (rev 17368)
@@ -0,0 +1,27 @@
+/*
+ * package-info.java
+ * 
+ * Copyright (c) 2007-2014, The University of Sheffield.
+ * 
+ * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
+ * and is free software, licenced under the GNU Lesser General Public License,
+ * Version 3, June 2007 (also included with this distribution as file
+ * LICENCE-LGPL3.html).
+ * 
+ * Valentin Tablan, 20 Feb 2014
+ * 
+ * $Id$
+ */
+/**
+ * This is the Mímir Java API. For more high-level information about Mímir, see
+ * the <a href="http://gate.ac.uk/mimir";>Mímir home page</a>.  
+ * 
+ * The top level entry point for the Mímir API is the 
+ * {@link gate.mimir.MimirIndex} class, which can be used to create new indexes
+ * or open existing ones. To create a new index you will need to supply a 
+ * properly populated instance of {@link gate.mimir.IndexConfig} to 
+ * {@link gate.mimir.MimirIndex#MimirIndex(IndexConfig)}. To open an
+ * existing index, use {@link gate.mimir.MimirIndex#MimirIndex(java.io.File)}, 
+ * and point it to the existing index directory.
+ */
+package gate.mimir;
\ No newline at end of file


Property changes on: 
mimir/branches/5.0/mimir-core/src/gate/mimir/package-info.java
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+Id
\ No newline at end of property
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Managing the Performance of Cloud-Based Applications
Take advantage of what the Cloud has to offer - Avoid Common Pitfalls.
Read the Whitepaper.
http://pubads.g.doubleclick.net/gampad/clk?id=121054471&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[17368] mimir/branches/5.0/mimir-core/src/gate/mimir

Reply via email to