Revision: 17370
http://sourceforge.net/p/gate/code/17370
Author: valyt
Date: 2014-02-20 15:35:58 +0000 (Thu, 20 Feb 2014)
Log Message:
-----------
More Javadocs.
Modified Paths:
--------------
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2014-02-20 15:12:35 UTC (rev 17369)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2014-02-20 15:35:58 UTC (rev 17370)
@@ -96,20 +96,27 @@
import com.google.common.io.PatternFilenameFilter;
/**
+ * <p>
* An inverted index associating terms with documents. Terms can be either
token
- * feature values, or semantic annotation URIs. Optionally, a direct index may
- * also be present.
- *
+ * feature values, or annotations. Optionally, a direct index may also be
+ * present.
+ * </p>
+ * <p>
* An atomic index manages a head index (the principal data) and a set of tail
* indexes (batches containing updates). Additionally, the data representing
* all the new documents that have been queued for indexing since the last tail
* was written are stored in RAM.
- *
+ * </p>
+ * <p>
* When direct indexing is enabled, the term IDs in the direct index are
* different from the term IDs in the inverted index. In the inverted index
* the term IDs are their position in the lexicographically sorted list of all
* terms. In the directed index, the term IDs are their position in the list
* sorted by the time they were first seen during indexing.
+ * </p>
+ * <p>
+ * The head and tail batches can be combined into a new head by a
+ * <em>compact</em> operation.
*/
public abstract class AtomicIndex implements Runnable {
@@ -399,48 +406,54 @@
/**
* Given a terms file (text file with one term per line) this method
generates
* the corresponding termmap file (binary representation of a StringMap).
- * Optionally, a {@link BloomFilter} can also be generated, if the suitable
+ * Optionally, a {@link BloomFilter} can also be generated, if the suitable
* target file is provided.
+ *
* @param termsFile the input file
- * @param termmapFile the output file
- * @param bloomFilterFile if not null, the file to be used for writing
- * the {@link BloomFilter} for the index.
- * @throws IOException
+ * @param termmapFile the output termmap file, or <code>null</code> if a
+ * termmap is not required.
+ * @param bloomFilterFile the file to be used for writing the
+ * {@link BloomFilter} for the index, or <code>null</code> if a Bloom filter
+ * is not required.
+ * @throws IOException
*/
- public static void generateTermMap(File termsFile, File termmapFile,
+ public static void generateTermMap(File termsFile, File termmapFile,
File bloomFilterFile) throws IOException {
FileLinesCollection fileLinesCollection =
new FileLinesCollection(termsFile.getAbsolutePath(), "UTF-8");
- StringMap<CharSequence> terms = new ShiftAddXorSignedStringMap(
- fileLinesCollection.iterator(),
- new LcpMonotoneMinimalPerfectHashFunction<CharSequence>(
- fileLinesCollection, TransformationStrategies.prefixFreeUtf16()));
- BinIO.storeObject(terms, termmapFile);
- if(bloomFilterFile != null) {
- BloomFilter<Void> bloomFilter = BloomFilter.create(terms.size64());
- for(MutableString term : fileLinesCollection) {
- bloomFilter.add(term);
- }
- BinIO.storeObject(bloomFilter, bloomFilterFile);
+ if(termmapFile != null) {
+ StringMap<CharSequence> terms =
+ new ShiftAddXorSignedStringMap(
+ fileLinesCollection.iterator(),
+ new LcpMonotoneMinimalPerfectHashFunction<CharSequence>(
+ fileLinesCollection,
TransformationStrategies.prefixFreeUtf16()));
+ BinIO.storeObject(terms, termmapFile);
+ }
+
+ if(bloomFilterFile != null) {
+ BloomFilter<Void> bloomFilter =
BloomFilter.create(fileLinesCollection.size64());
+ for(MutableString term : fileLinesCollection) {
+ bloomFilter.add(term);
}
+ BinIO.storeObject(bloomFilter, bloomFilterFile);
+ }
}
/**
* Creates a documental cluster from a list of {@link MG4JIndex} values.
*
- * @param subIndexes the indexes to be combined into a cluster
+ * @param batches the indexes to be combined into a cluster
* @param termProcessor the term processor to be used (can be null)
* @return a documental cluster view of the list of indexes provided.
*/
protected final static Index openInvertedIndexCluster(
- List<MG4JIndex> subIndexes,
- TermProcessor termProcessor){
+ List<MG4JIndex> batches, TermProcessor termProcessor){
- if(subIndexes == null || subIndexes.size() == 0) return null;
- if(subIndexes.size() == 1) return subIndexes.get(0).invertedIndex;
+ if(batches == null || batches.size() == 0) return null;
+ if(batches.size() == 1) return batches.get(0).invertedIndex;
// prepare the documental cluster
- Index[] indexes = new Index[subIndexes.size()];
+ Index[] indexes = new Index[batches.size()];
long[] cutPoints = new long[indexes.length];
cutPoints[0] = 0;
int numberOfTerms = -1;
@@ -453,7 +466,7 @@
@SuppressWarnings("unchecked")
BloomFilter<Void> bloomFilters[] = new BloomFilter[indexes.length];
- for(MG4JIndex aSubIndex : subIndexes) {
+ for(MG4JIndex aSubIndex : batches) {
indexes[indexIdx] = aSubIndex.invertedIndex;
if(indexIdx < cutPoints.length - 1) {
cutPoints[indexIdx + 1] = cutPoints[indexIdx] +
@@ -490,15 +503,19 @@
);
}
-
- protected final static Index openDirectIndexCluster(
- List<MG4JIndex> subIndexes){
+ /**
+ * Opens the direct index files from all the batches and combines them into
+ * a {@link LexicalCluster}.
+ * @param batches the batches to be opened.
+ * @return
+ */
+ protected final static Index openDirectIndexCluster(List<MG4JIndex> batches){
- if(subIndexes == null || subIndexes.size() == 0) return null;
- if(subIndexes.size() == 1) return subIndexes.get(0).directIndex;
+ if(batches == null || batches.size() == 0) return null;
+ if(batches.size() == 1) return batches.get(0).directIndex;
// prepare the lexical cluster
- Index[] indexes = new Index[subIndexes.size()];
+ Index[] indexes = new Index[batches.size()];
int[] cutPoints = new int[indexes.length];
cutPoints[0] = 0;
String[] cutPointTerms = new String[indexes.length];
@@ -512,7 +529,7 @@
@SuppressWarnings("unchecked")
BloomFilter<Void> bloomFilters[] = new BloomFilter[indexes.length];
- for(MG4JIndex aSubIndex : subIndexes) {
+ for(MG4JIndex aSubIndex : batches) {
indexes[indexIdx] = aSubIndex.directIndex;
// we build this based on the inverted index, as the cut-points for the
// lexical partitioning are based on document IDs
@@ -1058,11 +1075,8 @@
}
/**
- *
+ * Writes the in-RAM data to a new direct index batch.
* @param batchDir
- * @param termArray the in-RAM terms, sorted lexicographically
- * @throws IOException
- * @throws IndexException
*/
protected void writeDirectIndex(File batchDir)
throws IOException, IndexException {
@@ -1240,8 +1254,7 @@
/**
- * Combines all the currently existing sub-indexes, generating a new
- * head index.
+ * Combines all the currently existing batches, generating a new head
index.
* @throws IndexException
* @throws IOException
* @throws ConfigurationException
@@ -1440,8 +1453,8 @@
/**
* Instructs this index to dump to disk all the in-RAM index data at
the fist
* opportunity.
- * @return a {@link Future} value that, upon completion, will return
the number of
- * occurrences written to disk.
+ * @return a {@link Future} value that, upon completion, will return
the
+ * number of occurrences written to disk.
* @throws InterruptedException if this thread is interrupted while
trying to
* queue the dump request.
*/
@@ -1482,7 +1495,7 @@
}
/**
- * Opens one sub-index, specified as a directory inside this Atom
Index's
+ * Opens one sub-index, specified as a directory inside this Atomic
Index's
* index directory.
* @param subIndexDirname
* @return
@@ -1770,27 +1783,50 @@
}
}
-
+ /**
+ * Gets the top level directory for this atomic index. This will be a
+ * directory contained in the top level directory of the {@link MimirIndex}
+ * which includes this atomic index.
+ * @return
+ */
public File getIndexDirectory() {
return indexDirectory;
}
+ /**
+ * Gets the top level {@link MimirIndex} to which this atomic index belongs.
+ * @return
+ */
public MimirIndex getParent() {
return parent;
}
+ /**
+ * Gets the input queue used by this atomic index. This queue is used to
+ * submit documents for indexing.
+ * @return
+ */
public BlockingQueue<GATEDocument> getInputQueue() {
return inputQueue;
}
+ /**
+ * Gets the output queue used by this atomic index. This is used to
+ * "return" documents that have finished indexing. Notably, values
+ * in this queue will have their occurrences value (see
+ * {@link GATEDocument#getOccurrences()}) increased by the number of
+ * occurrences generated by indexing the document in this atomic index.
+ *
+ * @return
+ */
public BlockingQueue<GATEDocument> getOutputQueue() {
return outputQueue;
}
/**
- * Gets an {@link Index} value that can be used to search this atomic index.
- * This will normally be a {@link DocumentalCluster} view over all the
- * sub-indexes contained.
+ * Gets the inverted index (an {@link Index} value) that can be used to
+ * search this atomic index. This will normally be a
+ * {@link DocumentalCluster} view over all the batches contained.
* @return
*/
public Index getIndex() {
@@ -1818,18 +1854,32 @@
/**
* Gets the term string for a given direct term ID. The term ID must have
been
- * obtained from this index's direct index.
+ * obtained from the direct index of this index.
* @param termId the ID for the term being sought.
* @return the string for the given term.
*/
public CharSequence getDirectTerm(long termId) {
return directTerms.get(termId);
}
-
+
+ /**
+ * Gets the list of direct terms for this index. The terms are sorted by the
+ * first they were seen, and <strong>not</strong> lexicographically.
+ * @return
+ */
public ObjectBigList<? extends CharSequence> getDirectTerms() {
return directTerms;
}
+ /**
+ * Gets the occurrence count in the whole index for a given direct term,
+ * specified by a direct term ID (which must have been obtained from the
+ * direct index of this index).
+ *
+ * @param directTermId
+ * @return
+ * @throws IOException
+ */
public long getDirectTermOccurenceCount(long directTermId) throws
IOException {
String termStr = directTerms.get(directTermId);
// we need to sum up all the counts for this term in the inverted index
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Managing the Performance of Cloud-Based Applications
Take advantage of what the Cloud has to offer - Avoid Common Pitfalls.
Read the Whitepaper.
http://pubads.g.doubleclick.net/gampad/clk?id=121054471&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs