Revision: 17253
http://sourceforge.net/p/gate/code/17253
Author: valyt
Date: 2014-01-29 12:01:38 +0000 (Wed, 29 Jan 2014)
Log Message:
-----------
More work on supporting direct indexes:
- Direct terms are not lexicographically sorted, but are instead ordered by
occurrence time;
- this implies that direct term IDs are now different from inverted term IDs.
Not ideal, but can't be helped.
Modified Paths:
--------------
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/TermTypeTermsQuery.java
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2014-01-28 17:13:53 UTC (rev 17252)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2014-01-29 12:01:38 UTC (rev 17253)
@@ -60,7 +60,11 @@
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.longs.LongBigList;
+import it.unimi.dsi.fastutil.objects.Object2LongAVLTreeMap;
+import it.unimi.dsi.fastutil.objects.Object2LongMap;
import it.unimi.dsi.fastutil.objects.Object2ReferenceOpenHashMap;
+import it.unimi.dsi.fastutil.objects.ObjectBigArrayBigList;
+import it.unimi.dsi.fastutil.objects.ObjectBigList;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
@@ -82,6 +86,7 @@
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
@@ -96,14 +101,20 @@
import com.google.common.io.PatternFilenameFilter;
/**
- * An indirect index associating terms with documents. Terms can be either
token
+ * An inverted index associating terms with documents. Terms can be either
token
* feature values, or semantic annotation URIs. Optionally, a direct index may
* also be present.
*
* An atomic index manages a head index (the principal data) and a set of tail
- * indexes (batches containing updates). Additionally, the date representing
+ * indexes (batches containing updates). Additionally, the data representing
* all the new documents that have been queued for indexing since the last tail
- * was written are stored in RAM.
+ * was written are stored in RAM.
+ *
+ * When direct indexing is enabled, the term IDs in the direct index are
+ * different from the term IDs in the inverted index. In the inverted index
+ * the term IDs are their position in the lexicographically sorted list of all
+ * terms. In the directed index, the term IDs are their position in the list
+ * sorted by the time they were first seen during indexing.
*/
public abstract class AtomicIndex implements Runnable {
@@ -137,6 +148,11 @@
}
+ /**
+ * A callable that does nothing. This is used to produce instances of
+ * {@link CustomFuture} which are only used to wait for the completion of an
+ * operation that returns nothing.
+ */
private static final Callable<Void> noOpVoid = new Callable<Void>() {
@Override
public Void call() throws Exception {
@@ -160,37 +176,47 @@
private long lastDocumentPointer = -1;
/**
- * The list of document pointer differentials (differences from
- * {@link #firstDocumentPointer}). For the sake of easy alignment, we
- * actually store a <tt>0</tt> on the first position.
+ * The list of document pointer differentials (differences from the
previous
+ * document pointer. The first document pointer value is stored in
+ * {@link #firstDocumentPointer}), and we store a <tt>0</tt> in the first
+ * position of this list. The actual value of a document pointer for a
given
+ * position is:
+ * <dl>
+ * <dt>pos 0</dt>
+ * <dd>{@link #firstDocumentPointer}</dd>
+ * <dt>pos i</dt>
+ * <dd>pointer at position (i-1) + documentPointersDifferential[i]</dd>
+ * </dl>
*/
private IntList documentPointersDifferential;
/**
- * The list of counts for each document. This list is aligned with
+ * The count (number of terms) for each document. This list is aligned
with
* {@link #documentPointersDifferential}.
*/
private IntList counts;
/**
- * The list of positions in this postings list. For each document at
position
- * <tt>i</i>, there will be counts[i] positions stored in this list.
+ * The list of positions in this postings list. For each document at
+ * position <tt>i</i>, there will be counts[i] positions stored in this
+ * list. This value is <code>non-null</code> only if positions are stored,
+ * which is configured through a construction-time parameter.
*/
private IntArrayList positions;
/**
- * The last seen position.
+ * The last seen position in the current document.
*/
private int lastPosition = -1;
/**
- * The number of position in the current document
+ * The number of positions in the current document
*/
private int count = 0;
/**
- * The maximum count of all the stored documents
+ * The maximum term count of all the stored documents
*/
private int maxCount = 0;
@@ -200,7 +226,7 @@
private long frequency = 0;
/**
- * The total number of occurrences stored.
+ * The total number of term occurrences in all stored documents.
*/
private long occurrences = 0;
@@ -223,7 +249,7 @@
* @param pointer
*/
public void newDocumentPointer(long pointer) {
- // is this really a new document
+ // is this really a new document?
if(pointer != lastDocumentPointer) {
if(firstDocumentPointer < 0) firstDocumentPointer = pointer;
if(lastDocumentPointer == -1) {
@@ -254,13 +280,18 @@
}
}
+ /**
+ * When storing positions, the count is automatically calculated. When not
+ * storing positions, it needs to be explicitly set by calling this method.
+ * @param count
+ */
public void setCount(int count) {
this.count = count;
}
/**
* Checks whether the given position is valid (i.e. greater than the last
- * seen positions. If the position is invalid, this means that a call to
+ * seen position. If the position is invalid, this means that a call to
* {@link #addPosition(int)} with the same value would actually be a
* no-operation.
* @param pos
@@ -286,7 +317,7 @@
}
/**
- * Writes the data contained in this postings list to a index writer
+ * Writes the data contained in this postings list to an index writer
* @param indexWriter
* @throws IOException
*/
@@ -416,7 +447,7 @@
* @param termProcessor the term processor to be used (can be null)
* @return a documental cluster view of the list of indexes provided.
*/
- protected final static Index openIndexCluster(
+ protected final static Index openInvertedIndexCluster(
List<MG4JIndex> subIndexes,
TermProcessor termProcessor){
@@ -508,6 +539,8 @@
*/
public static final String TAIL_FILE_NAME_PREFIX = "tail-";
+
+ public static final String DIRECT_TERMS_FILENAME = "direct.terms";
/**
* The file name (under the current directory for this atomic index) for the
* directory containing the documents that have been queued for indexing,
but
@@ -577,7 +610,7 @@
* The cluster-view of all the MG4J indexes that are part of this index (i.e.
* the head and all the tails).
*/
- protected Index indexCluster;
+ protected Index invertedIndex;
/**
* The direct index for this atomic index. If
@@ -598,8 +631,30 @@
*/
protected Properties additionalDirectProperties;
+ /**
+ * Is the direct indexing enabled? Direct indexes are used to find terms
+ * occurring in given documents. This is the reverse operation to the typical
+ * search, which finds documents containing a given a set of terms.
+ */
protected boolean hasDirectIndex;
+ /**
+ * This map associates direct index terms with their IDs. See the note at the
+ * top-level javadocs for this class for a discussion on direct and inverted
+ * term IDs.
+ */
+ protected Object2LongMap<String> directTermIds;
+
+ /**
+ * The terms in the direct index, in the order they were first seen during
+ * indexing.
+ */
+ protected ObjectBigList<String> directTerms;
+
+ /**
+ * The single thread used to index documents. All writes to the index files
+ * are done from this thread.
+ */
protected Thread indexingThread;
/**
@@ -630,7 +685,6 @@
*/
protected long documentPointer;
-
/**
* The number of documents currently stored in RAM.
*/
@@ -686,7 +740,6 @@
boolean hasDirectIndex, TermProcessor termProcessor,
BlockingQueue<GATEDocument> inputQueue,
BlockingQueue<GATEDocument> outputQueue) throws IOException,
IndexException {
- super();
this.parent = parent;
this.name = name;
this.indexDirectory = indexDirectory;
@@ -743,8 +796,30 @@
indexDirectory.mkdirs();
}
synchronized(this) {
- indexCluster = openIndexCluster(subIndexes, termProcessor);
+ invertedIndex = openInvertedIndexCluster(subIndexes, termProcessor);
}
+ // open direct index
+ if(hasDirectIndex) {
+ directTerms = new ObjectBigArrayBigList<String>();
+ directTermIds = new Object2LongAVLTreeMap<String>();
+ directTermIds.defaultReturnValue(-1);
+ File directTermsFile = new File(indexDirectory, DIRECT_TERMS_FILENAME);
+ if(directTermsFile.exists()) {
+ FileLinesCollection fileLines = new FileLinesCollection(
+ directTermsFile.getAbsolutePath(), "UTF-8");
+ Iterator<MutableString> termsIter = fileLines.iterator();
+ long termID = 0;
+ while(termsIter.hasNext()) {
+ String term = termsIter.next().toString();
+ directTerms.add(term);
+ directTermIds.put(term, termID++);
+ }
+ }
+ synchronized(this) {
+ //TODO
+ // open direct index cluster
+ }
+ }
}
/**
@@ -791,11 +866,12 @@
}
/**
- * Writes all the data currently stored in RAM to a new tail index.
+ * Writes all the data currently stored in RAM to a new index batch.
The first
+ * batch is the head index, all other batches are tail indexes.
* @throws IOException
* @throws IndexException
*/
- protected void writeCurrentTail() throws IOException, IndexException {
+ protected void writeCurrentBatch() throws IOException, IndexException {
if(documentsInRAM == 0) return;
// find the name for the new tail
@@ -913,7 +989,7 @@
}
if(hasDirectIndex) {
- writeDirectIndex(newTailDir, termArray);
+ writeDirectIndex(newTailDir);
}
// update parent
parent.subtractOccurrences(occurrencesInRAM);
@@ -926,7 +1002,7 @@
// modify internal state
synchronized(this) {
subIndexes.add(openSubIndex(newTailName));
- indexCluster = openIndexCluster(subIndexes, termProcessor);
+ invertedIndex = openInvertedIndexCluster(subIndexes, termProcessor);
if(hasDirectIndex) {
// TODO
// merge the new direct batch into the direct cluster
@@ -953,7 +1029,7 @@
* @throws IOException
* @throws IndexException
*/
- protected void writeDirectIndex(File batchDir, MutableString[] termArray)
+ protected void writeDirectIndex(File batchDir)
throws IOException, IndexException {
// The index we are writing is a direct index, so we give it new terms
// which are actually document IDs, and they have posting lists containing
@@ -964,27 +1040,44 @@
new Object2ReferenceOpenHashMap<MutableString,
PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );
MutableString docIdStr = new MutableString();
- // we now read the posting lists for all the terms, in ascending term order
- for(int termId = 0; termId < termArray.length; termId++) {
- PostingsList termPostings = termMap.get(termArray[termId]);
- long docPointer = termPostings.firstDocumentPointer;
- for(int i = 0; i < termPostings.documentPointersDifferential.size();
i++) {
- docPointer += termPostings.documentPointersDifferential.get(i);
- int count = termPostings.counts.getInt(i);
- // convert data to the correct type
- docIdStr.replace(longToTerm(docPointer));
- // at this point we have term, document, counts so we can write the
data
- // to the in-RAM direct index
- PostingsList docPostings = docMap.get(docIdStr);
- if(docPostings == null) {
- docPostings = new PostingsList(false);
- docMap.put(docIdStr.copy(), docPostings);
- }
- docPostings.newDocumentPointer(termId);
- docPostings.setCount(count);
- docPostings.flush();
+ // make sure all the terms about to be indexed have direct ID
+ for(MutableString termMS : termMap.keySet()) {
+ String termString = termMS.toString();
+ long directTermId = directTermIds.getLong(termString);
+ if(directTermId == directTermIds.defaultReturnValue()) {
+ // term not seen before
+ directTerms.add(termString);
+ directTermId = directTerms.size64() -1;
+ directTermIds.put(termString, directTermId);
}
}
+ // we now read the posting lists for all the terms, in ascending term
order
+ //for(int termId = 0; termId < termArray.length; termId++) {
+ MutableString termMS = new MutableString();
+ for(long directTermId = 0; directTermId < directTerms.size64();
directTermId++){
+ String termString = directTerms.get(directTermId);
+ termMS.replace(termString);
+ PostingsList termPostings = termMap.get(termMS);
+ if(termPostings != null) {
+ long docPointer = termPostings.firstDocumentPointer;
+ for(int i = 0; i < termPostings.documentPointersDifferential.size();
i++) {
+ docPointer += termPostings.documentPointersDifferential.get(i);
+ int count = termPostings.counts.getInt(i);
+ // convert data to the correct type
+ docIdStr.replace(longToTerm(docPointer));
+ // at this point we have term, document, counts so we can write the
data
+ // to the in-RAM direct index
+ PostingsList docPostings = docMap.get(docIdStr);
+ if(docPostings == null) {
+ docPostings = new PostingsList(false);
+ docMap.put(docIdStr.copy(), docPostings);
+ }
+ docPostings.newDocumentPointer(directTermId);
+ docPostings.setCount(count);
+ docPostings.flush();
+ }
+ }
+ }
// 2. write the data from RAM
String mg4jBasename = new File(batchDir, name + "-dir").getAbsolutePath();
@@ -996,7 +1089,7 @@
new QuasiSuccinctIndexWriter(
IOFactory.FILESYSTEM_FACTORY,
mg4jBasename,
- termArray.length,
+ directTerms.size64(),
Fast.mostSignificantBit(QuasiSuccinctIndex.DEFAULT_QUANTUM),
QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE,
flags,
@@ -1043,11 +1136,17 @@
BinIO.storeObject(docBloomFilter,
new File(mg4jBasename + DocumentalCluster.BLOOM_EXTENSION));
// write the sizes file
+ // this is a list of document sizes (directTerms in our case)
File sizesFile = new File(mg4jBasename + DiskBasedIndex.SIZES_EXTENSION);
OutputBitStream sizesStream = new OutputBitStream(sizesFile);
int maxTermSize = -1; // -1 means unknown
- for(MutableString term : termArray) {
- int termSize = (int)termMap.get(term).frequency;
+ //for(MutableString term : termArray) {
+ for(long directTermId = 0; directTermId < directTerms.size64();
directTermId++){
+ String termString = directTerms.get(directTermId);
+ termMS.replace(termString);
+ PostingsList termPostings = termMap.get(termMS);
+ int termSize = termPostings != null ?
+ (int)termPostings.frequency : 0;
sizesStream.writeGamma(termSize);
if(termSize > maxTermSize) maxTermSize = termSize;
}
@@ -1087,7 +1186,14 @@
// this should never happen
throw new IndexException("Error while saving tail properties", e);
}
-
+ //update the index-wide direct terms file
+ pw = new PrintWriter(new OutputStreamWriter(new FastBufferedOutputStream(
+ new FileOutputStream(new File(indexDirectory, DIRECT_TERMS_FILENAME)),
+ 64 * 1024), "UTF-8" ));
+ for (String t : directTerms ) {
+ pw.println(t);
+ }
+ pw.close();
}
@@ -1140,6 +1246,10 @@
} catch(Exception e) {
throw new IndexException("Exception while combining sub-indexes", e);
}
+
+ if(hasDirectIndex()) {
+ // TODO
+ }
// update the internal state
synchronized(this) {
@@ -1151,7 +1261,7 @@
if(headDir.exists() && headDir.renameTo(headDirOld)){
if(headDirNew.renameTo(headDir)) {
subIndexes.add(0, openSubIndex(HEAD_FILE_NAME));
- indexCluster = openIndexCluster(subIndexes, termProcessor);
+ invertedIndex = openInvertedIndexCluster(subIndexes, termProcessor);
// clean-up: delete old head, used-up tails
if(!gate.util.Files.rmdir(headDirOld)) {
@@ -1278,7 +1388,7 @@
if(aDocument != GATEDocument.END_OF_QUEUE) {
if(aDocument == DUMP_BATCH) {
//dump batch was requested
- writeCurrentTail();
+ writeCurrentBatch();
} else if(aDocument == COMPRESS_INDEX) {
// compress index was requested
try {
@@ -1300,7 +1410,7 @@
}
} else {
// close down
- writeCurrentTail();
+ writeCurrentBatch();
flush();
}
if(aDocument != DUMP_BATCH && aDocument != COMPRESS_INDEX) {
@@ -1491,7 +1601,7 @@
* @return
*/
public Index getIndex() {
- return indexCluster;
+ return invertedIndex;
}
@@ -1507,7 +1617,7 @@
* representations of document IDs (as produced by {@link
#longToTerm(long)}).
* The search results is a set of "document IDs", which are
actually
* term IDs. The actual term string corresponding to the returned term IDs
can
- * be obtained by calling {@link #getTerm(long)}.
+ * be obtained by calling {@link #getDirectTerm(long)}.
*/
public Index getDirectIndex() {
return directIndex;
@@ -1523,21 +1633,21 @@
}
/**
- * Gets the term string for a given term ID. The term ID must have been
+ * Gets the term string for a given direct term ID. The term ID must have
been
* obtained from this index's direct index.
* @param termId the ID for the term being sought.
* @return the string for the given term.
*/
- public String getTerm(long termId) {
- return getIndex().termMap.list().get(termId).toString();
+ public CharSequence getDirectTerm(long termId) {
+ return directTerms.get(termId);
}
- public StringMap<? extends CharSequence> getTermMap() {
- return getIndex().termMap;
+ public ObjectBigList<? extends CharSequence> getDirectTerms() {
+ return directTerms;
}
- public LongBigList getTermOccurenceCounts() throws IOException {
+ public long getDirectTermOccurenceCount(long directTermId) throws
IOException {
//TODO: copy from IndexReaderPool
- return null;
+ return -1;
}
}
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java
===================================================================
---
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java
2014-01-28 17:13:53 UTC (rev 17252)
+++
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java
2014-01-29 12:01:38 UTC (rev 17253)
@@ -265,7 +265,7 @@
String termString = null;
// get the term string
try {
- termString = atomicIndex.getTerm(termId);
+ termString = atomicIndex.getDirectTerm(termId).toString();
} catch(Exception e) {
System.err.println("Error reading indirect index term with ID " +
termId);
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/TermTypeTermsQuery.java
===================================================================
---
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/TermTypeTermsQuery.java
2014-01-28 17:13:53 UTC (rev 17252)
+++
mimir/branches/5.0/mimir-core/src/gate/mimir/search/terms/TermTypeTermsQuery.java
2014-01-29 12:01:38 UTC (rev 17253)
@@ -186,8 +186,8 @@
}
// once we have the index reader, we scan the whole dictionary
- termId:for(long i = 0; i < atomicIndex.getTermMap().size64(); i++) {
- String termString = atomicIndex.getTerm(i);
+ termId:for(long i = 0; i < atomicIndex.getDirectTerms().size64(); i++) {
+ String termString = atomicIndex.getDirectTerm(i).toString();
// check this term should be returned
if(indexType == IndexType.ANNOTATIONS &&
!annotationHelper.isMentionUri(termString)) continue termId;
@@ -199,7 +199,7 @@
termDescriptions.add(annotationHelper.describeMention(termString));
}
if(countsEnabled) {
- long termCount = atomicIndex.getTermOccurenceCounts().get(i);
+ long termCount = atomicIndex.getDirectTermOccurenceCount(i);
if(termCount > Integer.MAX_VALUE) {
logger.warn("Term count lenght greater than 32 bit. Data was
pratially lost!");
termCounts.add(Integer.MAX_VALUE);
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
WatchGuard Dimension instantly turns raw network data into actionable
security intelligence. It gives you real-time visual feedback on key
security issues and trends. Skip the complicated setup - simply import
a virtual appliance and go from zero to informed in seconds.
http://pubads.g.doubleclick.net/gampad/clk?id=123612991&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs