Revision: 17204
http://sourceforge.net/p/gate/code/17204
Author: valyt
Date: 2013-12-23 16:16:04 +0000 (Mon, 23 Dec 2013)
Log Message:
-----------
- basic support for searching now implemented, for the simplest case.
- we now also write Bloom Filters when creating new indexes, to speed up
searches in the documental cluster.
- moved the term processor up to AtomicIndex, to allow the search
implementation to reside at the level.
Modified Paths:
--------------
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2013-12-21 18:23:19 UTC (rev 17203)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2013-12-23 16:16:04 UTC (rev 17204)
@@ -26,7 +26,11 @@
import it.unimi.di.big.mg4j.index.IndexWriter;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndex;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndexWriter;
+import it.unimi.di.big.mg4j.index.TermProcessor;
+import it.unimi.di.big.mg4j.index.Index.UriKeys;
+import it.unimi.di.big.mg4j.index.cluster.ContiguousDocumentalStrategy;
import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
+import it.unimi.di.big.mg4j.index.cluster.DocumentalConcatenatedCluster;
import it.unimi.di.big.mg4j.io.IOFactory;
import it.unimi.di.big.mg4j.tool.Scan;
import it.unimi.dsi.big.io.FileLinesCollection;
@@ -55,7 +59,10 @@
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.PrintWriter;
+import java.lang.reflect.InvocationTargetException;
+import java.net.URISyntaxException;
import java.nio.ByteOrder;
+import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.BlockingQueue;
@@ -257,7 +264,20 @@
long numberOfPostings;
long numberOfOccurences;
int maxCount;
-
+ public MG4JIndex(Index index, File indexDir,
+ BloomFilter<Void> termFilter,
+ int numberOfDocuments, int numberOfTerms, long numberOfPostings,
+ long numberOfOccurences, int maxCount) {
+ super();
+ this.index = index;
+ this.indexDir = indexDir;
+ this.termFilter = termFilter;
+ this.numberOfDocuments = numberOfDocuments;
+ this.numberOfTerms = numberOfTerms;
+ this.numberOfPostings = numberOfPostings;
+ this.numberOfOccurences = numberOfOccurences;
+ this.maxCount = maxCount;
+ }
}
/**
@@ -278,6 +298,60 @@
}
/**
+ * Creates a documental cluster from a list of {@link MG4JIndex} values.
+ *
+ * @param subIndexes the indexes to be combined into a cluster
+ * @param termProcessor the term processor to be used (can be null)
+ * @return a documental cluster view of the list of indexes provided.
+ */
+ protected final static DocumentalCluster openIndexCluster(
+ List<MG4JIndex> subIndexes,
+ TermProcessor termProcessor){
+
+ if(subIndexes == null || subIndexes.size() == 0) return null;
+ // prepare the documental cluster
+ Index[] indexes = new Index[subIndexes.size()];
+ long[] cutPoints = new long[indexes.length];
+ cutPoints[0] = 0;
+ int numberOfTerms = -1;
+ int numberOfDocuments = -1;
+ long numberOfPostings = -1;
+ long numberOfOccurences =-1;
+ int maxCount =-1;
+ int indexIdx = 0;
+
+ for(MG4JIndex aSubIndex : subIndexes) {
+ indexes[indexIdx] = aSubIndex.index;
+ if(indexIdx < cutPoints.length - 1) {
+ cutPoints[indexIdx + 1] = aSubIndex.numberOfDocuments;
+ }
+ numberOfTerms += aSubIndex.numberOfTerms;
+ numberOfDocuments += aSubIndex.numberOfDocuments;
+ numberOfPostings += aSubIndex.numberOfPostings;
+ numberOfOccurences += aSubIndex.numberOfOccurences;
+ if(maxCount < aSubIndex.maxCount) maxCount = aSubIndex.maxCount;
+ indexIdx++;
+ }
+ return new DocumentalConcatenatedCluster(indexes,
+ new ContiguousDocumentalStrategy(cutPoints),
+ false, // flat = all component indexes have the same term list
+ null, // Bloom Filters
+ numberOfDocuments,
+ numberOfTerms,
+ numberOfPostings,
+ numberOfOccurences,
+ maxCount,
+ null, // payload
+ true, // hasCounts
+ true, // hasPositions,
+ termProcessor,
+ null, // field
+ null, // sizes
+ null // properties
+ );
+ }
+
+ /**
* The file name (under the current directory for this atomic index) which
* stores the principal index.
*/
@@ -328,9 +402,13 @@
protected String name;
protected File indexDirectory;
-
/**
+ * The term processor used to process the feature values being indexed.
+ */
+ protected TermProcessor termProcessor = null;
+
+ /**
* The size (number of terms) for the longest document indexed but not yet
* saved.
*/
@@ -353,16 +431,11 @@
protected MimirIndex parent;
/**
- * The head index for this atomic index.
+ * A list containing the head and tails of this index.
*/
- protected MG4JIndex head;
+ protected List<MG4JIndex> subIndexes;
/**
- * The tails for this atomic index.
- */
- protected List<MG4JIndex> tails;
-
- /**
* The cluster-view of all the MG4J indexes that are part of this index (i.e.
* the head and all the tails).
*/
@@ -410,7 +483,7 @@
/**
* The number of documents currently stored in RAM.
*/
- protected long documentsInRAM;
+ protected int documentsInRAM;
/**
* An in-memory inverted index that gets dumped to files for each batch.
@@ -422,6 +495,7 @@
*/
protected IntArrayList documentSizesInRAM;
+
/**
* Creates a new AtomicIndex
*
@@ -450,9 +524,26 @@
this.additionalProperties = new Properties();
+ initIndex();
}
-
+ /**
+ * Opens the index and prepares it for indexing and searching.
+ */
+ protected void initIndex() {
+ // open the index
+ if(indexDirectory.exists()) {
+ // opening an existing index
+ //TODO
+ } else {
+ // new index creation
+ indexDirectory.mkdirs();
+ documentPointer = 0;
+ subIndexes = new ArrayList<AtomicIndex.MG4JIndex>();
+ }
+ indexCluster = openIndexCluster(subIndexes, termProcessor);
+ }
+
/**
* Gets the name of this atomic index. This is used as the file name
for the
* directory storing the index files.
@@ -567,7 +658,8 @@
termArray[other] = temp;
}
});
- // write the terms and termmap files
+ // write the terms, termmap, and bloom filter files
+ BloomFilter<Void> termFilter = BloomFilter.create(numTermsInRAM);
PrintWriter pw = new PrintWriter(
new OutputStreamWriter(new FastBufferedOutputStream(
new FileOutputStream(mg4jBasename +
DiskBasedIndex.TERMS_EXTENSION),
@@ -575,10 +667,14 @@
"UTF-8" ));
for (MutableString t : termArray ) {
t.println( pw );
+ termFilter.add(t.toString());
}
pw.close();
generateTermMap(new File(mg4jBasename + DiskBasedIndex.TERMS_EXTENSION),
new File(mg4jBasename + DiskBasedIndex.TERMMAP_EXTENSION));
+ // write the bloom filter
+ BinIO.storeObject(termFilter,
+ new File(mg4jBasename + DocumentalCluster.BLOOM_EXTENSION));
// write the sizes file
File sizesFile = new File(mg4jBasename + DiskBasedIndex.SIZES_EXTENSION);
OutputBitStream sizesStream = new OutputBitStream(sizesFile);
@@ -586,13 +682,14 @@
sizesStream.writeGamma(docSize);
}
sizesStream.close();
-
+ long postingsInRam = 0;
// write the actual index
int maxCount = 0;
for ( int i = 0; i < numTermsInRAM; i++ ) {
PostingsList postingsList = termMap.get( termArray[ i ] );
if ( maxCount < postingsList.maxCount ) maxCount = postingsList.maxCount;
postingsList.write(indexWriter);
+ postingsInRam += postingsList.frequency;
}
indexWriter.close();
// write the index properties
@@ -619,7 +716,31 @@
// this should never happen
throw new IndexException("Error while saving tail properties", e);
}
- // merge new tail into index cluster
+
+ // merge new tail into index cluster
+ Index newIndex = null;
+ try {
+ try{
+ newIndex = Index.getInstance(mg4jBasename + "?" +
+ UriKeys.MAPPED.name().toLowerCase() + "=1;");
+ } catch(IOException e) {
+ // memory mapping failed
+ logger.info("Memory mapping failed for index " + mg4jBasename
+ + ". Loading as file index instead");
+ // now try to just open it as an on-disk index
+ newIndex = Index.getInstance(mg4jBasename, true, true);
+ }
+ } catch(ConfigurationException | ClassNotFoundException | SecurityException
+ | InstantiationException | IllegalAccessException
+ | InvocationTargetException | NoSuchMethodException
+ | URISyntaxException e) {
+ throw new IndexException("Could not open the index just written to " +
+ mg4jBasename , e);
+ }
+ MG4JIndex newIndexData = new MG4JIndex(newIndex, newTailDir, termFilter,
+ documentsInRAM, numTermsInRAM, postingsInRam, occurrencesInRAM,
maxCount);
+ subIndexes.add(newIndexData);
+ indexCluster = openIndexCluster(subIndexes, termProcessor);
if(hasDirectIndex) {
// dump new direct tail (invert the tail just written)
@@ -662,16 +783,6 @@
indexingThread = Thread.currentThread();
GATEDocument aDocument;
try{
- // open the index
- if(indexDirectory.exists()) {
- // opening an existing index
- //TODO
- } else {
- // new index creation
- indexDirectory.mkdirs();
- documentPointer = 0;
- }
-
// start in-RAM indexing
newBatch();
if(inputQueue != null) {
@@ -856,4 +967,14 @@
public File getIndexDirectory() {
return indexDirectory;
}
+
+ /**
+ * Gets an {@link Index} value that can be used to search this atomic index.
+ * This will normally be a {@link DocumentalCluster} view over all the
+ * sub-indexes contained.
+ * @return
+ */
+ public Index getIndex() {
+ return indexCluster;
+ }
}
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
2013-12-21 18:23:19 UTC (rev 17203)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
2013-12-23 16:16:04 UTC (rev 17204)
@@ -94,13 +94,9 @@
*/
protected String featureName;
+
/**
- * The term processor used to process the feature values being indexed.
- */
- protected TermProcessor termProcessor;
-
- /**
* @param parent
* @param name
* @param indexDirectory
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Rapidly troubleshoot problems before they affect your business. Most IT
organizations don't have a clear picture of how application performance
affects their revenue. With AppDynamics, you get 100% visibility into your
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs