Revision: 17196
http://sourceforge.net/p/gate/code/17196
Author: valyt
Date: 2013-12-20 15:20:21 +0000 (Fri, 20 Dec 2013)
Log Message:
-----------
We now write .sizes and .stats files as well, which brings us back to the same
on-disk representation of an MG4J index as M?\195?\173mir version 4.
Started work on designing the clustering of indexes.
Modified Paths:
--------------
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2013-12-20 12:49:03 UTC (rev 17195)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2013-12-20 15:20:21 UTC (rev 17196)
@@ -26,6 +26,7 @@
import it.unimi.di.big.mg4j.index.IndexWriter;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndex;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndexWriter;
+import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
import it.unimi.di.big.mg4j.io.IOFactory;
import it.unimi.di.big.mg4j.tool.Scan;
import it.unimi.dsi.big.io.FileLinesCollection;
@@ -45,14 +46,17 @@
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction;
+import it.unimi.dsi.util.BloomFilter;
import it.unimi.dsi.util.Properties;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
+import java.io.PrintStream;
import java.io.PrintWriter;
import java.nio.ByteOrder;
+import java.util.List;
import java.util.concurrent.BlockingQueue;
import org.apache.commons.configuration.ConfigurationException;
@@ -241,6 +245,22 @@
}
/**
+ * Class representing an MG4J index batch, such as the head or any of the
+ * tails.
+ */
+ protected static class MG4JIndex {
+ protected File indexDir;
+ protected Index index;
+ protected BloomFilter<Void> termFilter;
+ int numberOfDocuments;
+ int numberOfTerms;
+ long numberOfPostings;
+ long numberOfOccurences;
+ int maxCount;
+
+ }
+
+ /**
* Given a terms file (text file with one term per line) this method
generates
* the corresponding termmap file (binary representation of a StringMap).
* @param termsFile the input file
@@ -333,6 +353,22 @@
protected MimirIndex parent;
/**
+ * The head index for this atomic index.
+ */
+ protected MG4JIndex head;
+
+ /**
+ * The tails for this atomic index.
+ */
+ protected List<MG4JIndex> tails;
+
+ /**
+ * The cluster-view of all the MG4J indexes that are part of this index (i.e.
+ * the head and all the tails).
+ */
+ protected DocumentalCluster indexCluster;
+
+ /**
* A set of properties added to the ones obtained from the index writer when
* writing out batches.
*/
@@ -372,7 +408,7 @@
/**
- * THe number of documents currently stored in RAM.
+ * The number of documents currently stored in RAM.
*/
protected long documentsInRAM;
@@ -382,6 +418,11 @@
protected Object2ReferenceOpenHashMap<MutableString, PostingsList> termMap;
/**
+ * The sizes (numbers of terms) for all the documents indexed in RAM.
+ */
+ protected IntArrayList documentSizesInRAM;
+
+ /**
* Creates a new AtomicIndex
*
* @param parent the {@link MimirIndex} containing this atomic index.
@@ -411,21 +452,6 @@
}
- protected void initIndex() {
- if(indexDirectory.exists()) {
- // opening an existing index
- //TODO
- } else {
- // new index creation
- indexDirectory.mkdirs();
- documentPointer = 0;
- }
- occurrencesInRAM = 0;
- maxDocSizeInRAM = -1;
- documentsInRAM = 0;
- termMap = new Object2ReferenceOpenHashMap<MutableString,
- PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );
- }
/**
* Gets the name of this atomic index. This is used as the file name
for the
@@ -454,6 +480,32 @@
}
/**
+ * Starts a new MG4J batch. First time around this will be the head,
+ * subsequent calls will start a new tail.
+ */
+ protected void newBatch() {
+ occurrencesInRAM = 0;
+ maxDocSizeInRAM = -1;
+ documentsInRAM = 0;
+ // hack to force zero-based batches. This forces us to use a merged cluster
+ // but avoids exceptions during indexing. We'll fix this if possible,
after
+ // we get advice on the MG4J mailing list.
+ documentPointer = 0;
+ if(termMap == null) {
+ termMap = new Object2ReferenceOpenHashMap<MutableString,
+ PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );
+ } else {
+ termMap.clear();
+ termMap.trim( INITIAL_TERM_MAP_SIZE );
+ }
+ if(documentSizesInRAM == null) {
+ documentSizesInRAM = new IntArrayList();
+ } else {
+ documentSizesInRAM.clear();
+ }
+ }
+
+ /**
* Writes all the data currently stored in RAM to a new tail index.
* @throws IOException
* @throws IndexException
@@ -527,6 +579,13 @@
pw.close();
generateTermMap(new File(mg4jBasename + DiskBasedIndex.TERMS_EXTENSION),
new File(mg4jBasename + DiskBasedIndex.TERMMAP_EXTENSION));
+ // write the sizes file
+ File sizesFile = new File(mg4jBasename + DiskBasedIndex.SIZES_EXTENSION);
+ OutputBitStream sizesStream = new OutputBitStream(sizesFile);
+ for(int docSize : documentSizesInRAM.elements()) {
+ sizesStream.writeGamma(docSize);
+ }
+ sizesStream.close();
// write the actual index
int maxCount = 0;
@@ -550,6 +609,12 @@
properties.addAll(additionalProperties);
Scan.saveProperties( IOFactory.FILESYSTEM_FACTORY, properties,
mg4jBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
+
+ // write stats
+ PrintStream statsPs = new PrintStream(new File(mg4jBasename +
+ DiskBasedIndex.STATS_EXTENSION));
+ indexWriter.printStats(statsPs);
+ statsPs.close();
} catch(ConfigurationException e) {
// this should never happen
throw new IndexException("Error while saving tail properties", e);
@@ -563,17 +628,8 @@
// clear queued-documents folder
- // clear out internal state, in preparation for the next tail
-
- occurrencesInRAM = 0;
- maxDocSizeInRAM = -1;
- documentsInRAM = 0;
- // hack to force zero-based batches. This forces us to use a merged cluster
- // but avoids exceptions during indexing. We'll fix this if possible,
after
- // we get advice on the MG4J mailing list.
- documentPointer = 0;
- termMap.clear();
- termMap.trim( INITIAL_TERM_MAP_SIZE );
+ // clear out internal state, in preparation for the next tail
+ newBatch();
}
/**
@@ -606,7 +662,18 @@
indexingThread = Thread.currentThread();
GATEDocument aDocument;
try{
- initIndex();
+ // open the index
+ if(indexDirectory.exists()) {
+ // opening an existing index
+ //TODO
+ } else {
+ // new index creation
+ indexDirectory.mkdirs();
+ documentPointer = 0;
+ }
+
+ // start in-RAM indexing
+ newBatch();
if(inputQueue != null) {
while((aDocument = inputQueue.take()) != GATEDocument.END_OF_QUEUE){
try {
@@ -735,6 +802,8 @@
// the current document is finished
int docLength = tokenPosition + 1;
if(docLength > maxDocSizeInRAM) maxDocSizeInRAM = docLength;
+ documentSizesInRAM.add(docLength);
+
} catch (IOException e) {
throw new IndexException("IO Exception while indexing", e);
}finally {
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Rapidly troubleshoot problems before they affect your business. Most IT
organizations don't have a clear picture of how application performance
affects their revenue. With AppDynamics, you get 100% visibility into your
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs