Revision: 17192
http://sourceforge.net/p/gate/code/17192
Author: valyt
Date: 2013-12-20 10:56:20 +0000 (Fri, 20 Dec 2013)
Log Message:
-----------
We now seem able to create multi-tailed Token indexes (or at least some files
on disk that have the correct names).
Modified Paths:
--------------
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java
mimir/branches/5.0/mimir-core/src/gate/mimir/util/MG4JTools.java
mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2013-12-20 08:57:15 UTC (rev 17191)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2013-12-20 10:56:20 UTC (rev 17192)
@@ -21,22 +21,31 @@
import gate.util.GateRuntimeException;
import it.unimi.di.big.mg4j.index.CompressionFlags;
import it.unimi.di.big.mg4j.index.DiskBasedIndex;
+import it.unimi.di.big.mg4j.index.Index;
import it.unimi.di.big.mg4j.index.IndexReader;
import it.unimi.di.big.mg4j.index.IndexWriter;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndex;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndexWriter;
import it.unimi.di.big.mg4j.io.IOFactory;
+import it.unimi.di.big.mg4j.tool.Scan;
+import it.unimi.dsi.big.io.FileLinesCollection;
+import it.unimi.dsi.big.util.ShiftAddXorSignedStringMap;
+import it.unimi.dsi.big.util.StringMap;
import it.unimi.dsi.bits.Fast;
+import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.Swapper;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntComparator;
import it.unimi.dsi.fastutil.ints.IntList;
+import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.objects.Object2ReferenceOpenHashMap;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
+import it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction;
+import it.unimi.dsi.util.Properties;
import java.io.File;
import java.io.FileOutputStream;
@@ -46,6 +55,7 @@
import java.nio.ByteOrder;
import java.util.concurrent.BlockingQueue;
+import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;
import com.google.common.io.PatternFilenameFilter;
@@ -80,7 +90,7 @@
/**
* The list of document pointer differentials (differences from
* {@link #firstDocumentPointer}). For the sake of easy alignment, we
- * actaully store a <tt>0</tt> on the first position.
+ * actually store a <tt>0</tt> on the first position.
*/
private IntList documentPointersDifferential;
@@ -230,6 +240,22 @@
}
}
+ /**
+ * Given a terms file (text file with one term per line) this method
generates
+ * the corresponding termmap file (binary representation of a StringMap).
+ * @param termsFile the input file
+ * @param termmapFile the output file
+ * @throws IOException
+ */
+ public static void generateTermMap(File termsFile, File termmapFile) throws
IOException {
+ FileLinesCollection fileLinesCollection =
+ new FileLinesCollection(termsFile.getAbsolutePath(), "UTF-8");
+ StringMap<CharSequence> terms = new ShiftAddXorSignedStringMap(
+ fileLinesCollection.iterator(),
+ new LcpMonotoneMinimalPerfectHashFunction<CharSequence>(
+ fileLinesCollection, TransformationStrategies.prefixFreeUtf16()));
+ BinIO.storeObject(terms, termmapFile);
+ }
/**
* The file name (under the current directory for this atomic index) which
@@ -255,8 +281,6 @@
*/
public static final String TAIL_FILE_NAME_PREFIX = "tail-";
-
-
/**
* The file name (under the current directory for this atomic index) for the
* directory containing the documents that have been queued for indexing,
but
@@ -284,16 +308,18 @@
protected String name;
protected File indexDirectory;
+
/**
- * The number of occurrences stored in this index.
+ * The size (number of terms) for the longest document indexed but not yet
+ * saved.
*/
- protected long totalOccurrences;
+ protected int maxDocSizeInRam = -1;
/**
* The number of occurrences represented in RAM and not yet written to disk.
*/
- protected long newOccurrences;
+ protected long occurrencesInRam = 0;
/**
* How many occurrences to be accumulated in RAM before a new tail batch is
@@ -306,6 +332,12 @@
*/
protected MimirIndex parent;
+ /**
+ * A set of properties added to the ones obtained from the index writer when
+ * writing out batches.
+ */
+ protected Properties additionalProperties;
+
protected boolean hasDirectIndex;
protected Thread indexingThread;
@@ -368,6 +400,7 @@
this.currentTerm = new MutableString();
+ this.additionalProperties = new Properties();
}
@@ -378,14 +411,12 @@
} else {
// new index creation
indexDirectory.mkdirs();
-
- totalOccurrences = 0;
- newOccurrences = 0;
documentPointer = 0;
-
- termMap = new Object2ReferenceOpenHashMap<MutableString,
- PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );
- }
+ }
+ occurrencesInRam = 0;
+ maxDocSizeInRam = -1;
+ termMap = new Object2ReferenceOpenHashMap<MutableString,
+ PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );
}
/**
@@ -417,19 +448,25 @@
/**
* Writes all the data currently stored in RAM to a new tail index.
* @throws IOException
+ * @throws IndexException
*/
- public void writeNewTail() throws IOException {
+ protected void writeCurrentTail() throws IOException, IndexException {
// find the name for the new tail
- String[] existingTails = indexDirectory.list(TAILS_FILENAME_FILTER);
int tailNo = -1;
- for(String aTail : existingTails) {
- int aTailNo =
Integer.parseInt(aTail.substring(TAIL_FILE_NAME_PREFIX.length()));
- if(aTailNo > tailNo) tailNo = aTailNo;
+ File headDir = new File(indexDirectory, HEAD_FILE_NAME);
+ if(headDir.exists()) {
+ // we have a head, calculate the tail number for this new tail
+ String[] existingTails = indexDirectory.list(TAILS_FILENAME_FILTER);
+ for(String aTail : existingTails) {
+ int aTailNo =
Integer.parseInt(aTail.substring(TAIL_FILE_NAME_PREFIX.length()));
+ if(aTailNo > tailNo) tailNo = aTailNo;
+ }
+ tailNo++;
}
- tailNo++;
// Open an index writer for the new tail
- String newTailName = TAIL_FILE_NAME_PREFIX + Integer.toString(tailNo);
+ String newTailName = tailNo == -1 ? HEAD_FILE_NAME :
+ (TAIL_FILE_NAME_PREFIX + Integer.toString(tailNo));
File newTailDir = new File(indexDirectory, newTailName);
newTailDir.mkdir();
String mg4jBasename = new File(newTailDir, name).getAbsolutePath();
@@ -445,7 +482,7 @@
int numTerms = termMap.size();
logger.info( "Generating index for batch " + newTailName +
"; documents: " + documentPointer + "; terms:" + numTerms +
- "; occurrences: " + newOccurrences );
+ "; occurrences: " + occurrencesInRam );
// We write down all term in appearance order in termArray.
final MutableString[] termArray = termMap.keySet().toArray(new
MutableString[ numTerms ]);
@@ -470,7 +507,7 @@
termArray[other] = temp;
}
});
- // write the term map
+ // write the terms and termmap files
PrintWriter pw = new PrintWriter(
new OutputStreamWriter(new FastBufferedOutputStream(
new FileOutputStream(mg4jBasename +
DiskBasedIndex.TERMS_EXTENSION),
@@ -480,6 +517,9 @@
t.println( pw );
}
pw.close();
+ generateTermMap(new File(mg4jBasename + DiskBasedIndex.TERMS_EXTENSION),
+ new File(mg4jBasename + DiskBasedIndex.TERMMAP_EXTENSION));
+
// write the actual index
int maxCount = 0;
for ( int i = 0; i < numTerms; i++ ) {
@@ -487,8 +527,25 @@
if ( maxCount < postingsList.maxCount ) maxCount = postingsList.maxCount;
postingsList.write(indexWriter);
}
-
indexWriter.close();
+ // write the index properties
+ try {
+ Properties properties = indexWriter.properties();
+ additionalProperties.setProperty( Index.PropertyKeys.SIZE,
+ indexWriter.writtenBits());
+ // -1 means unknown
+ additionalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE,
+ maxDocSizeInRam);
+ additionalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount
);
+ additionalProperties.setProperty( Index.PropertyKeys.OCCURRENCES,
+ occurrencesInRam );
+ properties.addAll(additionalProperties);
+ Scan.saveProperties( IOFactory.FILESYSTEM_FACTORY, properties,
+ mg4jBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
+ } catch(ConfigurationException e) {
+ // this should never happen
+ throw new IndexException("Error while saving tail properties", e);
+ }
// merge new tail into index cluster
if(hasDirectIndex) {
@@ -497,13 +554,16 @@
}
// clear queued-documents folder
+
+ // clear out internal state, in preparation for the next tail
- newOccurrences = 0;
-
- termMap.clear();
+ occurrencesInRam = 0;
+ maxDocSizeInRam = -1;
+ documentPointer = 0;
+ termMap.clear();
+ termMap.trim( INITIAL_TERM_MAP_SIZE );
}
-
/**
* Combines all the currently existing tails into the head, generating
a new
* head index.
@@ -543,13 +603,13 @@
logger.error("Problem while indexing document!", e);
}
//dump batch if needed AND there is data to dump
- if (occurrencesPerBatch > 0 && newOccurrences > occurrencesPerBatch){
- writeNewTail();
+ if (occurrencesPerBatch > 0 && occurrencesInRam >
occurrencesPerBatch){
+ writeCurrentTail();
}
outputQueue.put(aDocument);
}
// we're done
- writeNewTail();
+ writeCurrentTail();
flush();
}
}catch(InterruptedException e) {
@@ -658,8 +718,11 @@
currentTerm.replace(aTerm == null ? "" : aTerm);
indexCurrentTerm();
}
- }
+ }
}
+ // the current document is finished
+ int docLength = tokenPosition + 1;
+ if(docLength > maxDocSizeInRam) maxDocSizeInRam = docLength;
} catch (IOException e) {
throw new IndexException("IO Exception while indexing", e);
}finally {
@@ -686,10 +749,29 @@
//for duplicate values.
if(termPostings.checkPosition(tokenPosition)){
termPostings.addPosition(tokenPosition);
- newOccurrences++;
+ occurrencesInRam++;
} else {
logger.debug("Duplicate position");
}
}
-
+
+ public long getOccurrencesPerBatch() {
+ return occurrencesPerBatch;
+ }
+
+ public void setOccurrencesPerBatch(long occurrencesPerBatch) {
+ this.occurrencesPerBatch = occurrencesPerBatch;
+ }
+
+ public boolean isHasDirectIndex() {
+ return hasDirectIndex;
+ }
+
+ public void setHasDirectIndex(boolean hasDirectIndex) {
+ this.hasDirectIndex = hasDirectIndex;
+ }
+
+ public File getIndexDirectory() {
+ return indexDirectory;
+ }
}
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
2013-12-20 08:57:15 UTC (rev 17191)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
2013-12-20 10:56:20 UTC (rev 17192)
@@ -23,7 +23,9 @@
import gate.mimir.index.mg4j.GATEDocumentFactory;
import gate.mimir.index.mg4j.zipcollection.DocumentCollectionWriter;
import gate.mimir.index.mg4j.zipcollection.DocumentData;
+import it.unimi.di.big.mg4j.index.Index;
import it.unimi.di.big.mg4j.index.TermProcessor;
+import it.unimi.dsi.lang.ObjectParser;
import java.io.File;
import java.io.IOException;
@@ -118,6 +120,9 @@
logger.info("Creating zipped collection for field \"" + name + "\"");
collectionWriter = new DocumentCollectionWriter(indexDirectory);
}
+ // save the term processor
+ additionalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR,
+ ObjectParser.toSpec(termProcessor));
indexingThread = new Thread(this, "Mimir-" + name + " indexing thread");
indexingThread.start();
@@ -171,7 +176,8 @@
* order of offset.
*/
protected Annotation[] getAnnotsToProcess(GATEDocument gateDocument) {
- return gateDocument.getTokenAnnots();
+ Annotation[] tokens = gateDocument.getTokenAnnots();
+ return tokens;
}
/**
@@ -227,6 +233,4 @@
}
super.flush();
}
-
-
}
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java
===================================================================
---
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java
2013-12-20 08:57:15 UTC (rev 17191)
+++
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MimirIndexBuilder.java
2013-12-20 10:56:20 UTC (rev 17192)
@@ -16,14 +16,11 @@
import gate.Annotation;
import gate.mimir.IndexConfig;
+import gate.mimir.index.AtomicIndex;
import gate.mimir.index.IndexException;
import gate.mimir.index.Indexer;
import gate.util.GateRuntimeException;
import it.unimi.dsi.Util;
-import it.unimi.dsi.big.io.FileLinesCollection;
-import it.unimi.dsi.big.util.ShiftAddXorSignedStringMap;
-import it.unimi.dsi.big.util.StringMap;
-import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.Swapper;
@@ -60,7 +57,6 @@
import it.unimi.di.big.mg4j.tool.Concatenate;
import it.unimi.di.big.mg4j.tool.Scan;
import it.unimi.di.big.mg4j.tool.Scan.Completeness;
-import it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction;
import it.unimi.dsi.util.Properties;
import java.io.File;
@@ -999,7 +995,7 @@
.getStringArray(IndexCluster.PropertyKeys.LOCALINDEX);
combineBatches(inputBasename, getGlobalFile("").getAbsolutePath());
// save the termMap
- generateTermMap(getGlobalFile(DiskBasedIndex.TERMS_EXTENSION),
+
AtomicIndex.generateTermMap(getGlobalFile(DiskBasedIndex.TERMS_EXTENSION),
getGlobalFile(DiskBasedIndex.TERMMAP_EXTENSION));
// closing completed
closingProgress = 1;
@@ -1018,23 +1014,6 @@
}
/**
- * Given a terms file (text file with one term per line) this method
generates
- * the corresponding termmap file (binary representation of a StringMap).
- * @param termsFile the input file
- * @param termmapFile the output file
- * @throws IOException
- */
- public static void generateTermMap(File termsFile, File termmapFile) throws
IOException {
- FileLinesCollection fileLinesCollection =
- new FileLinesCollection(termsFile.getAbsolutePath(), "UTF-8");
- StringMap<CharSequence> terms = new ShiftAddXorSignedStringMap(
- fileLinesCollection.iterator(),
- new LcpMonotoneMinimalPerfectHashFunction<CharSequence>(
- fileLinesCollection, TransformationStrategies.prefixFreeUtf16()));
- BinIO.storeObject(terms, termmapFile);
- }
-
- /**
* Combines a set of batches. If the provided number of input batches is
* greater than {@link #MAXIMUM_BATCHES_TO_COMBINE}, then this method will
* start hierarchical batch combination: it will combine
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/util/MG4JTools.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/util/MG4JTools.java
2013-12-20 08:57:15 UTC (rev 17191)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/util/MG4JTools.java
2013-12-20 10:56:20 UTC (rev 17192)
@@ -14,7 +14,7 @@
*/
package gate.mimir.util;
-import gate.mimir.index.mg4j.MimirIndexBuilder;
+import gate.mimir.index.AtomicIndex;
import gate.mimir.search.QueryEngine;
import it.unimi.di.big.mg4j.index.DiskBasedIndex;
import it.unimi.di.big.mg4j.index.Index;
@@ -66,7 +66,7 @@
// and generate the new one
File termsFile = new File(URI.create(indexUri.toString()
+ DiskBasedIndex.TERMS_EXTENSION));
- MimirIndexBuilder.generateTermMap(termsFile, termMapFile);
+ AtomicIndex.generateTermMap(termsFile, termMapFile);
} else {
throw new IOException("Could not rename old termmap file (" +
termMapFile.getAbsolutePath() + ").");
Modified: mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
===================================================================
--- mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
2013-12-20 08:57:15 UTC (rev 17191)
+++ mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
2013-12-20 10:56:20 UTC (rev 17192)
@@ -230,13 +230,13 @@
inputQueue, outputQueue,
indexConfig.getTokenIndexers()[0],
false);
-
+ ati.setOccurrencesPerBatch(500000);
File zipFile = new File(args[1]);
String fileURI = zipFile.toURI().toString();
ZipFile zip = new ZipFile(args[1]);
Enumeration<? extends ZipEntry> entries = zip.entries();
- int copies = 1;
+ int copies = 100;
while(entries.hasMoreElements()) {
ZipEntry entry = entries.nextElement();
if(entry.isDirectory()) {
@@ -250,7 +250,6 @@
}
}
ati.close();
-
}
/**
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Rapidly troubleshoot problems before they affect your business. Most IT
organizations don't have a clear picture of how application performance
affects their revenue. With AppDynamics, you get 100% visibility into your
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs