Revision: 17194
http://sourceforge.net/p/gate/code/17194
Author: valyt
Date: 2013-12-20 11:25:05 +0000 (Fri, 20 Dec 2013)
Log Message:
-----------
Renamed some variables.
Modified Paths:
--------------
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2013-12-20 11:07:26 UTC (rev 17193)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2013-12-20 11:25:05 UTC (rev 17194)
@@ -218,7 +218,7 @@
flush();
if(indexWriter instanceof QuasiSuccinctIndexWriter) {
((QuasiSuccinctIndexWriter)indexWriter).newInvertedList(frequency,
- occurrences, sumMaxPos);
+ occurrences, sumMaxPos);
} else {
indexWriter.newInvertedList();
}
@@ -314,12 +314,12 @@
* The size (number of terms) for the longest document indexed but not yet
* saved.
*/
- protected int maxDocSizeInRam = -1;
+ protected int maxDocSizeInRAM = -1;
/**
* The number of occurrences represented in RAM and not yet written to disk.
*/
- protected long occurrencesInRam = 0;
+ protected long occurrencesInRAM = 0;
/**
* How many occurrences to be accumulated in RAM before a new tail batch is
@@ -365,11 +365,18 @@
protected MutableString currentTerm;
/**
- * The current document pointer (gets incremented for each document).
+ * The current document pointer (gets incremented for each document). This is
+ * a global for all documents in the atomic index, in all the batches.
*/
protected long documentPointer;
+
/**
+ * THe number of documents currently stored in RAM.
+ */
+ protected long documentsInRAM;
+
+ /**
* An in-memory inverted index that gets dumped to files for each batch.
*/
protected Object2ReferenceOpenHashMap<MutableString, PostingsList> termMap;
@@ -413,8 +420,9 @@
indexDirectory.mkdirs();
documentPointer = 0;
}
- occurrencesInRam = 0;
- maxDocSizeInRam = -1;
+ occurrencesInRAM = 0;
+ maxDocSizeInRAM = -1;
+ documentsInRAM = 0;
termMap = new Object2ReferenceOpenHashMap<MutableString,
PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );
}
@@ -473,19 +481,19 @@
QuasiSuccinctIndexWriter indexWriter = new QuasiSuccinctIndexWriter(
IOFactory.FILESYSTEM_FACTORY,
mg4jBasename,
- documentPointer,
+ documentsInRAM,
Fast.mostSignificantBit(QuasiSuccinctIndex.DEFAULT_QUANTUM),
QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE,
CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX,
ByteOrder.nativeOrder());
// write the data from RAM
- int numTerms = termMap.size();
+ int numTermsInRAM = termMap.size();
logger.info( "Generating index for batch " + newTailName +
- "; documents: " + documentPointer + "; terms:" + numTerms +
- "; occurrences: " + occurrencesInRam );
+ "; documents: " + documentsInRAM + "; terms:" + numTermsInRAM +
+ "; occurrences: " + occurrencesInRAM );
// We write down all term in appearance order in termArray.
- final MutableString[] termArray = termMap.keySet().toArray(new
MutableString[ numTerms ]);
+ final MutableString[] termArray = termMap.keySet().toArray(new
MutableString[ numTermsInRAM ]);
// We sort the terms appearing in the batch and write them on disk.
Arrays.quickSort(0, termArray.length,
new IntComparator() {
@@ -522,7 +530,7 @@
// write the actual index
int maxCount = 0;
- for ( int i = 0; i < numTerms; i++ ) {
+ for ( int i = 0; i < numTermsInRAM; i++ ) {
PostingsList postingsList = termMap.get( termArray[ i ] );
if ( maxCount < postingsList.maxCount ) maxCount = postingsList.maxCount;
postingsList.write(indexWriter);
@@ -535,10 +543,10 @@
indexWriter.writtenBits());
// -1 means unknown
additionalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE,
- maxDocSizeInRam);
+ maxDocSizeInRAM);
additionalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount
);
additionalProperties.setProperty( Index.PropertyKeys.OCCURRENCES,
- occurrencesInRam );
+ occurrencesInRAM );
properties.addAll(additionalProperties);
Scan.saveProperties( IOFactory.FILESYSTEM_FACTORY, properties,
mg4jBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
@@ -557,8 +565,12 @@
// clear out internal state, in preparation for the next tail
- occurrencesInRam = 0;
- maxDocSizeInRam = -1;
+ occurrencesInRAM = 0;
+ maxDocSizeInRAM = -1;
+ documentsInRAM = 0;
+ // hack to force zero-based batches. This forces us to use a merged cluster
+ // but avoids exceptions during indexing. We'll fix this if possible,
after
+ // we get advice on the MG4J mailing list.
documentPointer = 0;
termMap.clear();
termMap.trim( INITIAL_TERM_MAP_SIZE );
@@ -603,7 +615,7 @@
logger.error("Problem while indexing document!", e);
}
//dump batch if needed AND there is data to dump
- if (occurrencesPerBatch > 0 && occurrencesInRam >
occurrencesPerBatch){
+ if (occurrencesPerBatch > 0 && occurrencesInRAM >
occurrencesPerBatch){
writeCurrentTail();
}
outputQueue.put(aDocument);
@@ -722,12 +734,13 @@
}
// the current document is finished
int docLength = tokenPosition + 1;
- if(docLength > maxDocSizeInRam) maxDocSizeInRam = docLength;
+ if(docLength > maxDocSizeInRAM) maxDocSizeInRAM = docLength;
} catch (IOException e) {
throw new IndexException("IO Exception while indexing", e);
}finally {
documentEnding(gateDocument);
documentPointer++;
+ documentsInRAM++;
}
}
@@ -749,7 +762,7 @@
//for duplicate values.
if(termPostings.checkPosition(tokenPosition)){
termPostings.addPosition(tokenPosition);
- occurrencesInRam++;
+ occurrencesInRAM++;
} else {
logger.debug("Duplicate position");
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Rapidly troubleshoot problems before they affect your business. Most IT
organizations don't have a clear picture of how application performance
affects their revenue. With AppDynamics, you get 100% visibility into your
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs