Revision: 17244
http://sourceforge.net/p/gate/code/17244
Author: valyt
Date: 2014-01-22 17:30:47 +0000 (Wed, 22 Jan 2014)
Log Message:
-----------
We can now generate a set of files on disk that *look like* a direct index.
Still untested, so no idea if the data contained makes any sense at all..
Modified Paths:
--------------
mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
2014-01-22 16:06:49 UTC (rev 17243)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
2014-01-22 17:30:47 UTC (rev 17244)
@@ -528,6 +528,14 @@
return occurrencesPerBatch;
}
+ /**
+ * Gets the current estimated number of occurrences in RAM
+ * @return
+ */
+ public long getOccurrencesInRam() {
+ return occurrencesInRam;
+ }
+
public int getIndexingQueueSize() {
return indexingQueueSize;
}
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2014-01-22 16:06:49 UTC (rev 17243)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2014-01-22 17:30:47 UTC (rev 17244)
@@ -26,6 +26,7 @@
import it.unimi.di.big.mg4j.index.Index;
import it.unimi.di.big.mg4j.index.IndexReader;
import it.unimi.di.big.mg4j.index.IndexWriter;
+import it.unimi.di.big.mg4j.index.NullTermProcessor;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndex;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndexWriter;
import it.unimi.di.big.mg4j.index.SkipBitStreamIndexWriter;
@@ -63,6 +64,7 @@
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
+import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction;
import it.unimi.dsi.util.BloomFilter;
@@ -79,6 +81,7 @@
import java.net.URISyntaxException;
import java.nio.ByteOrder;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
@@ -225,7 +228,7 @@
if(firstDocumentPointer < 0) firstDocumentPointer = pointer;
if(lastDocumentPointer == -1) {
// this is the first document
- documentPointersDifferential.add(0);
+ documentPointersDifferential.add(0);
} else {
// close previous document
flush();
@@ -251,6 +254,10 @@
}
}
+ public void setCount(int count) {
+ this.count = count;
+ }
+
/**
* Checks whether the given position is valid (i.e. greater than the last
* seen positions. If the position is invalid, this means that a call to
@@ -286,8 +293,10 @@
public void write(IndexWriter indexWriter) throws IOException {
flush();
if(indexWriter instanceof QuasiSuccinctIndexWriter) {
- ((QuasiSuccinctIndexWriter)indexWriter).newInvertedList(frequency,
- occurrences, sumMaxPos);
+ ((QuasiSuccinctIndexWriter)indexWriter).newInvertedList(
+ frequency,
+ occurrences,
+ positions!= null ? sumMaxPos : 0);
} else {
indexWriter.newInvertedList();
}
@@ -295,16 +304,17 @@
indexWriter.writeFrequency(frequency);
long currDocumentPointer = firstDocumentPointer;
int positionsStart = 0;
-
for(int docId = 0; docId < documentPointersDifferential.size(); docId++)
{
currDocumentPointer += documentPointersDifferential.get(docId);
int currCount = counts.get(docId);
OutputBitStream obs = indexWriter.newDocumentRecord();
indexWriter.writeDocumentPointer(obs, currDocumentPointer);
indexWriter.writePositionCount(obs, currCount);
- indexWriter.writeDocumentPositions(obs, positions.elements(),
- positionsStart, currCount, -1);
- positionsStart += currCount;
+ if(positions != null){
+ indexWriter.writeDocumentPositions(obs, positions.elements(),
+ positionsStart, currCount, -1);
+ positionsStart += currCount;
+ }
}
}
@@ -461,6 +471,20 @@
}
/**
+ * Converts a long value into a String containing a zero-padded Hex
+ * representation of the input value. The lexicographic ordering of the
+ * generated strings is the same as the natural order of the corresponding
+ * long values.
+ *
+ * @param value the value to convert.
+ * @return the string representation.
+ */
+ public static final String longToTerm(long value) {
+ String valueStr = Long.toHexString(value);
+ return "0000000000000000".substring(valueStr.length()) + valueStr;
+ }
+
+ /**
* The file name (under the current directory for this atomic index) which
* stores the principal index.
*/
@@ -561,6 +585,12 @@
*/
protected Properties additionalProperties;
+ /**
+ * A set of properties added to the ones obtained from the direct index
writer
+ * when writing out batches.
+ */
+ protected Properties additionalDirectProperties;
+
protected boolean hasDirectIndex;
protected Thread indexingThread;
@@ -662,6 +692,11 @@
this.additionalProperties = new Properties();
+ if(hasDirectIndex) {
+ additionalDirectProperties = new Properties();
+ additionalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR,
+ ObjectParser.toSpec(NullTermProcessor.getInstance()));
+ }
initIndex();
}
@@ -787,7 +822,8 @@
int numTermsInRAM = termMap.size();
logger.info( "Generating index for batch " + newTailName +
"; documents: " + documentsInRAM + "; terms:" + numTermsInRAM +
- "; occurrences: " + occurrencesInRAM );
+ "; occurrences: " + occurrencesInRAM +
+ " / " + parent.getOccurrencesInRam());
// We write down all term in appearance order in termArray.
final MutableString[] termArray = termMap.keySet().toArray(new
MutableString[ numTermsInRAM ]);
@@ -869,30 +905,31 @@
throw new IndexException("Error while saving tail properties", e);
}
+ if(hasDirectIndex) {
+ writeDirectIndex(newTailDir, termArray);
+ }
+ // update parent
+ parent.subtractOccurrences(occurrencesInRAM);
+
+ // clear out internal state, in preparation for the next tail
+ newBatch();
+
// merge new tail into index cluster
try {
// modify internal state
synchronized(this) {
subIndexes.add(openSubIndex(newTailName));
- indexCluster = openIndexCluster(subIndexes, termProcessor);
+ indexCluster = openIndexCluster(subIndexes, termProcessor);
+ if(hasDirectIndex) {
+ // TODO
+ // merge the new direct batch into the direct cluster
+ }
}
} catch(Exception e) {
throw new IndexException("Could not open the index just written to " +
mg4jBasename , e);
}
-
- if(hasDirectIndex) {
- //TODO
- // dump new direct tail (invert the tail just written)
- // merge new direct tail into direct index cluster
- }
- // update parent
- parent.subtractOccurrences(occurrencesInRAM);
-
- // clear out internal state, in preparation for the next tail
- newBatch();
-
// notify "listeners"
synchronized(this) {
if(tailWriteRequested != null) {
@@ -903,6 +940,151 @@
}
/**
+ *
+ * @param batchDir
+ * @param termArray the in-RAM terms, sorted lexicographically
+ * @throws IOException
+ * @throws IndexException
+ */
+ protected void writeDirectIndex(File batchDir, MutableString[] termArray)
+ throws IOException, IndexException {
+ // The index we are writing is a direct index, so we give it new terms
+ // which are actually document IDs, and they have posting lists containing
+ // document IDs, which are actually termIDs.
+
+ //1. invert index data in RAM
+ Object2ReferenceOpenHashMap<MutableString, PostingsList> docMap =
+ new Object2ReferenceOpenHashMap<MutableString,
+ PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );
+ MutableString docIdStr = new MutableString();
+ // we now read the posting lists for all the terms, in ascending term order
+ for(int termId = 0; termId < termArray.length; termId++) {
+ PostingsList termPostings = termMap.get(termArray[termId]);
+ long docPointer = termPostings.firstDocumentPointer;
+ for(int i = 0; i < termPostings.documentPointersDifferential.size();
i++) {
+ docPointer += termPostings.documentPointersDifferential.get(i);
+ int count = termPostings.counts.getInt(i);
+ // convert data to the correct type
+ docIdStr.replace(longToTerm(docPointer));
+ // at this point we have term, document, counts so we can write the
data
+ // to the in-RAM direct index
+ PostingsList docPostings = docMap.get(docIdStr);
+ if(docPostings == null) {
+ docPostings = new PostingsList(false);
+ docMap.put(docIdStr.copy(), docPostings);
+ }
+ docPostings.newDocumentPointer(termId);
+ docPostings.setCount(count);
+ docPostings.flush();
+ }
+ }
+
+ // 2. write the data from RAM
+ String mg4jBasename = new File(batchDir, name + "-dir").getAbsolutePath();
+ // copy the default compression flags, and remove positions
+ Map<Component, Coding> flags = new HashMap<Component, Coding>(
+ CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX);
+ flags.remove(Component.POSITIONS);
+ QuasiSuccinctIndexWriter directIndexWriter =
+ new QuasiSuccinctIndexWriter(
+ IOFactory.FILESYSTEM_FACTORY,
+ mg4jBasename,
+ termArray.length,
+ Fast.mostSignificantBit(QuasiSuccinctIndex.DEFAULT_QUANTUM),
+ QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE,
+ flags,
+ ByteOrder.nativeOrder());
+
+ // sort all the docIds
+ final MutableString[] docArray = docMap.keySet().toArray(new
MutableString[ docMap.size() ]);
+ // We sort the terms appearing in the batch and write them on disk.
+ Arrays.quickSort(0, docArray.length,
+ new IntComparator() {
+ @Override
+ public int compare(Integer one, Integer other) {
+ return compare(one.intValue(), other.intValue());
+ }
+
+ @Override
+ public int compare(int one, int other) {
+ return docArray[one].compareTo(docArray[other]);
+ }
+ },
+ new Swapper() {
+ @Override
+ public void swap(int one, int other) {
+ MutableString temp = docArray[one];
+ docArray[one] = docArray[other];
+ docArray[other] = temp;
+ }
+ });
+
+ BloomFilter<Void> docBloomFilter = BloomFilter.create(docArray.length);
+ PrintWriter pw = new PrintWriter(
+ new OutputStreamWriter(new FastBufferedOutputStream(
+ new FileOutputStream(mg4jBasename +
DiskBasedIndex.TERMS_EXTENSION),
+ 64 * 1024),
+ "UTF-8" ));
+ for (MutableString t : docArray ) {
+ t.println( pw );
+ docBloomFilter.add(t);
+ }
+ pw.close();
+ generateTermMap(new File(mg4jBasename + DiskBasedIndex.TERMS_EXTENSION),
+ new File(mg4jBasename + DiskBasedIndex.TERMMAP_EXTENSION), null);
+ // write the bloom filter
+ BinIO.storeObject(docBloomFilter,
+ new File(mg4jBasename + DocumentalCluster.BLOOM_EXTENSION));
+ // write the sizes file
+ File sizesFile = new File(mg4jBasename + DiskBasedIndex.SIZES_EXTENSION);
+ OutputBitStream sizesStream = new OutputBitStream(sizesFile);
+ int maxTermSize = -1; // -1 means unknown
+ for(MutableString term : termArray) {
+ int termSize = (int)termMap.get(term).frequency;
+ sizesStream.writeGamma(termSize);
+ if(termSize > maxTermSize) maxTermSize = termSize;
+ }
+ sizesStream.close();
+
+ // write the actual index
+ int maxCount = 0;
+ long occurrences = 0;
+ for ( int i = 0; i < docArray.length; i++ ) {
+ PostingsList postingsList = docMap.get( docArray[ i ] );
+ if ( maxCount < postingsList.maxCount ) maxCount = postingsList.maxCount;
+ postingsList.write(directIndexWriter);
+ occurrences += postingsList.occurrences;
+ }
+ directIndexWriter.close();
+ // write the index properties
+ try {
+ Properties properties = directIndexWriter.properties();
+ additionalDirectProperties.setProperty( Index.PropertyKeys.SIZE,
+ directIndexWriter.writtenBits());
+ // -1 means unknown
+ additionalDirectProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE,
+ maxTermSize);
+ additionalDirectProperties.setProperty( Index.PropertyKeys.MAXCOUNT,
maxCount );
+ additionalDirectProperties.setProperty( Index.PropertyKeys.OCCURRENCES,
+ occurrences);
+ properties.addAll(additionalDirectProperties);
+ Scan.saveProperties( IOFactory.FILESYSTEM_FACTORY, properties,
+ mg4jBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
+
+ // write stats
+ PrintStream statsPs = new PrintStream(new File(mg4jBasename +
+ DiskBasedIndex.STATS_EXTENSION));
+ directIndexWriter.printStats(statsPs);
+ statsPs.close();
+ } catch(ConfigurationException e) {
+ // this should never happen
+ throw new IndexException("Error while saving tail properties", e);
+ }
+
+ }
+
+
+ /**
* Combines all the currently existing sub-indexes, generating a new
* head index.
* @throws IndexException
@@ -1320,7 +1502,6 @@
* @return the string for the given term.
*/
public String getTerm(long termId) {
- // TODO: list() is an optional operation, so this may be null
return getIndex().termMap.list().get(termId).toString();
}
Modified: mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
===================================================================
--- mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
2014-01-22 16:06:49 UTC (rev 17243)
+++ mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
2014-01-22 17:30:47 UTC (rev 17244)
@@ -81,9 +81,9 @@
public class Scratch {
public static void main (String[] args) throws Exception {
-// mainIndexer5(args);
+ mainIndexer5(args);
- mainSimple(args);
+// mainSimple(args);
// mainDirectIndexes(args);
// mainBuildDirectIndex(args);
@@ -247,7 +247,7 @@
ZipFile zip = new ZipFile(args[1]);
Enumeration<? extends ZipEntry> entries = zip.entries();
- int copies = 100;
+ int copies = 10;
boolean compress = false;
ResourceData docRd =
Gate.getCreoleRegister().get(DocumentImpl.class.getName());
while(entries.hasMoreElements()) {
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
CenturyLink Cloud: The Leader in Enterprise Cloud Services.
Learn Why More Businesses Are Choosing CenturyLink Cloud For
Critical Workloads, Development Environments & Everything In Between.
Get a Quote or Start a Free Trial Today.
http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs