Revision: 17248
http://sourceforge.net/p/gate/code/17248
Author: valyt
Date: 2014-01-27 17:17:30 +0000 (Mon, 27 Jan 2014)
Log Message:
-----------
A more sensible implementation for the document collection functionality. I'm
hoping this will be a better starting point for the
compact-in-a-background-thread function.
Modified Paths:
--------------
mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
2014-01-24 17:07:59 UTC (rev 17247)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
2014-01-27 17:17:30 UTC (rev 17248)
@@ -535,10 +535,10 @@
* @param documentID
* the ID of the document to be obtained.
* @return the {@link DocumentData} associated with the given document ID.
- * @throws IndexException
+ * @throws IOException
*/
public synchronized DocumentData getDocumentData(long documentID)
- throws IndexException {
+ throws IndexException, IOException {
if(isDeleted(documentID)) {
throw new IndexException("Invalid document ID " + documentID);
}
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
===================================================================
---
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
2014-01-24 17:07:59 UTC (rev 17247)
+++
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
2014-01-27 17:17:30 UTC (rev 17248)
@@ -16,15 +16,31 @@
import gate.mimir.index.IndexException;
-import gate.mimir.index.Indexer;
-import it.unimi.dsi.fastutil.ints.IntArrayList;
-import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.longs.Long2ObjectLinkedOpenHashMap;
-import java.io.*;
-import java.util.*;
-import java.util.concurrent.BlockingQueue;
-import java.util.zip.*;
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Enumeration;
+import java.util.List;
+import java.util.NoSuchElementException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipException;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipOutputStream;
import org.apache.log4j.Logger;
@@ -35,6 +51,7 @@
* serialised {@link DocumentData} values.
*/
public class DocumentCollection {
+
/**
* The maximum number of documents to be stored in the document cache.
*/
@@ -49,22 +66,151 @@
protected static final int INPUT_BUFFER_SIZE = 100;
/**
- * A simple {@link FilenameFilter} that only accepts the zip files that are
- * part of a collection.
- *
- * In order to be accepted, the file name needs to be in the form:
- * "{@value Indexer#MIMIR_COLLECTION_BASENAME}-number{@value
Indexer#MIMIR_COLLECTION_EXTENSION}"
+ * Class representing one of the collection (zip) files.
*/
- private class CollectionFilenameFilter implements FilenameFilter{
- public boolean accept(File dir, String name) {
- return getZipFileId(name) != -1;
+ protected static class CollectionFile implements Comparable<CollectionFile> {
+ /**
+ * The filename for the zip collection.
+ */
+ public static final String MIMIR_COLLECTION_BASENAME = "mimir-collection-";
+
+ /**
+ * The file extension used for the mimir-specific relocatable zip
collection
+ * definition.
+ */
+ public static final String MIMIR_COLLECTION_EXTENSION = ".zip";
+
+ /**
+ * Regex pattern that recognises a valid collection file name and its
parts.
+ * The following capturing groups can be used when a match occurs:
+ * <ul>
+ * <li>1: the collection file ID</li>
+ * <li>2: the collection file number (the numeric part of the ID)</li>
+ * <li>3: (optional) the collection file suffix (the non-numeric part of
the ID)</li>
+ * </ul>
+ */
+ protected static final Pattern MIMIR_COLLECTION_PATTERN = Pattern.compile(
+ "\\Q" + MIMIR_COLLECTION_BASENAME + "\\E((\\d+)(?:-([a-zA-Z]+))?)\\Q"+
+ MIMIR_COLLECTION_EXTENSION + "\\E");
+
+ protected static FilenameFilter FILENAME_FILTER = new FilenameFilter() {
+ @Override
+ public boolean accept(File dir, String name) {
+ return MIMIR_COLLECTION_PATTERN.matcher(name).matches();
+ }
+ };
+
+ protected ZipFile zipFile;
+
+ protected long firstEntry;
+
+ protected long lastEntry;
+
+ /**
+ * Each collection file has a number, and optionally a suffix. For
example
+ * in "mimir-collection-0-a.zip", the number is 0, and the
suffix
+ * is a.
+ */
+ protected int collectionFileNumber;
+
+
+ /**
+ * The size in bytes of the underlying file.
+ */
+ protected long length;
+
+ /**
+ * The number of documents contained.
+ */
+ protected int documentCount;
+
+ /**
+ * Given the name of a zip file, this method returns its ID: the part of
the
+ * file name between the prefix ({@value
DocumentCollection#MIMIR_COLLECTION_BASENAME}) and
+ * the suffix ({@value DocumentCollection#MIMIR_COLLECTION_EXTENSION}), or
<code>null</code> if
+ * the name is not that of a valid collection file.
+ * @param fileName the file name to be parsed.
+ * @return the ID of the file, or <code>null</code>.
+ */
+ protected static String getCollectionFileId(String fileName){
+ Matcher m = MIMIR_COLLECTION_PATTERN.matcher(fileName);
+ return m.matches() ? m.group(1) : null;
}
+
+ protected static int getCollectionFileNumber(String fileName){
+ Matcher m = MIMIR_COLLECTION_PATTERN.matcher(fileName);
+ return m.matches() ? Integer.parseInt(m.group(2)) : -1;
+ }
+
+ public static String getCollectionFileName(String id) {
+ return MIMIR_COLLECTION_BASENAME + id + MIMIR_COLLECTION_EXTENSION;
+ }
+
+ public CollectionFile(File file) throws ZipException, IOException {
+
+ zipFile = new ZipFile(file);
+ Enumeration<? extends ZipEntry> entries = zipFile.entries();
+ firstEntry = Long.MAX_VALUE;
+ lastEntry = -1;
+ documentCount = 0;
+ while(entries.hasMoreElements()) {
+ ZipEntry anEntry = entries.nextElement();
+ String entryName = anEntry.getName();
+ try {
+ long entryId = Long.parseLong(entryName);
+ //update the current maximum and minimum
+ if(entryId > lastEntry) lastEntry = entryId;
+ if(entryId < firstEntry) firstEntry = entryId;
+ documentCount++;
+ } catch(NumberFormatException e) {
+ //not parseable -> we'll ignore this entry.
+ logger.warn("Unparseable zip entry name: " + entryName);
+ }
+ }
+ if(firstEntry == Long.MAX_VALUE) firstEntry = -1;
+ length = file.length();
+ }
+
+ @Override
+ public int compareTo(CollectionFile o) {
+ return Long.compare(firstEntry, o.firstEntry);
+ }
+
+ public boolean containsDocument(long documentID) {
+ return firstEntry <= documentID &&
+ documentID <= lastEntry &&
+ zipFile.getEntry(Long.toString(documentID)) != null;
+ }
+
+ public DocumentData getDocumentData(Long documentID) throws IOException {
+ ZipEntry entry = zipFile.getEntry(Long.toString(documentID));
+ if(entry == null) throw new NoSuchElementException(
+ "No entry found for document ID " + documentID);
+ ObjectInputStream ois = null;
+ try {
+ ois = new ObjectInputStream(
+ zipFile.getInputStream(entry));
+ return (DocumentData) ois.readObject();
+ } catch(ClassNotFoundException e) {
+ //invalid data read from the zip file
+ throw new IOException("Invalid data read from zip file!", e);
+ } finally {
+ if(ois != null) ois.close();
+ }
+ }
+
+
+ public void close() throws IOException {
+ zipFile.close();
+ }
}
+
+
/**
* The zip files containing the document collection.
*/
- protected List<ZipFile> zipFiles = null;
+ protected List<CollectionFile> collectionFiles = null;
private static Logger logger = Logger.getLogger(DocumentCollection.class);
@@ -74,15 +220,6 @@
protected File indexDirectory;
/**
- * The maximum entry number in each zip file. This array is aligned with
- * {@link #zipFiles}. The zip file at position <code>i</code> in
- * {@link #zipFiles} will contain the entries with numbers between
- * <code>maxEntries[i-1] + 1</code> and <code>maxEntries[i]</code>,
inclusive.
- * By convention, <code>maxEntries[-1]=-1</code>.
- */
- protected IntList maxEntries = null;
-
- /**
* A cache of {@link DocumentData} values used for returning the various
* document details (title, URI, text).
*/
@@ -145,7 +282,7 @@
* The ID for the next document to be written. This value is initialised to 0
* and then is automatically incremented whenever a new document is written.
*/
- protected long documentId;
+ protected long nextDocumentId;
/**
@@ -154,28 +291,6 @@
protected int zipFileId;
/**
- * Given the name of a zip file, this method returns its ID (the numeric
part
- * of the name), or -1 if the name is not that of a valid collection file.
- * @param fileName the file name to be parsed.
- * @return the ID of the file, or -1.
- */
- protected static int getZipFileId(String fileName){
- if(fileName.startsWith(Indexer.MIMIR_COLLECTION_BASENAME + "-") &&
- fileName.endsWith(Indexer.MIMIR_COLLECTION_EXTENSION)){
- String numberPart = fileName.substring(
- Indexer.MIMIR_COLLECTION_BASENAME.length() + 1,
- fileName.length() - Indexer.MIMIR_COLLECTION_EXTENSION.length());
-
- try {
- return Integer.parseInt(numberPart);
- } catch(NumberFormatException e) {
- //non-parseable
- return -1;
- }
- }
- return -1;
- }
- /**
* Opens a zip file and creates a DocumentCollection object for accessing
the
* document data.
* @param indexDirectory
@@ -185,130 +300,72 @@
public DocumentCollection(File indexDirectory) throws IOException {
this.indexDirectory = indexDirectory;
- zipFiles = new ArrayList<ZipFile>();
- maxEntries = new IntArrayList();
+ collectionFiles = new ArrayList<CollectionFile>();
// prepare for reading
- for(File aCollectionFile : enumerateCollectionFiles()) {
- openCollectionFile(aCollectionFile);
+ for(File aCollectionFile :
indexDirectory.listFiles(CollectionFile.FILENAME_FILTER)) {
+ collectionFiles.add(new CollectionFile(aCollectionFile));
}
+ Collections.sort(collectionFiles);
+ // sanity check
+ for(int i = 0; i < collectionFiles.size() - 1; i++) {
+ CollectionFile first = collectionFiles.get(i);
+ CollectionFile second = collectionFiles.get(i + 1);
+ if(first.lastEntry >= second.firstEntry) {
+ throw new IOException(
+ "Invalid entries distribution: collection file " +
+ second.zipFile.getName() +
+ " contains an entry named \"" + second.firstEntry +
+ "\", but an entry with a larger-or-equal ID was " +
+ "already seen in a previous collection file!");
+ }
+ }
documentCache = new Long2ObjectLinkedOpenHashMap<DocumentData>();
// prepare for writing
byteArrayOS = new ByteArrayOutputStream();
- documentId = maxEntries.isEmpty() ? 0 :
- (maxEntries.getInt(maxEntries.size() -1) + 1);
- zipFileId = zipFiles.size();
+ nextDocumentId = collectionFiles.isEmpty() ? 0 :
+ (collectionFiles.get(collectionFiles.size() - 1).lastEntry + 1);
+ zipFileId = collectionFiles.size();
inputBuffer = new Long2ObjectLinkedOpenHashMap<DocumentData>();
}
+
/**
- * Gets the collection file in order.
- * @return
- */
- protected File[] enumerateCollectionFiles() {
- File[] collectionFiles = indexDirectory.listFiles(
- new CollectionFilenameFilter());
- //sort the files by ID
- Arrays.sort(collectionFiles, new Comparator<File>(){
- public int compare(File o1, File o2) {
- return getZipFileId(o1.getName()) - getZipFileId(o2.getName());
- }
- });
- return collectionFiles;
- }
-
- /**
- * Adds a new zip file to the collection.
- * @throws IndexException
- */
- protected synchronized void openCollectionFile(File collectionFile) throws
IOException {
- try {
- //for each file, open a ZipFile, parse the entries, set the maxEntry
value.
- ZipFile aZipFile = new ZipFile(collectionFile);
- int fileId = getZipFileId(collectionFile.getName());
- zipFiles.add(aZipFile);
- Enumeration<? extends ZipEntry> entries = aZipFile.entries();
- int maxEntryInFile = -1;
- while(entries.hasMoreElements()){
- ZipEntry anEntry = entries.nextElement();
- String entryName = anEntry.getName();
- try {
- int entryId = Integer.parseInt(entryName);
- //sanity check
- if(fileId > 0 && entryId <= maxEntries.get(fileId-1)){
- throw new IOException(
- "Invalid entries distribution: collection file " +
- collectionFile.getAbsolutePath() +
- " contains an entry named \"" + entryName +
- "\", but an entry with a larger-or-equal ID was " +
- "already seen in a previous collection file!");
- }
- //update the current maximum
- if(entryId > maxEntryInFile) maxEntryInFile = entryId;
- } catch(NumberFormatException e) {
- //not parseable -> we'll ignore this entry.
- logger.warn("Unparseable zip entry name: " + entryName);
- }
- }
- maxEntries.add(maxEntryInFile);
- } catch(ZipException e) {
- throw new IOException("Problem while reading collection file " +
- collectionFile.getAbsolutePath(), e);
- }
- }
-
- /**
* Gets the document data for a given document ID.
* @param documentID the ID of the document to be retrieved.
* @return a {@link DocumentData} object for the requested document ID.
* @throws IOException if there are problems accessing the underlying zip
file;
* @throws NoSuchElementException if the requested document ID is not found.
*/
- public DocumentData getDocumentData(long documentID) throws IndexException{
+ public DocumentData getDocumentData(long documentID) throws IOException{
if(closed) throw new IllegalStateException(
"This document collection has already been closed!");
DocumentData documentData = null;
- if(documentID > maxEntries.get(maxEntries.size() - 1)) {
+ if(collectionFiles.isEmpty() ||
+ documentID > collectionFiles.get(collectionFiles.size() - 1).lastEntry)
{
// it's a new document that's not yet available from the zip files
documentData = inputBuffer.get(documentID);
- // (or a wrong ID)
- if(documentData == null) throw new NoSuchElementException(
- "No entry found for document ID " + documentID);
} else {
// it's an old document. Try the cache first
documentData = documentCache.getAndMoveToFirst(documentID);
if(documentData == null) {
// cache miss: we need to actually load it
//locate the right zip file
- int zipFileId = 0;
- while(zipFileId < maxEntries.size() && documentID >
maxEntries.get(zipFileId)){
- zipFileId++;
- }
- if(zipFileId >= maxEntries.size()){
- //entry not found (entry number too large)
- throw new NoSuchElementException("No entry found for document ID " +
- documentID + ". Document ID too large for this collection!");
- }
-
- ZipEntry entry =
zipFiles.get(zipFileId).getEntry(Long.toString(documentID));
- if(entry == null) throw new NoSuchElementException(
- "No entry found for document ID " + documentID);
- try {
- ObjectInputStream ois = new
ObjectInputStream(zipFiles.get(zipFileId).getInputStream(entry));
- documentData = (DocumentData) ois.readObject();
- ois.close();
- documentCache.putAndMoveToFirst(documentID, documentData);
- if(documentCache.size() > DOCUMENT_DATA_CACHE_SIZE) {
- documentCache.removeLast();
+ files: for(CollectionFile aColFile : collectionFiles) {
+ if(aColFile.containsDocument(documentID)) {
+ // we found the collection file containing the document
+ documentData = aColFile.getDocumentData(nextDocumentId);
+ documentCache.putAndMoveToFirst(documentID, documentData);
+ if(documentCache.size() > DOCUMENT_DATA_CACHE_SIZE) {
+ documentCache.removeLast();
+ }
+ break files;
}
- } catch(ClassNotFoundException e) {
- //invalid data read from the zip file
- throw new IndexException("Invalid data read from zip file!", e);
- } catch(IOException e) {
- throw new IndexException("Exception reading zip file!", e);
}
}
}
+ if(documentData == null) throw new NoSuchElementException(
+ "No entry found for document ID " + documentID);
return documentData;
}
@@ -335,13 +392,13 @@
//move to the next zip file
closeZipFile();
// open the newly-closed zip file in read mode
- openCollectionFile(zipFile);
+ collectionFiles.add(new CollectionFile(zipFile));
zipFileId++;
openZipFile();
}
// create a new entry in the current zip file
- ZipEntry entry = new ZipEntry(Long.toString(documentId++));
+ ZipEntry entry = new ZipEntry(Long.toString(nextDocumentId++));
zipOuputStream.putNextEntry(entry);
//write the data
byteArrayOS.writeTo(zipOuputStream);
@@ -355,7 +412,7 @@
throw new IndexException("Problem while accessing the collection file",
e);
} finally {
// save the document data to the input buffer
- inputBuffer.put(documentId, document);
+ inputBuffer.put(nextDocumentId, document);
}
}
@@ -366,10 +423,8 @@
* be opened for writing.
*/
protected void openZipFile() throws IndexException{
- zipFile = new File(indexDirectory,
- Indexer.MIMIR_COLLECTION_BASENAME +
- "-" + zipFileId +
- Indexer.MIMIR_COLLECTION_EXTENSION);
+ zipFile = new File(indexDirectory,
+ CollectionFile.getCollectionFileName(Integer.toString(zipFileId)));
if(zipFile.exists()) throw new IndexException("Collection zip file (" +
zipFile.getAbsolutePath() + ") already exists!");
@@ -404,16 +459,16 @@
closeZipFile();
// close the reader
closed = true;
- if(zipFiles != null){
- for(ZipFile aZipFile : zipFiles){
+ if(collectionFiles != null){
+ for(CollectionFile colFile : collectionFiles){
try {
- aZipFile.close();
+ colFile.close();
} catch(IOException e) {
// ignore
}
}
- zipFiles.clear();
- zipFiles = null;
+ collectionFiles.clear();
+ collectionFiles = null;
}
documentCache.clear();
}
@@ -424,7 +479,7 @@
ZipOutputStream outputStream = null;
long outFileSize = 0;
int outFileEntries = 0;
- for(File inputFile : enumerateCollectionFiles()) {
+ for(File inputFile :
indexDirectory.listFiles(CollectionFile.FILENAME_FILTER)) {
ZipFile inputZipFile = new ZipFile(inputFile);
if(outputStream == null) {
// we're not currently writing because all files so far have been OK
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java
2014-01-24 17:07:59 UTC (rev 17247)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java
2014-01-27 17:17:30 UTC (rev 17248)
@@ -614,7 +614,12 @@
*/
public String[][] getRightContext(Binding hit, int numTokens)
throws IndexException {
- DocumentData docData = index.getDocumentData(hit.getDocumentId());
+ DocumentData docData;
+ try {
+ docData = index.getDocumentData(hit.getDocumentId());
+ } catch(IOException e) {
+ throw new IndexException(e);
+ }
int startOffset = hit.getTermPosition() + hit.getLength();
if(startOffset >= docData.getTokens().length) {
// hit is at the end of the document
@@ -653,7 +658,11 @@
*/
public String[][] getText(long documentID, int termPosition, int length)
throws IndexException {
- return index.getDocumentData(documentID).getText(termPosition, length);
+ try {
+ return index.getDocumentData(documentID).getText(termPosition, length);
+ } catch(IOException e) {
+ throw new IndexException(e);
+ }
}
/**
@@ -679,11 +688,19 @@
}
public String getDocumentTitle(long docID) throws IndexException {
- return index.getDocumentData(docID).getDocumentTitle();
+ try {
+ return index.getDocumentData(docID).getDocumentTitle();
+ } catch(IOException e) {
+ throw new IndexException(e);
+ }
}
public String getDocumentURI(long docID) throws IndexException {
- return index.getDocumentData(docID).getDocumentURI();
+ try {
+ return index.getDocumentData(docID).getDocumentURI();
+ } catch(IOException e) {
+ throw new IndexException(e);
+ }
}
/**
@@ -700,7 +717,11 @@
*/
public Serializable getDocumentMetadataField(long docID, String fieldName)
throws IndexException {
- return index.getDocumentData(docID).getMetadataField(fieldName);
+ try {
+ return index.getDocumentData(docID).getMetadataField(fieldName);
+ } catch(IOException e) {
+ throw new IndexException(e);
+ }
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
CenturyLink Cloud: The Leader in Enterprise Cloud Services.
Learn Why More Businesses Are Choosing CenturyLink Cloud For
Critical Workloads, Development Environments & Everything In Between.
Get a Quote or Start a Free Trial Today.
http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs