mimir

valyt Mon, 27 Jan 2014 09:22:07 -0800

Revision: 17248
          http://sourceforge.net/p/gate/code/17248
Author:   valyt
Date:     2014-01-27 17:17:30 +0000 (Mon, 27 Jan 2014)
Log Message:
-----------
A more sensible implementation for the document collection functionality. I'm 
hoping this will be a better starting point for the 
compact-in-a-background-thread function.


Modified Paths:
--------------
    mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
    
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java        
2014-01-24 17:07:59 UTC (rev 17247)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java        
2014-01-27 17:17:30 UTC (rev 17248)
@@ -535,10 +535,10 @@
    * @param documentID
    *          the ID of the document to be obtained.
    * @return the {@link DocumentData} associated with the given document ID.
-   * @throws IndexException
+   * @throws IOException 
    */
   public synchronized DocumentData getDocumentData(long documentID)
-  throws IndexException {
+  throws IndexException, IOException {
     if(isDeleted(documentID)) {
       throw new IndexException("Invalid document ID " + documentID);
     }

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
===================================================================
--- 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
       2014-01-24 17:07:59 UTC (rev 17247)
+++ 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
       2014-01-27 17:17:30 UTC (rev 17248)
@@ -16,15 +16,31 @@
 
 
 import gate.mimir.index.IndexException;
-import gate.mimir.index.Indexer;
-import it.unimi.dsi.fastutil.ints.IntArrayList;
-import it.unimi.dsi.fastutil.ints.IntList;
 import it.unimi.dsi.fastutil.longs.Long2ObjectLinkedOpenHashMap;
 
-import java.io.*;
-import java.util.*;
-import java.util.concurrent.BlockingQueue;
-import java.util.zip.*;
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Enumeration;
+import java.util.List;
+import java.util.NoSuchElementException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipException;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipOutputStream;
 
 import org.apache.log4j.Logger;
 
@@ -35,6 +51,7 @@
  * serialised {@link DocumentData} values.
  */
 public class DocumentCollection {
+  
   /**
    * The maximum number of documents to be stored in the document cache.
    */
@@ -49,22 +66,151 @@
   protected static final int INPUT_BUFFER_SIZE = 100;
   
   /**
-   * A simple {@link FilenameFilter} that only accepts the zip files that are
-   * part of a collection.
-   * 
-   * In order to be accepted, the file name needs to be in the form:
-   * &quot;{@value Indexer#MIMIR_COLLECTION_BASENAME}-number{@value 
Indexer#MIMIR_COLLECTION_EXTENSION}&quot;
+   * Class representing one of the collection (zip) files.
    */
-  private class CollectionFilenameFilter implements FilenameFilter{
-    public boolean accept(File dir, String name) {
-      return getZipFileId(name) != -1;
+  protected static class CollectionFile implements Comparable<CollectionFile> {
+    /**
+     * The filename for the zip collection.
+     */
+    public static final String MIMIR_COLLECTION_BASENAME = "mimir-collection-";
+    
+    /**
+     * The file extension used for the mimir-specific relocatable zip 
collection
+     * definition.
+     */
+    public static final String MIMIR_COLLECTION_EXTENSION = ".zip";
+    
+    /**
+     * Regex pattern that recognises a valid collection file name and its 
parts.
+     * The following capturing groups can be used when a match occurs:
+     * <ul>
+     *   <li>1: the collection file ID</li>
+     *   <li>2: the collection file number (the numeric part of the ID)</li>
+     *   <li>3: (optional) the collection file suffix (the non-numeric part of 
the ID)</li>
+     * </ul>   
+     */
+    protected static final Pattern MIMIR_COLLECTION_PATTERN = Pattern.compile(
+        "\\Q" + MIMIR_COLLECTION_BASENAME + "\\E((\\d+)(?:-([a-zA-Z]+))?)\\Q"+
+        MIMIR_COLLECTION_EXTENSION + "\\E");
+    
+    protected static FilenameFilter FILENAME_FILTER = new FilenameFilter() {
+      @Override
+      public boolean accept(File dir, String name) {
+        return MIMIR_COLLECTION_PATTERN.matcher(name).matches();
+      }
+    };
+    
+         protected ZipFile zipFile;
+         
+         protected long firstEntry;
+         
+         protected long lastEntry;
+
+         /**
+          * Each collection file has a number, and optionally a suffix. For 
example
+          * in &quot;mimir-collection-0-a.zip&quot;, the number is 0, and the 
suffix
+          * is a.
+          */
+         protected int collectionFileNumber;
+         
+         
+         /**
+          * The size in bytes of the underlying file.
+          */
+         protected long length;
+         
+         /**
+          * The number of documents contained.
+          */
+         protected int documentCount;
+
+         /**
+     * Given the name of a zip file, this method returns its ID: the part of 
the 
+     * file name between the prefix ({@value 
DocumentCollection#MIMIR_COLLECTION_BASENAME}) and
+     * the suffix ({@value DocumentCollection#MIMIR_COLLECTION_EXTENSION}), or 
<code>null</code> if 
+     * the name is not that of a valid collection file.
+     * @param fileName the file name to be parsed.
+     * @return the ID of the file, or <code>null</code>.
+     */
+    protected static String getCollectionFileId(String fileName){
+      Matcher m = MIMIR_COLLECTION_PATTERN.matcher(fileName);
+      return m.matches() ? m.group(1) : null;
     }
+
+    protected static int getCollectionFileNumber(String fileName){
+      Matcher m = MIMIR_COLLECTION_PATTERN.matcher(fileName);
+      return m.matches() ? Integer.parseInt(m.group(2)) : -1;
+    }
+    
+    public static String getCollectionFileName(String id) {
+      return MIMIR_COLLECTION_BASENAME + id + MIMIR_COLLECTION_EXTENSION;
+    }
+    
+         public CollectionFile(File file) throws ZipException, IOException {
+           
+      zipFile = new ZipFile(file);      
+      Enumeration<? extends ZipEntry> entries = zipFile.entries();
+      firstEntry = Long.MAX_VALUE;
+      lastEntry = -1;
+      documentCount = 0;
+      while(entries.hasMoreElements()) {
+        ZipEntry anEntry = entries.nextElement();
+        String entryName = anEntry.getName();
+        try {
+          long entryId = Long.parseLong(entryName);
+          //update the current maximum and minimum
+          if(entryId > lastEntry) lastEntry = entryId;
+          if(entryId < firstEntry) firstEntry = entryId;
+          documentCount++;
+        } catch(NumberFormatException e) {
+          //not parseable -> we'll ignore this entry.
+          logger.warn("Unparseable zip entry name: " + entryName);
+        }
+      }
+      if(firstEntry == Long.MAX_VALUE) firstEntry = -1;
+      length = file.length();
+         }
+         
+    @Override
+    public int compareTo(CollectionFile o) {
+      return Long.compare(firstEntry, o.firstEntry);
+    }
+    
+    public boolean containsDocument(long documentID) {
+      return firstEntry <= documentID && 
+          documentID <= lastEntry &&
+          zipFile.getEntry(Long.toString(documentID)) != null;
+    }
+    
+    public DocumentData getDocumentData(Long documentID) throws IOException {
+      ZipEntry entry = zipFile.getEntry(Long.toString(documentID));
+      if(entry == null) throw new NoSuchElementException(
+          "No entry found for document ID " + documentID);
+      ObjectInputStream ois = null;
+      try {
+        ois = new ObjectInputStream(
+            zipFile.getInputStream(entry));
+        return (DocumentData) ois.readObject();
+      } catch(ClassNotFoundException e) {
+        //invalid data read from the zip file
+        throw new IOException("Invalid data read from zip file!", e);
+      } finally {
+        if(ois != null) ois.close();
+      }
+    }
+    
+    
+    public void close() throws IOException {
+      zipFile.close();
+    }
   }
+
   
+  
   /**
    * The zip files containing the document collection.
    */
-  protected List<ZipFile> zipFiles = null;
+  protected List<CollectionFile> collectionFiles = null;
   
   private static Logger logger = Logger.getLogger(DocumentCollection.class);
   
@@ -74,15 +220,6 @@
   protected File indexDirectory;
   
   /**
-   * The maximum entry number in each zip file. This array is aligned with 
-   * {@link #zipFiles}. The zip file at position <code>i</code> in 
-   * {@link #zipFiles} will contain the entries with numbers between 
-   * <code>maxEntries[i-1] + 1</code> and <code>maxEntries[i]</code>, 
inclusive.
-   * By convention, <code>maxEntries[-1]=-1</code>.
-   */
-  protected IntList maxEntries = null;
-  
-  /**
    * A cache of {@link DocumentData} values used for returning the various
    * document details (title, URI, text).
    */
@@ -145,7 +282,7 @@
    * The ID for the next document to be written. This value is initialised to 0
    * and then is automatically incremented whenever a new document is written.
    */
-  protected long documentId;
+  protected long nextDocumentId;
   
 
   /**
@@ -154,28 +291,6 @@
   protected int zipFileId;
   
   /**
-   * Given the name of a zip file, this method returns its ID (the numeric 
part 
-   * of the name), or -1 if the name is not that of a valid collection file.
-   * @param fileName the file name to be parsed.
-   * @return the ID of the file, or -1.
-   */
-  protected static int getZipFileId(String fileName){
-    if(fileName.startsWith(Indexer.MIMIR_COLLECTION_BASENAME + "-") &&
-            fileName.endsWith(Indexer.MIMIR_COLLECTION_EXTENSION)){
-      String numberPart = fileName.substring(
-             Indexer.MIMIR_COLLECTION_BASENAME.length() + 1,
-             fileName.length() - Indexer.MIMIR_COLLECTION_EXTENSION.length());
-      
-      try {
-        return Integer.parseInt(numberPart);
-      } catch(NumberFormatException e) {
-        //non-parseable
-        return -1;
-      }
-    }
-    return -1; 
-  }
-  /**
    * Opens a zip file and creates a DocumentCollection object for accessing 
the 
    * document data.
    * @param indexDirectory
@@ -185,130 +300,72 @@
   public DocumentCollection(File indexDirectory) throws IOException {
     this.indexDirectory = indexDirectory;
     
-    zipFiles = new ArrayList<ZipFile>();
-    maxEntries = new IntArrayList();
+    collectionFiles = new ArrayList<CollectionFile>();
     // prepare for reading
-    for(File aCollectionFile : enumerateCollectionFiles()) {
-      openCollectionFile(aCollectionFile);
+    for(File aCollectionFile : 
indexDirectory.listFiles(CollectionFile.FILENAME_FILTER)) {
+      collectionFiles.add(new CollectionFile(aCollectionFile));
     }
+    Collections.sort(collectionFiles);
+    // sanity check
+    for(int i = 0;  i < collectionFiles.size() - 1; i++) {
+      CollectionFile first = collectionFiles.get(i);
+      CollectionFile second = collectionFiles.get(i + 1);
+      if(first.lastEntry >= second.firstEntry) {
+        throw new IOException(
+            "Invalid entries distribution: collection file " + 
+            second.zipFile.getName() + 
+            " contains an entry named \"" + second.firstEntry + 
+            "\", but an entry with a larger-or-equal ID was " +
+            "already seen in a previous collection file!");          
+      }
+    }
     documentCache = new Long2ObjectLinkedOpenHashMap<DocumentData>();
     
     // prepare for writing
     byteArrayOS = new ByteArrayOutputStream();
-    documentId = maxEntries.isEmpty() ? 0 : 
-        (maxEntries.getInt(maxEntries.size() -1) + 1);
-    zipFileId = zipFiles.size();
+    nextDocumentId = collectionFiles.isEmpty() ? 0 : 
+        (collectionFiles.get(collectionFiles.size() - 1).lastEntry + 1);
+    zipFileId = collectionFiles.size();
     inputBuffer = new Long2ObjectLinkedOpenHashMap<DocumentData>();
   }
   
+
   /**
-   * Gets the collection file in order.
-   * @return
-   */
-  protected File[] enumerateCollectionFiles() {
-    File[] collectionFiles = indexDirectory.listFiles(
-        new CollectionFilenameFilter());
-    //sort the files by ID
-    Arrays.sort(collectionFiles, new Comparator<File>(){
-      public int compare(File o1, File o2) {
-        return getZipFileId(o1.getName()) - getZipFileId(o2.getName());
-      }
-    });
-    return collectionFiles;
-  }
-  
-  /**
-   * Adds a new zip file to the collection.
-   * @throws IndexException 
-   */
-  protected synchronized void openCollectionFile(File collectionFile) throws 
IOException {
-    try {
-      //for each file, open a ZipFile, parse the entries, set the maxEntry 
value.
-      ZipFile aZipFile = new ZipFile(collectionFile);
-      int fileId = getZipFileId(collectionFile.getName());
-      zipFiles.add(aZipFile);
-      Enumeration<? extends ZipEntry> entries = aZipFile.entries();
-      int maxEntryInFile = -1;
-      while(entries.hasMoreElements()){
-        ZipEntry anEntry = entries.nextElement();
-        String entryName = anEntry.getName();
-        try {
-          int entryId = Integer.parseInt(entryName);
-          //sanity check
-          if(fileId > 0 && entryId <= maxEntries.get(fileId-1)){
-            throw new IOException(
-                    "Invalid entries distribution: collection file " + 
-                    collectionFile.getAbsolutePath() + 
-                    " contains an entry named \"" + entryName + 
-                    "\", but an entry with a larger-or-equal ID was " +
-                    "already seen in a previous collection file!");
-          }
-          //update the current maximum
-          if(entryId > maxEntryInFile) maxEntryInFile = entryId;
-        } catch(NumberFormatException e) {
-          //not parseable -> we'll ignore this entry.
-          logger.warn("Unparseable zip entry name: " + entryName);
-        }
-      }
-      maxEntries.add(maxEntryInFile);
-    } catch(ZipException e) {
-      throw new IOException("Problem while reading collection file " + 
-              collectionFile.getAbsolutePath(), e);
-    }
-  }
-  
-  /**
    * Gets the document data for a given document ID.
    * @param documentID the ID of the document to be retrieved.
    * @return a {@link DocumentData} object for the requested document ID.
    * @throws IOException if there are problems accessing the underlying zip 
file; 
    * @throws NoSuchElementException if the requested document ID is not found.
    */
-  public DocumentData getDocumentData(long documentID) throws IndexException{
+  public DocumentData getDocumentData(long documentID) throws IOException{
     if(closed) throw new IllegalStateException(
             "This document collection has already been closed!");
     DocumentData documentData = null;
-    if(documentID > maxEntries.get(maxEntries.size() - 1)) {
+    if(collectionFiles.isEmpty() ||
+       documentID > collectionFiles.get(collectionFiles.size() - 1).lastEntry) 
{
       // it's a new document that's not yet available from the zip files
       documentData = inputBuffer.get(documentID);
-      // (or a wrong ID)
-      if(documentData == null) throw new NoSuchElementException(
-          "No entry found for document ID " + documentID);
     } else {
       // it's an old document. Try the cache first
       documentData = documentCache.getAndMoveToFirst(documentID);
       if(documentData == null) {
         // cache miss: we need to actually load it
         //locate the right zip file
-        int zipFileId = 0;
-        while(zipFileId < maxEntries.size() && documentID > 
maxEntries.get(zipFileId)){
-          zipFileId++;
-        }
-        if(zipFileId >= maxEntries.size()){
-          //entry not found (entry number too large)
-          throw new NoSuchElementException("No entry found for document ID " + 
-                  documentID + ". Document ID too large for this collection!");
-        }
-        
-        ZipEntry entry = 
zipFiles.get(zipFileId).getEntry(Long.toString(documentID));
-        if(entry == null) throw new NoSuchElementException(
-            "No entry found for document ID " + documentID);
-        try {
-          ObjectInputStream ois = new 
ObjectInputStream(zipFiles.get(zipFileId).getInputStream(entry));
-          documentData = (DocumentData) ois.readObject();
-          ois.close();
-          documentCache.putAndMoveToFirst(documentID, documentData);
-          if(documentCache.size() > DOCUMENT_DATA_CACHE_SIZE) {
-            documentCache.removeLast();
+        files: for(CollectionFile aColFile : collectionFiles) {
+          if(aColFile.containsDocument(documentID)) {
+            // we found the collection file containing the document
+            documentData = aColFile.getDocumentData(nextDocumentId);
+            documentCache.putAndMoveToFirst(documentID, documentData);
+            if(documentCache.size() > DOCUMENT_DATA_CACHE_SIZE) {
+              documentCache.removeLast();
+            }
+            break files;
           }
-        } catch(ClassNotFoundException e) {
-          //invalid data read from the zip file
-          throw new IndexException("Invalid data read from zip file!", e);
-        } catch(IOException e) {
-          throw new IndexException("Exception reading zip file!", e);
         }
       }
     }
+    if(documentData == null) throw new NoSuchElementException(
+        "No entry found for document ID " + documentID);
     return documentData;  
   }
   
@@ -335,13 +392,13 @@
         //move to the next zip file
         closeZipFile();
         // open the newly-closed zip file in read mode
-        openCollectionFile(zipFile);
+        collectionFiles.add(new CollectionFile(zipFile));
         zipFileId++;
         openZipFile();
       }
 
       // create a new entry in the current zip file
-      ZipEntry entry = new ZipEntry(Long.toString(documentId++));
+      ZipEntry entry = new ZipEntry(Long.toString(nextDocumentId++));
       zipOuputStream.putNextEntry(entry);
       //write the data
       byteArrayOS.writeTo(zipOuputStream);
@@ -355,7 +412,7 @@
       throw new IndexException("Problem while accessing the collection file", 
e);
     } finally {
       // save the document data to the input buffer
-      inputBuffer.put(documentId, document);
+      inputBuffer.put(nextDocumentId, document);
     }
   }
   
@@ -366,10 +423,8 @@
    * be opened for writing.
    */
   protected void openZipFile() throws IndexException{
-    zipFile = new File(indexDirectory, 
-            Indexer.MIMIR_COLLECTION_BASENAME + 
-            "-" + zipFileId +
-            Indexer.MIMIR_COLLECTION_EXTENSION);
+    zipFile = new File(indexDirectory,
+        CollectionFile.getCollectionFileName(Integer.toString(zipFileId)));
     if(zipFile.exists()) throw new IndexException("Collection zip file (" + 
             zipFile.getAbsolutePath() + ") already exists!");
     
@@ -404,16 +459,16 @@
     closeZipFile();
     // close the reader
     closed = true;
-    if(zipFiles != null){
-      for(ZipFile aZipFile : zipFiles){
+    if(collectionFiles != null){
+      for(CollectionFile colFile : collectionFiles){
         try {
-          aZipFile.close();
+          colFile.close();
         } catch(IOException e) {
           // ignore
         }
       }
-      zipFiles.clear();
-      zipFiles = null;      
+      collectionFiles.clear();
+      collectionFiles = null;      
     }
     documentCache.clear();
   }
@@ -424,7 +479,7 @@
     ZipOutputStream outputStream = null;
     long outFileSize = 0;
     int outFileEntries = 0;
-    for(File inputFile : enumerateCollectionFiles()) {
+    for(File inputFile : 
indexDirectory.listFiles(CollectionFile.FILENAME_FILTER)) {
       ZipFile inputZipFile = new ZipFile(inputFile);
       if(outputStream == null) {
         // we're not currently writing because all files so far have been OK

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java        
2014-01-24 17:07:59 UTC (rev 17247)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java        
2014-01-27 17:17:30 UTC (rev 17248)
@@ -614,7 +614,12 @@
    */
   public String[][] getRightContext(Binding hit, int numTokens)
   throws IndexException {
-    DocumentData docData = index.getDocumentData(hit.getDocumentId());
+    DocumentData docData;
+    try {
+      docData = index.getDocumentData(hit.getDocumentId());
+    } catch(IOException e) {
+      throw new IndexException(e);
+    }
     int startOffset = hit.getTermPosition() + hit.getLength();
     if(startOffset >= docData.getTokens().length) {
       // hit is at the end of the document
@@ -653,7 +658,11 @@
    */
   public String[][] getText(long documentID, int termPosition, int length)
   throws IndexException {
-    return index.getDocumentData(documentID).getText(termPosition, length);
+    try {
+      return index.getDocumentData(documentID).getText(termPosition, length);
+    } catch(IOException e) {
+      throw new IndexException(e); 
+    }
   }
 
   /**
@@ -679,11 +688,19 @@
   }
 
   public String getDocumentTitle(long docID) throws IndexException {
-    return index.getDocumentData(docID).getDocumentTitle();
+    try {
+      return index.getDocumentData(docID).getDocumentTitle();
+    } catch(IOException e) {
+      throw new IndexException(e);
+    }
   }
 
   public String getDocumentURI(long docID) throws IndexException {
-    return index.getDocumentData(docID).getDocumentURI();
+    try {
+      return index.getDocumentData(docID).getDocumentURI();
+    } catch(IOException e) {
+      throw new IndexException(e);
+    }
   }
 
   /**
@@ -700,7 +717,11 @@
    */
   public Serializable getDocumentMetadataField(long docID, String fieldName) 
       throws IndexException {
-    return index.getDocumentData(docID).getMetadataField(fieldName);
+    try {
+      return index.getDocumentData(docID).getMetadataField(fieldName);
+    } catch(IOException e) {
+      throw new IndexException(e);
+    }
   }
   
 

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
CenturyLink Cloud: The Leader in Enterprise Cloud Services.
Learn Why More Businesses Are Choosing CenturyLink Cloud For
Critical Workloads, Development Environments & Everything In Between.
Get a Quote or Start a Free Trial Today. 
http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[17248] mimir/branches/5.0/mimir-core/src/gate/mimir

Reply via email to