mimir

valyt Fri, 14 Feb 2014 07:33:48 -0800

Revision: 17308
          http://sourceforge.net/p/gate/code/17308
Author:   valyt
Date:     2014-02-14 15:33:24 +0000 (Fri, 14 Feb 2014)
Log Message:
-----------
- MimirIndex: finessing the index maintenance in the background threads.
- AtomicIndex: charset encoder/decoder are not thread safe, so they cannot be 
statics.


Modified Paths:
--------------
    mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java        
2014-02-14 11:47:27 UTC (rev 17307)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java        
2014-02-14 15:33:24 UTC (rev 17308)
@@ -175,6 +175,8 @@
    *   to disk. It finds these by consuming the {@link Future}s in 
    *   {@link MimirIndex#syncRequests}.</li>
    *   <li>Compact the index when too many on-disk batches have been 
created.</li>
+   *   <li>compact the document collection when too many archive files have 
+   *   been created.</li>
    * </ul>
    */
   protected class IndexMaintenanceRunner2 implements Runnable {
@@ -195,7 +197,30 @@
                   break;
                 }
               }
-              if(compactNeeded) compactIndexSync();
+              if(compactNeeded){
+                logger.debug("Compacting sub-indexes");
+                compactIndexSync();
+              }
+              if(documentCollection.getArchiveCount() >  
indexConfig.getMaximumBatches()) {
+                try {
+                  logger.debug("Compacting document collection");
+                  compactDocumentCollection();
+                } catch(Exception e) {
+                  logger.error("Error while compacting document collection. "
+                      + "Index is now invalid. Closing index to avoid further 
damage.",
+                      e);
+                  try {
+                    close();
+                  } catch(InterruptedException e1) {
+                    logger.error("Received interrupt request while closing "
+                        + "operation in progress", e);
+                    Thread.currentThread().interrupt();
+                  } catch(IOException e1) {
+                    logger.error("Further IO exception while closing index.", 
e1);
+                  }
+                }
+              }
+              
             }
           } catch(ExecutionException e) {
             // a sync request has failed. The index may be damaged, so we will
@@ -209,13 +234,10 @@
                   + "to close index.", e1);
             }
           }
+          aTask = syncRequests.take();
         }
       } catch(InterruptedException e) {
-        if(closed) {
-          // we've just been told to give up: yay, holiday!
-        } else {
-          Thread.currentThread().interrupt();
-        }
+        Thread.currentThread().interrupt();
       }
     }
     
@@ -226,22 +248,7 @@
      */
     protected void compactIndexSync() throws InterruptedException {
       List<Future<Void>> futures = requestCompactIndex();
-      try {
-        compactDocumentCollection();
-      } catch(Exception e) {
-        logger.error("Error while compacting document collection. "
-            + "Index is now invalid. Closing index to avoid further damage.",
-            e);
-        try {
-          close();
-        } catch(InterruptedException e1) {
-          logger.error("Received interrupt request while closing "
-              + "operation in progress", e);
-          Thread.currentThread().interrupt();
-        } catch(IOException e1) {
-          logger.error("Further IO exception while closing index.", e1);
-        }
-      }
+
       for(Future<Void> f : futures){
         try {
           f.get();
@@ -295,7 +302,7 @@
    * {@link TimerTask} used to regularly dump the latest document to an on-disk
    * batch, allowing them to become searchable.
    */
-  protected class DumpToDiskTask extends TimerTask {
+  protected class SyncToDiskTask extends TimerTask {
     @Override
     public void run() {
       if(occurrencesInRam > 0) {
@@ -370,7 +377,7 @@
    * documents become searcheable after at most {@link #timeBetweenBatches} #
    * milliseconds.
    */
-  private volatile transient DumpToDiskTask syncToDiskTask;
+  private volatile transient SyncToDiskTask syncToDiskTask;
   
   /**
    * The token indexes, in the order they are listed in the {@link 
#indexConfig}.
@@ -486,29 +493,42 @@
     occurrencesInRam = 0;
     syncRequests = new LinkedBlockingQueue<Future<Long>>();
     
+    // #####################
+    // Prepare for searching
+    // #####################
+    readDeletedDocs();
+    
+    // #####################
+    // Index maintenance 
+    // #####################
     // start the documents collector thread
     maintenanceThread = new Thread(new IndexMaintenanceRunner(),
         indexDirectory.getAbsolutePath() + " index maintenance");
+    maintenanceThread.setUncaughtExceptionHandler(new 
Thread.UncaughtExceptionHandler() {
+      @Override
+      public void uncaughtException(Thread t, Throwable e) {
+        logger.error("Uncaught exception in background tread", e);
+      }
+    });
     maintenanceThread.start();
     
     // start the occurrences subtractor thread
     maintenanceThread2 = new Thread(
         new IndexMaintenanceRunner2(),
-        indexDirectory.getAbsolutePath() + " Occurrences subtractor");
+        indexDirectory.getAbsolutePath() + " index maintenance 2");
     maintenanceThread2.setPriority(Thread.MIN_PRIORITY);
+    maintenanceThread2.setUncaughtExceptionHandler(new 
Thread.UncaughtExceptionHandler() {
+      @Override
+      public void uncaughtException(Thread t, Throwable e) {
+        logger.error("Uncaught exception in background tread", e);
+      }
+    });
     maintenanceThread2.start();
     
-    // #####################
-    // Prepare for searching
-    // #####################
-    readDeletedDocs();
-    
-    // #####################
-    // Index maintenance 
-    // #####################
+    // start the timer for regular sync-ing, and maintenance of the deleted 
docs
     maintenanceTimer = new Timer("Mímir index maintenance timer");
     synchronized(maintenanceTimer) {
-      syncToDiskTask = new DumpToDiskTask();
+      syncToDiskTask = new SyncToDiskTask();
       maintenanceTimer.schedule(syncToDiskTask, 
           indexConfig.getTimeBetweenBatches(), 
           indexConfig.getTimeBetweenBatches());
@@ -702,7 +722,7 @@
         if(syncToDiskTask != null) {
           syncToDiskTask.cancel();
         }
-        syncToDiskTask = new DumpToDiskTask();
+        syncToDiskTask = new SyncToDiskTask();
         maintenanceTimer.schedule(syncToDiskTask, timeBetweenBatches, 
             timeBetweenBatches);
       }

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java    
2014-02-14 11:47:27 UTC (rev 17307)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java    
2014-02-14 15:33:24 UTC (rev 17308)
@@ -53,21 +53,10 @@
   private static final String[] DO_NOT_INDEX = new String[]{};
   
   
-  protected static final CharsetEncoder UTF8_CHARSET_ENCODER = 
Charset.forName("UTF-8").newEncoder();
+  protected final CharsetEncoder UTF8_CHARSET_ENCODER = 
Charset.forName("UTF-8").newEncoder();
   
-  protected static final CharsetDecoder UTF8_CHARSET_DECODER = 
Charset.forName("UTF-8").newDecoder();
+  protected final CharsetDecoder UTF8_CHARSET_DECODER = 
Charset.forName("UTF-8").newDecoder();
   
-  static {
-    try {
-      UTF8_CHARSET_ENCODER.replaceWith("[?]".getBytes("UTF-8"));
-      UTF8_CHARSET_ENCODER.onMalformedInput(CodingErrorAction.REPLACE);
-      UTF8_CHARSET_ENCODER.onUnmappableCharacter(CodingErrorAction.REPLACE);
-    } catch(UnsupportedEncodingException e) {
-      // this should never happen
-      throw new RuntimeException("UTF-8 not supported");
-    }
-  }
-  
   /**
    * Is this token index responsible for writing the zip collection?
    */
@@ -130,6 +119,15 @@
     additionalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR, 
         ObjectParser.toSpec(termProcessor));
     
+    try {
+      UTF8_CHARSET_ENCODER.replaceWith("[?]".getBytes("UTF-8"));
+      UTF8_CHARSET_ENCODER.onMalformedInput(CodingErrorAction.REPLACE);
+      UTF8_CHARSET_ENCODER.onUnmappableCharacter(CodingErrorAction.REPLACE);
+    } catch(UnsupportedEncodingException e) {
+      // this should never happen
+      throw new RuntimeException("UTF-8 not supported");
+    }
+    
     indexingThread = new Thread(this, "Mimir-" + name + " indexing thread");
     indexingThread.start();
   }
@@ -212,16 +210,19 @@
     String value = (String)tokenFeatures.get(featureName);
     // make sure we get valid UTF-8 content
    // illegal strings will simply be rendered as "[UNMAPPED]"
-    try {
-      CharBuffer cb = CharBuffer.wrap(value);
-      ByteBuffer bb = UTF8_CHARSET_ENCODER.encode(cb);
-      cb = UTF8_CHARSET_DECODER.decode(bb);
-      value  = cb.toString();
-    } catch(CharacterCodingException e) {
-      // this should not happen
-      value = null;
-      logger.error("Error while normalizing input", e);
+    if(value != null) {
+      try {
+        CharBuffer cb = CharBuffer.wrap(value);
+        ByteBuffer bb = UTF8_CHARSET_ENCODER.encode(cb);
+        cb = UTF8_CHARSET_DECODER.decode(bb);
+        value  = cb.toString();
+      } catch(CharacterCodingException e) {
+        // this should not happen
+        value = null;
+        logger.error("Error while normalizing input", e);
+      }      
     }
+
     
     currentTerm.replace(value == null ? "" : value);
     //save the *unprocessed* term to the collection, if required.

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java  
2014-02-14 11:47:27 UTC (rev 17307)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java  
2014-02-14 15:33:24 UTC (rev 17308)
@@ -518,7 +518,16 @@
     documentCache.clear();
   }
   
+  
   /**
+   * Returns the number of archive files in this collection.
+   * @return
+   */
+  public int getArchiveCount() {
+    return collectionFiles.size();
+  }
+  
+  /**
    * Combines multiple smaller collection files into larger ones. If multiple
    * consecutive collection files can be combined without exceeding the maximum
    * permitted sizes ({@link #ZIP_FILE_MAX_ENTRIES} and 

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Android apps run on BlackBerry 10
Introducing the new BlackBerry 10.2.1 Runtime for Android apps.
Now with support for Jelly Bean, Bluetooth, Mapview and more.
Get your Android app in front of a whole new audience.  Start now.
http://pubads.g.doubleclick.net/gampad/clk?id=124407151&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[17308] mimir/branches/5.0/mimir-core/src/gate/mimir

Reply via email to