Revision: 17303
          http://sourceforge.net/p/gate/code/17303
Author:   valyt
Date:     2014-02-13 18:44:43 +0000 (Thu, 13 Feb 2014)
Log Message:
-----------
Copied non-UTF content guard from the trunk.

Modified Paths:
--------------
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java    
2014-02-13 18:35:43 UTC (rev 17302)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java    
2014-02-13 18:44:43 UTC (rev 17303)
@@ -24,6 +24,14 @@
 
 import java.io.File;
 import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.concurrent.BlockingQueue;
@@ -44,6 +52,22 @@
    */
   private static final String[] DO_NOT_INDEX = new String[]{};
   
+  
+  protected static final CharsetEncoder UTF8_CHARSET_ENCODER = 
Charset.forName("UTF-8").newEncoder();
+  
+  protected static final CharsetDecoder UTF8_CHARSET_DECODER = 
Charset.forName("UTF-8").newDecoder();
+  
+  static {
+    try {
+      UTF8_CHARSET_ENCODER.replaceWith("[UNMAPPED]".getBytes("UTF-8"));
+     UTF8_CHARSET_ENCODER.onMalformedInput(CodingErrorAction.REPLACE);
+      UTF8_CHARSET_ENCODER.onUnmappableCharacter(CodingErrorAction.REPLACE);
+    } catch(UnsupportedEncodingException e) {
+      // this should never happen
+     throw new RuntimeException("UTF-8 not supported");
+    }
+  }
+  
   /**
    * Is this token index responsible for writing the zip collection?
    */
@@ -186,6 +210,19 @@
           GATEDocument gateDocument) throws IndexException {
     FeatureMap tokenFeatures = ann.getFeatures();
     String value = (String)tokenFeatures.get(featureName);
+    // make sure we get valid UTF-8 content
+   // illegal strings will simply be rendered as "[UNMAPPED]"
+    try {
+      CharBuffer cb = CharBuffer.wrap(value);
+      ByteBuffer bb = UTF8_CHARSET_ENCODER.encode(cb);
+      cb = UTF8_CHARSET_DECODER.decode(bb);
+      value  = cb.toString();
+    } catch(CharacterCodingException e) {
+      // this should not happen
+      value = null;
+      logger.error("Error while normalizing input", e);
+    }
+    
     currentTerm.replace(value == null ? "" : value);
     //save the *unprocessed* term to the collection, if required.
     if(zipCollectionEnabled) {

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Android apps run on BlackBerry 10
Introducing the new BlackBerry 10.2.1 Runtime for Android apps.
Now with support for Jelly Bean, Bluetooth, Mapview and more.
Get your Android app in front of a whole new audience.  Start now.
http://pubads.g.doubleclick.net/gampad/clk?id=124407151&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to