Author: rwesten
Date: Fri Nov 23 05:29:16 2012
New Revision: 1412756

URL: http://svn.apache.org/viewvc?rev=1412756&view=rev
Log:
fix for STANBOL-813

Modified:
    
stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
    
stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java

Modified: 
stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1412756&r1=1412755&r2=1412756&view=diff
==============================================================================
--- 
stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
 (original)
+++ 
stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
 Fri Nov 23 05:29:16 2012
@@ -445,17 +445,26 @@ public abstract class NEREngineCore 
         if (null == text) {
             return null;
         }
-        Charset UTF8 = Charset.forName("UTF-8");
-        byte[] bytes = text.getBytes(UTF8);
-        for (int i = 0; i < bytes.length; i++) {
-            byte ch = bytes[i];
+        StringBuilder sb = null; //initialised on the first replacement
+        for (int i = 0; i < text.length(); i++) {
+            int ch = text.codePointAt(i);
             // remove any characters outside the valid UTF-8 range as well as 
all control characters
             // except tabs and new lines
-            if (!((ch > 31 && ch < 253) || ch == '\t' || ch == '\n' || ch == 
'\r')) {
-                bytes[i] = ' ';
+            //NOTE: rewesten (2012-11-21) replaced the original check with the 
one
+            // found at 
http://blog.mark-mclaren.info/2007/02/invalid-xml-characters-when-valid-utf8_5873.html
+            if (!((ch == 0x9) ||
+                    (ch == 0xA) ||
+                    (ch == 0xD) ||
+                    ((ch >= 0x20) && (ch <= 0xD7FF)) ||
+                    ((ch >= 0xE000) && (ch <= 0xFFFD)) ||
+                    ((ch >= 0x10000) && (ch <= 0x10FFFF)))){
+                if(sb == null){
+                    sb = new StringBuilder(text);
+                }
+                sb.setCharAt(i, ' ');
             }
         }
-        return new String(bytes, UTF8);
+        return sb == null ? text : sb.toString();
     }
 
     /**

Modified: 
stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java?rev=1412756&r1=1412755&r2=1412756&view=diff
==============================================================================
--- 
stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
 (original)
+++ 
stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
 Fri Nov 23 05:29:16 2012
@@ -51,7 +51,7 @@ public class TestNamedEntityExtractionEn
             + " geologist who lived in New Zealand and worked at the 
University of Otago.";
     
     public static final String SINGLE_SENTENCE_WITH_CONTROL_CHARS = "Dr 
Patrick Marshall (1869 - November 1950) was a" 
-               + " \u0014geologist\u0015 who lived in New Zealand and worked 
at the University of Otago.";
+               + " \u0014geologist\u0015 who lived in New\tZealand and worked 
at the University\nof Otago.";
 
     public static final String MULTI_SENTENCES = "The life of Patrick 
Marshall\n\n"
             + "Dr Patrick Marshall (1869 - November 1950) was a"
@@ -135,6 +135,8 @@ public class TestNamedEntityExtractionEn
         NameOccurrence firstOccurrence = pmOccurrences.get(0);
         assertEquals("Patrick Marshall", firstOccurrence.name);
         assertFalse(firstOccurrence.context.contains("\u0014"));
+        assertTrue(firstOccurrence.context.contains("\t"));
+        assertTrue(firstOccurrence.context.contains("\n"));
     }
     
     @Test


Reply via email to