Author: rwesten
Date: Fri Nov 23 05:29:16 2012
New Revision: 1412756
URL: http://svn.apache.org/viewvc?rev=1412756&view=rev
Log:
fix for STANBOL-813
Modified:
stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
Modified:
stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1412756&r1=1412755&r2=1412756&view=diff
==============================================================================
---
stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
(original)
+++
stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
Fri Nov 23 05:29:16 2012
@@ -445,17 +445,26 @@ public abstract class NEREngineCore
if (null == text) {
return null;
}
- Charset UTF8 = Charset.forName("UTF-8");
- byte[] bytes = text.getBytes(UTF8);
- for (int i = 0; i < bytes.length; i++) {
- byte ch = bytes[i];
+ StringBuilder sb = null; //initialised on the first replacement
+ for (int i = 0; i < text.length(); i++) {
+ int ch = text.codePointAt(i);
// remove any characters outside the valid UTF-8 range as well as
all control characters
// except tabs and new lines
- if (!((ch > 31 && ch < 253) || ch == '\t' || ch == '\n' || ch ==
'\r')) {
- bytes[i] = ' ';
+ //NOTE: rewesten (2012-11-21) replaced the original check with the
one
+ // found at
http://blog.mark-mclaren.info/2007/02/invalid-xml-characters-when-valid-utf8_5873.html
+ if (!((ch == 0x9) ||
+ (ch == 0xA) ||
+ (ch == 0xD) ||
+ ((ch >= 0x20) && (ch <= 0xD7FF)) ||
+ ((ch >= 0xE000) && (ch <= 0xFFFD)) ||
+ ((ch >= 0x10000) && (ch <= 0x10FFFF)))){
+ if(sb == null){
+ sb = new StringBuilder(text);
+ }
+ sb.setCharAt(i, ' ');
}
}
- return new String(bytes, UTF8);
+ return sb == null ? text : sb.toString();
}
/**
Modified:
stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java?rev=1412756&r1=1412755&r2=1412756&view=diff
==============================================================================
---
stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
(original)
+++
stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
Fri Nov 23 05:29:16 2012
@@ -51,7 +51,7 @@ public class TestNamedEntityExtractionEn
+ " geologist who lived in New Zealand and worked at the
University of Otago.";
public static final String SINGLE_SENTENCE_WITH_CONTROL_CHARS = "Dr
Patrick Marshall (1869 - November 1950) was a"
- + " \u0014geologist\u0015 who lived in New Zealand and worked
at the University of Otago.";
+ + " \u0014geologist\u0015 who lived in New\tZealand and worked
at the University\nof Otago.";
public static final String MULTI_SENTENCES = "The life of Patrick
Marshall\n\n"
+ "Dr Patrick Marshall (1869 - November 1950) was a"
@@ -135,6 +135,8 @@ public class TestNamedEntityExtractionEn
NameOccurrence firstOccurrence = pmOccurrences.get(0);
assertEquals("Patrick Marshall", firstOccurrence.name);
assertFalse(firstOccurrence.context.contains("\u0014"));
+ assertTrue(firstOccurrence.context.contains("\t"));
+ assertTrue(firstOccurrence.context.contains("\n"));
}
@Test