Author: mikemccand
Date: Mon Oct 3 18:25:47 2011
New Revision: 1178494
URL: http://svn.apache.org/viewvc?rev=1178494&view=rev
Log:
TIKA-711: correctly handle optional hyphen from Word docs (.doc)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1178494&r1=1178493&r2=1178494&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Oct 3 18:25:47 2011
@@ -8,6 +8,10 @@ Release 0.11 - Current Development
* TIKA-733: Try to be robust when an RTF has too many closing {'s vs
opening {'s.
+ * TIKA-711: From Word (.doc) documents we now extract optional hyphen
+ as Unicode zero-width space (U+200B), and non-breaking hyphen as
+ Unicode non-breaking hyphen (U+2011).
+
Release 0.10 - 09/25/2011
The most notable changes in Tika 0.10 over previous releases are:
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1178494&r1=1178493&r2=1178494&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
Mon Oct 3 18:25:47 2011
@@ -50,6 +50,10 @@ import org.xml.sax.helpers.AttributesImp
public class WordExtractor extends AbstractPOIFSExtractor {
+ private static final char RECORD_SEPARATOR = 30;
+ private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
+ private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
+
public WordExtractor(ParseContext context) {
super(context);
}
@@ -271,6 +275,14 @@ public class WordExtractor extends Abstr
// Strip the table cell end marker
text = text.substring(0, text.length()-1);
}
+
+ // Copied from POI's
org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters:
+
+ // Non-breaking hyphens are returned as char 30
+ text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN);
+
+ // Non-required hyphens to zero-width space
+ text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE);
xhtml.characters(text);
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java?rev=1178494&r1=1178493&r2=1178494&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
(original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java Mon
Oct 3 18:25:47 2011
@@ -164,15 +164,14 @@ public class TestParsers extends TikaTes
}
public void testOptionalHyphen() throws Exception {
- // TIKA-711: re-enable doc once it's fixed
- //final String[] extensions = new String[] {"ppt", "pptx", "doc",
"docx", "rtf", "pdf"};
- final String[] extensions = new String[] {"ppt", "pptx", "docx",
"rtf", "pdf"};
+ final String[] extensions = new String[] {"ppt", "pptx", "doc",
"docx", "rtf", "pdf"};
for(String extension : extensions) {
File file =
getResourceAsFile("/test-documents/testOptionalHyphen." + extension);
String content = ParseUtils.getStringContent(file, tc);
assertTrue("optional hyphen was not handled for '" + extension +
"' file type: " + content,
content.contains("optionalhyphen") ||
content.contains("optional\u00adhyphen") || // soft
hyphen
+ content.contains("optional\u200bhyphen") || // zero
width space
content.contains("optional\u2027")); //
hyphenation point
}