This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new d1a8bff TIKA-2459 -- fix special character handling
d1a8bff is described below
commit d1a8bff9faacb828a1039f7cc2c7f9e1f1d5e3fd
Author: tballison <[email protected]>
AuthorDate: Fri Sep 8 12:47:40 2017 -0400
TIKA-2459 -- fix special character handling
---
.../org/apache/tika/parser/microsoft/WordExtractor.java | 2 ++
.../apache/tika/parser/microsoft/WordParserTest.java | 6 ++++++
.../testWORD_specialControlCharacter1415.doc | Bin 0 -> 25600 bytes
3 files changed, 8 insertions(+)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index ff07fef..569c881 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -489,6 +489,8 @@ public class WordExtractor extends AbstractPOIFSExtractor {
controls = new ArrayList<CharacterRun>();
}
break;
+ } else if (cr.text().equals("\u0014\u0015")) {
+ has14 = true;
} else {
if (has14) {
texts.add(cr);
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index b399d09..b70ba72 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -621,5 +621,11 @@ public class WordParserTest extends TikaTest {
assertContains("\\s\\up
10(\u3068\u3046\u304D\u3087\u3046),\u6771\u4EAC",
getXML("testWORD_phonetic.doc").xml);
}
+
+ @Test
+ public void testSpecialControlCharacter() throws Exception {
+ //TIKA-2459
+ assertContains("Paragraph one",
getXML("testWORD_specialControlCharacter1415.doc").xml);
+ }
}
diff --git
a/tika-parsers/src/test/resources/test-documents/testWORD_specialControlCharacter1415.doc
b/tika-parsers/src/test/resources/test-documents/testWORD_specialControlCharacter1415.doc
new file mode 100644
index 0000000..919126c
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testWORD_specialControlCharacter1415.doc
differ
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].