This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new d1a8bff  TIKA-2459 -- fix special character handling
d1a8bff is described below

commit d1a8bff9faacb828a1039f7cc2c7f9e1f1d5e3fd
Author: tballison <[email protected]>
AuthorDate: Fri Sep 8 12:47:40 2017 -0400

    TIKA-2459 -- fix special character handling
---
 .../org/apache/tika/parser/microsoft/WordExtractor.java |   2 ++
 .../apache/tika/parser/microsoft/WordParserTest.java    |   6 ++++++
 .../testWORD_specialControlCharacter1415.doc            | Bin 0 -> 25600 bytes
 3 files changed, 8 insertions(+)

diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index ff07fef..569c881 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -489,6 +489,8 @@ public class WordExtractor extends AbstractPOIFSExtractor {
                     controls = new ArrayList<CharacterRun>();
                 }
                 break;
+            } else if (cr.text().equals("\u0014\u0015")) {
+                has14 = true;
             } else {
                 if (has14) {
                     texts.add(cr);
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index b399d09..b70ba72 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -621,5 +621,11 @@ public class WordParserTest extends TikaTest {
         assertContains("\\s\\up 
10(\u3068\u3046\u304D\u3087\u3046),\u6771\u4EAC",
                 getXML("testWORD_phonetic.doc").xml);
     }
+
+    @Test
+    public void testSpecialControlCharacter() throws Exception {
+        //TIKA-2459
+        assertContains("Paragraph one", 
getXML("testWORD_specialControlCharacter1415.doc").xml);
+    }
 }
 
diff --git 
a/tika-parsers/src/test/resources/test-documents/testWORD_specialControlCharacter1415.doc
 
b/tika-parsers/src/test/resources/test-documents/testWORD_specialControlCharacter1415.doc
new file mode 100644
index 0000000..919126c
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testWORD_specialControlCharacter1415.doc
 differ

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to