This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new add8b39  TIKA-3282 fix non-ascii characters in onenote (#400)
add8b39 is described below

commit add8b39f0d194b119544443acf3b3908ac7e9726
Author: AdrianD-intrafind <[email protected]>
AuthorDate: Thu Jan 28 14:36:45 2021 +0100

    TIKA-3282 fix non-ascii characters in onenote (#400)
    
    * TIKA-3282 fix non-ascii characters in onenote
---
 .../parser/microsoft/onenote/OneNoteTreeWalker.java     |   2 +-
 .../parser/microsoft/onenote/OneNoteParserTest.java     |   8 ++++++++
 .../resources/test-documents/testOneNoteNonAscii.one    | Bin 0 -> 13528 bytes
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
index 6d94acf..5553bf0 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
@@ -445,7 +445,7 @@ class OneNoteTreeWalker {
                 }
                 ByteBuffer buf = ByteBuffer.allocate(content.size());
                 dif.read(buf);
-                propMap.put("dataAscii", new String(buf.array(), 
StandardCharsets.US_ASCII));
+                propMap.put("dataAscii", new String(buf.array(), 
StandardCharsets.ISO_8859_1));
                 xhtml.startElement(P);
                 xhtml.characters((String) propMap.get("dataAscii"));
                 xhtml.endElement(P);
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
index d5d1639..4d5a59b 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
@@ -206,6 +206,14 @@ public class OneNoteParserTest extends TikaTest {
             
"application/vnd.openxmlformats-officedocument.wordprocessingml.document".equals(ml.get("Content-Type"))));
     }
 
+    @Test
+    public void testOneNoteWithNonAsciiCharacter() throws Exception {
+        String txt = getText("testOneNoteNonAscii.one");
+
+        assertContains("äöüß", txt);
+        assertContains("的是\uD83D\uDE0A", txt);
+    }
+
     private void assertNoJunk(String txt) {
         //Should not include font names in the text
         assertNotContained("Calibri", txt);
diff --git 
a/tika-parsers/src/test/resources/test-documents/testOneNoteNonAscii.one 
b/tika-parsers/src/test/resources/test-documents/testOneNoteNonAscii.one
new file mode 100644
index 0000000..1adc9a7
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testOneNoteNonAscii.one differ

Reply via email to