This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new add8b39 TIKA-3282 fix non-ascii characters in onenote (#400)
add8b39 is described below
commit add8b39f0d194b119544443acf3b3908ac7e9726
Author: AdrianD-intrafind <[email protected]>
AuthorDate: Thu Jan 28 14:36:45 2021 +0100
TIKA-3282 fix non-ascii characters in onenote (#400)
* TIKA-3282 fix non-ascii characters in onenote
---
.../parser/microsoft/onenote/OneNoteTreeWalker.java | 2 +-
.../parser/microsoft/onenote/OneNoteParserTest.java | 8 ++++++++
.../resources/test-documents/testOneNoteNonAscii.one | Bin 0 -> 13528 bytes
3 files changed, 9 insertions(+), 1 deletion(-)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
index 6d94acf..5553bf0 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
@@ -445,7 +445,7 @@ class OneNoteTreeWalker {
}
ByteBuffer buf = ByteBuffer.allocate(content.size());
dif.read(buf);
- propMap.put("dataAscii", new String(buf.array(),
StandardCharsets.US_ASCII));
+ propMap.put("dataAscii", new String(buf.array(),
StandardCharsets.ISO_8859_1));
xhtml.startElement(P);
xhtml.characters((String) propMap.get("dataAscii"));
xhtml.endElement(P);
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
index d5d1639..4d5a59b 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
@@ -206,6 +206,14 @@ public class OneNoteParserTest extends TikaTest {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document".equals(ml.get("Content-Type"))));
}
+ @Test
+ public void testOneNoteWithNonAsciiCharacter() throws Exception {
+ String txt = getText("testOneNoteNonAscii.one");
+
+ assertContains("äöüß", txt);
+ assertContains("的是\uD83D\uDE0A", txt);
+ }
+
private void assertNoJunk(String txt) {
//Should not include font names in the text
assertNotContained("Calibri", txt);
diff --git
a/tika-parsers/src/test/resources/test-documents/testOneNoteNonAscii.one
b/tika-parsers/src/test/resources/test-documents/testOneNoteNonAscii.one
new file mode 100644
index 0000000..1adc9a7
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testOneNoteNonAscii.one differ