Author: fanningpj
Date: Tue Aug 30 19:46:32 2022
New Revision: 1903780

URL: http://svn.apache.org/viewvc?rev=1903780&view=rev
Log:
[TIKA-3388] issue with non-ascii chars in file name of embedded OLE object

Added:
    poi/trunk/test-data/document/tika-3388.docx   (with props)
Modified:
    poi/trunk/poi-ooxml/src/test/java/org/apache/poi/xwpf/TestXWPFBugs.java
    poi/trunk/poi/src/main/java/org/apache/poi/poifs/filesystem/Ole10Native.java
    poi/trunk/poi/src/main/java/org/apache/poi/util/StringUtil.java

Modified: 
poi/trunk/poi-ooxml/src/test/java/org/apache/poi/xwpf/TestXWPFBugs.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/poi-ooxml/src/test/java/org/apache/poi/xwpf/TestXWPFBugs.java?rev=1903780&r1=1903779&r2=1903780&view=diff
==============================================================================
--- poi/trunk/poi-ooxml/src/test/java/org/apache/poi/xwpf/TestXWPFBugs.java 
(original)
+++ poi/trunk/poi-ooxml/src/test/java/org/apache/poi/xwpf/TestXWPFBugs.java Tue 
Aug 30 19:46:32 2022
@@ -32,10 +32,14 @@ import org.apache.commons.compress.archi
 import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.poi.POIDataSamples;
 import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackagePartName;
+import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.poifs.crypt.CipherAlgorithm;
 import org.apache.poi.poifs.crypt.Decryptor;
 import org.apache.poi.poifs.crypt.EncryptionInfo;
 import org.apache.poi.poifs.crypt.HashAlgorithm;
+import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
@@ -159,4 +163,22 @@ class TestXWPFBugs {
             assertEquals(731, document.getParagraphs().size());
         }
     }
+
+    @Test
+    void tika3388() throws Exception {
+        try (XWPFDocument document = new 
XWPFDocument(samples.openResourceAsStream("tika-3388.docx"))) {
+            assertEquals(1, document.getParagraphs().size());
+            PackagePartName partName = 
PackagingURIHelper.createPartName("/word/embeddings/oleObject1.bin");
+            PackagePart part = document.getPackage().getPart(partName);
+            assertNotNull(part);
+            try (
+                    InputStream partStream = part.getInputStream();
+                    POIFSFileSystem poifs = new POIFSFileSystem(partStream)
+            ) {
+                Ole10Native ole = 
Ole10Native.createFromEmbeddedOleObject(poifs);
+                
assertEquals("C:\\Users\\ross\\AppData\\Local\\Microsoft\\Windows\\INetCache\\Content.Word\\約翰的測試文件\uD83D\uDD96.msg",
+                        ole.getFileName());
+            }
+        }
+    }
 }

Modified: 
poi/trunk/poi/src/main/java/org/apache/poi/poifs/filesystem/Ole10Native.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/poi/src/main/java/org/apache/poi/poifs/filesystem/Ole10Native.java?rev=1903780&r1=1903779&r2=1903780&view=diff
==============================================================================
--- 
poi/trunk/poi/src/main/java/org/apache/poi/poifs/filesystem/Ole10Native.java 
(original)
+++ 
poi/trunk/poi/src/main/java/org/apache/poi/poifs/filesystem/Ole10Native.java 
Tue Aug 30 19:46:32 2022
@@ -48,7 +48,7 @@ public class Ole10Native {
 
 
     public static final String OLE10_NATIVE = "\u0001Ole10Native";
-    private static final Charset ISO1 = StandardCharsets.ISO_8859_1;
+    private static final Charset UTF8 = StandardCharsets.UTF_8;
     // arbitrarily selected; may need to increase
     private static final int DEFAULT_MAX_RECORD_LENGTH = 100_000_000;
     private static int MAX_RECORD_LENGTH = DEFAULT_MAX_RECORD_LENGTH;
@@ -407,14 +407,14 @@ public class Ole10Native {
                     // total size, will be determined later ..
 
                     leos.writeShort(getFlags1());
-                    leos.write(getLabel().getBytes(ISO1));
+                    leos.write(getLabel().getBytes(UTF8));
                     leos.write(0);
-                    leos.write(getFileName().getBytes(ISO1));
+                    leos.write(getFileName().getBytes(UTF8));
                     leos.write(0);
                     leos.writeShort(getFlags2());
                     leos.writeShort(getUnknown1());
                     leos.writeInt(getCommand().length() + 1);
-                    leos.write(getCommand().getBytes(ISO1));
+                    leos.write(getCommand().getBytes(UTF8));
                     leos.write(0);
                     leos.writeInt(getDataSize());
                     leos.write(getDataBuffer());

Modified: poi/trunk/poi/src/main/java/org/apache/poi/util/StringUtil.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/poi/src/main/java/org/apache/poi/util/StringUtil.java?rev=1903780&r1=1903779&r2=1903780&view=diff
==============================================================================
--- poi/trunk/poi/src/main/java/org/apache/poi/util/StringUtil.java (original)
+++ poi/trunk/poi/src/main/java/org/apache/poi/util/StringUtil.java Tue Aug 30 
19:46:32 2022
@@ -135,13 +135,13 @@ public final class StringUtil {
             final int offset,
             final int len) {
         int len_to_use = Math.min(len, string.length - offset);
-        return new String(string, offset, len_to_use, ISO_8859_1);
+        return new String(string, offset, len_to_use, UTF8);
     }
 
     public static String readCompressedUnicode(LittleEndianInput in, int 
nChars) {
         byte[] buf = IOUtils.safelyAllocate(nChars, MAX_RECORD_LENGTH);
         in.readFully(buf);
-        return new String(buf, ISO_8859_1);
+        return new String(buf, UTF8);
     }
 
     /**

Added: poi/trunk/test-data/document/tika-3388.docx
URL: 
http://svn.apache.org/viewvc/poi/trunk/test-data/document/tika-3388.docx?rev=1903780&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/document/tika-3388.docx
------------------------------------------------------------------------------
--- svn:mime-type (added)
+++ svn:mime-type Tue Aug 30 19:46:32 2022
@@ -0,0 +1 @@
+application/vnd.openxmlformats-officedocument.wordprocessingml.document



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to