On Sun, Jan 12, 2014 at 3:14 PM, Stefano Fornari
<[email protected]>wrote:

> Hi All,
> attached the patch. See https://issues.apache.org/jira/browse/TIKA-1078for 
> some more details.
> Indeed with this I intend to release the right to use the code for any
> purpose.
>
> Let me know if it is ok, or anything can be improved.
> Regards,
>
> Ste
>
>
> On Sun, Jan 12, 2014 at 11:07 AM, Stefano Fornari <
> [email protected]> wrote:
>
>> Hi All,
>>
>> I'd like to fix this one as a way to get familiar with tika.
>> I have a couple of questions:
>>
>> 1. As far as I understand it (and based on the tests I have done) the
>> problem here is with special characters not allowed in file names by the
>> different file systems, not to special (i.e. not ASCII or UTF8) characters.
>> can anyone confirm?
>> 2. Is there any general policy in tika development I should follow wrt
>> java version? shall I stick to a particular version of java, or can I go
>> with Java 7?
>>
>>
>> --
>> Ste
>>
>
>
>


-- 
Ste
Index: tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
===================================================================
--- tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java	(revision 1557531)
+++ tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java	(working copy)
@@ -91,6 +91,7 @@
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 import com.google.gson.Gson;
+import org.apache.tika.io.FilenameUtils;
 
 /**
  * Simple command line interface for Apache Tika.
@@ -712,11 +713,10 @@
                 name = relID + "_" + name;
             }
 
-            File outputFile = new File(extractDir, name);
-            File parent = outputFile.getParentFile();
-            if (!parent.exists()) {
-                if (!parent.mkdirs()) {
-                    throw new IOException("unable to create directory \"" + parent + "\"");
+            File outputFile = new File(extractDir, FilenameUtils.normalize(name));
+            if (!extractDir.exists()) {
+                if (!extractDir.mkdirs()) {
+                    throw new IOException("unable to create directory \"" + extractDir + "\"");
                 }
             }
             System.out.println("Extracting '"+name+"' ("+contentType+") to " + outputFile);
@@ -740,7 +740,16 @@
                     IOUtils.copy(inputStream, os);
                 }
             } catch (Exception e) {
-                logger.warn("Ignoring unexpected exception trying to save embedded file " + name, e);
+                //
+                // being a CLI program messages should go to the stderr too
+                //
+                String msg = String.format(
+                    "Ignoring unexpected exception trying to save embedded file %s (%s)",
+                    name,
+                    e.getMessage()
+                );
+                System.err.println(msg);
+                logger.warn(msg, e);
             } finally {
                 if (os != null) {
                     os.close();
Index: tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
===================================================================
--- tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java	(revision 0)
+++ tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java	(working copy)
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.io;
+
+/**
+ *
+ * @author ste
+ */
+public class FilenameUtils {
+
+    /**
+     * Reserved characters
+     */
+    public final static char[] RESERVED_FILENAME_CHARACTERS = {
+        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+        0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+        '?', '/', '\\', ':', '*', '<', '>', '|'
+    };
+
+    private final static String RESERVED = new String(RESERVED_FILENAME_CHARACTERS, 32, 8);
+
+    /**
+     * Scans the given file name for reserved characters on different OSs and
+     * file systems and returns a sanitized version of the name with the
+     * reserved chars replaced by their hexadecimal value.
+     *
+     * For example <code>why?.zip</code> will be converted into <code>why%3F.zip</code>
+     *
+     * @param name the file name to be normalized - NOT NULL
+     *
+     * @return the normalized file name
+     *
+     * @throws IllegalArgumentException if name is null
+     */
+    public static String normalize(final String name) {
+        if (name == null) {
+            throw new IllegalArgumentException("name cannot be null");
+        }
+
+        StringBuilder sb = new StringBuilder();
+
+        for (char c: name.toCharArray()) {
+            if ((c < ' ') || (RESERVED.indexOf(c) >= 0)) {
+                sb.append('%').append((c<16) ? "0" : "").append(Integer.toHexString(c).toUpperCase());
+            } else {
+                sb.append(c);
+            }
+        }
+
+        return sb.toString();
+    }
+}
Index: tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
===================================================================
--- tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java	(revision 0)
+++ tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java	(working copy)
@@ -0,0 +1,103 @@
+package org.apache.tika.io;
+
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertTrue;
+import static junit.framework.Assert.fail;
+import org.junit.Test;
+
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * @author ste
+ */
+public class FilenameUtilsTest {
+
+    /**
+     * Different filesystems and operating systems have different restrictions
+     * on the name that can be used for files and directories.
+     * FilenameUtils.normalize() returns a cross platform file name that turns
+     * special characters in a HEX based code convention. This is %<code>.
+     * For example why?.zip will be converted into why%3F.zip
+     *
+     * @see http://en.wikipedia.org/wiki/Filename#Comparison_of_filename_limitations
+     *
+     * Reserved chars are the ones in FilenameUtils.RESERVED_FILENAME_CHARACTERS:
+     */
+    @Test
+    public void normalizeNothingTodo() throws Exception {
+        final String TEST_NAME = "test.zip";
+
+        assertEquals(TEST_NAME, FilenameUtils.normalize(TEST_NAME));
+    }
+
+    @Test
+    public void normalizeWithNull() throws Exception {
+        try {
+            FilenameUtils.normalize(null);
+            fail("missing check for null parameters");
+        } catch (IllegalArgumentException x) {
+            assertTrue(x.getMessage().contains("name"));
+            assertTrue(x.getMessage().contains("not be null"));
+        }
+    }
+
+    @Test
+    public void normalizeWithReservedChar() throws Exception {
+        final String[] TEST_NAMES = {
+            "test?.txt", "?test.txt", "test.txt?", "?test?txt?"
+        };
+        final String[] EXPECTED_NAMES = {
+            "test%3F.txt", "%3Ftest.txt", "test.txt%3F", "%3Ftest%3Ftxt%3F"
+        };
+
+        for (int i=0; i<TEST_NAMES.length; ++i) {
+            System.out.println("checking " + TEST_NAMES[i]);
+            assertEquals(EXPECTED_NAMES[i], FilenameUtils.normalize(TEST_NAMES[i]));
+        }
+    }
+
+    @Test
+    public void normalizeWithReservedChars() throws Exception {
+        final String TEST_NAME =
+            "?a/b\nc\td\re*f\\g:h<i>j.txt|";
+        final String EXPECTED_NAME =
+            "%3Fa%2Fb%0Ac%09d%0De%2Af%5Cg%3Ah%3Ci%3Ej.txt%7C";
+
+        assertEquals(EXPECTED_NAME, FilenameUtils.normalize(TEST_NAME));
+    }
+
+    @Test
+    public void normalizeWithNotPrintableChars() throws Exception {
+        final String TEST_NAME = new String(
+            new char[] {
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                '.',
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            }
+        );
+        final String EXPECTED_NAME =
+            "%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F" +
+            "." +
+            "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F";
+
+        assertEquals(EXPECTED_NAME, FilenameUtils.normalize(TEST_NAME));
+    }
+
+
+}

Reply via email to