On Sun, Jan 12, 2014 at 3:14 PM, Stefano Fornari
<[email protected]>wrote:
> Hi All,
> attached the patch. See https://issues.apache.org/jira/browse/TIKA-1078for
> some more details.
> Indeed with this I intend to release the right to use the code for any
> purpose.
>
> Let me know if it is ok, or anything can be improved.
> Regards,
>
> Ste
>
>
> On Sun, Jan 12, 2014 at 11:07 AM, Stefano Fornari <
> [email protected]> wrote:
>
>> Hi All,
>>
>> I'd like to fix this one as a way to get familiar with tika.
>> I have a couple of questions:
>>
>> 1. As far as I understand it (and based on the tests I have done) the
>> problem here is with special characters not allowed in file names by the
>> different file systems, not to special (i.e. not ASCII or UTF8) characters.
>> can anyone confirm?
>> 2. Is there any general policy in tika development I should follow wrt
>> java version? shall I stick to a particular version of java, or can I go
>> with Java 7?
>>
>>
>> --
>> Ste
>>
>
>
>
--
Ste
Index: tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
===================================================================
--- tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (revision 1557531)
+++ tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (working copy)
@@ -91,6 +91,7 @@
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import com.google.gson.Gson;
+import org.apache.tika.io.FilenameUtils;
/**
* Simple command line interface for Apache Tika.
@@ -712,11 +713,10 @@
name = relID + "_" + name;
}
- File outputFile = new File(extractDir, name);
- File parent = outputFile.getParentFile();
- if (!parent.exists()) {
- if (!parent.mkdirs()) {
- throw new IOException("unable to create directory \"" + parent + "\"");
+ File outputFile = new File(extractDir, FilenameUtils.normalize(name));
+ if (!extractDir.exists()) {
+ if (!extractDir.mkdirs()) {
+ throw new IOException("unable to create directory \"" + extractDir + "\"");
}
}
System.out.println("Extracting '"+name+"' ("+contentType+") to " + outputFile);
@@ -740,7 +740,16 @@
IOUtils.copy(inputStream, os);
}
} catch (Exception e) {
- logger.warn("Ignoring unexpected exception trying to save embedded file " + name, e);
+ //
+ // being a CLI program messages should go to the stderr too
+ //
+ String msg = String.format(
+ "Ignoring unexpected exception trying to save embedded file %s (%s)",
+ name,
+ e.getMessage()
+ );
+ System.err.println(msg);
+ logger.warn(msg, e);
} finally {
if (os != null) {
os.close();
Index: tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
===================================================================
--- tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java (revision 0)
+++ tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java (working copy)
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.io;
+
+/**
+ *
+ * @author ste
+ */
+public class FilenameUtils {
+
+ /**
+ * Reserved characters
+ */
+ public final static char[] RESERVED_FILENAME_CHARACTERS = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+ '?', '/', '\\', ':', '*', '<', '>', '|'
+ };
+
+ private final static String RESERVED = new String(RESERVED_FILENAME_CHARACTERS, 32, 8);
+
+ /**
+ * Scans the given file name for reserved characters on different OSs and
+ * file systems and returns a sanitized version of the name with the
+ * reserved chars replaced by their hexadecimal value.
+ *
+ * For example <code>why?.zip</code> will be converted into <code>why%3F.zip</code>
+ *
+ * @param name the file name to be normalized - NOT NULL
+ *
+ * @return the normalized file name
+ *
+ * @throws IllegalArgumentException if name is null
+ */
+ public static String normalize(final String name) {
+ if (name == null) {
+ throw new IllegalArgumentException("name cannot be null");
+ }
+
+ StringBuilder sb = new StringBuilder();
+
+ for (char c: name.toCharArray()) {
+ if ((c < ' ') || (RESERVED.indexOf(c) >= 0)) {
+ sb.append('%').append((c<16) ? "0" : "").append(Integer.toHexString(c).toUpperCase());
+ } else {
+ sb.append(c);
+ }
+ }
+
+ return sb.toString();
+ }
+}
Index: tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
===================================================================
--- tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java (revision 0)
+++ tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java (working copy)
@@ -0,0 +1,103 @@
+package org.apache.tika.io;
+
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertTrue;
+import static junit.framework.Assert.fail;
+import org.junit.Test;
+
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * @author ste
+ */
+public class FilenameUtilsTest {
+
+ /**
+ * Different filesystems and operating systems have different restrictions
+ * on the name that can be used for files and directories.
+ * FilenameUtils.normalize() returns a cross platform file name that turns
+ * special characters in a HEX based code convention. This is %<code>.
+ * For example why?.zip will be converted into why%3F.zip
+ *
+ * @see http://en.wikipedia.org/wiki/Filename#Comparison_of_filename_limitations
+ *
+ * Reserved chars are the ones in FilenameUtils.RESERVED_FILENAME_CHARACTERS:
+ */
+ @Test
+ public void normalizeNothingTodo() throws Exception {
+ final String TEST_NAME = "test.zip";
+
+ assertEquals(TEST_NAME, FilenameUtils.normalize(TEST_NAME));
+ }
+
+ @Test
+ public void normalizeWithNull() throws Exception {
+ try {
+ FilenameUtils.normalize(null);
+ fail("missing check for null parameters");
+ } catch (IllegalArgumentException x) {
+ assertTrue(x.getMessage().contains("name"));
+ assertTrue(x.getMessage().contains("not be null"));
+ }
+ }
+
+ @Test
+ public void normalizeWithReservedChar() throws Exception {
+ final String[] TEST_NAMES = {
+ "test?.txt", "?test.txt", "test.txt?", "?test?txt?"
+ };
+ final String[] EXPECTED_NAMES = {
+ "test%3F.txt", "%3Ftest.txt", "test.txt%3F", "%3Ftest%3Ftxt%3F"
+ };
+
+ for (int i=0; i<TEST_NAMES.length; ++i) {
+ System.out.println("checking " + TEST_NAMES[i]);
+ assertEquals(EXPECTED_NAMES[i], FilenameUtils.normalize(TEST_NAMES[i]));
+ }
+ }
+
+ @Test
+ public void normalizeWithReservedChars() throws Exception {
+ final String TEST_NAME =
+ "?a/b\nc\td\re*f\\g:h<i>j.txt|";
+ final String EXPECTED_NAME =
+ "%3Fa%2Fb%0Ac%09d%0De%2Af%5Cg%3Ah%3Ci%3Ej.txt%7C";
+
+ assertEquals(EXPECTED_NAME, FilenameUtils.normalize(TEST_NAME));
+ }
+
+ @Test
+ public void normalizeWithNotPrintableChars() throws Exception {
+ final String TEST_NAME = new String(
+ new char[] {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ '.',
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ }
+ );
+ final String EXPECTED_NAME =
+ "%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F" +
+ "." +
+ "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F";
+
+ assertEquals(EXPECTED_NAME, FilenameUtils.normalize(TEST_NAME));
+ }
+
+
+}