Author: snagel Date: Sat Jul 5 20:36:33 2014 New Revision: 1608130 URL: http://svn.apache.org/r1608130 Log: NUTCH-1605 MIME type detector recognizes xlsx as zip file
Added: nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java (with props) nutch/branches/2.x/src/testresources/test-mime-util/ nutch/branches/2.x/src/testresources/test-mime-util/test.xlsx (with props) nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java (with props) nutch/trunk/src/testresources/test-mime-util/ nutch/trunk/src/testresources/test-mime-util/test.xlsx (with props) Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1608130&r1=1608129&r2=1608130&view=diff ============================================================================== --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Sat Jul 5 20:36:33 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1605 MIME type detector recognizes xlsx as zip file (snagel) + * NUTCH-385 Improve description of thread related configuration for Fetcher (jnioche,lufeng) * NUTCH-1798 Crawl script not calling index command correctly (Aaron Bedward via jnioche) Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java?rev=1608130&r1=1608129&r2=1608130&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java Sat Jul 5 20:36:33 2014 @@ -19,13 +19,16 @@ package org.apache.nutch.util; // JDK imports import java.io.File; +import java.io.IOException; +import java.io.InputStream; // Hadoop imports import org.apache.hadoop.conf.Configuration; // Tika imports import org.apache.tika.Tika; -import org.apache.tika.config.TikaConfig; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; @@ -128,10 +131,10 @@ public final class MimeUtil { * strategies available within Tika. First, the mime type provided in * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}. * Then the cleaned mime type is looked up in the underlying Tika - * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} is - * found, then that mime type is used, otherwise URL resolution is - * used to try and determine the mime type. If that means is unsuccessful, and - * if <code>mime.type.magic</code> is enabled in {@link NutchConfiguration}, + * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} + * is found, then that mime type is used, otherwise URL resolution is + * used to try and determine the mime type. However, if + * <code>mime.type.magic</code> is enabled in {@link NutchConfiguration}, * then mime type magic resolution is used to try and obtain a * better-than-the-default approximation of the {@link MimeType}. * @@ -145,24 +148,19 @@ public final class MimeUtil { */ public String autoResolveContentType(String typeName, String url, byte[] data) { String retType = null; - String magicType = null; MimeType type = null; String cleanedMimeType = null; - try { - cleanedMimeType = MimeUtil.cleanMimeType(typeName) != null ? this.mimeTypes - .forName(MimeUtil.cleanMimeType(typeName)).getName() - : null; - } catch (MimeTypeException mte) { - // Seems to be a malformed mime type name... - } - + cleanedMimeType = MimeUtil.cleanMimeType(typeName); // first try to get the type from the cleaned type name - try { - type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType) - : null; - } catch (MimeTypeException e) { - type = null; + if (cleanedMimeType != null) { + try { + type = mimeTypes.forName(cleanedMimeType); + cleanedMimeType = type.getName(); + } catch (MimeTypeException mte) { + // Seems to be a malformed mime type name... + cleanedMimeType = null; + } } // if returned null, or if it's the default type then try url resolution @@ -172,8 +170,6 @@ public final class MimeUtil { // mime-type, then guess a mime-type from the url pattern try { - TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); - Tika tika = new Tika(tikaConfig); retType = tika.detect(url) != null ? tika.detect(url) : null; } catch (Exception e) { String message = "Problem loading default Tika configuration"; @@ -189,10 +185,21 @@ public final class MimeUtil { // if it is, and it's not the default mime type, then go with the mime type // returned by the magic if (this.mimeMagic) { - magicType = tika.detect(data); + String magicType = null; + // pass URL (file name) and (cleansed) content type from protocol to Tika + Metadata tikaMeta = new Metadata(); + tikaMeta.add(Metadata.RESOURCE_NAME_KEY, url); + tikaMeta.add(Metadata.CONTENT_TYPE, + (cleanedMimeType != null ? cleanedMimeType : typeName)); + try { + InputStream stream = TikaInputStream.get(data); + try { + magicType = tika.detect(stream, tikaMeta); + } finally { + stream.close(); + } + } catch (IOException ignore) {} - // Deprecated in Tika 1.0 See https://issues.apache.org/jira/browse/NUTCH-1230 - //MimeType magicType = this.mimeTypes.getMimeType(data); if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM) && !magicType.equals(MimeTypes.PLAIN_TEXT) && retType != null && !retType.equals(magicType)) { Added: nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java?rev=1608130&view=auto ============================================================================== --- nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java (added) +++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java Sat Jul 5 20:36:33 2014 @@ -0,0 +1,130 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.Charset; + +import org.apache.hadoop.conf.Configuration; + +import com.google.common.io.Files; + +import junit.framework.TestCase; + +public class TestMimeUtil extends TestCase { + + public static String urlPrefix = "http://localhost/"; + + private static Charset defaultCharset = Charset.forName("UTF-8"); + + private File sampleDir = new File(System.getProperty("test.build.data", "."), + "test-mime-util"); + + /** test data, every element on "test page": + * <ol> + * <li>MIME type</li> + * <li>file name (last URL path element)</li> + * <li>Content-Type (HTTP header)</li> + * <li>content: if empty, do not test MIME magic</li> + * </ol> + */ + public static String[][] textBasedFormats = { + { + "text/html", + "test.html", + "text/html; charset=utf-8", + "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " + + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" + + "<html>\n<head>\n" + + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />\n" + + "</head>\n<body>Hello, World!</body></html>" }, + { + "text/html", + "test.html", + "", // no Content-Type in HTTP header => test URL pattern + "<!DOCTYPE html>\n<html>\n<head>\n" + + "</head>\n<body>Hello, World!</body></html>" }, + { + "application/xhtml+xml", + "test.html", + "application/xhtml+xml; charset=utf-8", + "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">" + + "<html>\n<head>\n" + + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />" + + "</head>\n<body>Hello, World!</body></html>" } + }; + + public static String[][] binaryFiles = { + { + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "test.xlsx", + "" } + }; + + private String getMimeType(String url, File file, String contentType, + boolean useMagic) throws IOException { + return getMimeType(url, Files.toByteArray(file), contentType, useMagic); + } + + private String getMimeType(String url, byte[] bytes, String contentType, + boolean useMagic) { + Configuration conf = NutchConfiguration.create(); + conf.setBoolean("mime.type.magic", useMagic); + MimeUtil mimeUtil = new MimeUtil(conf); + return mimeUtil.autoResolveContentType(contentType, url, bytes); + } + + /** use HTTP Content-Type, URL pattern, and MIME magic */ + public void testWithMimeMagic() { + for (String[] testPage : textBasedFormats) { + String mimeType = getMimeType(urlPrefix, + testPage[3].getBytes(defaultCharset), testPage[2], true); + assertEquals("", testPage[0], mimeType); + } + } + + /** use only HTTP Content-Type (if given) and URL pattern */ + public void testWithoutMimeMagic() { + for (String[] testPage : textBasedFormats) { + String mimeType = getMimeType(urlPrefix + testPage[1], + testPage[3].getBytes(defaultCharset), testPage[2], false); + assertEquals("", testPage[0], mimeType); + } + } + + /** use only MIME magic (detection from content bytes) */ + public void testOnlyMimeMagic() { + for (String[] testPage : textBasedFormats) { + String mimeType = getMimeType(urlPrefix, + testPage[3].getBytes(defaultCharset), "", true); + assertEquals("", testPage[0], mimeType); + } + } + + /** test binary file formats (real files) */ + public void testBinaryFiles() throws IOException { + for (String[] testPage : binaryFiles) { + File dataFile = new File(sampleDir, testPage[1]); + String mimeType = getMimeType(urlPrefix + testPage[1], + dataFile, testPage[2], false); + assertEquals("", testPage[0], mimeType); + } + } + +} Propchange: nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java ------------------------------------------------------------------------------ svn:eol-style = native Added: nutch/branches/2.x/src/testresources/test-mime-util/test.xlsx URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/testresources/test-mime-util/test.xlsx?rev=1608130&view=auto ============================================================================== Binary file - no diff available. Propchange: nutch/branches/2.x/src/testresources/test-mime-util/test.xlsx ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1608130&r1=1608129&r2=1608130&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Sat Jul 5 20:36:33 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1605 MIME type detector recognizes xlsx as zip file (snagel) + * NUTCH-1802 Move TestbedProxy to test environment (jnioche) * NUTCH-1803 Put test dependencies in a separate lib dir (jnioche) Modified: nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=1608130&r1=1608129&r2=1608130&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Sat Jul 5 20:36:33 2014 @@ -19,13 +19,16 @@ package org.apache.nutch.util; // JDK imports import java.io.File; +import java.io.IOException; +import java.io.InputStream; // Hadoop imports import org.apache.hadoop.conf.Configuration; // Tika imports import org.apache.tika.Tika; -import org.apache.tika.config.TikaConfig; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; @@ -128,10 +131,10 @@ public final class MimeUtil { * strategies available within Tika. First, the mime type provided in * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}. * Then the cleaned mime type is looked up in the underlying Tika - * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} is - * found, then that mime type is used, otherwise URL resolution is - * used to try and determine the mime type. If that means is unsuccessful, and - * if <code>mime.type.magic</code> is enabled in {@link NutchConfiguration}, + * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} + * is found, then that mime type is used, otherwise URL resolution is + * used to try and determine the mime type. However, if + * <code>mime.type.magic</code> is enabled in {@link NutchConfiguration}, * then mime type magic resolution is used to try and obtain a * better-than-the-default approximation of the {@link MimeType}. * @@ -145,24 +148,19 @@ public final class MimeUtil { */ public String autoResolveContentType(String typeName, String url, byte[] data) { String retType = null; - String magicType = null; MimeType type = null; String cleanedMimeType = null; - try { - cleanedMimeType = MimeUtil.cleanMimeType(typeName) != null ? this.mimeTypes - .forName(MimeUtil.cleanMimeType(typeName)).getName() - : null; - } catch (MimeTypeException mte) { - // Seems to be a malformed mime type name... - } - + cleanedMimeType = MimeUtil.cleanMimeType(typeName); // first try to get the type from the cleaned type name - try { - type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType) - : null; - } catch (MimeTypeException e) { - type = null; + if (cleanedMimeType != null) { + try { + type = mimeTypes.forName(cleanedMimeType); + cleanedMimeType = type.getName(); + } catch (MimeTypeException mte) { + // Seems to be a malformed mime type name... + cleanedMimeType = null; + } } // if returned null, or if it's the default type then try url resolution @@ -171,8 +169,6 @@ public final class MimeUtil { // If no mime-type header, or cannot find a corresponding registered // mime-type, then guess a mime-type from the url pattern try { - TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); - Tika tika = new Tika(tikaConfig); retType = tika.detect(url) != null ? tika.detect(url) : null; } catch (Exception e) { String message = "Problem loading default Tika configuration"; @@ -188,10 +184,21 @@ public final class MimeUtil { // if it is, and it's not the default mime type, then go with the mime type // returned by the magic if (this.mimeMagic) { - magicType = tika.detect(data); + String magicType = null; + // pass URL (file name) and (cleansed) content type from protocol to Tika + Metadata tikaMeta = new Metadata(); + tikaMeta.add(Metadata.RESOURCE_NAME_KEY, url); + tikaMeta.add(Metadata.CONTENT_TYPE, + (cleanedMimeType != null ? cleanedMimeType : typeName)); + try { + InputStream stream = TikaInputStream.get(data); + try { + magicType = tika.detect(stream, tikaMeta); + } finally { + stream.close(); + } + } catch (IOException ignore) {} - // Deprecated in Tika 1.0 See https://issues.apache.org/jira/browse/NUTCH-1230 - //MimeType magicType = this.mimeTypes.getMimeType(data); if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM) && !magicType.equals(MimeTypes.PLAIN_TEXT) && retType != null && !retType.equals(magicType)) { Added: nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java?rev=1608130&view=auto ============================================================================== --- nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java (added) +++ nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java Sat Jul 5 20:36:33 2014 @@ -0,0 +1,130 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.Charset; + +import org.apache.hadoop.conf.Configuration; + +import com.google.common.io.Files; + +import junit.framework.TestCase; + +public class TestMimeUtil extends TestCase { + + public static String urlPrefix = "http://localhost/"; + + private static Charset defaultCharset = Charset.forName("UTF-8"); + + private File sampleDir = new File(System.getProperty("test.build.data", "."), + "test-mime-util"); + + /** test data, every element on "test page": + * <ol> + * <li>MIME type</li> + * <li>file name (last URL path element)</li> + * <li>Content-Type (HTTP header)</li> + * <li>content: if empty, do not test MIME magic</li> + * </ol> + */ + public static String[][] textBasedFormats = { + { + "text/html", + "test.html", + "text/html; charset=utf-8", + "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " + + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" + + "<html>\n<head>\n" + + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />\n" + + "</head>\n<body>Hello, World!</body></html>" }, + { + "text/html", + "test.html", + "", // no Content-Type in HTTP header => test URL pattern + "<!DOCTYPE html>\n<html>\n<head>\n" + + "</head>\n<body>Hello, World!</body></html>" }, + { + "application/xhtml+xml", + "test.html", + "application/xhtml+xml; charset=utf-8", + "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">" + + "<html>\n<head>\n" + + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />" + + "</head>\n<body>Hello, World!</body></html>" } + }; + + public static String[][] binaryFiles = { + { + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "test.xlsx", + "" } + }; + + private String getMimeType(String url, File file, String contentType, + boolean useMagic) throws IOException { + return getMimeType(url, Files.toByteArray(file), contentType, useMagic); + } + + private String getMimeType(String url, byte[] bytes, String contentType, + boolean useMagic) { + Configuration conf = NutchConfiguration.create(); + conf.setBoolean("mime.type.magic", useMagic); + MimeUtil mimeUtil = new MimeUtil(conf); + return mimeUtil.autoResolveContentType(contentType, url, bytes); + } + + /** use HTTP Content-Type, URL pattern, and MIME magic */ + public void testWithMimeMagic() { + for (String[] testPage : textBasedFormats) { + String mimeType = getMimeType(urlPrefix, + testPage[3].getBytes(defaultCharset), testPage[2], true); + assertEquals("", testPage[0], mimeType); + } + } + + /** use only HTTP Content-Type (if given) and URL pattern */ + public void testWithoutMimeMagic() { + for (String[] testPage : textBasedFormats) { + String mimeType = getMimeType(urlPrefix + testPage[1], + testPage[3].getBytes(defaultCharset), testPage[2], false); + assertEquals("", testPage[0], mimeType); + } + } + + /** use only MIME magic (detection from content bytes) */ + public void testOnlyMimeMagic() { + for (String[] testPage : textBasedFormats) { + String mimeType = getMimeType(urlPrefix, + testPage[3].getBytes(defaultCharset), "", true); + assertEquals("", testPage[0], mimeType); + } + } + + /** test binary file formats (real files) */ + public void testBinaryFiles() throws IOException { + for (String[] testPage : binaryFiles) { + File dataFile = new File(sampleDir, testPage[1]); + String mimeType = getMimeType(urlPrefix + testPage[1], + dataFile, testPage[2], false); + assertEquals("", testPage[0], mimeType); + } + } + +} Propchange: nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java ------------------------------------------------------------------------------ svn:eol-style = native Added: nutch/trunk/src/testresources/test-mime-util/test.xlsx URL: http://svn.apache.org/viewvc/nutch/trunk/src/testresources/test-mime-util/test.xlsx?rev=1608130&view=auto ============================================================================== Binary file - no diff available. Propchange: nutch/trunk/src/testresources/test-mime-util/test.xlsx ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream