Author: jerome Date: Sat Dec 10 15:47:18 2005 New Revision: 355809 URL: http://svn.apache.org/viewcvs?rev=355809&view=rev Log: Content-Type resolution enhancements: * Resolution moved from protocol plugins to Content constructor * Best content-type guessing policy * Some unit tests added
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=355809&r1=355808&r2=355809&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Sat Dec 10 15:47:18 2005 @@ -22,6 +22,10 @@ import org.apache.nutch.io.*; import org.apache.nutch.fs.*; import org.apache.nutch.util.*; +import org.apache.nutch.util.mime.MimeType; +import org.apache.nutch.util.mime.MimeTypes; +import org.apache.nutch.util.mime.MimeTypeException; + public final class Content extends VersionedWritable { @@ -29,6 +33,14 @@ private final static byte VERSION = 1; + /** A flag that tells if magic resolution must be performed */ + private final static boolean MAGIC = + NutchConf.get().getBoolean("mime.type.magic", true); + + /** Get the MimeTypes resolver instance. */ + private final static MimeTypes MIME = + MimeTypes.get(NutchConf.get().get("mime.types.file")); + private String url; private String base; private byte[] content; @@ -38,18 +50,17 @@ public Content() {} public Content(String url, String base, byte[] content, String contentType, - Properties metadata){ + Properties metadata) { if (url == null) throw new IllegalArgumentException("null url"); if (base == null) throw new IllegalArgumentException("null base"); if (content == null) throw new IllegalArgumentException("null content"); - if (contentType == null) throw new IllegalArgumentException("null type"); if (metadata == null) throw new IllegalArgumentException("null metadata"); this.url = url; this.base = base; this.content = content; - this.contentType = contentType; + this.contentType = getContentType(contentType, url, content); this.metadata = metadata; } @@ -185,4 +196,33 @@ nfs.close(); } } + + private String getContentType(String typeName, String url, byte[] data) { + + MimeType type = null; + try { + typeName = MimeType.clean(typeName); + type = typeName == null ? null : MIME.forName(typeName); + } catch (MimeTypeException mte) { + // Seems to be a malformed mime type name... + } + + if (typeName == null || type == null || !type.matches(url)) { + // If no mime-type header, or cannot find a corresponding registered + // mime-type, or the one found doesn't match the url pattern + // it shouldbe, then guess a mime-type from the url pattern + type = MIME.getMimeType(url); + typeName = type == null ? typeName : type.getName(); + } + if (typeName == null || type == null || + (MAGIC && type.hasMagic() && !type.matches(data))) { + // If no mime-type already found, or the one found doesn't match + // the magic bytes it should be, then, guess a mime-type from the + // document content (magic bytes) + type = MIME.getMimeType(data); + typeName = type == null ? typeName : type.getName(); + } + return typeName; + } + } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java?rev=355809&r1=355808&r2=355809&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java Sat Dec 10 15:47:18 2005 @@ -227,11 +227,21 @@ return minLength; } - boolean hasMagic() { + public boolean hasMagic() { return (magics.size() > 0); } - boolean matches(byte[] data) { + public boolean matches(String url) { + boolean match = false; + int index = url.lastIndexOf('.'); + if ((index != -1) && (index < url.length()-1)) { + // There's an extension, so try to find if it matches mines + match = extensions.contains(url.substring(index + 1)); + } + return match; + } + + public boolean matches(byte[] data) { if (!hasMagic()) { return false; } Magic tested = null; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java?rev=355809&r1=355808&r2=355809&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java Sat Dec 10 15:47:18 2005 @@ -39,7 +39,10 @@ public final static String DEFAULT = "application/octet-stream"; /** All the registered MimeTypes */ - private ArrayList types = new ArrayList(); + private ArrayList types = new ArrayList(); + + /** All the registered MimeType indexed by name */ + private HashMap typesIdx = new HashMap(); /** MimeTypes indexed on the file extension */ private Map extIdx = new HashMap(); @@ -211,7 +214,14 @@ } return mimeType; } - + + /** + * Return a MimeType from its name. + */ + public MimeType forName(String name) { + return (MimeType) typesIdx.get(name); + } + /** * Return the minimum length of data to provide to analyzing methods * based on the document's content in order to check all the known @@ -241,6 +251,7 @@ * @param type is the mime-type to add. */ void add(MimeType type) { + typesIdx.put(type.getName(), type); types.add(type); // Update minLentgth minLength = Math.max(minLength, type.getMinLength()); Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=355809&r1=355808&r2=355809&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Sat Dec 10 15:47:18 2005 @@ -25,9 +25,6 @@ // Nutch imports import org.apache.nutch.protocol.Content; -import org.apache.nutch.util.NutchConf; -import org.apache.nutch.util.mime.MimeType; -import org.apache.nutch.util.mime.MimeTypes; /************************************ @@ -58,15 +55,6 @@ ***********************************/ public class FileResponse { - /** A flag that tells if magic resolution must be performed */ - private final static boolean MAGIC = - NutchConf.get().getBoolean("mime.type.magic", true); - - /** Get the MimeTypes resolver instance. */ - private final static MimeTypes MIME = - MimeTypes.get(NutchConf.get().get("mime.types.file")); - - private String orig; private String base; private byte[] content; @@ -201,15 +189,8 @@ hdrs.put("Last-Modified", this.file.httpDateFormat.toString(f.lastModified())); - MimeType contentType = null; - if (MAGIC) { - contentType = MIME.getMimeType(f.getName(), this.content); - } else { - contentType = MIME.getMimeType(f.getName()); - } - if (contentType != null) { - hdrs.put("Content-Type", contentType.getName()); - } + hdrs.put("Content-Type", ""); // No Content-Type at file protocol level + this.headers.putAll(hdrs); // response code Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=355809&r1=355808&r2=355809&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java Sat Dec 10 15:47:18 2005 @@ -26,10 +26,6 @@ import org.apache.nutch.protocol.Content; -import org.apache.nutch.util.NutchConf; -import org.apache.nutch.util.mime.MimeType; -import org.apache.nutch.util.mime.MimeTypes; - import java.net.InetAddress; import java.net.URL; @@ -58,15 +54,7 @@ * @author John Xing ***********************************/ public class FtpResponse { - - /** A flag that tells if magic resolution must be performed */ - private final static boolean MAGIC = - NutchConf.get().getBoolean("mime.type.magic", true); - - /** Get the MimeTypes resolver instance. */ - private final static MimeTypes MIME = - MimeTypes.get(NutchConf.get().get("mime.types.file")); - + private String orig; private String base; private byte[] content; @@ -314,16 +302,6 @@ ftp.httpDateFormat.toString(ftpFile.getTimestamp())); this.content = os.toByteArray(); - MimeType contentType = null; - if (MAGIC) { - contentType = MIME.getMimeType(path, this.content); - } else { - contentType = MIME.getMimeType(path); - } - if (contentType != null) { - this.headers.put("Content-Type", contentType.getName()); - } - // // approximate bytes sent and read // if (this.httpAccounting != null) { // this.httpAccounting.incrementBytesSent(path.length()); @@ -359,16 +337,6 @@ this.headers.put("Last-Modified", ftp.httpDateFormat.toString(ftpFile.getTimestamp())); this.content = os.toByteArray(); - - MimeType contentType = null; - if (MAGIC) { - contentType = MIME.getMimeType(path, this.content); - } else { - contentType = MIME.getMimeType(path); - } - if (contentType != null) { - this.headers.put("Content-Type", contentType.getName()); - } // // approximate bytes sent and read // if (this.httpAccounting != null) { Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=355809&r1=355808&r2=355809&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Sat Dec 10 15:47:18 2005 @@ -35,22 +35,10 @@ import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.util.GZIPUtils; -import org.apache.nutch.util.NutchConf; -import org.apache.nutch.util.mime.MimeType; -import org.apache.nutch.util.mime.MimeTypes; /** An HTTP response. */ public class HttpResponse { - - /** A flag that tells if magic resolution must be performed */ - private final static boolean MAGIC = - NutchConf.get().getBoolean("mime.type.magic", true); - - /** Get the MimeTypes resolver instance. */ - private final static MimeTypes MIME = - MimeTypes.get(NutchConf.get().get("mime.types.file")); - private String orig; private String base; @@ -69,21 +57,9 @@ public byte[] getContent() { return content; } public Content toContent() { - String contentType = getHeader("Content-Type"); - if (contentType == null) { - MimeType type = null; - if (MAGIC) { - type = MIME.getMimeType(orig, content); - } else { - type = MIME.getMimeType(orig); - } - if (type != null) { - contentType = type.getName(); - } else { - contentType = ""; - } - } - return new Content(orig, base, content, contentType, headers); + return new Content(orig, base, content, + getHeader("Content-Type"), + headers); } public HttpResponse(URL url) throws ProtocolException, IOException { Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=355809&r1=355808&r2=355809&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Sat Dec 10 15:47:18 2005 @@ -4,9 +4,6 @@ package org.apache.nutch.protocol.httpclient; import org.apache.nutch.protocol.Content; -import org.apache.nutch.util.NutchConf; -import org.apache.nutch.util.mime.MimeType; -import org.apache.nutch.util.mime.MimeTypes; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpVersion; @@ -24,13 +21,6 @@ * An HTTP response. */ public class HttpResponse { - /** A flag that tells if magic resolution must be performed */ - private final static boolean MAGIC = - NutchConf.get().getBoolean("mime.type.magic", true); - - /** Get the MimeTypes resolver instance. */ - private final static MimeTypes MIME = - MimeTypes.get(NutchConf.get().get("mime.types.file")); private String orig; @@ -63,22 +53,10 @@ } public Content toContent() { - String contentType = getHeader("Content-Type"); - if (contentType == null) { - MimeType type = null; - if (MAGIC) { - type = MIME.getMimeType(orig, content); - } else { - type = MIME.getMimeType(orig); - } - if (type != null) { - contentType = type.getName(); - } else { - contentType = ""; - } - } - if (content == null) content = EMPTY_CONTENT; - return new Content(orig, base, content, contentType, headers); + return new Content(orig, base, + (content == null ? EMPTY_CONTENT : content), + getHeader("Content-Type"), + headers); } public HttpResponse(URL url) throws IOException { Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=355809&r1=355808&r2=355809&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Sat Dec 10 15:47:18 2005 @@ -42,5 +42,59 @@ TestWritable.testWritable(r); } + + /** Unit tests for getContentType(String, String, byte[]) method. */ + public void testGetContentType() throws Exception { + Content c = null; + Properties p = new Properties(); + + c = new Content("http://www.foo.com/", + "http://www.foo.com/", + "".getBytes("UTF8"), + "text/html; charset=UTF-8", p); + assertEquals("text/html", c.getContentType()); + + c = new Content("http://www.foo.com/foo.html", + "http://www.foo.com/", + "".getBytes("UTF8"), + "", p); + assertEquals("text/html", c.getContentType()); + + c = new Content("http://www.foo.com/foo.html", + "http://www.foo.com/", + "".getBytes("UTF8"), + null, p); + assertEquals("text/html", c.getContentType()); + + c = new Content("http://www.foo.com/", + "http://www.foo.com/", + "<html></html>".getBytes("UTF8"), + "", p); + assertEquals("text/html", c.getContentType()); + + c = new Content("http://www.foo.com/foo.html", + "http://www.foo.com/", + "<html></html>".getBytes("UTF8"), + "text/plain", p); + assertEquals("text/html", c.getContentType()); + + c = new Content("http://www.foo.com/foo.png", + "http://www.foo.com/", + "<html></html>".getBytes("UTF8"), + "text/plain", p); + assertEquals("text/html", c.getContentType()); + + c = new Content("http://www.foo.com/", + "http://www.foo.com/", + "".getBytes("UTF8"), + "", p); + assertEquals("", c.getContentType()); + + c = new Content("http://www.foo.com/", + "http://www.foo.com/", + "".getBytes("UTF8"), + null, p); + assertNull(c.getContentType()); + } }