Author: jerome
Date: Sat Dec 10 15:47:18 2005
New Revision: 355809
URL: http://svn.apache.org/viewcvs?rev=355809&view=rev
Log:
Content-Type resolution enhancements:
* Resolution moved from protocol plugins to Content constructor
* Best content-type guessing policy
* Some unit tests added
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Sat Dec
10 15:47:18 2005
@@ -22,6 +22,10 @@
import org.apache.nutch.io.*;
import org.apache.nutch.fs.*;
import org.apache.nutch.util.*;
+import org.apache.nutch.util.mime.MimeType;
+import org.apache.nutch.util.mime.MimeTypes;
+import org.apache.nutch.util.mime.MimeTypeException;
+
public final class Content extends VersionedWritable {
@@ -29,6 +33,14 @@
private final static byte VERSION = 1;
+ /** A flag that tells if magic resolution must be performed */
+ private final static boolean MAGIC =
+ NutchConf.get().getBoolean("mime.type.magic", true);
+
+ /** Get the MimeTypes resolver instance. */
+ private final static MimeTypes MIME =
+ MimeTypes.get(NutchConf.get().get("mime.types.file"));
+
private String url;
private String base;
private byte[] content;
@@ -38,18 +50,17 @@
public Content() {}
public Content(String url, String base, byte[] content, String contentType,
- Properties metadata){
+ Properties metadata) {
if (url == null) throw new IllegalArgumentException("null url");
if (base == null) throw new IllegalArgumentException("null base");
if (content == null) throw new IllegalArgumentException("null content");
- if (contentType == null) throw new IllegalArgumentException("null type");
if (metadata == null) throw new IllegalArgumentException("null metadata");
this.url = url;
this.base = base;
this.content = content;
- this.contentType = contentType;
+ this.contentType = getContentType(contentType, url, content);
this.metadata = metadata;
}
@@ -185,4 +196,33 @@
nfs.close();
}
}
+
+ private String getContentType(String typeName, String url, byte[] data) {
+
+ MimeType type = null;
+ try {
+ typeName = MimeType.clean(typeName);
+ type = typeName == null ? null : MIME.forName(typeName);
+ } catch (MimeTypeException mte) {
+ // Seems to be a malformed mime type name...
+ }
+
+ if (typeName == null || type == null || !type.matches(url)) {
+ // If no mime-type header, or cannot find a corresponding registered
+ // mime-type, or the one found doesn't match the url pattern
+ // it shouldbe, then guess a mime-type from the url pattern
+ type = MIME.getMimeType(url);
+ typeName = type == null ? typeName : type.getName();
+ }
+ if (typeName == null || type == null ||
+ (MAGIC && type.hasMagic() && !type.matches(data))) {
+ // If no mime-type already found, or the one found doesn't match
+ // the magic bytes it should be, then, guess a mime-type from the
+ // document content (magic bytes)
+ type = MIME.getMimeType(data);
+ typeName = type == null ? typeName : type.getName();
+ }
+ return typeName;
+ }
+
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java Sat
Dec 10 15:47:18 2005
@@ -227,11 +227,21 @@
return minLength;
}
- boolean hasMagic() {
+ public boolean hasMagic() {
return (magics.size() > 0);
}
- boolean matches(byte[] data) {
+ public boolean matches(String url) {
+ boolean match = false;
+ int index = url.lastIndexOf('.');
+ if ((index != -1) && (index < url.length()-1)) {
+ // There's an extension, so try to find if it matches mines
+ match = extensions.contains(url.substring(index + 1));
+ }
+ return match;
+ }
+
+ public boolean matches(byte[] data) {
if (!hasMagic()) { return false; }
Magic tested = null;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java Sat
Dec 10 15:47:18 2005
@@ -39,7 +39,10 @@
public final static String DEFAULT = "application/octet-stream";
/** All the registered MimeTypes */
- private ArrayList types = new ArrayList();
+ private ArrayList types = new ArrayList();
+
+ /** All the registered MimeType indexed by name */
+ private HashMap typesIdx = new HashMap();
/** MimeTypes indexed on the file extension */
private Map extIdx = new HashMap();
@@ -211,7 +214,14 @@
}
return mimeType;
}
-
+
+ /**
+ * Return a MimeType from its name.
+ */
+ public MimeType forName(String name) {
+ return (MimeType) typesIdx.get(name);
+ }
+
/**
* Return the minimum length of data to provide to analyzing methods
* based on the document's content in order to check all the known
@@ -241,6 +251,7 @@
* @param type is the mime-type to add.
*/
void add(MimeType type) {
+ typesIdx.put(type.getName(), type);
types.add(type);
// Update minLentgth
minLength = Math.max(minLength, type.getMinLength());
Modified:
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
Sat Dec 10 15:47:18 2005
@@ -25,9 +25,6 @@
// Nutch imports
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
/************************************
@@ -58,15 +55,6 @@
***********************************/
public class FileResponse {
- /** A flag that tells if magic resolution must be performed */
- private final static boolean MAGIC =
- NutchConf.get().getBoolean("mime.type.magic", true);
-
- /** Get the MimeTypes resolver instance. */
- private final static MimeTypes MIME =
- MimeTypes.get(NutchConf.get().get("mime.types.file"));
-
-
private String orig;
private String base;
private byte[] content;
@@ -201,15 +189,8 @@
hdrs.put("Last-Modified",
this.file.httpDateFormat.toString(f.lastModified()));
- MimeType contentType = null;
- if (MAGIC) {
- contentType = MIME.getMimeType(f.getName(), this.content);
- } else {
- contentType = MIME.getMimeType(f.getName());
- }
- if (contentType != null) {
- hdrs.put("Content-Type", contentType.getName());
- }
+ hdrs.put("Content-Type", ""); // No Content-Type at file protocol level
+
this.headers.putAll(hdrs);
// response code
Modified:
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
Sat Dec 10 15:47:18 2005
@@ -26,10 +26,6 @@
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
-
import java.net.InetAddress;
import java.net.URL;
@@ -58,15 +54,7 @@
* @author John Xing
***********************************/
public class FtpResponse {
-
- /** A flag that tells if magic resolution must be performed */
- private final static boolean MAGIC =
- NutchConf.get().getBoolean("mime.type.magic", true);
-
- /** Get the MimeTypes resolver instance. */
- private final static MimeTypes MIME =
- MimeTypes.get(NutchConf.get().get("mime.types.file"));
-
+
private String orig;
private String base;
private byte[] content;
@@ -314,16 +302,6 @@
ftp.httpDateFormat.toString(ftpFile.getTimestamp()));
this.content = os.toByteArray();
- MimeType contentType = null;
- if (MAGIC) {
- contentType = MIME.getMimeType(path, this.content);
- } else {
- contentType = MIME.getMimeType(path);
- }
- if (contentType != null) {
- this.headers.put("Content-Type", contentType.getName());
- }
-
// // approximate bytes sent and read
// if (this.httpAccounting != null) {
// this.httpAccounting.incrementBytesSent(path.length());
@@ -359,16 +337,6 @@
this.headers.put("Last-Modified",
ftp.httpDateFormat.toString(ftpFile.getTimestamp()));
this.content = os.toByteArray();
-
- MimeType contentType = null;
- if (MAGIC) {
- contentType = MIME.getMimeType(path, this.content);
- } else {
- contentType = MIME.getMimeType(path);
- }
- if (contentType != null) {
- this.headers.put("Content-Type", contentType.getName());
- }
// // approximate bytes sent and read
// if (this.httpAccounting != null) {
Modified:
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Sat Dec 10 15:47:18 2005
@@ -35,22 +35,10 @@
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.util.GZIPUtils;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
/** An HTTP response. */
public class HttpResponse {
-
- /** A flag that tells if magic resolution must be performed */
- private final static boolean MAGIC =
- NutchConf.get().getBoolean("mime.type.magic", true);
-
- /** Get the MimeTypes resolver instance. */
- private final static MimeTypes MIME =
- MimeTypes.get(NutchConf.get().get("mime.types.file"));
-
private String orig;
private String base;
@@ -69,21 +57,9 @@
public byte[] getContent() { return content; }
public Content toContent() {
- String contentType = getHeader("Content-Type");
- if (contentType == null) {
- MimeType type = null;
- if (MAGIC) {
- type = MIME.getMimeType(orig, content);
- } else {
- type = MIME.getMimeType(orig);
- }
- if (type != null) {
- contentType = type.getName();
- } else {
- contentType = "";
- }
- }
- return new Content(orig, base, content, contentType, headers);
+ return new Content(orig, base, content,
+ getHeader("Content-Type"),
+ headers);
}
public HttpResponse(URL url) throws ProtocolException, IOException {
Modified:
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
Sat Dec 10 15:47:18 2005
@@ -4,9 +4,6 @@
package org.apache.nutch.protocol.httpclient;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpVersion;
@@ -24,13 +21,6 @@
* An HTTP response.
*/
public class HttpResponse {
- /** A flag that tells if magic resolution must be performed */
- private final static boolean MAGIC =
- NutchConf.get().getBoolean("mime.type.magic", true);
-
- /** Get the MimeTypes resolver instance. */
- private final static MimeTypes MIME =
- MimeTypes.get(NutchConf.get().get("mime.types.file"));
private String orig;
@@ -63,22 +53,10 @@
}
public Content toContent() {
- String contentType = getHeader("Content-Type");
- if (contentType == null) {
- MimeType type = null;
- if (MAGIC) {
- type = MIME.getMimeType(orig, content);
- } else {
- type = MIME.getMimeType(orig);
- }
- if (type != null) {
- contentType = type.getName();
- } else {
- contentType = "";
- }
- }
- if (content == null) content = EMPTY_CONTENT;
- return new Content(orig, base, content, contentType, headers);
+ return new Content(orig, base,
+ (content == null ? EMPTY_CONTENT : content),
+ getHeader("Content-Type"),
+ headers);
}
public HttpResponse(URL url) throws IOException {
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Sat
Dec 10 15:47:18 2005
@@ -42,5 +42,59 @@
TestWritable.testWritable(r);
}
+
+ /** Unit tests for getContentType(String, String, byte[]) method. */
+ public void testGetContentType() throws Exception {
+ Content c = null;
+ Properties p = new Properties();
+
+ c = new Content("http://www.foo.com/",
+ "http://www.foo.com/",
+ "".getBytes("UTF8"),
+ "text/html; charset=UTF-8", p);
+ assertEquals("text/html", c.getContentType());
+
+ c = new Content("http://www.foo.com/foo.html",
+ "http://www.foo.com/",
+ "".getBytes("UTF8"),
+ "", p);
+ assertEquals("text/html", c.getContentType());
+
+ c = new Content("http://www.foo.com/foo.html",
+ "http://www.foo.com/",
+ "".getBytes("UTF8"),
+ null, p);
+ assertEquals("text/html", c.getContentType());
+
+ c = new Content("http://www.foo.com/",
+ "http://www.foo.com/",
+ "<html></html>".getBytes("UTF8"),
+ "", p);
+ assertEquals("text/html", c.getContentType());
+
+ c = new Content("http://www.foo.com/foo.html",
+ "http://www.foo.com/",
+ "<html></html>".getBytes("UTF8"),
+ "text/plain", p);
+ assertEquals("text/html", c.getContentType());
+
+ c = new Content("http://www.foo.com/foo.png",
+ "http://www.foo.com/",
+ "<html></html>".getBytes("UTF8"),
+ "text/plain", p);
+ assertEquals("text/html", c.getContentType());
+
+ c = new Content("http://www.foo.com/",
+ "http://www.foo.com/",
+ "".getBytes("UTF8"),
+ "", p);
+ assertEquals("", c.getContentType());
+
+ c = new Content("http://www.foo.com/",
+ "http://www.foo.com/",
+ "".getBytes("UTF8"),
+ null, p);
+ assertNull(c.getContentType());
+ }
}