Author: jerome
Date: Sat Dec 10 15:47:18 2005
New Revision: 355809

URL: http://svn.apache.org/viewcvs?rev=355809&view=rev
Log:
Content-Type resolution enhancements:
* Resolution moved from protocol plugins to Content constructor
* Best content-type guessing policy
* Some unit tests added

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
    lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java
    lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java
    
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
    
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
    
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
    lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Sat Dec 
10 15:47:18 2005
@@ -22,6 +22,10 @@
 import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
 import org.apache.nutch.util.*;
+import org.apache.nutch.util.mime.MimeType;
+import org.apache.nutch.util.mime.MimeTypes;
+import org.apache.nutch.util.mime.MimeTypeException;
+
 
 public final class Content extends VersionedWritable {
 
@@ -29,6 +33,14 @@
 
   private final static byte VERSION = 1;
 
+  /** A flag that tells if magic resolution must be performed */
+  private final static boolean MAGIC =
+        NutchConf.get().getBoolean("mime.type.magic", true);
+
+  /** Get the MimeTypes resolver instance. */
+  private final static MimeTypes MIME = 
+        MimeTypes.get(NutchConf.get().get("mime.types.file"));
+
   private String url;
   private String base;
   private byte[] content;
@@ -38,18 +50,17 @@
   public Content() {}
     
   public Content(String url, String base, byte[] content, String contentType,
-                 Properties metadata){
+                 Properties metadata) {
 
     if (url == null) throw new IllegalArgumentException("null url");
     if (base == null) throw new IllegalArgumentException("null base");
     if (content == null) throw new IllegalArgumentException("null content");
-    if (contentType == null) throw new IllegalArgumentException("null type");
     if (metadata == null) throw new IllegalArgumentException("null metadata");
 
     this.url = url;
     this.base = base;
     this.content = content;
-    this.contentType = contentType;
+    this.contentType = getContentType(contentType, url, content);
     this.metadata = metadata;
   }
 
@@ -185,4 +196,33 @@
       nfs.close();
     }
   }
+
+  private String getContentType(String typeName, String url, byte[] data) {
+    
+    MimeType type = null;
+    try {
+        typeName = MimeType.clean(typeName);
+        type = typeName == null ? null : MIME.forName(typeName);
+    } catch (MimeTypeException mte) {
+        // Seems to be a malformed mime type name...
+    }
+
+    if (typeName == null || type == null || !type.matches(url)) {
+      // If no mime-type header, or cannot find a corresponding registered
+      // mime-type, or the one found doesn't match the url pattern
+      // it shouldbe, then guess a mime-type from the url pattern
+      type = MIME.getMimeType(url);
+      typeName = type == null ? typeName : type.getName();
+    }
+    if (typeName == null || type == null ||
+        (MAGIC && type.hasMagic() && !type.matches(data))) {
+      // If no mime-type already found, or the one found doesn't match
+      // the magic bytes it should be, then, guess a mime-type from the
+      // document content (magic bytes)
+      type = MIME.getMimeType(data);
+      typeName = type == null ? typeName : type.getName();
+    }
+    return typeName;
+  }
+
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeType.java Sat 
Dec 10 15:47:18 2005
@@ -227,11 +227,21 @@
         return minLength;
     }
     
-    boolean hasMagic() {
+    public boolean hasMagic() {
         return (magics.size() > 0);
     }
     
-    boolean matches(byte[] data) {
+    public boolean matches(String url) {
+        boolean match = false;
+        int index = url.lastIndexOf('.');
+        if ((index != -1) && (index < url.length()-1)) {
+            // There's an extension, so try to find if it matches mines
+            match = extensions.contains(url.substring(index + 1));
+         }
+         return match;
+    }
+
+    public boolean matches(byte[] data) {
         if (!hasMagic()) { return false; }
         
         Magic tested = null;

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/MimeTypes.java Sat 
Dec 10 15:47:18 2005
@@ -39,7 +39,10 @@
     public final static String DEFAULT = "application/octet-stream";
 
     /** All the registered MimeTypes */
-    private ArrayList types = new ArrayList();    
+    private ArrayList types = new ArrayList();
+
+    /** All the registered MimeType indexed by name */
+    private HashMap typesIdx = new HashMap();
 
     /** MimeTypes indexed on the file extension */
     private Map extIdx = new HashMap();
@@ -211,7 +214,14 @@
         }
         return mimeType;
     }
-    
+   
+   /**
+    * Return a MimeType from its name.
+    */
+   public MimeType forName(String name) {
+      return (MimeType) typesIdx.get(name);
+   }
+
     /**
      * Return the minimum length of data to provide to analyzing methods
      * based on the document's content in order to check all the known
@@ -241,6 +251,7 @@
      * @param type is the mime-type to add.
      */
     void add(MimeType type) {
+        typesIdx.put(type.getName(), type);
         types.add(type);
         // Update minLentgth
         minLength = Math.max(minLength, type.getMinLength());

Modified: 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 Sat Dec 10 15:47:18 2005
@@ -25,9 +25,6 @@
 
 // Nutch imports
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
 
 
 /************************************
@@ -58,15 +55,6 @@
  ***********************************/
 public class FileResponse {
 
-  /** A flag that tells if magic resolution must be performed */
-  private final static boolean MAGIC =
-        NutchConf.get().getBoolean("mime.type.magic", true);
-
-  /** Get the MimeTypes resolver instance. */
-  private final static MimeTypes MIME = 
-        MimeTypes.get(NutchConf.get().get("mime.types.file"));
-
-
   private String orig;
   private String base;
   private byte[] content;
@@ -201,15 +189,8 @@
     hdrs.put("Last-Modified",
       this.file.httpDateFormat.toString(f.lastModified()));
 
-    MimeType contentType = null;
-    if (MAGIC) {
-      contentType = MIME.getMimeType(f.getName(), this.content);
-    } else {
-      contentType = MIME.getMimeType(f.getName());
-    }
-    if (contentType != null) {
-        hdrs.put("Content-Type", contentType.getName());
-    }
+    hdrs.put("Content-Type", "");   // No Content-Type at file protocol level
+
     this.headers.putAll(hdrs);
 
     // response code

Modified: 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
 Sat Dec 10 15:47:18 2005
@@ -26,10 +26,6 @@
 
 import org.apache.nutch.protocol.Content;
 
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
-
 import java.net.InetAddress;
 import java.net.URL;
 
@@ -58,15 +54,7 @@
  * @author John Xing
  ***********************************/
 public class FtpResponse {
-    
-  /** A flag that tells if magic resolution must be performed */
-  private final static boolean MAGIC =
-        NutchConf.get().getBoolean("mime.type.magic", true);
-
-  /** Get the MimeTypes resolver instance. */
-  private final static MimeTypes MIME = 
-        MimeTypes.get(NutchConf.get().get("mime.types.file"));
-    
+
   private String orig;
   private String base;
   private byte[] content;
@@ -314,16 +302,6 @@
         ftp.httpDateFormat.toString(ftpFile.getTimestamp()));
       this.content = os.toByteArray();
 
-      MimeType contentType = null;
-      if (MAGIC) {
-        contentType = MIME.getMimeType(path, this.content);
-      } else {
-        contentType = MIME.getMimeType(path);
-      }
-      if (contentType != null) {
-        this.headers.put("Content-Type", contentType.getName());
-      }
-
 //      // approximate bytes sent and read
 //      if (this.httpAccounting != null) {
 //        this.httpAccounting.incrementBytesSent(path.length());
@@ -359,16 +337,6 @@
       this.headers.put("Last-Modified",
         ftp.httpDateFormat.toString(ftpFile.getTimestamp()));
       this.content = os.toByteArray();
-
-      MimeType contentType = null;
-      if (MAGIC) {
-        contentType = MIME.getMimeType(path, this.content);
-      } else {
-        contentType = MIME.getMimeType(path);
-      }
-      if (contentType != null) {
-        this.headers.put("Content-Type", contentType.getName());
-      }
 
 //      // approximate bytes sent and read
 //      if (this.httpAccounting != null) {

Modified: 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 Sat Dec 10 15:47:18 2005
@@ -35,22 +35,10 @@
 import org.apache.nutch.protocol.ProtocolException;
 
 import org.apache.nutch.util.GZIPUtils;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
 
 
 /** An HTTP response. */
 public class HttpResponse {
-
-  /** A flag that tells if magic resolution must be performed */
-  private final static boolean MAGIC =
-        NutchConf.get().getBoolean("mime.type.magic", true);
-
-  /** Get the MimeTypes resolver instance. */
-  private final static MimeTypes MIME = 
-        MimeTypes.get(NutchConf.get().get("mime.types.file"));
-
   
   private String orig;
   private String base;
@@ -69,21 +57,9 @@
   public byte[] getContent() { return content; }
 
   public Content toContent() {
-    String contentType = getHeader("Content-Type");
-    if (contentType == null) {
-      MimeType type = null;
-      if (MAGIC) {
-        type = MIME.getMimeType(orig, content);
-      } else {
-        type = MIME.getMimeType(orig);
-      }
-      if (type != null) {
-          contentType = type.getName();
-      } else {
-          contentType = "";
-      }
-    }
-    return new Content(orig, base, content, contentType, headers);
+    return new Content(orig, base, content,
+                       getHeader("Content-Type"),
+                       headers);
   }
 
   public HttpResponse(URL url) throws ProtocolException, IOException {

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 Sat Dec 10 15:47:18 2005
@@ -4,9 +4,6 @@
 package org.apache.nutch.protocol.httpclient;
 
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
 
 import org.apache.commons.httpclient.Header;
 import org.apache.commons.httpclient.HttpVersion;
@@ -24,13 +21,6 @@
  * An HTTP response.
  */
 public class HttpResponse {
-  /** A flag that tells if magic resolution must be performed */
-  private final static boolean MAGIC =
-        NutchConf.get().getBoolean("mime.type.magic", true);
-
-  /** Get the MimeTypes resolver instance. */
-  private final static MimeTypes MIME = 
-        MimeTypes.get(NutchConf.get().get("mime.types.file"));
 
   private String orig;
 
@@ -63,22 +53,10 @@
   }
 
   public Content toContent() {
-    String contentType = getHeader("Content-Type");
-    if (contentType == null) {
-      MimeType type = null;
-      if (MAGIC) {
-        type = MIME.getMimeType(orig, content);
-      } else {
-        type = MIME.getMimeType(orig);
-      }
-      if (type != null) {
-          contentType = type.getName();
-      } else {
-          contentType = "";
-      }
-    }
-    if (content == null) content = EMPTY_CONTENT;
-    return new Content(orig, base, content, contentType, headers);
+    return new Content(orig, base,
+                       (content == null ? EMPTY_CONTENT : content),
+                       getHeader("Content-Type"),
+                       headers);
   }
 
   public HttpResponse(URL url) throws IOException {

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=355809&r1=355808&r2=355809&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Sat 
Dec 10 15:47:18 2005
@@ -42,5 +42,59 @@
                         
     TestWritable.testWritable(r);
   }
+
+  /** Unit tests for getContentType(String, String, byte[]) method. */
+  public void testGetContentType() throws Exception {
+    Content c = null;
+    Properties p = new Properties();
+
+    c = new Content("http://www.foo.com/";,
+                    "http://www.foo.com/";,
+                    "".getBytes("UTF8"),
+                    "text/html; charset=UTF-8", p);
+    assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.html";,
+                    "http://www.foo.com/";,
+                    "".getBytes("UTF8"),
+                    "", p);
+    assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.html";,
+                    "http://www.foo.com/";,
+                    "".getBytes("UTF8"),
+                    null, p);
+    assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/";,
+                    "http://www.foo.com/";,
+                    "<html></html>".getBytes("UTF8"),
+                    "", p);
+    assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.html";,
+                    "http://www.foo.com/";,
+                    "<html></html>".getBytes("UTF8"),
+                    "text/plain", p);
+    assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.png";,
+                    "http://www.foo.com/";,
+                    "<html></html>".getBytes("UTF8"),
+                    "text/plain", p);
+    assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/";,
+                    "http://www.foo.com/";,
+                    "".getBytes("UTF8"),
+                    "", p);
+    assertEquals("", c.getContentType());
+
+    c = new Content("http://www.foo.com/";,
+                    "http://www.foo.com/";,
+                    "".getBytes("UTF8"),
+                    null, p);
+    assertNull(c.getContentType());
+  }
        
 }


Reply via email to