Author: siren
Date: Thu Feb 19 10:25:47 2009
New Revision: 745808

URL: http://svn.apache.org/viewvc?rev=745808&view=rev
Log:
NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin, 
contributed by Dmitry Lihachev

Modified:
    lucene/nutch/trunk/CHANGES.txt
    
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
    
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745808&r1=745807&r2=745808&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Feb 19 10:25:47 2009
@@ -348,6 +348,9 @@
 
 130. NUTCH-563 - Include custom fields in BasicQueryFilter
      (Julien Nioche via siren)
+     
+131. NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin
+     (Dmitry Lihachev via siren)
 
 Release 0.9 - 2007-04-02
 

Modified: 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=745808&r1=745807&r2=745808&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Thu Feb 19 10:25:47 2009
@@ -199,20 +199,20 @@
     MimeType mimeType = null;
     String contentType = data.getMeta(Response.CONTENT_TYPE);
     if (contentType == null) {
-        // Note by Jerome Charron on 20050415:
-        // Content Type not solved by a previous plugin
-        // Or unable to solve it... Trying to find it
-        // Should be better to use the doc content too
-        // (using MimeTypes.getMimeType(byte[], String), but I don't know
-        // which field it is?
-        // if (MAGIC) {
-        //   contentType = MIME.getMimeType(url, content);
-        // } else {
-        //   contentType = MIME.getMimeType(url);
-        // }
-        mimeType = MIME.getMimeType(url);
+      // Note by Jerome Charron on 20050415:
+      // Content Type not solved by a previous plugin
+      // Or unable to solve it... Trying to find it
+      // Should be better to use the doc content too
+      // (using MimeTypes.getMimeType(byte[], String), but I don't know
+      // which field it is?
+      // if (MAGIC) {
+      //   contentType = MIME.getMimeType(url, content);
+      // } else {
+      //   contentType = MIME.getMimeType(url);
+      // }
+      mimeType = MIME.getMimeType(url);
     } else {
-            mimeType = MIME.forName(contentType);
+      mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
     }
         
     // Checks if we solved the content-type.

Modified: 
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=745808&r1=745807&r2=745808&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
 Thu Feb 19 10:25:47 2009
@@ -16,10 +16,30 @@
  */
 package org.apache.nutch.indexer.more;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
 import junit.framework.TestCase;
 
 public class TestMoreIndexingFilter extends TestCase {
 
+  public void testContentType() throws IndexingException {
+    Configuration conf = NutchConfiguration.create();
+    assertContentType(conf, "text/html", "text/html");
+    assertContentType(conf, "text/html; charset=UTF-8", "text/html");
+  }
+  
   public void testGetParts() {
     String[] parts = MoreIndexingFilter.getParts("text/html");
     assertParts(parts, 2, "text", "html");
@@ -32,4 +52,15 @@
       assertEquals(expected[i], parts[i]);
     }
   }
+  
+  private void assertContentType(Configuration conf, String source, String 
expected) throws IndexingException {
+    Metadata metadata = new Metadata();
+    metadata.add(Response.CONTENT_TYPE, source);
+    MoreIndexingFilter filter = new MoreIndexingFilter();
+    filter.setConf(conf);
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text", new ParseData(
+        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
+        "http://www.example.com/";), new CrawlDatum(), new Inlinks());
+    assertEquals("mime type not detected", expected, 
doc.getFieldValue("type"));
+  }
 }


Reply via email to