Author: siren Date: Thu Feb 19 10:25:47 2009 New Revision: 745808 URL: http://svn.apache.org/viewvc?rev=745808&view=rev Log: NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin, contributed by Dmitry Lihachev
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745808&r1=745807&r2=745808&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Feb 19 10:25:47 2009 @@ -348,6 +348,9 @@ 130. NUTCH-563 - Include custom fields in BasicQueryFilter (Julien Nioche via siren) + +131. NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin + (Dmitry Lihachev via siren) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=745808&r1=745807&r2=745808&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Thu Feb 19 10:25:47 2009 @@ -199,20 +199,20 @@ MimeType mimeType = null; String contentType = data.getMeta(Response.CONTENT_TYPE); if (contentType == null) { - // Note by Jerome Charron on 20050415: - // Content Type not solved by a previous plugin - // Or unable to solve it... Trying to find it - // Should be better to use the doc content too - // (using MimeTypes.getMimeType(byte[], String), but I don't know - // which field it is? - // if (MAGIC) { - // contentType = MIME.getMimeType(url, content); - // } else { - // contentType = MIME.getMimeType(url); - // } - mimeType = MIME.getMimeType(url); + // Note by Jerome Charron on 20050415: + // Content Type not solved by a previous plugin + // Or unable to solve it... Trying to find it + // Should be better to use the doc content too + // (using MimeTypes.getMimeType(byte[], String), but I don't know + // which field it is? + // if (MAGIC) { + // contentType = MIME.getMimeType(url, content); + // } else { + // contentType = MIME.getMimeType(url); + // } + mimeType = MIME.getMimeType(url); } else { - mimeType = MIME.forName(contentType); + mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType)); } // Checks if we solved the content-type. Modified: lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=745808&r1=745807&r2=745808&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Thu Feb 19 10:25:47 2009 @@ -16,10 +16,30 @@ */ package org.apache.nutch.indexer.more; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.util.NutchConfiguration; + import junit.framework.TestCase; public class TestMoreIndexingFilter extends TestCase { + public void testContentType() throws IndexingException { + Configuration conf = NutchConfiguration.create(); + assertContentType(conf, "text/html", "text/html"); + assertContentType(conf, "text/html; charset=UTF-8", "text/html"); + } + public void testGetParts() { String[] parts = MoreIndexingFilter.getParts("text/html"); assertParts(parts, 2, "text", "html"); @@ -32,4 +52,15 @@ assertEquals(expected[i], parts[i]); } } + + private void assertContentType(Configuration conf, String source, String expected) throws IndexingException { + Metadata metadata = new Metadata(); + metadata.add(Response.CONTENT_TYPE, source); + MoreIndexingFilter filter = new MoreIndexingFilter(); + filter.setConf(conf); + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData( + new ParseStatus(), "title", new Outlink[0], metadata)), new Text( + "http://www.example.com/"), new CrawlDatum(), new Inlinks()); + assertEquals("mime type not detected", expected, doc.getFieldValue("type")); + } }