Author: snagel Date: Wed Jan 7 22:25:18 2015 New Revision: 1650181 URL: http://svn.apache.org/r1650181 Log: NUTCH-1140 index-more plugin, resetTitle creates multiple values in title field
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1650181&r1=1650180&r2=1650181&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jan 7 22:25:18 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1140 index-more plugin, resetTitle creates multiple values in title field (Joe Liedtke, kaveh minooie via snagel) + * NUTCH-1904 Schema for Solr4 doesn't include _version_ field (mattmann) * NUTCH-1897 Easier debugging of plugin XML errors (markus) Modified: nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1650181&r1=1650180&r2=1650181&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Wed Jan 7 22:25:18 2015 @@ -289,7 +289,7 @@ public class MoreIndexingFilter implemen private NutchDocument resetTitle(NutchDocument doc, ParseData data, String url) { String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION); - if (contentDisposition == null) + if (contentDisposition == null || doc.getFieldValue("title") != null) return doc; for (int i=0; i<patterns.length; i++) { Modified: nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=1650181&r1=1650180&r2=1650181&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Wed Jan 7 22:25:18 2015 @@ -82,11 +82,21 @@ public class TestMoreIndexingFilter { MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData( - new ParseStatus(), "title", new Outlink[0], metadata)), new Text( - "http://www.example.com/"), new CrawlDatum(), new Inlinks()); + Text url = new Text("http://www.example.com/"); + ParseImpl parseImpl = new ParseImpl("text", new ParseData( + new ParseStatus(), "title", new Outlink[0], metadata)); + + NutchDocument doc = new NutchDocument(); + doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks()); Assert.assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title")); + + /* NUTCH-1140: do not add second title to avoid a multi-valued title field */ + doc = new NutchDocument(); + doc.add("title", "title"); + doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks()); + Assert.assertEquals("do not add second title by content-disposition", + "title", doc.getFieldValue("title")); } private void assertParts(String[] parts, int count, String... expected) {