Author: markus Date: Mon Sep 12 12:16:53 2011 New Revision: 1169707 URL: http://svn.apache.org/viewvc?rev=1169707&view=rev Log: NUTCH-1105 Max content length option for index-basic
Modified: nutch/branches/branch-1.4/CHANGES.txt nutch/branches/branch-1.4/conf/nutch-default.xml nutch/branches/branch-1.4/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Modified: nutch/branches/branch-1.4/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1169707&r1=1169706&r2=1169707&view=diff ============================================================================== --- nutch/branches/branch-1.4/CHANGES.txt (original) +++ nutch/branches/branch-1.4/CHANGES.txt Mon Sep 12 12:16:53 2011 @@ -2,6 +2,8 @@ Nutch Change Log Release 1.4 - Current development +* NUTCH-1105 Max content length option for index-basic (markus) + * NUTCH-940 static field plugin (Claudio Martella via lewismc) * NUTCH-914 Implement Apache Project Branding Requirements (lewismc) Modified: nutch/branches/branch-1.4/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/nutch-default.xml?rev=1169707&r1=1169706&r2=1169707&view=diff ============================================================================== --- nutch/branches/branch-1.4/conf/nutch-default.xml (original) +++ nutch/branches/branch-1.4/conf/nutch-default.xml Mon Sep 12 12:16:53 2011 @@ -762,6 +762,14 @@ </description> </property> +<property> + <name>indexer.max.content.length</name> + <value>-1</value> + <description>The maximum number of characters of a content that are indexed. + Content beyond the limit is truncated. A value of -1 disables this check. + </description> +</property> + <!-- URL normalizer properties --> <property> Modified: nutch/branches/branch-1.4/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1169707&r1=1169706&r2=1169707&view=diff ============================================================================== --- nutch/branches/branch-1.4/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original) +++ nutch/branches/branch-1.4/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Mon Sep 12 12:16:53 2011 @@ -42,6 +42,7 @@ public class BasicIndexingFilter impleme public static final Log LOG = LogFactory.getLog(BasicIndexingFilter.class); private int MAX_TITLE_LENGTH; + private int MAX_CONTENT_LENGTH; private Configuration conf; public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) @@ -70,8 +71,14 @@ public class BasicIndexingFilter impleme } doc.add("url", reprUrlString == null ? urlString : reprUrlString); - doc.add("content", parse.getText()); - + + // content + String content = parse.getText(); + if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) { + content = content.substring(0, MAX_CONTENT_LENGTH); + } + doc.add("content", content); + // title String title = parse.getData().getTitle(); if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed @@ -88,7 +95,7 @@ public class BasicIndexingFilter impleme if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) { doc.add("cache", caching); } - + // add timestamp when fetched, for deduplication doc.add("tstamp", new Date(datum.getFetchTime())); @@ -98,6 +105,7 @@ public class BasicIndexingFilter impleme public void setConf(Configuration conf) { this.conf = conf; this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100); + this.MAX_CONTENT_LENGTH = conf.getInt("indexer.max.content.length", -1); } public Configuration getConf() {