Author: markus
Date: Mon Sep 12 12:16:53 2011
New Revision: 1169707

URL: http://svn.apache.org/viewvc?rev=1169707&view=rev
Log:
NUTCH-1105 Max content length option for index-basic

Modified:
    nutch/branches/branch-1.4/CHANGES.txt
    nutch/branches/branch-1.4/conf/nutch-default.xml
    
nutch/branches/branch-1.4/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java

Modified: nutch/branches/branch-1.4/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1169707&r1=1169706&r2=1169707&view=diff
==============================================================================
--- nutch/branches/branch-1.4/CHANGES.txt (original)
+++ nutch/branches/branch-1.4/CHANGES.txt Mon Sep 12 12:16:53 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.4 - Current development
 
+* NUTCH-1105 Max content length option for index-basic (markus)
+
 * NUTCH-940 static field plugin (Claudio Martella via lewismc)
 
 * NUTCH-914 Implement Apache Project Branding Requirements (lewismc)

Modified: nutch/branches/branch-1.4/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/nutch-default.xml?rev=1169707&r1=1169706&r2=1169707&view=diff
==============================================================================
--- nutch/branches/branch-1.4/conf/nutch-default.xml (original)
+++ nutch/branches/branch-1.4/conf/nutch-default.xml Mon Sep 12 12:16:53 2011
@@ -762,6 +762,14 @@
   </description>
 </property>
 
+<property>
+  <name>indexer.max.content.length</name>
+  <value>-1</value>
+  <description>The maximum number of characters of a content that are indexed.
+  Content beyond the limit is truncated. A value of -1 disables this check.
+  </description>
+</property>
+
 <!-- URL normalizer properties -->
 
 <property>

Modified: 
nutch/branches/branch-1.4/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1169707&r1=1169706&r2=1169707&view=diff
==============================================================================
--- 
nutch/branches/branch-1.4/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 (original)
+++ 
nutch/branches/branch-1.4/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 Mon Sep 12 12:16:53 2011
@@ -42,6 +42,7 @@ public class BasicIndexingFilter impleme
   public static final Log LOG = LogFactory.getLog(BasicIndexingFilter.class);
 
   private int MAX_TITLE_LENGTH;
+  private int MAX_CONTENT_LENGTH;
   private Configuration conf;
 
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url, 
CrawlDatum datum, Inlinks inlinks)
@@ -70,8 +71,14 @@ public class BasicIndexingFilter impleme
     }
 
     doc.add("url", reprUrlString == null ? urlString : reprUrlString);
-    doc.add("content", parse.getText());
-    
+
+    // content
+    String content = parse.getText();
+    if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) {
+      content = content.substring(0, MAX_CONTENT_LENGTH);
+    }
+    doc.add("content", content);
+
     // title
     String title = parse.getData().getTitle();
     if (title.length() > MAX_TITLE_LENGTH) {      // truncate title if needed
@@ -88,7 +95,7 @@ public class BasicIndexingFilter impleme
     if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
       doc.add("cache", caching);
     }
-    
+
     // add timestamp when fetched, for deduplication
     doc.add("tstamp", new Date(datum.getFetchTime()));
 
@@ -98,6 +105,7 @@ public class BasicIndexingFilter impleme
   public void setConf(Configuration conf) {
     this.conf = conf;
     this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
+    this.MAX_CONTENT_LENGTH = conf.getInt("indexer.max.content.length", -1);
   }
 
   public Configuration getConf() {


Reply via email to