Author: lewismc
Date: Mon Feb  8 18:05:46 2016
New Revision: 1729218

URL: http://svn.apache.org/viewvc?rev=1729218&view=rev
Log:
NUTCH-1314 Impose a limit on the length of outlink target urls

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/nutch-default.xml
    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1729218&r1=1729217&r2=1729218&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Feb  8 18:05:46 2016
@@ -2,7 +2,7 @@ Nutch Change Log
 
 Nutch 2.4 Development
 
-Put Jira release report here
+ * NUTCH-1314 Impose a limit on the length of outlink target urls (ferdy, 
lewismc, tejasp, Canan Girgin, Tien Nguyen Manh)
 
 Nutch 2.3.1 Release 22092015 (ddmmyyyy)
 Release Report - http://s.apache.org/nutch_2.3.1

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1729218&r1=1729217&r2=1729218&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Mon Feb  8 18:05:46 2016
@@ -998,6 +998,20 @@
 </property>
 
 <property>
+  <name>parser.html.outlinks.max.target.length</name>
+  <value>3000</value>
+  <description>The maximum number of characters permitted in an outlink urls 
target.
+  </description>
+</property>
+
+<property>
+  <name>parser.html.outlinks.max.target.length</name>
+  <value>3000</value>
+  <description>The maximum number of characters permitted in an outlink urls 
target.
+  </description>
+</property>
+
+<property>
   <name>htmlparsefilter.order</name>
   <value></value>
   <description>The order by which HTMLParse filters are applied.

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1729218&r1=1729217&r2=1729218&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Mon Feb  
8 18:05:46 2016
@@ -70,6 +70,7 @@ public class ParseUtil extends Configure
   public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class);
 
   private static final int DEFAULT_MAX_PARSE_TIME = 30;
+  private static final int DEFAULT_OUTLINKS_MAX_TARGET_LENGTH = 3000;
 
   private Configuration conf;
   private Signature sig;
@@ -80,6 +81,7 @@ public class ParseUtil extends Configure
   private ParserFactory parserFactory;
   /** Parser timeout set to 30 sec by default. Set -1 to deactivate **/
   private int maxParseTime;
+  private int maxTargetLength;
   private ExecutorService executorService;
 
   /**
@@ -100,6 +102,7 @@ public class ParseUtil extends Configure
   public void setConf(Configuration conf) {
     this.conf = conf;
     parserFactory = new ParserFactory(conf);
+    maxTargetLength = conf.getInt("parser.html.outlinks.max.target.length", 
DEFAULT_OUTLINKS_MAX_TARGET_LENGTH);
     if (conf.getBoolean("parse.sitemap", false)) {
       maxParseTime = conf.getInt("parser.timeout", DEFAULT_MAX_PARSE_TIME);
     } else {
@@ -373,6 +376,9 @@ public class ParseUtil extends Configure
         for (int i = 0; validCount < outlinksToStore
             && i < outlinks.length; i++, validCount++) {
           String toUrl = outlinks[i].getToUrl();
+          if (toUrl.length() > maxTargetLength) {
+            continue; // skip it
+          }
           String toHost;
           if (ignoreExternalLinks) {
             try {


Reply via email to