Author: lewismc
Date: Mon Feb 8 18:05:46 2016
New Revision: 1729218
URL: http://svn.apache.org/viewvc?rev=1729218&view=rev
Log:
NUTCH-1314 Impose a limit on the length of outlink target urls
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1729218&r1=1729217&r2=1729218&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Feb 8 18:05:46 2016
@@ -2,7 +2,7 @@ Nutch Change Log
Nutch 2.4 Development
-Put Jira release report here
+ * NUTCH-1314 Impose a limit on the length of outlink target urls (ferdy,
lewismc, tejasp, Canan Girgin, Tien Nguyen Manh)
Nutch 2.3.1 Release 22092015 (ddmmyyyy)
Release Report - http://s.apache.org/nutch_2.3.1
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1729218&r1=1729217&r2=1729218&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Mon Feb 8 18:05:46 2016
@@ -998,6 +998,20 @@
</property>
<property>
+ <name>parser.html.outlinks.max.target.length</name>
+ <value>3000</value>
+ <description>The maximum number of characters permitted in an outlink urls
target.
+ </description>
+</property>
+
+<property>
+ <name>parser.html.outlinks.max.target.length</name>
+ <value>3000</value>
+ <description>The maximum number of characters permitted in an outlink urls
target.
+ </description>
+</property>
+
+<property>
<name>htmlparsefilter.order</name>
<value></value>
<description>The order by which HTMLParse filters are applied.
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1729218&r1=1729217&r2=1729218&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Mon Feb
8 18:05:46 2016
@@ -70,6 +70,7 @@ public class ParseUtil extends Configure
public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class);
private static final int DEFAULT_MAX_PARSE_TIME = 30;
+ private static final int DEFAULT_OUTLINKS_MAX_TARGET_LENGTH = 3000;
private Configuration conf;
private Signature sig;
@@ -80,6 +81,7 @@ public class ParseUtil extends Configure
private ParserFactory parserFactory;
/** Parser timeout set to 30 sec by default. Set -1 to deactivate **/
private int maxParseTime;
+ private int maxTargetLength;
private ExecutorService executorService;
/**
@@ -100,6 +102,7 @@ public class ParseUtil extends Configure
public void setConf(Configuration conf) {
this.conf = conf;
parserFactory = new ParserFactory(conf);
+ maxTargetLength = conf.getInt("parser.html.outlinks.max.target.length",
DEFAULT_OUTLINKS_MAX_TARGET_LENGTH);
if (conf.getBoolean("parse.sitemap", false)) {
maxParseTime = conf.getInt("parser.timeout", DEFAULT_MAX_PARSE_TIME);
} else {
@@ -373,6 +376,9 @@ public class ParseUtil extends Configure
for (int i = 0; validCount < outlinksToStore
&& i < outlinks.length; i++, validCount++) {
String toUrl = outlinks[i].getToUrl();
+ if (toUrl.length() > maxTargetLength) {
+ continue; // skip it
+ }
String toHost;
if (ignoreExternalLinks) {
try {