Author: mattmann
Date: Sat Apr 18 16:32:42 2015
New Revision: 1674536
URL: http://svn.apache.org/r1674536
Log:
Fix for NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper contributed
by Giuseppe Totaro.
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1674536&r1=1674535&r2=1674536&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr 18 16:32:42 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro
via mattmann)
+
* NUTCH-1988 Make nested output directory dump optional (Michael Joyce via
mattmann)
* NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of
RobotRules parsing (mattmann, snagel)
Modified: nutch/trunk/ivy/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1674536&r1=1674535&r2=1674536&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Sat Apr 18 16:32:42 2015
@@ -43,6 +43,8 @@
<dependency org="commons-lang" name="commons-lang" rev="2.6"
conf="*->default" />
+ <dependency org="commons-validator" name="commons-validator"
rev="1.4.1"
+ conf="*->default" />
<dependency org="commons-collections" name="commons-collections"
rev="3.1" conf="*->default" />
<dependency org="commons-httpclient" name="commons-httpclient"
Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1674536&r1=1674535&r2=1674536&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Sat
Apr 18 16:32:42 2015
@@ -49,6 +49,7 @@ import org.apache.commons.compress.compr
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.validator.routines.UrlValidator;
//Hadoop
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@@ -384,6 +385,12 @@ public class CommonCrawlDataDumper {
reader.getCurrentValue(content);
Metadata metadata =
content.getMetadata();
String url = key.toString();
+
+ UrlValidator urlValidator = new
UrlValidator();
+ if (!urlValidator.isValid(url)) {
+ LOG.warn("Not valid URL
detected: " + url);
+ }
+
String baseName =
FilenameUtils.getBaseName(url);
String extension =
FilenameUtils.getExtension(url);