Author: mattmann
Date: Sat Apr 18 16:32:42 2015
New Revision: 1674536

URL: http://svn.apache.org/r1674536
Log:
Fix for NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper contributed 
by Giuseppe Totaro.

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1674536&r1=1674535&r2=1674536&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr 18 16:32:42 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro 
via mattmann)
+
 * NUTCH-1988 Make nested output directory dump optional (Michael Joyce via 
mattmann)
 
 * NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of 
RobotRules parsing (mattmann, snagel)

Modified: nutch/trunk/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1674536&r1=1674535&r2=1674536&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Sat Apr 18 16:32:42 2015
@@ -43,6 +43,8 @@
                
                <dependency org="commons-lang" name="commons-lang" rev="2.6"
                        conf="*->default" />
+               <dependency org="commons-validator" name="commons-validator" 
rev="1.4.1"
+                       conf="*->default" />
                <dependency org="commons-collections" name="commons-collections"
                        rev="3.1" conf="*->default" />
                <dependency org="commons-httpclient" name="commons-httpclient"

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1674536&r1=1674535&r2=1674536&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Sat 
Apr 18 16:32:42 2015
@@ -49,6 +49,7 @@ import org.apache.commons.compress.compr
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.FilenameUtils;
 
+import org.apache.commons.validator.routines.UrlValidator;
 //Hadoop
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
@@ -384,6 +385,12 @@ public class CommonCrawlDataDumper {
                                        reader.getCurrentValue(content);
                                        Metadata metadata = 
content.getMetadata();
                                        String url = key.toString();
+                                       
+                                       UrlValidator urlValidator = new 
UrlValidator();
+                                       if (!urlValidator.isValid(url)) {
+                                               LOG.warn("Not valid URL 
detected: " + url);
+                                       }
+                                       
                                        String baseName = 
FilenameUtils.getBaseName(url);
                                        String extension = 
FilenameUtils.getExtension(url);
                                        


Reply via email to