Author: lewismc
Date: Mon May 21 16:56:02 2012
New Revision: 1341105

URL: http://svn.apache.org/viewvc?rev=1341105&view=rev
Log:
commit to address NUTCH-1361 and update to CHANGES.txt

Modified:
    nutch/branches/nutchgora/CHANGES.txt
    
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorMapper.java
    
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorReducer.java

Modified: nutch/branches/nutchgora/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1341105&r1=1341104&r2=1341105&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Mon May 21 16:56:02 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release nutchgora - Current Development
 
+* NUTCH-1361 Fix mishandling of malformed urls in generator job (Jason Trost 
via lewismc)
+
 * NUTCH-1360 Support the storing of IP address connected to when web crawling 
(lewismc)
 
 * NUTCH-1366 speed up indexing by eliminating the indexreducer (ferdy)

Modified: 
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorMapper.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorMapper.java?rev=1341105&r1=1341104&r2=1341105&view=diff
==============================================================================
--- 
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorMapper.java 
(original)
+++ 
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorMapper.java 
Mon May 21 16:56:02 2012
@@ -17,6 +17,7 @@
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
+import java.net.MalformedURLException;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.crawl.GeneratorJob.SelectorEntry;
@@ -63,6 +64,8 @@ extends GoraMapper<String, WebPage, Sele
     } catch (URLFilterException e) {
       GeneratorJob.LOG.warn("Couldn't filter url: " + url + " (" + 
e.getMessage() + ")");
       return;
+    } catch (MalformedURLException e) {
+      GeneratorJob.LOG.warn("Couldn't filter url: " + url + " (" + 
e.getMessage() +")");
     }
 
     // check fetch schedule

Modified: 
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorReducer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorReducer.java?rev=1341105&r1=1341104&r2=1341105&view=diff
==============================================================================
--- 
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorReducer.java 
(original)
+++ 
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorReducer.java 
Mon May 21 16:56:02 2012
@@ -17,6 +17,7 @@
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
+import java.net.MalformedURLException;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -73,7 +74,11 @@ extends GoraReducer<SelectorEntry, WebPa
       }
 
       Mark.GENERATE_MARK.putMark(page, batchId);
-      context.write(TableUtil.reverseUrl(key.url), page);
+      try {
+        context.write(TableUtil.reverseUrl(key.url), page);
+      } catch (MalformedURLException e) {
+        continue;
+      }
       context.getCounter("Generator", "GENERATE_MARK").increment(1);
       count++;
     }


Reply via email to