Author: lewismc
Date: Mon May 21 16:56:02 2012
New Revision: 1341105
URL: http://svn.apache.org/viewvc?rev=1341105&view=rev
Log:
commit to address NUTCH-1361 and update to CHANGES.txt
Modified:
nutch/branches/nutchgora/CHANGES.txt
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorMapper.java
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorReducer.java
Modified: nutch/branches/nutchgora/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1341105&r1=1341104&r2=1341105&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Mon May 21 16:56:02 2012
@@ -2,6 +2,8 @@ Nutch Change Log
Release nutchgora - Current Development
+* NUTCH-1361 Fix mishandling of malformed urls in generator job (Jason Trost
via lewismc)
+
* NUTCH-1360 Support the storing of IP address connected to when web crawling
(lewismc)
* NUTCH-1366 speed up indexing by eliminating the indexreducer (ferdy)
Modified:
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorMapper.java
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorMapper.java?rev=1341105&r1=1341104&r2=1341105&view=diff
==============================================================================
---
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorMapper.java
(original)
+++
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorMapper.java
Mon May 21 16:56:02 2012
@@ -17,6 +17,7 @@
package org.apache.nutch.crawl;
import java.io.IOException;
+import java.net.MalformedURLException;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.GeneratorJob.SelectorEntry;
@@ -63,6 +64,8 @@ extends GoraMapper<String, WebPage, Sele
} catch (URLFilterException e) {
GeneratorJob.LOG.warn("Couldn't filter url: " + url + " (" +
e.getMessage() + ")");
return;
+ } catch (MalformedURLException e) {
+ GeneratorJob.LOG.warn("Couldn't filter url: " + url + " (" +
e.getMessage() +")");
}
// check fetch schedule
Modified:
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorReducer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorReducer.java?rev=1341105&r1=1341104&r2=1341105&view=diff
==============================================================================
---
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorReducer.java
(original)
+++
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/GeneratorReducer.java
Mon May 21 16:56:02 2012
@@ -17,6 +17,7 @@
package org.apache.nutch.crawl;
import java.io.IOException;
+import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.Map;
@@ -73,7 +74,11 @@ extends GoraReducer<SelectorEntry, WebPa
}
Mark.GENERATE_MARK.putMark(page, batchId);
- context.write(TableUtil.reverseUrl(key.url), page);
+ try {
+ context.write(TableUtil.reverseUrl(key.url), page);
+ } catch (MalformedURLException e) {
+ continue;
+ }
context.getCounter("Generator", "GENERATE_MARK").increment(1);
count++;
}