Author: ferdy
Date: Thu Apr 26 09:00:17 2012
New Revision: 1330722
URL: http://svn.apache.org/viewvc?rev=1330722&view=rev
Log:
NUTCH-1340 Increase scalability by only removing markers when they actually
exist for DbUpdaterReducer
Modified:
nutch/branches/nutchgora/CHANGES.txt
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java
Modified: nutch/branches/nutchgora/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1330722&r1=1330721&r2=1330722&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Thu Apr 26 09:00:17 2012
@@ -2,6 +2,8 @@ Nutch Change Log
Release nutchgora - Current Development
+* NUTCH-1340 Increase scalability by only removing markers when they actually
exist for DbUpdaterReducer (ferdy)
+
* NUTCH-1333 Introduce AvroStore, DataFileAvroStore and Accumulo Datastore
implementations (lewismc)
* NUTCH-1312 Nutchgora to send HTTP-accept header (ferdy)
Modified:
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java?rev=1330722&r1=1330721&r2=1330722&view=diff
==============================================================================
---
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
(original)
+++
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
Thu Apr 26 09:00:17 2012
@@ -175,11 +175,14 @@ extends GoraReducer<UrlWithScore, NutchW
}
// clear markers
-
- page.removeFromMetadata(FetcherJob.REDIRECT_DISCOVERED);
- Mark.GENERATE_MARK.removeMark(page);
- Mark.FETCH_MARK.removeMark(page);
- Utf8 mark = Mark.PARSE_MARK.removeMark(page);
+ // But only delete when they exist. This is much faster for the underlying
+ // store. The markers are on the input anyway.
+ if (page.getFromMetadata(FetcherJob.REDIRECT_DISCOVERED) != null) {
+ page.removeFromMetadata(FetcherJob.REDIRECT_DISCOVERED);
+ }
+ Mark.GENERATE_MARK.removeMarkIfExist(page);
+ Mark.FETCH_MARK.removeMarkIfExist(page);
+ Utf8 mark = Mark.PARSE_MARK.removeMarkIfExist(page);
if (mark != null) {
Mark.UPDATEDB_MARK.putMark(page, mark);
}
Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java?rev=1330722&r1=1330721&r2=1330722&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java
(original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java Thu
Apr 26 09:00:17 2012
@@ -43,4 +43,16 @@ public enum Mark {
public Utf8 checkMark(WebPage page) {
return page.getFromMarkers(name);
}
+
+ /**
+ * Remove the mark only if the mark is present on the page.
+ * @param page The page to remove the mark from.
+ * @return If the mark was present.
+ */
+ public Utf8 removeMarkIfExist(WebPage page) {
+ if (page.getFromMarkers(name) != null) {
+ return page.removeFromMarkers(name);
+ }
+ return null;
+ }
}