[jira] [Commented] (NUTCH-2565) MergeDB incorrectly handles unfetched CrawlDatums

ASF GitHub Bot (JIRA) Thu, 21 Jun 2018 07:44:18 -0700


    [ 
https://issues.apache.org/jira/browse/NUTCH-2565?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16519443#comment-16519443
 ]


ASF GitHub Bot commented on NUTCH-2565:
---------------------------------------

sebastian-nagel closed pull request #311: - fix for NUTCH-2565 contributed by 
Jurian Broertjes
URL: https://github.com/apache/nutch/pull/311
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java 
b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
index 38fde9f02..a3209894b 100755
--- a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
@@ -157,7 +157,11 @@ public CrawlDatum setPageRetrySchedule(Text url, 
CrawlDatum datum,
    * @return the date as a long.
    */
   public long calculateLastFetchTime(CrawlDatum datum) {
-    return datum.getFetchTime() - (long) datum.getFetchInterval() * 1000;
+    if (datum.getStatus() == CrawlDatum.STATUS_DB_UNFETCHED) {
+      return 0L;
+    } else {
+      return datum.getFetchTime() - (long) datum.getFetchInterval() * 1000;
+    }
   }
 
   /**
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java 
b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
index d8756fd5e..475ee855d 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
@@ -31,6 +31,7 @@
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.Job;
@@ -70,8 +71,6 @@
 
   public static class Merger extends
       Reducer<Text, CrawlDatum, Text, CrawlDatum> {
-    private org.apache.hadoop.io.MapWritable meta;
-    private CrawlDatum res = new CrawlDatum();
     private FetchSchedule schedule;
 
     public void close() throws IOException {
@@ -85,40 +84,40 @@ public void setup(Reducer.Context context) {
     public void reduce(Text key, Iterable<CrawlDatum> values,
         Context context)
         throws IOException, InterruptedException {
-      long resTime = 0L;
-      boolean resSet = false;
-      long valTime = 0L;
-      meta = new org.apache.hadoop.io.MapWritable();
+
+      CrawlDatum res = new CrawlDatum();
+      res.setFetchTime(-1); // We want everything to be newer!
+      MapWritable meta = new MapWritable();
+
       for (CrawlDatum val : values) {
-        if (!resSet) {
-          res.set(val);
-          resSet = true;
-          resTime = schedule.calculateLastFetchTime(res);
-          for (Entry<Writable, Writable> e : res.getMetaData().entrySet()) {
-            meta.put(e.getKey(), e.getValue());
-          }
-          continue;
-        }
-        // compute last fetch time, and pick the latest
-        valTime = schedule.calculateLastFetchTime(val);
-        if (valTime > resTime) {
+        if (isNewer(res, val)) {
           // collect all metadata, newer values override older values
-          for (Entry<Writable, Writable> e : val.getMetaData().entrySet()) {
-            meta.put(e.getKey(), e.getValue());
-          }
+          meta = mergeMeta(val.getMetaData(), meta);
           res.set(val);
-          resTime = valTime;
         } else {
-          // insert older metadata before newer
-          for (Entry<Writable, Writable> e : meta.entrySet()) {
-            val.getMetaData().put(e.getKey(), e.getValue());
-          }
-          meta = val.getMetaData();
+          // overwrite older metadata with current metadata
+          meta = mergeMeta(meta, val.getMetaData());
         }
       }
+
       res.setMetaData(meta);
       context.write(key, res);
     }
+
+    // Determine which CrawlDatum is the latest, according to 
calculateLastFetchTime() 
+    // and getFetchTime() as fallback in case calculateLastFetchTime()s are 
equal (eg: DB_UNFETCHED)
+    private boolean isNewer(CrawlDatum cd1, CrawlDatum cd2) {
+      return schedule.calculateLastFetchTime(cd2) > 
schedule.calculateLastFetchTime(cd1) 
+        || schedule.calculateLastFetchTime(cd2) == 
schedule.calculateLastFetchTime(cd1) 
+        && cd2.getFetchTime() > cd1.getFetchTime();
+    }
+
+    private MapWritable mergeMeta(MapWritable from, MapWritable to) {
+      for (Entry<Writable, Writable> e : from.entrySet()) {
+        to.put(e.getKey(), e.getValue());
+      }
+      return to;
+    }
   }
 
   public CrawlDbMerger() {


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> MergeDB incorrectly handles unfetched CrawlDatums
> -------------------------------------------------
>
>                 Key: NUTCH-2565
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2565
>             Project: Nutch
>          Issue Type: Bug
>          Components: crawldb
>    Affects Versions: 1.14
>            Reporter: Jurian Broertjes
>            Priority: Minor
>             Fix For: 1.15
>
>
> I ran into this issue when merging a crawlDB originating from sitemaps into 
> our normal crawlDB. CrawlDatums are merged based on output of 
> AbstractFetchSchedule::calculateLastFetchTime(). When CrawlDatums are 
> unfetched, this can overwrite fetchTime or other stuff.
> I assume this is a bug and have a simple fix for it that checks if CrawlDatum 
> has status db_unfetched.
>  



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (NUTCH-2565) MergeDB incorrectly handles unfetched CrawlDatums

Reply via email to