Author: jnioche
Date: Tue Mar 30 08:35:49 2010
New Revision: 929039

URL: http://svn.apache.org/viewvc?rev=929039&view=rev
Log:
NUTCH 785 : Fetcher : copy metadata from origin URL when redirecting + call 
scfilters.initialScore on newly created URL

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=929039&r1=929038&r2=929039&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Mar 30 08:35:49 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-785 Copy metadata from origin URL when redirecting in Fetcher + call 
scfilters.initialScore on newly created URL (jnioche)
+
 * NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche)
 
 * NUTCH-784 CrawlDBScanner (jnioche)

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=929039&r1=929038&r2=929039&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Mar 
30 08:35:49 2010
@@ -46,6 +46,7 @@ import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.net.*;
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.parse.*;
+import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.*;
 
@@ -656,6 +657,9 @@ public class Fetcher extends Configured 
                   if (redirUrl != null) {
                     CrawlDatum newDatum = new 
CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
                         fit.datum.getFetchInterval(), fit.datum.getScore());
+                    // transfer existing metadata to the redir
+                    newDatum.getMetaData().putAll(fit.datum.getMetaData());
+                    scfilters.initialScore(redirUrl, newDatum);
                     if (reprUrl != null) {
                       newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
                           new Text(reprUrl));
@@ -694,6 +698,9 @@ public class Fetcher extends Configured 
                 if (redirUrl != null) {
                   CrawlDatum newDatum = new 
CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
                       fit.datum.getFetchInterval(), fit.datum.getScore());
+                  // transfer existing metadata
+                  newDatum.getMetaData().putAll(fit.datum.getMetaData());
+                  scfilters.initialScore(redirUrl, newDatum);
                   if (reprUrl != null) {
                     newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
                         new Text(reprUrl));
@@ -809,6 +816,13 @@ public class Fetcher extends Configured 
         } else {
           CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED,
               datum.getFetchInterval());
+          // transfer existing metadata 
+          newDatum.getMetaData().putAll(datum.getMetaData());
+          try {
+            scfilters.initialScore(url, newDatum);
+          } catch (ScoringFilterException e) {
+            e.printStackTrace();
+          }
           if (reprUrl != null) {
             newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
                 new Text(reprUrl));


Reply via email to