Author: jnioche Date: Tue Mar 30 08:35:49 2010 New Revision: 929039 URL: http://svn.apache.org/viewvc?rev=929039&view=rev Log: NUTCH 785 : Fetcher : copy metadata from origin URL when redirecting + call scfilters.initialScore on newly created URL
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=929039&r1=929038&r2=929039&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Mar 30 08:35:49 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-785 Copy metadata from origin URL when redirecting in Fetcher + call scfilters.initialScore on newly created URL (jnioche) + * NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche) * NUTCH-784 CrawlDBScanner (jnioche) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=929039&r1=929038&r2=929039&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Mar 30 08:35:49 2010 @@ -46,6 +46,7 @@ import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.*; import org.apache.nutch.protocol.*; import org.apache.nutch.parse.*; +import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.*; @@ -656,6 +657,9 @@ public class Fetcher extends Configured if (redirUrl != null) { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, fit.datum.getFetchInterval(), fit.datum.getScore()); + // transfer existing metadata to the redir + newDatum.getMetaData().putAll(fit.datum.getMetaData()); + scfilters.initialScore(redirUrl, newDatum); if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); @@ -694,6 +698,9 @@ public class Fetcher extends Configured if (redirUrl != null) { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, fit.datum.getFetchInterval(), fit.datum.getScore()); + // transfer existing metadata + newDatum.getMetaData().putAll(fit.datum.getMetaData()); + scfilters.initialScore(redirUrl, newDatum); if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); @@ -809,6 +816,13 @@ public class Fetcher extends Configured } else { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval()); + // transfer existing metadata + newDatum.getMetaData().putAll(datum.getMetaData()); + try { + scfilters.initialScore(url, newDatum); + } catch (ScoringFilterException e) { + e.printStackTrace(); + } if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));