Author: snagel Date: Sat Apr 5 17:06:04 2014 New Revision: 1585144 URL: http://svn.apache.org/r1585144 Log: NUTCH-1735 code dedup fetcher queue redirects
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1585144&r1=1585143&r2=1585144&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Sat Apr 5 17:06:04 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1735 code dedup fetcher queue redirects (snagel) + * NUTCH-1745 Upgrade to ElasticSearch 1.1.0 (jnioche) * NUTCH-1645 Junit Test Case for Adaptive Fetch Schedule class (Yasin Kılınç, lufeng, Sertac TURKEL via snagel) Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1585144&r1=1585143&r2=1585144&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Apr 5 17:06:04 2014 @@ -731,25 +731,7 @@ public class Fetcher extends Configured refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR); if (redirUrl != null) { - CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, - fit.datum.getFetchInterval(), fit.datum.getScore()); - // transfer existing metadata to the redir - newDatum.getMetaData().putAll(fit.datum.getMetaData()); - scfilters.initialScore(redirUrl, newDatum); - if (reprUrl != null) { - newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, - new Text(reprUrl)); - } - fit = FetchItem.create(redirUrl, newDatum, queueMode); - if (fit != null) { - FetchItemQueue fiq = - fetchQueues.getFetchItemQueue(fit.queueID); - fiq.addInProgressFetchItem(fit); - } else { - // stop redirecting - redirecting = false; - reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1); - } + queueRedirect(redirUrl, fit); } } break; @@ -772,25 +754,7 @@ public class Fetcher extends Configured urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR); if (redirUrl != null) { - CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, - fit.datum.getFetchInterval(), fit.datum.getScore()); - // transfer existing metadata - newDatum.getMetaData().putAll(fit.datum.getMetaData()); - scfilters.initialScore(redirUrl, newDatum); - if (reprUrl != null) { - newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, - new Text(reprUrl)); - } - fit = FetchItem.create(redirUrl, newDatum, queueMode); - if (fit != null) { - FetchItemQueue fiq = - fetchQueues.getFetchItemQueue(fit.queueID); - fiq.addInProgressFetchItem(fit); - } else { - // stop redirecting - redirecting = false; - reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1); - } + queueRedirect(redirUrl, fit); } else { // stop redirecting redirecting = false; @@ -918,6 +882,28 @@ public class Fetcher extends Configured } } + private void queueRedirect(Text redirUrl, FetchItem fit) throws ScoringFilterException { + CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, + fit.datum.getFetchInterval(), fit.datum.getScore()); + // transfer all existing metadata to the redirect + newDatum.getMetaData().putAll(fit.datum.getMetaData()); + scfilters.initialScore(redirUrl, newDatum); + if (reprUrl != null) { + newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, + new Text(reprUrl)); + } + fit = FetchItem.create(redirUrl, newDatum, queueMode); + if (fit != null) { + FetchItemQueue fiq = + fetchQueues.getFetchItemQueue(fit.queueID); + fiq.addInProgressFetchItem(fit); + } else { + // stop redirecting + redirecting = false; + reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1); + } + } + private void logError(Text url, String message) { if (LOG.isInfoEnabled()) { LOG.info("fetch of " + url + " failed with: " + message);