Author: lewismc
Date: Sat Nov 2 14:16:28 2013
New Revision: 1538200
URL: http://svn.apache.org/r1538200
Log:
NUTCH-1588 Port NUTCH-1245 URL gone with 404 after db.fetch.interval.max stays
db_unfetched in CrawlDb and is generated over and over again to 2.x
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1538200&r1=1538199&r2=1538200&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Nov 2 14:16:28 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1588 Port NUTCH-1245 URL gone with 404 after db.fetch.interval.max
stays db_unfetched in CrawlDb and is generated over and over again to 2.x
(Talat UYARER via lewismc)
+
* NUTCH-1650 Adaptive Fetch Scheduler interval Wrong Set (Talat UYARER via
lewismc)
* NUTCH-1413 Record response time (Yasin KILINC, Talat UYARER, snagel via
lewismc)
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1538200&r1=1538199&r2=1538200&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
Sat Nov 2 14:16:28 2013
@@ -101,9 +101,8 @@ implements FetchSchedule {
/**
* This method specifies how to schedule refetching of pages
- * marked as GONE. Default implementation increases fetchInterval by 50%,
- * and if it exceeds the <code>maxInterval</code> it calls
- * {@link #forceRefetch(Text, CrawlDatum, boolean)}.
+ * marked as GONE. Default implementation increases fetchInterval by 50%
+ * but the value may never exceed <code>maxInterval</code>.
* @param url URL of the page
* @param page
* @return adjusted page information, including all original information.
@@ -112,14 +111,17 @@ implements FetchSchedule {
* information from {@param datum}.
*/
@Override
- public void setPageGoneSchedule(String url, WebPage page,
- long prevFetchTime, long prevModifiedTime, long fetchTime) {
+ public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime,
+ long prevModifiedTime, long fetchTime) {
// no page is truly GONE ... just increase the interval by 50%
// and try much later.
- int newFetchInterval = (int) (page.getFetchInterval() * 1.5f);
- page.setFetchInterval(newFetchInterval);
- page.setFetchTime(fetchTime + newFetchInterval * 1000L);
- if (maxInterval < newFetchInterval) forceRefetch(url, page, false);
+ if ((page.getFetchInterval() * 1.5f) < maxInterval) {
+ int newFetchInterval = (int) (page.getFetchInterval() * 1.5f);
+ page.setFetchInterval(newFetchInterval);
+ } else {
+ page.setFetchInterval((int) (maxInterval * 0.9f));
+ }
+ page.setFetchTime(fetchTime + page.getFetchInterval() * 1000L);
}
/**