Author: lewismc
Date: Sat Nov  2 14:16:28 2013
New Revision: 1538200

URL: http://svn.apache.org/r1538200
Log:
NUTCH-1588 Port NUTCH-1245 URL gone with 404 after db.fetch.interval.max stays 
db_unfetched in CrawlDb and is generated over and over again to 2.x

Modified:
    nutch/branches/2.x/CHANGES.txt
    
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1538200&r1=1538199&r2=1538200&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Nov  2 14:16:28 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1588 Port NUTCH-1245 URL gone with 404 after db.fetch.interval.max 
stays db_unfetched in CrawlDb and is generated over and over again to 2.x 
(Talat UYARER via lewismc)
+
 * NUTCH-1650 Adaptive Fetch Scheduler interval Wrong Set (Talat UYARER via 
lewismc)
 
 * NUTCH-1413 Record response time (Yasin KILINC, Talat UYARER, snagel via 
lewismc)

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1538200&r1=1538199&r2=1538200&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java 
(original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java 
Sat Nov  2 14:16:28 2013
@@ -101,9 +101,8 @@ implements FetchSchedule {
 
   /**
    * This method specifies how to schedule refetching of pages
-   * marked as GONE. Default implementation increases fetchInterval by 50%,
-   * and if it exceeds the <code>maxInterval</code> it calls
-   * {@link #forceRefetch(Text, CrawlDatum, boolean)}.
+   * marked as GONE. Default implementation increases fetchInterval by 50%
+   * but the value may never exceed <code>maxInterval</code>.
    * @param url URL of the page
    * @param page
    * @return adjusted page information, including all original information.
@@ -112,14 +111,17 @@ implements FetchSchedule {
    * information from {@param datum}.
    */
   @Override
-  public void setPageGoneSchedule(String url, WebPage page,
-          long prevFetchTime, long prevModifiedTime, long fetchTime) {
+  public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime,
+      long prevModifiedTime, long fetchTime) {
     // no page is truly GONE ... just increase the interval by 50%
     // and try much later.
-    int newFetchInterval = (int) (page.getFetchInterval() * 1.5f);
-    page.setFetchInterval(newFetchInterval);
-    page.setFetchTime(fetchTime + newFetchInterval * 1000L);
-    if (maxInterval < newFetchInterval) forceRefetch(url, page, false);
+    if ((page.getFetchInterval() * 1.5f) < maxInterval) {
+      int newFetchInterval = (int) (page.getFetchInterval() * 1.5f);
+      page.setFetchInterval(newFetchInterval);
+    } else {
+      page.setFetchInterval((int) (maxInterval * 0.9f));
+    }
+    page.setFetchTime(fetchTime + page.getFetchInterval() * 1000L);
   }
 
   /**


Reply via email to