[nutch] branch master updated: NUTCH-2748 Fetch status gone (redirect exceeded) not to overwrite existing items in CrawlDb - new configuration property `http.redirect.max.exceeded.skip`: * if true skip redirect targets if http.redirect.max is exceeded * if false (default): store the redirect targets with status "linked" - log whether exceeded redirects are "skipped" or "linked"

snagel Mon, 02 Dec 2019 03:45:57 -0800

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git



The following commit(s) were added to refs/heads/master by this push:
     new 969a194  NUTCH-2748 Fetch status gone (redirect exceeded) not to 
overwrite existing items in CrawlDb - new configuration property 
`http.redirect.max.exceeded.skip`:   * if true skip redirect targets if 
http.redirect.max is exceeded   * if false (default): store the redirect 
targets with status "linked" - log whether exceeded redirects are "skipped" or 
"linked"
     new ac9c435  Merge pull request #485 from 
sebastian-nagel/NUTCH-2748-redir-exceeded
969a194 is described below

commit 969a1943939703e524f7e50185dfa03db8bd419b
Author: Sebastian Nagel <[email protected]>
AuthorDate: Fri Nov 8 13:21:05 2019 +0100

    NUTCH-2748 Fetch status gone (redirect exceeded) not to overwrite existing 
items in CrawlDb
    - new configuration property `http.redirect.max.exceeded.skip`:
      * if true skip redirect targets if http.redirect.max is exceeded
      * if false (default): store the redirect targets with status "linked"
    - log whether exceeded redirects are "skipped" or "linked"
---
 conf/nutch-default.xml                             | 30 ++++++++----
 .../org/apache/nutch/fetcher/FetcherThread.java    | 55 ++++++++++++----------
 2 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 01f4578..58db620 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -331,15 +331,6 @@
 </property>
 
 <property>
-  <name>http.redirect.max</name>
-  <value>0</value>
-  <description>The maximum number of redirects the fetcher will follow when
-  trying to fetch a page. If set to negative or 0, fetcher won't immediately
-  follow redirected URLs, instead it will record them for later fetching.
-  </description>
-</property>
-
-<property>
   <name>http.useHttp11</name>
   <value>true</value>
   <description>
@@ -1197,6 +1188,27 @@
   <description>Whether fetcher will normalize URLs (with the configured URL 
normalizers).</description>
 </property>
 
+<property>
+  <name>http.redirect.max</name>
+  <value>0</value>
+  <description>The maximum number of redirects the fetcher will follow when
+  trying to fetch a page. If set to negative or 0, fetcher won't immediately
+  follow redirected URLs, instead it will record them for later fetching.
+  </description>
+</property>
+
+<property>
+  <name>http.redirect.max.exceeded.skip</name>
+  <value>false</value>
+  <description>
+    Whether to skip the last URL in a redirect chain when when redirects
+    are followed (http.redirect.max > 0) and the maximum number of redirects
+    in a chain is exceeded (redirect_count > http.redirect.max).
+    If not skipped the redirect target URLs are stored as `linked`
+    and fetched in one of the following cycles. See also NUTCH-2748.
+  </description>
+</property>
+
 <!--  any23 plugin properties -->
 
 <property>
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java 
b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index e52b9ea..e3cf411 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -87,6 +87,7 @@ public class FetcherThread extends Thread {
   private long maxCrawlDelay;
   private String queueMode;
   private int maxRedirect;
+  private boolean maxRedirectExceededSkip = false;
   private String reprUrl;
   private boolean redirecting;
   private int redirectCount;
@@ -197,7 +198,10 @@ public class FetcherThread extends Thread {
     queueMode = FetchItemQueues.checkQueueMode(queueMode);
     LOG.info("{} {} Using queue mode : {}", getName(),
         Thread.currentThread().getId(), queueMode);
+
     this.maxRedirect = conf.getInt("http.redirect.max", 3);
+    this.maxRedirectExceededSkip = conf
+        .getBoolean("http.redirect.max.exceeded.skip", false);
 
     int maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
     maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
@@ -449,12 +453,18 @@ public class FetcherThread extends Thread {
             if (redirecting && redirectCount > maxRedirect) {
               ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
               if (LOG.isInfoEnabled()) {
-                LOG.info("{} {} - redirect count exceeded {}", getName(),
-                    Thread.currentThread().getId(), fit.url);
+                LOG.info("{} {} - redirect count exceeded {} ({})", getName(),
+                    Thread.currentThread().getId(), fit.url,
+                    maxRedirectExceededSkip ? "skipped" : "linked");
+              }
+              if (maxRedirectExceededSkip) {
+                // skip redirect target when redirect count is exceeded
+              } else {
+                Text newUrl = new Text(status.getMessage());
+                CrawlDatum newDatum = createRedirDatum(newUrl, fit,
+                    CrawlDatum.STATUS_LINKED);
+                output(newUrl, newDatum, null, null, CrawlDatum.STATUS_LINKED);
               }
-              output(fit.url, fit.datum, null,
-                  ProtocolStatus.STATUS_REDIR_EXCEEDED,
-                  CrawlDatum.STATUS_FETCH_GONE);
             }
 
           } while (redirecting && (redirectCount <= maxRedirect));
@@ -550,36 +560,33 @@ public class FetcherThread extends Thread {
       LOG.debug(" - {} redirect to {} (fetching now)", redirType, url);
       return url;
     } else {
-      CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED,
-          fit.datum.getFetchInterval(), fit.datum.getScore());
-      // transfer existing metadata
-      newDatum.getMetaData().putAll(fit.datum.getMetaData());
-      try {
-        scfilters.initialScore(url, newDatum);
-      } catch (ScoringFilterException e) {
-        e.printStackTrace();
-      }
-      if (reprUrl != null) {
-        newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
-            new Text(reprUrl));
-      }
+      CrawlDatum newDatum = createRedirDatum(url, fit, 
CrawlDatum.STATUS_LINKED);
       output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
       LOG.debug(" - {} redirect to {} (fetching later)", redirType, url);
       return null;
     }
   }
 
-  private FetchItem queueRedirect(Text redirUrl, FetchItem fit)
-      throws ScoringFilterException {
-    CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
-        fit.datum.getFetchInterval(), fit.datum.getScore());
-    // transfer all existing metadata to the redirect
+  private CrawlDatum createRedirDatum(Text redirUrl, FetchItem fit, byte 
status) {
+    CrawlDatum newDatum = new CrawlDatum(status, fit.datum.getFetchInterval(),
+        fit.datum.getScore());
+    // transfer existing metadata
     newDatum.getMetaData().putAll(fit.datum.getMetaData());
-    scfilters.initialScore(redirUrl, newDatum);
+    try {
+      scfilters.initialScore(redirUrl, newDatum);
+    } catch (ScoringFilterException e) {
+      LOG.error("Scoring filtering failed for {}: ", redirUrl, e);
+    }
     if (reprUrl != null) {
       newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
           new Text(reprUrl));
     }
+    return newDatum;
+  }
+
+  private FetchItem queueRedirect(Text redirUrl, FetchItem fit)
+      throws ScoringFilterException {
+    CrawlDatum newDatum = createRedirDatum(redirUrl, fit, 
CrawlDatum.STATUS_DB_UNFETCHED);
     fit = FetchItem.create(redirUrl, newDatum, queueMode);
     if (fit != null) {
       FetchItemQueue fiq = ((FetchItemQueues) 
fetchQueues).getFetchItemQueue(fit.queueID);

Reply via email to