[ 
https://issues.apache.org/jira/browse/NUTCH-2623?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16641180#comment-16641180
 ] 

ASF GitHub Bot commented on NUTCH-2623:
---------------------------------------

sebastian-nagel closed pull request #369: NUTCH-2623 Fetcher to guarantee delay 
for same host/domain/ip independent of http/https protocol
URL: https://github.com/apache/nutch/pull/369
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 51710f7c7..9f57af26e 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -979,8 +979,9 @@
 <property>
   <name>fetcher.queue.mode</name>
   <value>byHost</value>
-  <description>Determines how to put URLs into queues. Default value is 
'byHost', 
-  also takes 'byDomain' or 'byIP'. 
+  <description>Determines how to put URLs into queues. Default value
+  is 'byHost', also takes 'byDomain' or 'byIP'. Crawl delays are
+  implemented on the level of fetcher queues.
   </description>
 </property>
 
diff --git a/src/java/org/apache/nutch/fetcher/FetchItem.java 
b/src/java/org/apache/nutch/fetcher/FetchItem.java
index 538fc5f30..f56ed2536 100644
--- a/src/java/org/apache/nutch/fetcher/FetchItem.java
+++ b/src/java/org/apache/nutch/fetcher/FetchItem.java
@@ -20,6 +20,7 @@
 import java.net.InetAddress;
 import java.net.URL;
 import java.net.UnknownHostException;
+import java.util.Locale;
 
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -65,7 +66,6 @@ public static FetchItem create(Text url, CrawlDatum datum, 
String queueMode) {
 
   public static FetchItem create(Text url, CrawlDatum datum,
       String queueMode, int outlinkDepth) {
-    String queueID;
     URL u = null;
     try {
       u = new URL(url.toString());
@@ -73,7 +73,6 @@ public static FetchItem create(Text url, CrawlDatum datum,
       LOG.warn("Cannot parse url: " + url, e);
       return null;
     }
-    final String proto = u.getProtocol().toLowerCase();
     String key;
     if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
       try {
@@ -85,21 +84,20 @@ public static FetchItem create(Text url, CrawlDatum datum,
         return null;
       }
     } else if (FetchItemQueues.QUEUE_MODE_DOMAIN.equalsIgnoreCase(queueMode)) {
-      key = URLUtil.getDomainName(u);
+      key = URLUtil.getDomainName(u).toLowerCase(Locale.ROOT);
       if (key == null) {
         LOG.warn("Unknown domain for url: " + url
             + ", using URL string as key");
         key = u.toExternalForm();
       }
     } else {
-      key = u.getHost();
+      key = u.getHost().toLowerCase(Locale.ROOT);
       if (key == null) {
         LOG.warn("Unknown host for url: " + url + ", using URL string as key");
         key = u.toExternalForm();
       }
     }
-    queueID = proto + "://" + key.toLowerCase();
-    return new FetchItem(url, u, datum, queueID, outlinkDepth);
+    return new FetchItem(url, u, datum, key, outlinkDepth);
   }
 
   public CrawlDatum getDatum() {
diff --git a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java 
b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
index 4259aa7bd..407d00fb6 100644
--- a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
+++ b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
@@ -57,14 +57,7 @@ public FetchItemQueues(Configuration conf) {
     this.conf = conf;
     this.maxThreads = conf.getInt("fetcher.threads.per.queue", 1);
     queueMode = conf.get("fetcher.queue.mode", QUEUE_MODE_HOST);
-    // check that the mode is known
-    if (!queueMode.equals(QUEUE_MODE_IP)
-        && !queueMode.equals(QUEUE_MODE_DOMAIN)
-        && !queueMode.equals(QUEUE_MODE_HOST)) {
-      LOG.error("Unknown partition mode : " + queueMode
-          + " - forcing to byHost");
-      queueMode = QUEUE_MODE_HOST;
-    }
+    queueMode = checkQueueMode(queueMode);
     LOG.info("Using queue mode : " + queueMode);
 
     this.crawlDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 
1000);
@@ -75,6 +68,24 @@ public FetchItemQueues(Configuration conf) {
         "fetcher.max.exceptions.per.queue", -1);
   }
 
+  /**
+   * Check whether queue mode is valid, fall-back to default mode if not.
+   * 
+   * @param queueMode
+   *          queue mode to check
+   * @return valid queue mode or default
+   */
+  protected static String checkQueueMode(String queueMode) {
+    // check that the mode is known
+    if (!queueMode.equals(QUEUE_MODE_IP)
+        && !queueMode.equals(QUEUE_MODE_DOMAIN)
+        && !queueMode.equals(QUEUE_MODE_HOST)) {
+      LOG.error("Unknown partition mode : {} - forcing to byHost", queueMode);
+      queueMode = QUEUE_MODE_HOST;
+    }
+    return queueMode;
+  }
+
   public int getTotalSize() {
     return totalSize.get();
   }
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java 
b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 088b0fdd9..bfcc3741e 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -186,13 +186,7 @@ public FetcherThread(Configuration conf, AtomicInteger 
activeThreads, FetchItemQ
     
     queueMode = conf.get("fetcher.queue.mode",
         FetchItemQueues.QUEUE_MODE_HOST);
-    // check that the mode is known
-    if (!queueMode.equals(FetchItemQueues.QUEUE_MODE_IP)
-        && !queueMode.equals(FetchItemQueues.QUEUE_MODE_DOMAIN)
-        && !queueMode.equals(FetchItemQueues.QUEUE_MODE_HOST)) {
-      LOG.error("Unknown partition mode : {} - forcing to byHost", queueMode);
-      queueMode = FetchItemQueues.QUEUE_MODE_HOST;
-    }
+    queueMode = FetchItemQueues.checkQueueMode(queueMode);
     LOG.info("{} {} Using queue mode : {}", getName(),
         Thread.currentThread().getId(), queueMode);
     this.maxRedirect = conf.getInt("http.redirect.max", 3);


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Fetcher to guarantee delay for same host/domain/ip independent of http/https 
> protocol
> -------------------------------------------------------------------------------------
>
>                 Key: NUTCH-2623
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2623
>             Project: Nutch
>          Issue Type: Improvement
>          Components: fetcher
>    Affects Versions: 1.14
>            Reporter: Sebastian Nagel
>            Priority: Minor
>             Fix For: 1.16
>
>
>  Fetcher uses a combination of protocol and host/domain/ip as ID for fetch 
> item queues, see 
> [FetchItem.java|https://github.com/apache/nutch/blob/2b93a66/src/java/org/apache/nutch/fetcher/FetchItem.java#L101].
>  This inhibits a guaranteed delay, in case both http:// and https:// URLs are 
> fetched from the same host/domain/ip, e.g. here with a large delay of 30 sec.:
> {noformat}
> 2018-07-23 14:54:39,834 INFO fetcher.FetcherThread - FetcherThread 24 
> fetching http://nutch.apache.org/ (queue crawl delay=30000ms)
> 2018-07-23 14:54:39,846 INFO fetcher.FetcherThread - FetcherThread 23 
> fetching https://nutch.apache.org/ (queue crawl delay=30000ms)
> {noformat}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to