This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 54f73bf  NUTCH-2725 Plugin lib-http to support per-host configurable 
cookies
54f73bf is described below

commit 54f73bf78ded8b66ba262270d069232417bbe391
Author: Markus Jelsma <mar...@apache.org>
AuthorDate: Mon Jul 29 12:44:49 2019 +0200

    NUTCH-2725 Plugin lib-http to support per-host configurable cookies
---
 conf/cookies.txt                                   |  3 ++
 conf/nutch-default.xml                             |  8 ++++
 .../apache/nutch/protocol/http/api/HttpBase.java   | 56 ++++++++++++++++++++++
 .../apache/nutch/protocol/http/HttpResponse.java   | 23 ++++++---
 .../nutch/protocol/httpclient/HttpResponse.java    | 17 +++++--
 .../nutch/protocol/okhttp/OkHttpResponse.java      | 19 ++++++--
 6 files changed, 111 insertions(+), 15 deletions(-)

diff --git a/conf/cookies.txt b/conf/cookies.txt
new file mode 100644
index 0000000..f75f220
--- /dev/null
+++ b/conf/cookies.txt
@@ -0,0 +1,3 @@
+# Optional per-host configurable cookies. Format:
+#
+# <host>\t<cookie>
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index a9ce899..e88991c 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -190,6 +190,14 @@
 </property>
 
 <property>
+  <name>http.agent.host.cookie.file</name>
+  <value>cookies.txt</value>
+  <description>
+    File containing per-host configured cookies.
+  </description>
+</property>
+
+<property>
   <name>http.agent.host</name>
   <value></value>
   <description>Name or IP address of the host on which the Nutch crawler
diff --git 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index bcc2e29..4b91f9c 100644
--- 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -28,6 +28,7 @@ import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ThreadLocalRandom;
 
@@ -45,6 +46,7 @@ import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.util.GZIPUtils;
 import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.DeflateUtils;
+import org.apache.nutch.util.URLUtil;
 import org.apache.hadoop.util.StringUtils;
 
 import org.apache.hadoop.conf.Configuration;
@@ -66,6 +68,9 @@ public abstract class HttpBase implements Protocol {
   private HttpRobotRulesParser robots = null;
 
   private ArrayList<String> userAgentNames = null;
+  
+  /** Mapping hostnames to cookies */
+  private Map<String, String> hostCookies = null;
 
   /** The proxy hostname. */
   protected String proxyHost = null;
@@ -257,6 +262,42 @@ public abstract class HttpBase implements Protocol {
             .warn("Falling back to fixed user agent set via property 
http.agent.name");
       }
     }
+    
+    // If cookies are enabled, try to load a per-host cookie file
+    if (enableCookieHeader) {
+      String cookieFile = conf.get("http.agent.host.cookie.file", 
"cookies.txt");
+      BufferedReader br = null;
+      try {
+        Reader reader = conf.getConfResourceAsReader(cookieFile);
+        br = new BufferedReader(reader);
+        hostCookies = new HashMap<String,String>();
+        String word = "";
+        while ((word = br.readLine()) != null) {
+          if (!word.trim().isEmpty()) {
+            if (word.indexOf("#") == -1) { // skip comment
+              String[] parts = word.split("\t");
+              if (parts.length == 2) {
+                hostCookies.put(parts[0], parts[1]);
+              } else {
+                LOG.warn("Unable to parse cookie file correctly at: " + word);
+              }
+            }
+          }
+        }
+      } catch (Exception e) {
+        logger.warn("Failed to read http.agent.host.cookie.file {}: {}", 
cookieFile,
+            StringUtils.stringifyException(e));
+        hostCookies = null;
+      } finally {
+        if (br != null) {
+          try {
+            br.close();
+          } catch (IOException e) {
+            // ignore
+          }
+        }
+      }
+    }
 
     String[] protocols = conf.getStrings("http.tls.supported.protocols",
         "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
@@ -479,6 +520,21 @@ public abstract class HttpBase implements Protocol {
     }
     return userAgent;
   }
+  
+  /**
+   * If per-host cookies are configured, this method will look it up
+   * for the given url.
+   *
+   * @param url the url to look-up a cookie for
+   * @return the cookie or null
+   */
+  public String getCookie(URL url) {
+    if (hostCookies != null) {
+      return hostCookies.get(url.getHost());
+    }
+    
+    return null;
+  }
 
   /**
    * Value of "Accept-Language" request header sent by Nutch.
diff --git 
a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 
b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index 5a4b1ef..2d75b1c 100644
--- 
a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ 
b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -213,13 +213,22 @@ public class HttpResponse implements Response {
         reqStr.append("\r\n");
       }
 
-      if (http.isCookieEnabled()
-          && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
-        String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE))
-            .toString();
-        reqStr.append("Cookie: ");
-        reqStr.append(cookie);
-        reqStr.append("\r\n");
+      if (http.isCookieEnabled()) {
+        String cookie = null;
+        
+        if (datum.getMetaData().containsKey(HttpBase.COOKIE)) {
+          cookie = ((Text)datum.getMetaData().get(HttpBase.COOKIE)).toString();
+        }
+        
+        if (cookie == null) {
+          cookie = http.getCookie(url);
+        }
+        
+        if (cookie != null) {
+          reqStr.append("Cookie: ");
+          reqStr.append(cookie);
+          reqStr.append("\r\n");
+        }
       }
 
       if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
diff --git 
a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 
b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
index 3628d91..010f5ca 100644
--- 
a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
+++ 
b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
@@ -100,9 +100,20 @@ public class HttpResponse implements Response {
     // XXX little danger in retrying...
     // params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
     
-    if (http.isCookieEnabled() && 
datum.getMetaData().containsKey(http.COOKIE)) {
-      String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
-      get.addRequestHeader("Cookie", cookie);
+    if (http.isCookieEnabled()) {
+      String cookie = null;
+      
+      if (datum.getMetaData().containsKey(http.COOKIE)) {
+        cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
+      }
+      
+      if (cookie == null) {
+        cookie = http.getCookie(url);
+      }
+      
+      if (cookie != null) {
+        get.addRequestHeader("Cookie", cookie);
+      }
     }
     
     try {
diff --git 
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
index 3230413..d7d4cdf 100644
--- 
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
+++ 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
@@ -84,11 +84,20 @@ public class OkHttpResponse implements Response {
           HttpDateFormat.toString(datum.getModifiedTime()));
     }
 
-    if (okhttp.isCookieEnabled()
-        && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
-      String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE))
-          .toString();
-      rb.header("Cookie", cookie);
+    if (okhttp.isCookieEnabled()) {
+      String cookie = null;
+      
+      if (datum.getMetaData().containsKey(HttpBase.COOKIE)) {
+        cookie = ((Text)datum.getMetaData().get(HttpBase.COOKIE)).toString();
+      }
+      
+      if (cookie == null) {
+        cookie = okhttp.getCookie(url);
+      }
+      
+      if (cookie != null) {
+        rb.header("Cookie", cookie);
+      }
     }
 
     Request request = rb.build();

Reply via email to