This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 54f73bf NUTCH-2725 Plugin lib-http to support per-host configurable cookies 54f73bf is described below commit 54f73bf78ded8b66ba262270d069232417bbe391 Author: Markus Jelsma <mar...@apache.org> AuthorDate: Mon Jul 29 12:44:49 2019 +0200 NUTCH-2725 Plugin lib-http to support per-host configurable cookies --- conf/cookies.txt | 3 ++ conf/nutch-default.xml | 8 ++++ .../apache/nutch/protocol/http/api/HttpBase.java | 56 ++++++++++++++++++++++ .../apache/nutch/protocol/http/HttpResponse.java | 23 ++++++--- .../nutch/protocol/httpclient/HttpResponse.java | 17 +++++-- .../nutch/protocol/okhttp/OkHttpResponse.java | 19 ++++++-- 6 files changed, 111 insertions(+), 15 deletions(-) diff --git a/conf/cookies.txt b/conf/cookies.txt new file mode 100644 index 0000000..f75f220 --- /dev/null +++ b/conf/cookies.txt @@ -0,0 +1,3 @@ +# Optional per-host configurable cookies. Format: +# +# <host>\t<cookie> diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index a9ce899..e88991c 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -190,6 +190,14 @@ </property> <property> + <name>http.agent.host.cookie.file</name> + <value>cookies.txt</value> + <description> + File containing per-host configured cookies. + </description> +</property> + +<property> <name>http.agent.host</name> <value></value> <description>Name or IP address of the host on which the Nutch crawler diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index bcc2e29..4b91f9c 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -28,6 +28,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.concurrent.ThreadLocalRandom; @@ -45,6 +46,7 @@ import org.apache.nutch.protocol.ProtocolStatus; import org.apache.nutch.util.GZIPUtils; import org.apache.nutch.util.MimeUtil; import org.apache.nutch.util.DeflateUtils; +import org.apache.nutch.util.URLUtil; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.conf.Configuration; @@ -66,6 +68,9 @@ public abstract class HttpBase implements Protocol { private HttpRobotRulesParser robots = null; private ArrayList<String> userAgentNames = null; + + /** Mapping hostnames to cookies */ + private Map<String, String> hostCookies = null; /** The proxy hostname. */ protected String proxyHost = null; @@ -257,6 +262,42 @@ public abstract class HttpBase implements Protocol { .warn("Falling back to fixed user agent set via property http.agent.name"); } } + + // If cookies are enabled, try to load a per-host cookie file + if (enableCookieHeader) { + String cookieFile = conf.get("http.agent.host.cookie.file", "cookies.txt"); + BufferedReader br = null; + try { + Reader reader = conf.getConfResourceAsReader(cookieFile); + br = new BufferedReader(reader); + hostCookies = new HashMap<String,String>(); + String word = ""; + while ((word = br.readLine()) != null) { + if (!word.trim().isEmpty()) { + if (word.indexOf("#") == -1) { // skip comment + String[] parts = word.split("\t"); + if (parts.length == 2) { + hostCookies.put(parts[0], parts[1]); + } else { + LOG.warn("Unable to parse cookie file correctly at: " + word); + } + } + } + } + } catch (Exception e) { + logger.warn("Failed to read http.agent.host.cookie.file {}: {}", cookieFile, + StringUtils.stringifyException(e)); + hostCookies = null; + } finally { + if (br != null) { + try { + br.close(); + } catch (IOException e) { + // ignore + } + } + } + } String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); @@ -479,6 +520,21 @@ public abstract class HttpBase implements Protocol { } return userAgent; } + + /** + * If per-host cookies are configured, this method will look it up + * for the given url. + * + * @param url the url to look-up a cookie for + * @return the cookie or null + */ + public String getCookie(URL url) { + if (hostCookies != null) { + return hostCookies.get(url.getHost()); + } + + return null; + } /** * Value of "Accept-Language" request header sent by Nutch. diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java index 5a4b1ef..2d75b1c 100644 --- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java +++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java @@ -213,13 +213,22 @@ public class HttpResponse implements Response { reqStr.append("\r\n"); } - if (http.isCookieEnabled() - && datum.getMetaData().containsKey(HttpBase.COOKIE)) { - String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE)) - .toString(); - reqStr.append("Cookie: "); - reqStr.append(cookie); - reqStr.append("\r\n"); + if (http.isCookieEnabled()) { + String cookie = null; + + if (datum.getMetaData().containsKey(HttpBase.COOKIE)) { + cookie = ((Text)datum.getMetaData().get(HttpBase.COOKIE)).toString(); + } + + if (cookie == null) { + cookie = http.getCookie(url); + } + + if (cookie != null) { + reqStr.append("Cookie: "); + reqStr.append(cookie); + reqStr.append("\r\n"); + } } if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) { diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java index 3628d91..010f5ca 100644 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java @@ -100,9 +100,20 @@ public class HttpResponse implements Response { // XXX little danger in retrying... // params.setParameter(HttpMethodParams.RETRY_HANDLER, null); - if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) { - String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString(); - get.addRequestHeader("Cookie", cookie); + if (http.isCookieEnabled()) { + String cookie = null; + + if (datum.getMetaData().containsKey(http.COOKIE)) { + cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString(); + } + + if (cookie == null) { + cookie = http.getCookie(url); + } + + if (cookie != null) { + get.addRequestHeader("Cookie", cookie); + } } try { diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java index 3230413..d7d4cdf 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java @@ -84,11 +84,20 @@ public class OkHttpResponse implements Response { HttpDateFormat.toString(datum.getModifiedTime())); } - if (okhttp.isCookieEnabled() - && datum.getMetaData().containsKey(HttpBase.COOKIE)) { - String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE)) - .toString(); - rb.header("Cookie", cookie); + if (okhttp.isCookieEnabled()) { + String cookie = null; + + if (datum.getMetaData().containsKey(HttpBase.COOKIE)) { + cookie = ((Text)datum.getMetaData().get(HttpBase.COOKIE)).toString(); + } + + if (cookie == null) { + cookie = okhttp.getCookie(url); + } + + if (cookie != null) { + rb.header("Cookie", cookie); + } } Request request = rb.build();