Repository: nutch Updated Branches: refs/heads/master 9a9c4b32b -> 217fad16b
NUTCH-2355 Protocol plugins to set cookie if Cookie metadata field is present Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/217fad16 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/217fad16 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/217fad16 Branch: refs/heads/master Commit: 217fad16bfdea0494390e8f170d9350cf06657ef Parents: 9a9c4b3 Author: Markus Jelsma <[email protected]> Authored: Tue Feb 21 11:55:33 2017 +0100 Committer: Markus Jelsma <[email protected]> Committed: Tue Feb 21 11:55:33 2017 +0100 ---------------------------------------------------------------------- conf/nutch-default.xml | 8 ++++++++ .../org/apache/nutch/protocol/http/api/HttpBase.java | 15 +++++++++++++-- .../org/apache/nutch/protocol/http/HttpResponse.java | 11 +++++++++-- .../nutch/protocol/httpclient/HttpResponse.java | 7 +++++++ 4 files changed, 37 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/217fad16/conf/nutch-default.xml ---------------------------------------------------------------------- diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index ea7df89..08fb8a0 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -344,6 +344,14 @@ </description> </property> +<property> + <name>http.enable.cookie.header</name> + <value>true</value> + <description>Whether Nutch sends an HTTP Cookie header. The cookie value + is read from the CrawlDatum Cookie metadata field. + </description> +</property> + <!-- FTP properties --> <property> http://git-wip-us.apache.org/repos/asf/nutch/blob/217fad16/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index 41b63e3..eb3eb60 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -59,6 +59,8 @@ public abstract class HttpBase implements Protocol { public static final Text RESPONSE_TIME = new Text("_rs_"); + public static final Text COOKIE = new Text("Cookie"); + public static final int BUFFER_SIZE = 8 * 1024; private static final byte[] EMPTY_CONTENT = new byte[0]; @@ -124,7 +126,10 @@ public abstract class HttpBase implements Protocol { protected Set<String> tlsPreferredCipherSuites; /** Configuration directive for If-Modified-Since HTTP header */ - public boolean enableIfModifiedsinceHeader = true; + protected boolean enableIfModifiedsinceHeader = true; + + /** Controls whether or not to set Cookie HTTP header based on CrawlDatum metadata */ + protected boolean enableCookieHeader = true; /** Creates a new instance of HttpBase */ public HttpBase() { @@ -157,6 +162,7 @@ public abstract class HttpBase implements Protocol { this.useHttp11 = conf.getBoolean("http.useHttp11", false); this.responseTime = conf.getBoolean("http.store.responsetime", true); this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true); + this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header", true); this.robots.setConf(conf); // NUTCH-1941: read list of alternating agent names @@ -369,6 +375,10 @@ public abstract class HttpBase implements Protocol { public boolean isIfModifiedSinceEnabled() { return enableIfModifiedsinceHeader; } + + public boolean isCookieEnabled() { + return enableCookieHeader; + } public int getMaxContent() { return maxContent; @@ -458,6 +468,7 @@ public abstract class HttpBase implements Protocol { logger.info("http.agent = " + userAgent); logger.info("http.accept.language = " + acceptLanguage); logger.info("http.accept = " + accept); + logger.info("http.enable.cookie.header = " + isCookieEnabled()); } } @@ -584,4 +595,4 @@ public abstract class HttpBase implements Protocol { } return hm; } -} +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/217fad16/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java index f6d7e4d..d984dc4 100644 --- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java +++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java @@ -34,6 +34,7 @@ import javax.net.ssl.SSLSocket; import javax.net.ssl.SSLSocketFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.SpellCheckedMetadata; @@ -58,7 +59,7 @@ public class HttpResponse implements Response { private Metadata headers = new SpellCheckedMetadata(); // used for storing the http headers verbatim private StringBuffer httpHeaders; - + protected enum Scheme { HTTP, HTTPS, } @@ -195,6 +196,13 @@ public class HttpResponse implements Response { reqStr.append("Accept: "); reqStr.append(this.http.getAccept()); reqStr.append("\r\n"); + + if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) { + String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString(); + reqStr.append("Cookie: "); + reqStr.append(cookie); + reqStr.append("\r\n"); + } if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) { reqStr.append("If-Modified-Since: " + HttpDateFormat @@ -554,5 +562,4 @@ public class HttpResponse implements Response { in.unread(value); return value; } - } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/217fad16/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java index f074af2..6041e13 100644 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java @@ -39,6 +39,7 @@ import org.apache.nutch.metadata.SpellCheckedMetadata; import org.apache.nutch.net.protocols.HttpDateFormat; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.http.api.HttpBase; +import org.apache.hadoop.io.Text; /** * An HTTP response. @@ -96,6 +97,12 @@ public class HttpResponse implements Response { // XXX the request body was sent the method is not retried, so there is // XXX little danger in retrying... // params.setParameter(HttpMethodParams.RETRY_HANDLER, null); + + if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) { + String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString(); + get.addRequestHeader("Cookie", cookie); + } + try { HttpClient client = Http.getClient(); client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941
