Author: markus
Date: Tue Mar 3 13:16:39 2015
New Revision: 1663698
URL: http://svn.apache.org/r1663698
Log:
NUTCH 1921 Optionally disable HTTP if-modified-since header
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1663698&r1=1663697&r2=1663698&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Mar 3 13:16:39 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1921 Optionally disable HTTP if-modified-since header (markus)
+
* NUTCH-1933 nutch-selenium plugin (Mo Omer, Mohammad Al-Moshin, lewismc)
* NUTCH-827 HTTP POST Authentication (Jasper van Veghel, yuanyun.cn, snagel,
lewismc)
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1663698&r1=1663697&r2=1663698&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Mar 3 13:16:39 2015
@@ -297,6 +297,17 @@
</description>
</property>
+<property>
+ <name>http.enable.if.modified.since.header</name>
+ <value>true</value>
+ <description>Whether Nutch sends an HTTP If-Modified-Since header. It reduces
+ bandwidth when enabled by not downloading pages that respond with an HTTP
+ Not-Modified header. URL's that are not downloaded are not passed through
+ parse or indexing filters. If you regularly modify filters, you should force
+ Nutch to also download unmodified pages by disabling this feature.
+ </description>
+</property>
+
<!-- FTP properties -->
<property>
Modified:
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1663698&r1=1663697&r2=1663698&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
(original)
+++
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Tue Mar 3 13:16:39 2015
@@ -107,6 +107,9 @@ public abstract class HttpBase implement
/** Which TLS/SSL cipher suites to support */
protected Set<String> tlsPreferredCipherSuites;
+
+ /** Configuration directive for If-Modified-Since HTTP header */
+ public boolean enableIfModifiedsinceHeader = true;
/** Creates a new instance of HttpBase */
public HttpBase() {
@@ -137,6 +140,7 @@ public abstract class HttpBase implement
// backward-compatible default setting
this.useHttp11 = conf.getBoolean("http.useHttp11", false);
this.responseTime = conf.getBoolean("http.store.responsetime", true);
+ this.enableIfModifiedsinceHeader =
conf.getBoolean("http.enable.if.modified.since.header", true);
this.robots.setConf(conf);
String[] protocols = conf.getStrings("http.tls.supported.protocols",
@@ -298,6 +302,10 @@ public abstract class HttpBase implement
public int getTimeout() {
return timeout;
}
+
+ public boolean isIfModifiedSinceEnabled() {
+ return enableIfModifiedsinceHeader;
+ }
public int getMaxContent() {
return maxContent;
Modified:
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1663698&r1=1663697&r2=1663698&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(original)
+++
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Tue Mar 3 13:16:39 2015
@@ -192,7 +192,7 @@ public class HttpResponse implements Res
reqStr.append(this.http.getAccept());
reqStr.append("\r\n");
- if (datum.getModifiedTime() > 0) {
+ if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
reqStr.append("If-Modified-Since: "
+ HttpDateFormat.toString(datum.getModifiedTime()));
reqStr.append("\r\n");
Modified:
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=1663698&r1=1663697&r2=1663698&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
(original)
+++
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
Tue Mar 3 13:16:39 2015
@@ -74,7 +74,7 @@ public class HttpResponse implements Res
GetMethod get = new GetMethod(url.toString());
get.setFollowRedirects(followRedirects);
get.setDoAuthentication(true);
- if (datum.getModifiedTime() > 0) {
+ if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
get.setRequestHeader("If-Modified-Since",
HttpDateFormat.toString(datum.getModifiedTime()));
}