Author: markus
Date: Tue Mar  3 13:16:39 2015
New Revision: 1663698

URL: http://svn.apache.org/r1663698
Log:
NUTCH 1921 Optionally disable HTTP if-modified-since header

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
    
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1663698&r1=1663697&r2=1663698&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Mar  3 13:16:39 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1921 Optionally disable HTTP if-modified-since header (markus)
+
 * NUTCH-1933 nutch-selenium plugin (Mo Omer, Mohammad Al-Moshin, lewismc)
 
 * NUTCH-827 HTTP POST Authentication (Jasper van Veghel, yuanyun.cn, snagel, 
lewismc)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1663698&r1=1663697&r2=1663698&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Mar  3 13:16:39 2015
@@ -297,6 +297,17 @@
   </description>
 </property>
 
+<property>
+  <name>http.enable.if.modified.since.header</name>
+  <value>true</value>
+  <description>Whether Nutch sends an HTTP If-Modified-Since header. It reduces
+  bandwidth when enabled by not downloading pages that respond with an HTTP
+  Not-Modified header. URL's that are not downloaded are not passed through
+  parse or indexing filters. If you regularly modify filters, you should force
+  Nutch to also download unmodified pages by disabling this feature.
+  </description>
+</property>
+
 <!-- FTP properties -->
 
 <property>

Modified: 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1663698&r1=1663697&r2=1663698&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Tue Mar  3 13:16:39 2015
@@ -107,6 +107,9 @@ public abstract class HttpBase implement
 
   /** Which TLS/SSL cipher suites to support */
   protected Set<String> tlsPreferredCipherSuites;
+  
+  /** Configuration directive for If-Modified-Since HTTP header */
+  public boolean enableIfModifiedsinceHeader = true;
 
   /** Creates a new instance of HttpBase */
   public HttpBase() {
@@ -137,6 +140,7 @@ public abstract class HttpBase implement
     // backward-compatible default setting
     this.useHttp11 = conf.getBoolean("http.useHttp11", false);
     this.responseTime = conf.getBoolean("http.store.responsetime", true);
+    this.enableIfModifiedsinceHeader = 
conf.getBoolean("http.enable.if.modified.since.header", true);
     this.robots.setConf(conf);
 
     String[] protocols = conf.getStrings("http.tls.supported.protocols",
@@ -298,6 +302,10 @@ public abstract class HttpBase implement
   public int getTimeout() {
     return timeout;
   }
+  
+  public boolean isIfModifiedSinceEnabled() {
+    return enableIfModifiedsinceHeader;
+  }
 
   public int getMaxContent() {
     return maxContent;

Modified: 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1663698&r1=1663697&r2=1663698&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 Tue Mar  3 13:16:39 2015
@@ -192,7 +192,7 @@ public class HttpResponse implements Res
       reqStr.append(this.http.getAccept());
       reqStr.append("\r\n");
 
-      if (datum.getModifiedTime() > 0) {
+      if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
         reqStr.append("If-Modified-Since: "
             + HttpDateFormat.toString(datum.getModifiedTime()));
         reqStr.append("\r\n");

Modified: 
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=1663698&r1=1663697&r2=1663698&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 Tue Mar  3 13:16:39 2015
@@ -74,7 +74,7 @@ public class HttpResponse implements Res
     GetMethod get = new GetMethod(url.toString());
     get.setFollowRedirects(followRedirects);
     get.setDoAuthentication(true);
-    if (datum.getModifiedTime() > 0) {
+    if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
       get.setRequestHeader("If-Modified-Since",
           HttpDateFormat.toString(datum.getModifiedTime()));
     }


Reply via email to