Author: markus Date: Tue Mar 3 13:16:39 2015 New Revision: 1663698 URL: http://svn.apache.org/r1663698 Log: NUTCH 1921 Optionally disable HTTP if-modified-since header
Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1663698&r1=1663697&r2=1663698&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Mar 3 13:16:39 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1921 Optionally disable HTTP if-modified-since header (markus) + * NUTCH-1933 nutch-selenium plugin (Mo Omer, Mohammad Al-Moshin, lewismc) * NUTCH-827 HTTP POST Authentication (Jasper van Veghel, yuanyun.cn, snagel, lewismc) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1663698&r1=1663697&r2=1663698&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Tue Mar 3 13:16:39 2015 @@ -297,6 +297,17 @@ </description> </property> +<property> + <name>http.enable.if.modified.since.header</name> + <value>true</value> + <description>Whether Nutch sends an HTTP If-Modified-Since header. It reduces + bandwidth when enabled by not downloading pages that respond with an HTTP + Not-Modified header. URL's that are not downloaded are not passed through + parse or indexing filters. If you regularly modify filters, you should force + Nutch to also download unmodified pages by disabling this feature. + </description> +</property> + <!-- FTP properties --> <property> Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1663698&r1=1663697&r2=1663698&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Tue Mar 3 13:16:39 2015 @@ -107,6 +107,9 @@ public abstract class HttpBase implement /** Which TLS/SSL cipher suites to support */ protected Set<String> tlsPreferredCipherSuites; + + /** Configuration directive for If-Modified-Since HTTP header */ + public boolean enableIfModifiedsinceHeader = true; /** Creates a new instance of HttpBase */ public HttpBase() { @@ -137,6 +140,7 @@ public abstract class HttpBase implement // backward-compatible default setting this.useHttp11 = conf.getBoolean("http.useHttp11", false); this.responseTime = conf.getBoolean("http.store.responsetime", true); + this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true); this.robots.setConf(conf); String[] protocols = conf.getStrings("http.tls.supported.protocols", @@ -298,6 +302,10 @@ public abstract class HttpBase implement public int getTimeout() { return timeout; } + + public boolean isIfModifiedSinceEnabled() { + return enableIfModifiedsinceHeader; + } public int getMaxContent() { return maxContent; Modified: nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1663698&r1=1663697&r2=1663698&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Tue Mar 3 13:16:39 2015 @@ -192,7 +192,7 @@ public class HttpResponse implements Res reqStr.append(this.http.getAccept()); reqStr.append("\r\n"); - if (datum.getModifiedTime() > 0) { + if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) { reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime())); reqStr.append("\r\n"); Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=1663698&r1=1663697&r2=1663698&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Tue Mar 3 13:16:39 2015 @@ -74,7 +74,7 @@ public class HttpResponse implements Res GetMethod get = new GetMethod(url.toString()); get.setFollowRedirects(followRedirects); get.setDoAuthentication(true); - if (datum.getModifiedTime() > 0) { + if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) { get.setRequestHeader("If-Modified-Since", HttpDateFormat.toString(datum.getModifiedTime())); }