Author: lewismc
Date: Mon Jun 11 20:27:20 2012
New Revision: 1348993
URL: http://svn.apache.org/viewvc?rev=1348993&view=rev
Log:
commit to address NUTCH-1360 and update to CHANGES.txt
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1348993&r1=1348992&r2=1348993&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Jun 11 20:27:20 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1360 Suport the storing of IP address connected to when web crawling
(lewismc)
+
* NUTCH-1262 Map `duplicating` content-types to a single type (markus)
* NUTCH-1384 Typo in ParseSegments's run-method (Matthias Agethle via markus)
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1348993&r1=1348992&r2=1348993&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Jun 11 20:27:20 2012
@@ -255,6 +255,13 @@
</description>
</property>
+<property>
+ <name>http.store.ip.address</name>
+ <value>false</value>
+ <description>Enables us to capture the specific IP address of the
+ host which we connect to to fetch a page.</description>
+</property>
+
<!-- FTP properties -->
<property>
Modified: nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1348993&r1=1348992&r2=1348993&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java Mon Jun 11
20:27:20 2012
@@ -45,5 +45,7 @@ public interface HttpHeaders {
public final static String LAST_MODIFIED = "Last-Modified";
public final static String LOCATION = "Location";
+
+ public final static String IP_ADDRESS = "_ip";
}
Modified:
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1348993&r1=1348992&r2=1348993&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
(original)
+++
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Mon Jun 11 20:27:20 2012
@@ -20,7 +20,7 @@ package org.apache.nutch.protocol.http.a
import java.io.IOException;
import java.net.URL;
-// Commons Logging imports
+// Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -71,14 +71,17 @@ public abstract class HttpBase implement
/** The Nutch 'User-Agent' request header */
protected String userAgent = getAgentString(
"NutchCVS", null, "Nutch",
- "http://lucene.apache.org/nutch/bot.html",
- "[email protected]");
+ "http://nutch.apache.org/bot.html",
+ "[email protected]");
/** The "Accept-Language" request header value. */
protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
/** The "Accept" request header value. */
protected String accept =
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
+
+ /** The "_ip" request header value. */
+ protected boolean ip_header = false;
/** The default logger */
private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class);
@@ -120,6 +123,7 @@ public abstract class HttpBase implement
.get("http.agent.description"), conf.get("http.agent.url"),
conf.get("http.agent.email"));
this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
this.accept = conf.get("http.accept", accept);
+ this.ip_header = conf.getBoolean("http.store.ip.address", false);
// backward-compatible default setting
this.useHttp11 = conf.getBoolean("http.useHttp11", false);
this.robots.setConf(conf);
@@ -247,6 +251,10 @@ public abstract class HttpBase implement
return useHttp11;
}
+ public boolean getIP_Header(){
+ return ip_header;
+ }
+
private static String getAgentString(String agentName,
String agentVersion,
String agentDesc,
@@ -301,6 +309,7 @@ public abstract class HttpBase implement
logger.info("http.agent = " + userAgent);
logger.info("http.accept.language = " + acceptLanguage);
logger.info("http.accept = " + accept);
+ logger.info("http.store.ip.address = " + ip_header);
}
}
Modified:
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1348993&r1=1348992&r2=1348993&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(original)
+++
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Mon Jun 11 20:27:20 2012
@@ -93,7 +93,9 @@ public class HttpResponse implements Res
int sockPort = http.useProxy() ? http.getProxyPort() : port;
InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
socket.connect(sockAddr, http.getTimeout());
-
+
+ headers.set("_ip", socket.getInetAddress().getHostAddress());
+
// make request
OutputStream req = socket.getOutputStream();
@@ -110,6 +112,12 @@ public class HttpResponse implements Res
reqStr.append(host);
reqStr.append(portString);
reqStr.append("\r\n");
+
+ if(this.http.getConf().getBoolean("http.store.ip.address", true)) {
+ reqStr.append("_ip: ");
+ reqStr.append(http.getIP_Header());
+ reqStr.append("\r\n");
+ }
reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
@@ -432,5 +440,5 @@ public class HttpResponse implements Res
in.unread(value);
return value;
}
-
+
}