Author: lewismc
Date: Mon Jun 11 20:27:20 2012
New Revision: 1348993

URL: http://svn.apache.org/viewvc?rev=1348993&view=rev
Log:
commit to address NUTCH-1360 and update to CHANGES.txt

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java
    
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1348993&r1=1348992&r2=1348993&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Jun 11 20:27:20 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1360 Suport the storing of IP address connected to when web crawling 
(lewismc)
+
 * NUTCH-1262 Map `duplicating` content-types to a single type (markus)
 
 * NUTCH-1384 Typo in ParseSegments's run-method (Matthias Agethle via markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1348993&r1=1348992&r2=1348993&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Jun 11 20:27:20 2012
@@ -255,6 +255,13 @@
   </description>
 </property>
 
+<property>
+  <name>http.store.ip.address</name>
+  <value>false</value>
+  <description>Enables us to capture the specific IP address of the 
+  host which we connect to to fetch a page.</description>
+</property>
+
 <!-- FTP properties -->
 
 <property>

Modified: nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1348993&r1=1348992&r2=1348993&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java Mon Jun 11 
20:27:20 2012
@@ -45,5 +45,7 @@ public interface HttpHeaders {
   public final static String LAST_MODIFIED = "Last-Modified";
   
   public final static String LOCATION = "Location";
+  
+  public final static String IP_ADDRESS = "_ip";
 
 }

Modified: 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1348993&r1=1348992&r2=1348993&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Mon Jun 11 20:27:20 2012
@@ -20,7 +20,7 @@ package org.apache.nutch.protocol.http.a
 import java.io.IOException;
 import java.net.URL;
 
-// Commons Logging imports
+// Logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -71,14 +71,17 @@ public abstract class HttpBase implement
   /** The Nutch 'User-Agent' request header */
   protected String userAgent = getAgentString(
                         "NutchCVS", null, "Nutch",
-                        "http://lucene.apache.org/nutch/bot.html";,
-                        "[email protected]");
+                        "http://nutch.apache.org/bot.html";,
+                        "[email protected]");
 
   /** The "Accept-Language" request header value. */
   protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
 
   /** The "Accept" request header value. */
   protected String accept = 
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
+  
+  /** The "_ip" request header value. */
+  protected boolean ip_header = false;
     
   /** The default logger */
   private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class);
@@ -120,6 +123,7 @@ public abstract class HttpBase implement
               .get("http.agent.description"), conf.get("http.agent.url"), 
conf.get("http.agent.email"));
       this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
       this.accept = conf.get("http.accept", accept);
+      this.ip_header = conf.getBoolean("http.store.ip.address", false);
       // backward-compatible default setting
       this.useHttp11 = conf.getBoolean("http.useHttp11", false);
       this.robots.setConf(conf);
@@ -247,6 +251,10 @@ public abstract class HttpBase implement
     return useHttp11;
   }
   
+  public boolean getIP_Header(){
+         return ip_header;
+  }
+  
   private static String getAgentString(String agentName,
                                        String agentVersion,
                                        String agentDesc,
@@ -301,6 +309,7 @@ public abstract class HttpBase implement
       logger.info("http.agent = " + userAgent);
       logger.info("http.accept.language = " + acceptLanguage);
       logger.info("http.accept = " + accept);
+      logger.info("http.store.ip.address = " + ip_header);
     }
   }
   

Modified: 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1348993&r1=1348992&r2=1348993&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 Mon Jun 11 20:27:20 2012
@@ -93,7 +93,9 @@ public class HttpResponse implements Res
       int sockPort = http.useProxy() ? http.getProxyPort() : port;
       InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
       socket.connect(sockAddr, http.getTimeout());
-
+      
+      headers.set("_ip", socket.getInetAddress().getHostAddress());
+      
       // make request
       OutputStream req = socket.getOutputStream();
 
@@ -110,6 +112,12 @@ public class HttpResponse implements Res
       reqStr.append(host);
       reqStr.append(portString);
       reqStr.append("\r\n");
+      
+      if(this.http.getConf().getBoolean("http.store.ip.address", true)) {
+        reqStr.append("_ip: ");
+        reqStr.append(http.getIP_Header());
+        reqStr.append("\r\n");
+      }
 
       reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
 
@@ -432,5 +440,5 @@ public class HttpResponse implements Res
     in.unread(value);
     return value;
   }
-
+  
 }


Reply via email to