Author: lewismc
Date: Mon Dec 23 17:10:58 2013
New Revision: 1553151

URL: http://svn.apache.org/r1553151
Log:
NUTCH-1360 Support the storing of IP address connected to when web crawling

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
    
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
    
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
    
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
    
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1553151&r1=1553150&r2=1553151&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Dec 23 17:10:58 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1360 Suport the storing of IP address connected to when web crawling 
(lewismc, ferdy and Yasin Kılınç)
+
 * NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly 
(İlhami KALKAN, snagel via markus)
 
 * NUTCH-1668 Remove package org.apache.nutch.indexer.solr (jnioche)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1553151&r1=1553150&r2=1553151&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Dec 23 17:10:58 2013
@@ -22,6 +22,18 @@
 
 <configuration>
 
+<!-- general properties  -->
+
+<property>
+  <name>store.ip.address</name>
+  <value>false</value>
+  <description>Enables us to capture the specific IP address 
+  (InetSocketAddress) of the host which we connect to via 
+  the given protocol. Currently supported is protocol-ftp & 
+  http.
+  </description>
+</property>
+
 <!-- file properties -->
 
 <property>

Modified: 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1553151&r1=1553150&r2=1553151&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 Mon Dec 23 17:10:58 2013
@@ -19,9 +19,6 @@ package org.apache.nutch.protocol.file;
 
 // JDK imports
 import java.net.URL;
-import java.net.URI;
-import java.util.Date;
-import java.util.TreeMap;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 
@@ -96,6 +93,15 @@ public class FileResponse {
         getHeader(Response.CONTENT_TYPE), headers, this.conf);
   }
 
+  /**
+   * Default public constructor
+   * @param url
+   * @param datum
+   * @param file
+   * @param conf
+   * @throws FileException
+   * @throws IOException
+   */
   public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
     throws FileException, IOException {
 
@@ -232,7 +238,11 @@ public class FileResponse {
     this.code = 200; // http OK
   }
 
-  // get dir list as http response
+  /**
+   * get dir list as http response
+   * @param f
+   * @throws IOException
+   */
   private void getDirAsHttpResponse(java.io.File f) throws IOException {
 
     String path = f.toString();
@@ -253,7 +263,13 @@ public class FileResponse {
     this.code = 200; // http OK
   }
 
-  // generate html page from dir list
+  /**
+   * generate html page from dir list
+   * @param list
+   * @param path
+   * @param includeDotDot
+   * @return
+   */
   private byte[] list2html(java.io.File[] list, String path,
       boolean includeDotDot) {
 

Modified: 
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java?rev=1553151&r1=1553150&r2=1553151&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
 Mon Dec 23 17:10:58 2013
@@ -51,14 +51,14 @@ import org.apache.commons.net.ftp.FTPCon
  * ftp server implementations are hardly uniform and none seems to follow
  * RFCs whole-heartedly. We have no choice, but assume common denominator
  * as following:
- * (1) Use stream mode for data tranfer. Block mode will be better for
+ * (1) Use stream mode for data transfer. Block mode will be better for
  *     multiple file downloading and partial file downloading. However
  *     not every ftpd has block mode support.
  * (2) Use passive mode for data connection.
- *     So nutch will work if we run behind firewall.
+ *     So Nutch will work if we run behind firewall.
  * (3) Data connection is opened/closed per ftp command for the reasons
  *     listed in (1). There are ftp servers out there,
- *     when partial downloading is enforeced by closing data channel
+ *     when partial downloading is enforced by closing data channel
  *     socket on our client side, the server side immediately closes
  *     control channel (socket). Our codes deal with such a bad behavior.
  * (4) LIST is used to obtain remote file attributes if possible.
@@ -82,7 +82,7 @@ public class Client extends FTP
 //    private FTPFileEntryParser __entryParser;
     private String __systemName;
 
-    // constructor
+    /** Public default constructor */
     public Client()
     {
         __initDefaults();
@@ -150,7 +150,14 @@ public class Client extends FTP
         __passivePort = index;
     }
 
-    // open passive data connection socket
+    /** 
+     * open a passive data connection socket
+     * @param command
+     * @param arg
+     * @return
+     * @throws IOException
+     * @throws FtpExceptionCanNotHaveDataConnection
+     */
     protected Socket __openPassiveDataConnection(int command, String arg)
       throws IOException, FtpExceptionCanNotHaveDataConnection {
         Socket socket;
@@ -314,7 +321,17 @@ public class Client extends FTP
         return FTPReply.isPositiveCompletion(quit());
     }
 
-    // retrieve list reply for path
+    /**
+     * retrieve list reply for path
+     * @param path
+     * @param entries
+     * @param limit
+     * @param parser
+     * @throws IOException
+     * @throws FtpExceptionCanNotHaveDataConnection
+     * @throws FtpExceptionUnknownForcedDataClose
+     * @throws FtpExceptionControlClosedByForcedDataClose
+     */
     public void retrieveList(String path, List<FTPFile> entries, int limit,
       FTPFileEntryParser parser)
       throws IOException,
@@ -380,7 +397,16 @@ public class Client extends FTP
 
     }
 
-    // retrieve file for path
+    /**
+     * retrieve file for path
+     * @param path
+     * @param os
+     * @param limit
+     * @throws IOException
+     * @throws FtpExceptionCanNotHaveDataConnection
+     * @throws FtpExceptionUnknownForcedDataClose
+     * @throws FtpExceptionControlClosedByForcedDataClose
+     */
     public void retrieveFile(String path, OutputStream os, int limit)
       throws IOException,
         FtpExceptionCanNotHaveDataConnection,
@@ -450,7 +476,11 @@ public class Client extends FTP
 
     }
 
-    // reply check after closing data connection
+    /**
+     * reply check after closing data connection
+     * @param reply
+     * @return
+     */
     private boolean _notBadReply(int reply) {
 
       if (FTPReply.isPositiveCompletion(reply)) {

Modified: 
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=1553151&r1=1553150&r2=1553151&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
 Mon Dec 23 17:10:58 2013
@@ -22,21 +22,17 @@ import org.apache.commons.net.ftp.FTPFil
 import org.apache.commons.net.ftp.FTPReply;
 import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory;
 import org.apache.commons.net.ftp.parser.ParserInitializationException;
-
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
-
 import org.apache.hadoop.conf.Configuration;
 
 import java.net.InetAddress;
 import java.net.URL;
-
 import java.util.List;
 import java.util.LinkedList;
-
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 
@@ -111,6 +107,10 @@ public class FtpResponse {
       }
 
       InetAddress addr = InetAddress.getByName(url.getHost());
+      if (addr != null
+          && conf.getBoolean("store.ip.address", false) == true) {
+        headers.add("_ip_", addr.getHostAddress());
+      }
 
       // idled too long, remote server or ourselves may have timed out,
       // should start anew.

Modified: 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java?rev=1553151&r1=1553150&r2=1553151&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
 Mon Dec 23 17:10:58 2013
@@ -40,10 +40,18 @@ public class Http extends HttpBase {
   public static final Logger LOG = LoggerFactory.getLogger(Http.class);
 
 
+  /**
+   * Public default constructor.
+   */
   public Http() {
     super(LOG);
   }
 
+  /**
+   * Set the {@link org.apache.hadoop.conf.Configuration}
+   * object.
+   * @param conf
+   */
   public void setConf(Configuration conf) {
     super.setConf(conf);
 //    Level logLevel = Level.WARNING;

Modified: 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1553151&r1=1553150&r2=1553151&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 Mon Dec 23 17:10:58 2013
@@ -28,7 +28,8 @@ import java.net.InetSocketAddress;
 import java.net.Socket;
 import java.net.URL;
 
-// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.SpellCheckedMetadata;
@@ -38,10 +39,10 @@ import org.apache.nutch.protocol.Protoco
 import org.apache.nutch.protocol.http.api.HttpBase;
 import org.apache.nutch.protocol.http.api.HttpException;
 
-
 /** An HTTP response. */
 public class HttpResponse implements Response {
  
+  private Configuration conf;
   private HttpBase http; 
   private URL url;
   private String orig;
@@ -50,6 +51,14 @@ public class HttpResponse implements Res
   private int code;
   private Metadata headers = new SpellCheckedMetadata();
 
+  /**
+   * Default public constructor.
+   * @param http
+   * @param url
+   * @param datum
+   * @throws ProtocolException
+   * @throws IOException
+   */
   public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
     throws ProtocolException, IOException {
 
@@ -93,6 +102,12 @@ public class HttpResponse implements Res
       int sockPort = http.useProxy() ? http.getProxyPort() : port;
       InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
       socket.connect(sockAddr, http.getTimeout());
+      
+      this.conf = http.getConf();
+      if (sockAddr != null
+          && conf.getBoolean("store.ip.address", false) == true) {
+        headers.add("_ip_", sockAddr.getAddress().getHostAddress());
+      }
 
       // make request
       OutputStream req = socket.getOutputStream();
@@ -236,6 +251,14 @@ public class HttpResponse implements Res
     content = out.toByteArray();
   }
 
+  /**
+   * 
+   * @param in
+   * @param line
+   * @throws HttpException
+   * @throws IOException
+   */
+  @SuppressWarnings("unused")
   private void readChunkedContent(PushbackInputStream in,  
                                   StringBuffer line) 
     throws HttpException, IOException {


Reply via email to