Author: lewismc
Date: Sat Nov  2 20:52:19 2013
New Revision: 1538280

URL: http://svn.apache.org/r1538280
Log:
NUTCH-1360 Suport the storing of IP address connected to when web crawling

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/nutch-default.xml
    
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
    
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
    
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
    
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
    
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
    
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Nov  2 20:52:19 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1360 Suport the storing of IP address connected to when web crawling 
(ferdy, lewismc, Yasin Kılınç)
+
 * NUTCH-1588 Port NUTCH-1245 URL gone with 404 after db.fetch.interval.max 
stays db_unfetched in CrawlDb and is generated over and over again to 2.x 
(Talat UYARER via lewismc)
 
 * NUTCH-1650 Adaptive Fetch Scheduler interval Wrong Set (Talat UYARER via 
lewismc)

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Sat Nov  2 20:52:19 2013
@@ -22,6 +22,17 @@
 
 <configuration>
 
+<!-- general properties  -->
+
+<property>
+  <name>store.ip.address</name>
+  <value>false</value>
+  <description>Enables us to capture the specific IP address 
+  (InetSocketAddress) of the host which we connect to via 
+  the given protocol.
+  </description>
+</property>
+
 <!-- file properties -->
 
 <property>

Modified: 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 Sat Nov  2 20:52:19 2013
@@ -29,7 +29,6 @@ import org.apache.nutch.net.protocols.Re
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.MimeUtil;
-import org.apache.tika.mime.MimeType;
 
 
 /************************************

Modified: 
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
 Sat Nov  2 20:52:19 2013
@@ -22,21 +22,18 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.OutputStream;
-
 import java.net.InetAddress;
 import java.net.Socket;
-
 import java.util.List;
 //import java.util.LinkedList;
 
-import org.apache.commons.net.MalformedServerReplyException;
 
+import org.apache.commons.net.MalformedServerReplyException;
 import org.apache.commons.net.ftp.FTP;
 import org.apache.commons.net.ftp.FTPCommand;
 import org.apache.commons.net.ftp.FTPFile;
 import org.apache.commons.net.ftp.FTPFileEntryParser;
 import org.apache.commons.net.ftp.FTPReply;
-
 import org.apache.commons.net.ftp.FTPConnectionClosedException;
 
 /***********************************************
@@ -51,7 +48,7 @@ import org.apache.commons.net.ftp.FTPCon
  * ftp server implementations are hardly uniform and none seems to follow
  * RFCs whole-heartedly. We have no choice, but assume common denominator
  * as following:
- * (1) Use stream mode for data tranfer. Block mode will be better for
+ * (1) Use stream mode for data transfer. Block mode will be better for
  *     multiple file downloading and partial file downloading. However
  *     not every ftpd has block mode support.
  * (2) Use passive mode for data connection.
@@ -150,7 +147,14 @@ public class Client extends FTP
         __passivePort = index;
     }
 
-    // open passive data connection socket
+    /**
+     * open a passive data connection socket
+     * @param command
+     * @param arg
+     * @return
+     * @throws IOException
+     * @throws FtpExceptionCanNotHaveDataConnection
+     */
     protected Socket __openPassiveDataConnection(int command, String arg)
       throws IOException, FtpExceptionCanNotHaveDataConnection {
         Socket socket;
@@ -314,7 +318,17 @@ public class Client extends FTP
         return FTPReply.isPositiveCompletion(quit());
     }
 
-    // retrieve list reply for path
+    /**
+     * Retrieve a list reply for path
+     * @param path
+     * @param entries
+     * @param limit
+     * @param parser
+     * @throws IOException
+     * @throws FtpExceptionCanNotHaveDataConnection
+     * @throws FtpExceptionUnknownForcedDataClose
+     * @throws FtpExceptionControlClosedByForcedDataClose
+     */
     public void retrieveList(String path, List<FTPFile> entries, int limit,
       FTPFileEntryParser parser)
       throws IOException,
@@ -380,7 +394,16 @@ public class Client extends FTP
 
     }
 
-    // retrieve file for path
+    /**
+     * Retrieve a file for path
+     * @param path
+     * @param os
+     * @param limit
+     * @throws IOException
+     * @throws FtpExceptionCanNotHaveDataConnection
+     * @throws FtpExceptionUnknownForcedDataClose
+     * @throws FtpExceptionControlClosedByForcedDataClose
+     */
     public void retrieveFile(String path, OutputStream os, int limit)
       throws IOException,
         FtpExceptionCanNotHaveDataConnection,

Modified: 
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
 Sat Nov  2 20:52:19 2013
@@ -22,9 +22,11 @@ import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.net.URL;
+import java.nio.ByteBuffer;
 import java.util.LinkedList;
 import java.util.List;
 
+import org.apache.avro.util.Utf8;
 import org.apache.commons.net.ftp.FTP;
 import org.apache.commons.net.ftp.FTPFile;
 import org.apache.commons.net.ftp.FTPReply;
@@ -111,6 +113,12 @@ public class FtpResponse {
       }
 
       InetAddress addr = InetAddress.getByName(url.getHost());
+      if (addr != null
+          && conf.getBoolean("store.ip.address", false) == true) {
+        String ipString = addr.getHostAddress(); //get the ip address
+        page.putToMetadata(new Utf8("_ip_"),
+          ByteBuffer.wrap(ipString.getBytes()));
+      }
 
       // idled too long, remote server or ourselves may have timed out,
       // should start anew.

Modified: 
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 Sat Nov  2 20:52:19 2013
@@ -27,8 +27,10 @@ import java.io.PushbackInputStream;
 import java.net.InetSocketAddress;
 import java.net.Socket;
 import java.net.URL;
+import java.nio.ByteBuffer;
 
 import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.SpellCheckedMetadata;
 import org.apache.nutch.net.protocols.HttpDateFormat;
@@ -37,6 +39,7 @@ import org.apache.nutch.protocol.Protoco
 import org.apache.nutch.protocol.http.api.HttpBase;
 import org.apache.nutch.protocol.http.api.HttpException;
 import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.NutchConfiguration;
 
 /** An HTTP response. */
 public class HttpResponse implements Response {
@@ -89,6 +92,14 @@ public class HttpResponse implements Res
       int sockPort = http.useProxy() ? http.getProxyPort() : port;
       InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
       socket.connect(sockAddr, http.getTimeout());
+      
+      Configuration conf = NutchConfiguration.create();
+      if (sockAddr != null
+          && conf.getBoolean("store.ip.address", false) == true) {
+        String ipString = sockAddr.getAddress().getHostAddress(); //get the ip 
address
+        page.putToMetadata(new Utf8("_ip_"),
+          ByteBuffer.wrap(ipString.getBytes()));
+      }
 
       // make request
       OutputStream req = socket.getOutputStream();

Modified: 
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
 Sat Nov  2 20:52:19 2013
@@ -1,4 +1,4 @@
-/*
+/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
@@ -29,8 +29,6 @@ import java.security.cert.X509Certificat
 import javax.net.ssl.TrustManagerFactory;
 import javax.net.ssl.TrustManager;
 import javax.net.ssl.X509TrustManager;
-import org.slf4j.Logger; 
-import org.slf4j.LoggerFactory;
 
 public class DummyX509TrustManager implements X509TrustManager
 {

Modified: 
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
 Sat Nov  2 20:52:19 2013
@@ -32,7 +32,6 @@ import java.util.concurrent.BlockingQueu
 
 //APACHE imports
 import org.apache.hadoop.conf.Configuration;
-import org.apache.log4j.Logger;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
@@ -52,13 +51,16 @@ import com.jcraft.jsch.ChannelSftp.LsEnt
 
 import crawlercommons.robots.BaseRobotRules;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 /**
  * This class uses the Jsch package to fetch content using the Sftp protocol.
  * 
  */
 public class Sftp implements Protocol {
 
-  private static final Logger logger = Logger.getLogger(Sftp.class);
+  private static final Logger logger = LoggerFactory.getLogger(Sftp.class);
   private static final Map<String, BlockingQueue<ChannelSftp>> 
channelSftpByHostMap = new Hashtable<String, BlockingQueue<ChannelSftp>>();
 
   private Configuration configuration;


Reply via email to