Author: lewismc
Date: Mon Dec 23 17:10:58 2013
New Revision: 1553151
URL: http://svn.apache.org/r1553151
Log:
NUTCH-1360 Support the storing of IP address connected to when web crawling
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1553151&r1=1553150&r2=1553151&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Dec 23 17:10:58 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1360 Suport the storing of IP address connected to when web crawling
(lewismc, ferdy and Yasin Kılınç)
+
* NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly
(İlhami KALKAN, snagel via markus)
* NUTCH-1668 Remove package org.apache.nutch.indexer.solr (jnioche)
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1553151&r1=1553150&r2=1553151&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Dec 23 17:10:58 2013
@@ -22,6 +22,18 @@
<configuration>
+<!-- general properties -->
+
+<property>
+ <name>store.ip.address</name>
+ <value>false</value>
+ <description>Enables us to capture the specific IP address
+ (InetSocketAddress) of the host which we connect to via
+ the given protocol. Currently supported is protocol-ftp &
+ http.
+ </description>
+</property>
+
<!-- file properties -->
<property>
Modified:
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1553151&r1=1553150&r2=1553151&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
(original)
+++
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
Mon Dec 23 17:10:58 2013
@@ -19,9 +19,6 @@ package org.apache.nutch.protocol.file;
// JDK imports
import java.net.URL;
-import java.net.URI;
-import java.util.Date;
-import java.util.TreeMap;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
@@ -96,6 +93,15 @@ public class FileResponse {
getHeader(Response.CONTENT_TYPE), headers, this.conf);
}
+ /**
+ * Default public constructor
+ * @param url
+ * @param datum
+ * @param file
+ * @param conf
+ * @throws FileException
+ * @throws IOException
+ */
public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
throws FileException, IOException {
@@ -232,7 +238,11 @@ public class FileResponse {
this.code = 200; // http OK
}
- // get dir list as http response
+ /**
+ * get dir list as http response
+ * @param f
+ * @throws IOException
+ */
private void getDirAsHttpResponse(java.io.File f) throws IOException {
String path = f.toString();
@@ -253,7 +263,13 @@ public class FileResponse {
this.code = 200; // http OK
}
- // generate html page from dir list
+ /**
+ * generate html page from dir list
+ * @param list
+ * @param path
+ * @param includeDotDot
+ * @return
+ */
private byte[] list2html(java.io.File[] list, String path,
boolean includeDotDot) {
Modified:
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java?rev=1553151&r1=1553150&r2=1553151&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
(original)
+++
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
Mon Dec 23 17:10:58 2013
@@ -51,14 +51,14 @@ import org.apache.commons.net.ftp.FTPCon
* ftp server implementations are hardly uniform and none seems to follow
* RFCs whole-heartedly. We have no choice, but assume common denominator
* as following:
- * (1) Use stream mode for data tranfer. Block mode will be better for
+ * (1) Use stream mode for data transfer. Block mode will be better for
* multiple file downloading and partial file downloading. However
* not every ftpd has block mode support.
* (2) Use passive mode for data connection.
- * So nutch will work if we run behind firewall.
+ * So Nutch will work if we run behind firewall.
* (3) Data connection is opened/closed per ftp command for the reasons
* listed in (1). There are ftp servers out there,
- * when partial downloading is enforeced by closing data channel
+ * when partial downloading is enforced by closing data channel
* socket on our client side, the server side immediately closes
* control channel (socket). Our codes deal with such a bad behavior.
* (4) LIST is used to obtain remote file attributes if possible.
@@ -82,7 +82,7 @@ public class Client extends FTP
// private FTPFileEntryParser __entryParser;
private String __systemName;
- // constructor
+ /** Public default constructor */
public Client()
{
__initDefaults();
@@ -150,7 +150,14 @@ public class Client extends FTP
__passivePort = index;
}
- // open passive data connection socket
+ /**
+ * open a passive data connection socket
+ * @param command
+ * @param arg
+ * @return
+ * @throws IOException
+ * @throws FtpExceptionCanNotHaveDataConnection
+ */
protected Socket __openPassiveDataConnection(int command, String arg)
throws IOException, FtpExceptionCanNotHaveDataConnection {
Socket socket;
@@ -314,7 +321,17 @@ public class Client extends FTP
return FTPReply.isPositiveCompletion(quit());
}
- // retrieve list reply for path
+ /**
+ * retrieve list reply for path
+ * @param path
+ * @param entries
+ * @param limit
+ * @param parser
+ * @throws IOException
+ * @throws FtpExceptionCanNotHaveDataConnection
+ * @throws FtpExceptionUnknownForcedDataClose
+ * @throws FtpExceptionControlClosedByForcedDataClose
+ */
public void retrieveList(String path, List<FTPFile> entries, int limit,
FTPFileEntryParser parser)
throws IOException,
@@ -380,7 +397,16 @@ public class Client extends FTP
}
- // retrieve file for path
+ /**
+ * retrieve file for path
+ * @param path
+ * @param os
+ * @param limit
+ * @throws IOException
+ * @throws FtpExceptionCanNotHaveDataConnection
+ * @throws FtpExceptionUnknownForcedDataClose
+ * @throws FtpExceptionControlClosedByForcedDataClose
+ */
public void retrieveFile(String path, OutputStream os, int limit)
throws IOException,
FtpExceptionCanNotHaveDataConnection,
@@ -450,7 +476,11 @@ public class Client extends FTP
}
- // reply check after closing data connection
+ /**
+ * reply check after closing data connection
+ * @param reply
+ * @return
+ */
private boolean _notBadReply(int reply) {
if (FTPReply.isPositiveCompletion(reply)) {
Modified:
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=1553151&r1=1553150&r2=1553151&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
(original)
+++
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
Mon Dec 23 17:10:58 2013
@@ -22,21 +22,17 @@ import org.apache.commons.net.ftp.FTPFil
import org.apache.commons.net.ftp.FTPReply;
import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory;
import org.apache.commons.net.ftp.parser.ParserInitializationException;
-
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
-
import org.apache.hadoop.conf.Configuration;
import java.net.InetAddress;
import java.net.URL;
-
import java.util.List;
import java.util.LinkedList;
-
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@@ -111,6 +107,10 @@ public class FtpResponse {
}
InetAddress addr = InetAddress.getByName(url.getHost());
+ if (addr != null
+ && conf.getBoolean("store.ip.address", false) == true) {
+ headers.add("_ip_", addr.getHostAddress());
+ }
// idled too long, remote server or ourselves may have timed out,
// should start anew.
Modified:
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java?rev=1553151&r1=1553150&r2=1553151&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
(original)
+++
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
Mon Dec 23 17:10:58 2013
@@ -40,10 +40,18 @@ public class Http extends HttpBase {
public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+ /**
+ * Public default constructor.
+ */
public Http() {
super(LOG);
}
+ /**
+ * Set the {@link org.apache.hadoop.conf.Configuration}
+ * object.
+ * @param conf
+ */
public void setConf(Configuration conf) {
super.setConf(conf);
// Level logLevel = Level.WARNING;
Modified:
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1553151&r1=1553150&r2=1553151&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(original)
+++
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Mon Dec 23 17:10:58 2013
@@ -28,7 +28,8 @@ import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
-// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.SpellCheckedMetadata;
@@ -38,10 +39,10 @@ import org.apache.nutch.protocol.Protoco
import org.apache.nutch.protocol.http.api.HttpBase;
import org.apache.nutch.protocol.http.api.HttpException;
-
/** An HTTP response. */
public class HttpResponse implements Response {
+ private Configuration conf;
private HttpBase http;
private URL url;
private String orig;
@@ -50,6 +51,14 @@ public class HttpResponse implements Res
private int code;
private Metadata headers = new SpellCheckedMetadata();
+ /**
+ * Default public constructor.
+ * @param http
+ * @param url
+ * @param datum
+ * @throws ProtocolException
+ * @throws IOException
+ */
public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
throws ProtocolException, IOException {
@@ -93,6 +102,12 @@ public class HttpResponse implements Res
int sockPort = http.useProxy() ? http.getProxyPort() : port;
InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
socket.connect(sockAddr, http.getTimeout());
+
+ this.conf = http.getConf();
+ if (sockAddr != null
+ && conf.getBoolean("store.ip.address", false) == true) {
+ headers.add("_ip_", sockAddr.getAddress().getHostAddress());
+ }
// make request
OutputStream req = socket.getOutputStream();
@@ -236,6 +251,14 @@ public class HttpResponse implements Res
content = out.toByteArray();
}
+ /**
+ *
+ * @param in
+ * @param line
+ * @throws HttpException
+ * @throws IOException
+ */
+ @SuppressWarnings("unused")
private void readChunkedContent(PushbackInputStream in,
StringBuffer line)
throws HttpException, IOException {