Author: lewismc
Date: Sat Nov 2 20:52:19 2013
New Revision: 1538280
URL: http://svn.apache.org/r1538280
Log:
NUTCH-1360 Suport the storing of IP address connected to when web crawling
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Nov 2 20:52:19 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1360 Suport the storing of IP address connected to when web crawling
(ferdy, lewismc, Yasin Kılınç)
+
* NUTCH-1588 Port NUTCH-1245 URL gone with 404 after db.fetch.interval.max
stays db_unfetched in CrawlDb and is generated over and over again to 2.x
(Talat UYARER via lewismc)
* NUTCH-1650 Adaptive Fetch Scheduler interval Wrong Set (Talat UYARER via
lewismc)
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Sat Nov 2 20:52:19 2013
@@ -22,6 +22,17 @@
<configuration>
+<!-- general properties -->
+
+<property>
+ <name>store.ip.address</name>
+ <value>false</value>
+ <description>Enables us to capture the specific IP address
+ (InetSocketAddress) of the host which we connect to via
+ the given protocol.
+ </description>
+</property>
+
<!-- file properties -->
<property>
Modified:
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
(original)
+++
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
Sat Nov 2 20:52:19 2013
@@ -29,7 +29,6 @@ import org.apache.nutch.net.protocols.Re
import org.apache.nutch.protocol.Content;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.MimeUtil;
-import org.apache.tika.mime.MimeType;
/************************************
Modified:
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
(original)
+++
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
Sat Nov 2 20:52:19 2013
@@ -22,21 +22,18 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
-
import java.net.InetAddress;
import java.net.Socket;
-
import java.util.List;
//import java.util.LinkedList;
-import org.apache.commons.net.MalformedServerReplyException;
+import org.apache.commons.net.MalformedServerReplyException;
import org.apache.commons.net.ftp.FTP;
import org.apache.commons.net.ftp.FTPCommand;
import org.apache.commons.net.ftp.FTPFile;
import org.apache.commons.net.ftp.FTPFileEntryParser;
import org.apache.commons.net.ftp.FTPReply;
-
import org.apache.commons.net.ftp.FTPConnectionClosedException;
/***********************************************
@@ -51,7 +48,7 @@ import org.apache.commons.net.ftp.FTPCon
* ftp server implementations are hardly uniform and none seems to follow
* RFCs whole-heartedly. We have no choice, but assume common denominator
* as following:
- * (1) Use stream mode for data tranfer. Block mode will be better for
+ * (1) Use stream mode for data transfer. Block mode will be better for
* multiple file downloading and partial file downloading. However
* not every ftpd has block mode support.
* (2) Use passive mode for data connection.
@@ -150,7 +147,14 @@ public class Client extends FTP
__passivePort = index;
}
- // open passive data connection socket
+ /**
+ * open a passive data connection socket
+ * @param command
+ * @param arg
+ * @return
+ * @throws IOException
+ * @throws FtpExceptionCanNotHaveDataConnection
+ */
protected Socket __openPassiveDataConnection(int command, String arg)
throws IOException, FtpExceptionCanNotHaveDataConnection {
Socket socket;
@@ -314,7 +318,17 @@ public class Client extends FTP
return FTPReply.isPositiveCompletion(quit());
}
- // retrieve list reply for path
+ /**
+ * Retrieve a list reply for path
+ * @param path
+ * @param entries
+ * @param limit
+ * @param parser
+ * @throws IOException
+ * @throws FtpExceptionCanNotHaveDataConnection
+ * @throws FtpExceptionUnknownForcedDataClose
+ * @throws FtpExceptionControlClosedByForcedDataClose
+ */
public void retrieveList(String path, List<FTPFile> entries, int limit,
FTPFileEntryParser parser)
throws IOException,
@@ -380,7 +394,16 @@ public class Client extends FTP
}
- // retrieve file for path
+ /**
+ * Retrieve a file for path
+ * @param path
+ * @param os
+ * @param limit
+ * @throws IOException
+ * @throws FtpExceptionCanNotHaveDataConnection
+ * @throws FtpExceptionUnknownForcedDataClose
+ * @throws FtpExceptionControlClosedByForcedDataClose
+ */
public void retrieveFile(String path, OutputStream os, int limit)
throws IOException,
FtpExceptionCanNotHaveDataConnection,
Modified:
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
(original)
+++
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
Sat Nov 2 20:52:19 2013
@@ -22,9 +22,11 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.InetAddress;
import java.net.URL;
+import java.nio.ByteBuffer;
import java.util.LinkedList;
import java.util.List;
+import org.apache.avro.util.Utf8;
import org.apache.commons.net.ftp.FTP;
import org.apache.commons.net.ftp.FTPFile;
import org.apache.commons.net.ftp.FTPReply;
@@ -111,6 +113,12 @@ public class FtpResponse {
}
InetAddress addr = InetAddress.getByName(url.getHost());
+ if (addr != null
+ && conf.getBoolean("store.ip.address", false) == true) {
+ String ipString = addr.getHostAddress(); //get the ip address
+ page.putToMetadata(new Utf8("_ip_"),
+ ByteBuffer.wrap(ipString.getBytes()));
+ }
// idled too long, remote server or ourselves may have timed out,
// should start anew.
Modified:
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(original)
+++
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Sat Nov 2 20:52:19 2013
@@ -27,8 +27,10 @@ import java.io.PushbackInputStream;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
+import java.nio.ByteBuffer;
import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.SpellCheckedMetadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
@@ -37,6 +39,7 @@ import org.apache.nutch.protocol.Protoco
import org.apache.nutch.protocol.http.api.HttpBase;
import org.apache.nutch.protocol.http.api.HttpException;
import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.NutchConfiguration;
/** An HTTP response. */
public class HttpResponse implements Response {
@@ -89,6 +92,14 @@ public class HttpResponse implements Res
int sockPort = http.useProxy() ? http.getProxyPort() : port;
InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
socket.connect(sockAddr, http.getTimeout());
+
+ Configuration conf = NutchConfiguration.create();
+ if (sockAddr != null
+ && conf.getBoolean("store.ip.address", false) == true) {
+ String ipString = sockAddr.getAddress().getHostAddress(); //get the ip
address
+ page.putToMetadata(new Utf8("_ip_"),
+ ByteBuffer.wrap(ipString.getBytes()));
+ }
// make request
OutputStream req = socket.getOutputStream();
Modified:
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
(original)
+++
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
Sat Nov 2 20:52:19 2013
@@ -1,4 +1,4 @@
-/*
+/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -29,8 +29,6 @@ import java.security.cert.X509Certificat
import javax.net.ssl.TrustManagerFactory;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
public class DummyX509TrustManager implements X509TrustManager
{
Modified:
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java?rev=1538280&r1=1538279&r2=1538280&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
(original)
+++
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
Sat Nov 2 20:52:19 2013
@@ -32,7 +32,6 @@ import java.util.concurrent.BlockingQueu
//APACHE imports
import org.apache.hadoop.conf.Configuration;
-import org.apache.log4j.Logger;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
@@ -52,13 +51,16 @@ import com.jcraft.jsch.ChannelSftp.LsEnt
import crawlercommons.robots.BaseRobotRules;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
/**
* This class uses the Jsch package to fetch content using the Sftp protocol.
*
*/
public class Sftp implements Protocol {
- private static final Logger logger = Logger.getLogger(Sftp.class);
+ private static final Logger logger = LoggerFactory.getLogger(Sftp.class);
private static final Map<String, BlockingQueue<ChannelSftp>>
channelSftpByHostMap = new Hashtable<String, BlockingQueue<ChannelSftp>>();
private Configuration configuration;