http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java new file mode 100644 index 0000000..da25d87 --- /dev/null +++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java @@ -0,0 +1,595 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.ftp; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; + +import java.net.InetAddress; +import java.net.Socket; + +import java.util.List; +//import java.util.LinkedList; + +import org.apache.commons.net.MalformedServerReplyException; + +import org.apache.commons.net.ftp.FTP; +import org.apache.commons.net.ftp.FTPCommand; +import org.apache.commons.net.ftp.FTPFile; +import org.apache.commons.net.ftp.FTPFileEntryParser; +import org.apache.commons.net.ftp.FTPReply; + +import org.apache.commons.net.ftp.FTPConnectionClosedException; + +/*********************************************** + * Client.java encapsulates functionalities necessary for nutch to get dir list + * and retrieve file from an FTP server. This class takes care of all low level + * details of interacting with an FTP server and provides a convenient higher + * level interface. + * + * Modified from FtpClient.java in apache commons-net. + * + * Notes by John Xing: ftp server implementations are hardly uniform and none + * seems to follow RFCs whole-heartedly. We have no choice, but assume common + * denominator as following: (1) Use stream mode for data transfer. Block mode + * will be better for multiple file downloading and partial file downloading. + * However not every ftpd has block mode support. (2) Use passive mode for data + * connection. So Nutch will work if we run behind firewall. (3) Data connection + * is opened/closed per ftp command for the reasons listed in (1). There are ftp + * servers out there, when partial downloading is enforced by closing data + * channel socket on our client side, the server side immediately closes control + * channel (socket). Our codes deal with such a bad behavior. (4) LIST is used + * to obtain remote file attributes if possible. MDTM & SIZE would be nice, but + * not as ubiquitously implemented as LIST. (5) Avoid using ABOR in single + * thread? Do not use it at all. + * + * About exceptions: Some specific exceptions are re-thrown as one of + * FtpException*.java In fact, each function throws FtpException*.java or pass + * IOException. + * + * @author John Xing + ***********************************************/ + +public class Client extends FTP { + private int __dataTimeout; + private int __passivePort; + private String __passiveHost; + // private int __fileType, __fileFormat; + private boolean __remoteVerificationEnabled; + // private FTPFileEntryParser __entryParser; + private String __systemName; + + /** Public default constructor */ + public Client() { + __initDefaults(); + __dataTimeout = -1; + __remoteVerificationEnabled = true; + } + + // defaults when initialize + private void __initDefaults() { + __passiveHost = null; + __passivePort = -1; + __systemName = null; + // __fileType = FTP.ASCII_FILE_TYPE; + // __fileFormat = FTP.NON_PRINT_TEXT_FORMAT; + // __entryParser = null; + } + + // parse reply for pass() + private void __parsePassiveModeReply(String reply) + throws MalformedServerReplyException { + int i, index, lastIndex; + String octet1, octet2; + StringBuffer host; + + reply = reply.substring(reply.indexOf('(') + 1, reply.indexOf(')')).trim(); + + host = new StringBuffer(24); + lastIndex = 0; + index = reply.indexOf(','); + host.append(reply.substring(lastIndex, index)); + + for (i = 0; i < 3; i++) { + host.append('.'); + lastIndex = index + 1; + index = reply.indexOf(',', lastIndex); + host.append(reply.substring(lastIndex, index)); + } + + lastIndex = index + 1; + index = reply.indexOf(',', lastIndex); + + octet1 = reply.substring(lastIndex, index); + octet2 = reply.substring(index + 1); + + // index and lastIndex now used as temporaries + try { + index = Integer.parseInt(octet1); + lastIndex = Integer.parseInt(octet2); + } catch (NumberFormatException e) { + throw new MalformedServerReplyException( + "Could not parse passive host information.\nServer Reply: " + reply); + } + + index <<= 8; + index |= lastIndex; + + __passiveHost = host.toString(); + __passivePort = index; + } + + /** + * open a passive data connection socket + * + * @param command + * @param arg + * @return + * @throws IOException + * @throws FtpExceptionCanNotHaveDataConnection + */ + protected Socket __openPassiveDataConnection(int command, String arg) + throws IOException, FtpExceptionCanNotHaveDataConnection { + Socket socket; + + // // 20040317, xing, accommodate ill-behaved servers, see below + // int port_previous = __passivePort; + + if (pasv() != FTPReply.ENTERING_PASSIVE_MODE) + throw new FtpExceptionCanNotHaveDataConnection("pasv() failed. " + + getReplyString()); + + try { + __parsePassiveModeReply(getReplyStrings()[0]); + } catch (MalformedServerReplyException e) { + throw new FtpExceptionCanNotHaveDataConnection(e.getMessage()); + } + + // // 20040317, xing, accommodate ill-behaved servers, see above + // int count = 0; + // System.err.println("__passivePort "+__passivePort); + // System.err.println("port_previous "+port_previous); + // while (__passivePort == port_previous) { + // // just quit if too many tries. make it an exception here? + // if (count++ > 10) + // return null; + // // slow down further for each new try + // Thread.sleep(500*count); + // if (pasv() != FTPReply.ENTERING_PASSIVE_MODE) + // throw new FtpExceptionCanNotHaveDataConnection( + // "pasv() failed. " + getReplyString()); + // //return null; + // try { + // __parsePassiveModeReply(getReplyStrings()[0]); + // } catch (MalformedServerReplyException e) { + // throw new FtpExceptionCanNotHaveDataConnection(e.getMessage()); + // } + // } + + socket = _socketFactory_.createSocket(__passiveHost, __passivePort); + + if (!FTPReply.isPositivePreliminary(sendCommand(command, arg))) { + socket.close(); + return null; + } + + if (__remoteVerificationEnabled && !verifyRemote(socket)) { + InetAddress host1, host2; + + host1 = socket.getInetAddress(); + host2 = getRemoteAddress(); + + socket.close(); + + // our precaution + throw new FtpExceptionCanNotHaveDataConnection( + "Host attempting data connection " + host1.getHostAddress() + + " is not same as server " + host2.getHostAddress() + + " So we intentionally close it for security precaution."); + } + + if (__dataTimeout >= 0) + socket.setSoTimeout(__dataTimeout); + + return socket; + } + + /*** + * Sets the timeout in milliseconds to use for data connection. set + * immediately after opening the data connection. + ***/ + public void setDataTimeout(int timeout) { + __dataTimeout = timeout; + } + + /*** + * Closes the connection to the FTP server and restores connection parameters + * to the default values. + * <p> + * + * @exception IOException + * If an error occurs while disconnecting. + ***/ + public void disconnect() throws IOException { + __initDefaults(); + super.disconnect(); + // no worry for data connection, since we always close it + // in every ftp command that invloves data connection + } + + /*** + * Enable or disable verification that the remote host taking part of a data + * connection is the same as the host to which the control connection is + * attached. The default is for verification to be enabled. You may set this + * value at any time, whether the FTPClient is currently connected or not. + * <p> + * + * @param enable + * True to enable verification, false to disable verification. + ***/ + public void setRemoteVerificationEnabled(boolean enable) { + __remoteVerificationEnabled = enable; + } + + /*** + * Return whether or not verification of the remote host participating in data + * connections is enabled. The default behavior is for verification to be + * enabled. + * <p> + * + * @return True if verification is enabled, false if not. + ***/ + public boolean isRemoteVerificationEnabled() { + return __remoteVerificationEnabled; + } + + /*** + * Login to the FTP server using the provided username and password. + * <p> + * + * @param username + * The username to login under. + * @param password + * The password to use. + * @return True if successfully completed, false if not. + * @exception FTPConnectionClosedException + * If the FTP server prematurely closes the connection as a + * result of the client being idle or some other reason causing + * the server to send FTP reply code 421. This exception may be + * caught either as an IOException or independently as itself. + * @exception IOException + * If an I/O error occurs while either sending a command to the + * server or receiving a reply from the server. + ***/ + public boolean login(String username, String password) throws IOException { + user(username); + + if (FTPReply.isPositiveCompletion(getReplyCode())) + return true; + + // If we get here, we either have an error code, or an intermmediate + // reply requesting password. + if (!FTPReply.isPositiveIntermediate(getReplyCode())) + return false; + + return FTPReply.isPositiveCompletion(pass(password)); + } + + /*** + * Logout of the FTP server by sending the QUIT command. + * <p> + * + * @return True if successfully completed, false if not. + * @exception FTPConnectionClosedException + * If the FTP server prematurely closes the connection as a + * result of the client being idle or some other reason causing + * the server to send FTP reply code 421. This exception may be + * caught either as an IOException or independently as itself. + * @exception IOException + * If an I/O error occurs while either sending a command to the + * server or receiving a reply from the server. + ***/ + public boolean logout() throws IOException { + return FTPReply.isPositiveCompletion(quit()); + } + + /** + * retrieve list reply for path + * + * @param path + * @param entries + * @param limit + * @param parser + * @throws IOException + * @throws FtpExceptionCanNotHaveDataConnection + * @throws FtpExceptionUnknownForcedDataClose + * @throws FtpExceptionControlClosedByForcedDataClose + */ + public void retrieveList(String path, List<FTPFile> entries, int limit, + FTPFileEntryParser parser) throws IOException, + FtpExceptionCanNotHaveDataConnection, FtpExceptionUnknownForcedDataClose, + FtpExceptionControlClosedByForcedDataClose { + Socket socket = __openPassiveDataConnection(FTPCommand.LIST, path); + + if (socket == null) + throw new FtpExceptionCanNotHaveDataConnection("LIST " + + ((path == null) ? "" : path)); + + BufferedReader reader = new BufferedReader(new InputStreamReader( + socket.getInputStream())); + + // force-close data channel socket, when download limit is reached + // boolean mandatory_close = false; + + // List entries = new LinkedList(); + int count = 0; + String line = parser.readNextEntry(reader); + while (line != null) { + FTPFile ftpFile = parser.parseFTPEntry(line); + // skip non-formatted lines + if (ftpFile == null) { + line = parser.readNextEntry(reader); + continue; + } + entries.add(ftpFile); + count += line.length(); + // impose download limit if limit >= 0, otherwise no limit + // here, cut off is up to the line when total bytes is just over limit + if (limit >= 0 && count > limit) { + // mandatory_close = true; + break; + } + line = parser.readNextEntry(reader); + } + + // if (mandatory_close) + // you always close here, no matter mandatory_close or not. + // however different ftp servers respond differently, see below. + socket.close(); + + // scenarios: + // (1) mandatory_close is false, download limit not reached + // no special care here + // (2) mandatory_close is true, download limit is reached + // different servers have different reply codes: + + try { + int reply = getReply(); + if (!_notBadReply(reply)) + throw new FtpExceptionUnknownForcedDataClose(getReplyString()); + } catch (FTPConnectionClosedException e) { + // some ftp servers will close control channel if data channel socket + // is closed by our end before all data has been read out. Check: + // tux414.q-tam.hp.com FTP server (hp.com version whp02) + // so must catch FTPConnectionClosedException thrown by getReply() above + // disconnect(); + throw new FtpExceptionControlClosedByForcedDataClose(e.getMessage()); + } + + } + + /** + * retrieve file for path + * + * @param path + * @param os + * @param limit + * @throws IOException + * @throws FtpExceptionCanNotHaveDataConnection + * @throws FtpExceptionUnknownForcedDataClose + * @throws FtpExceptionControlClosedByForcedDataClose + */ + public void retrieveFile(String path, OutputStream os, int limit) + throws IOException, FtpExceptionCanNotHaveDataConnection, + FtpExceptionUnknownForcedDataClose, + FtpExceptionControlClosedByForcedDataClose { + + Socket socket = __openPassiveDataConnection(FTPCommand.RETR, path); + + if (socket == null) + throw new FtpExceptionCanNotHaveDataConnection("RETR " + + ((path == null) ? "" : path)); + + InputStream input = socket.getInputStream(); + + // 20040318, xing, treat everything as BINARY_FILE_TYPE for now + // do we ever need ASCII_FILE_TYPE? + // if (__fileType == ASCII_FILE_TYPE) + // input = new FromNetASCIIInputStream(input); + + // fixme, should we instruct server here for binary file type? + + // force-close data channel socket + // boolean mandatory_close = false; + + int len; + int count = 0; + byte[] buf = new byte[org.apache.commons.net.io.Util.DEFAULT_COPY_BUFFER_SIZE]; + while ((len = input.read(buf, 0, buf.length)) != -1) { + count += len; + // impose download limit if limit >= 0, otherwise no limit + // here, cut off is exactly of limit bytes + if (limit >= 0 && count > limit) { + os.write(buf, 0, len - (count - limit)); + // mandatory_close = true; + break; + } + os.write(buf, 0, len); + os.flush(); + } + + // if (mandatory_close) + // you always close here, no matter mandatory_close or not. + // however different ftp servers respond differently, see below. + socket.close(); + + // scenarios: + // (1) mandatory_close is false, download limit not reached + // no special care here + // (2) mandatory_close is true, download limit is reached + // different servers have different reply codes: + + // do not need this + // sendCommand("ABOR"); + + try { + int reply = getReply(); + if (!_notBadReply(reply)) + throw new FtpExceptionUnknownForcedDataClose(getReplyString()); + } catch (FTPConnectionClosedException e) { + // some ftp servers will close control channel if data channel socket + // is closed by our end before all data has been read out. Check: + // tux414.q-tam.hp.com FTP server (hp.com version whp02) + // so must catch FTPConnectionClosedException thrown by getReply() above + // disconnect(); + throw new FtpExceptionControlClosedByForcedDataClose(e.getMessage()); + } + + } + + /** + * reply check after closing data connection + * + * @param reply + * @return + */ + private boolean _notBadReply(int reply) { + + if (FTPReply.isPositiveCompletion(reply)) { + // do nothing + } else if (reply == 426) { // FTPReply.TRANSFER_ABORTED + // some ftp servers reply 426, e.g., + // foggy FTP server (Version wu-2.6.2(2) + // there is second reply witing? no! + // getReply(); + } else if (reply == 450) { // FTPReply.FILE_ACTION_NOT_TAKEN + // some ftp servers reply 450, e.g., + // ProFTPD [ftp.kernel.org] + // there is second reply witing? no! + // getReply(); + } else if (reply == 451) { // FTPReply.ACTION_ABORTED + // some ftp servers reply 451, e.g., + // ProFTPD [ftp.kernel.org] + // there is second reply witing? no! + // getReply(); + } else if (reply == 451) { // FTPReply.ACTION_ABORTED + } else { + // what other kind of ftp server out there? + return false; + } + + return true; + } + + /*** + * Sets the file type to be transferred. This should be one of + * <code> FTP.ASCII_FILE_TYPE </code>, <code> FTP.IMAGE_FILE_TYPE </code>, + * etc. The file type only needs to be set when you want to change the type. + * After changing it, the new type stays in effect until you change it again. + * The default file type is <code> FTP.ASCII_FILE_TYPE </code> if this method + * is never called. + * <p> + * + * @param fileType + * The <code> _FILE_TYPE </code> constant indcating the type of file. + * @return True if successfully completed, false if not. + * @exception FTPConnectionClosedException + * If the FTP server prematurely closes the connection as a + * result of the client being idle or some other reason causing + * the server to send FTP reply code 421. This exception may be + * caught either as an IOException or independently as itself. + * @exception IOException + * If an I/O error occurs while either sending a command to the + * server or receiving a reply from the server. + ***/ + public boolean setFileType(int fileType) throws IOException { + if (FTPReply.isPositiveCompletion(type(fileType))) { + /* + * __fileType = fileType; __fileFormat = FTP.NON_PRINT_TEXT_FORMAT; + */ + return true; + } + return false; + } + + /*** + * Fetches the system type name from the server and returns the string. This + * value is cached for the duration of the connection after the first call to + * this method. In other words, only the first time that you invoke this + * method will it issue a SYST command to the FTP server. FTPClient will + * remember the value and return the cached value until a call to disconnect. + * <p> + * + * @return The system type name obtained from the server. null if the + * information could not be obtained. + * @exception FTPConnectionClosedException + * If the FTP server prematurely closes the connection as a + * result of the client being idle or some other reason causing + * the server to send FTP reply code 421. This exception may be + * caught either as an IOException or independently as itself. + * @exception IOException + * If an I/O error occurs while either sending a command to the + * server or receiving a reply from the server. + ***/ + public String getSystemName() throws IOException, FtpExceptionBadSystResponse { + // if (syst() == FTPReply.NAME_SYSTEM_TYPE) + // Technically, we should expect a NAME_SYSTEM_TYPE response, but + // in practice FTP servers deviate, so we soften the condition to + // a positive completion. + if (__systemName == null && FTPReply.isPositiveCompletion(syst())) { + __systemName = (getReplyStrings()[0]).substring(4); + } else { + throw new FtpExceptionBadSystResponse("Bad response of SYST: " + + getReplyString()); + } + + return __systemName; + } + + /*** + * Sends a NOOP command to the FTP server. This is useful for preventing + * server timeouts. + * <p> + * + * @return True if successfully completed, false if not. + * @exception FTPConnectionClosedException + * If the FTP server prematurely closes the connection as a + * result of the client being idle or some other reason causing + * the server to send FTP reply code 421. This exception may be + * caught either as an IOException or independently as itself. + * @exception IOException + * If an I/O error occurs while either sending a command to the + * server or receiving a reply from the server. + ***/ + public boolean sendNoOp() throws IOException { + return FTPReply.isPositiveCompletion(noop()); + } + + // client.stat(path); + // client.sendCommand("STAT"); + // client.sendCommand("STAT",path); + // client.sendCommand("MDTM",path); + // client.sendCommand("SIZE",path); + // client.sendCommand("HELP","SITE"); + // client.sendCommand("SYST"); + // client.setRestartOffset(120); + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java new file mode 100644 index 0000000..772f3bb --- /dev/null +++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java @@ -0,0 +1,267 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.ftp; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.commons.net.ftp.FTPFileEntryParser; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.hadoop.io.Text; +import org.apache.nutch.net.protocols.Response; + +import org.apache.hadoop.conf.Configuration; + +import org.apache.nutch.protocol.Content; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolStatus; +import crawlercommons.robots.BaseRobotRules; + +import java.net.URL; + +import java.io.IOException; + +/** + * This class is a protocol plugin used for ftp: scheme. It creates + * {@link FtpResponse} object and gets the content of the url from it. + * Configurable parameters are {@code ftp.username}, {@code ftp.password}, + * {@code ftp.content.limit}, {@code ftp.timeout}, {@code ftp.server.timeout}, + * {@code ftp.password}, {@code ftp.keep.connection} and {@code ftp.follow.talk} + * . For details see "FTP properties" section in {@code nutch-default.xml}. + */ +public class Ftp implements Protocol { + + public static final Logger LOG = LoggerFactory.getLogger(Ftp.class); + + private static final int BUFFER_SIZE = 16384; // 16*1024 = 16384 + + static final int MAX_REDIRECTS = 5; + + int timeout; + + int maxContentLength; + + String userName; + String passWord; + + // typical/default server timeout is 120*1000 millisec. + // better be conservative here + int serverTimeout; + + // when to have client start anew + long renewalTime = -1; + + boolean keepConnection; + + boolean followTalk; + + // ftp client + Client client = null; + // ftp dir list entry parser + FTPFileEntryParser parser = null; + + private Configuration conf; + + private FtpRobotRulesParser robots = null; + + // constructor + public Ftp() { + robots = new FtpRobotRulesParser(); + } + + /** Set the timeout. */ + public void setTimeout(int to) { + timeout = to; + } + + /** Set the point at which content is truncated. */ + public void setMaxContentLength(int length) { + maxContentLength = length; + } + + /** Set followTalk */ + public void setFollowTalk(boolean followTalk) { + this.followTalk = followTalk; + } + + /** Set keepConnection */ + public void setKeepConnection(boolean keepConnection) { + this.keepConnection = keepConnection; + } + + /** + * Creates a {@link FtpResponse} object corresponding to the url and returns a + * {@link ProtocolOutput} object as per the content received + * + * @param url + * Text containing the ftp url + * @param datum + * The CrawlDatum object corresponding to the url + * + * @return {@link ProtocolOutput} object for the url + */ + public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { + String urlString = url.toString(); + try { + URL u = new URL(urlString); + + int redirects = 0; + + while (true) { + FtpResponse response; + response = new FtpResponse(u, datum, this, getConf()); // make a request + + int code = response.getCode(); + datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, + new Text(Integer.toString(code))); + + + if (code == 200) { // got a good response + return new ProtocolOutput(response.toContent()); // return it + + } else if (code >= 300 && code < 400) { // handle redirect + if (redirects == MAX_REDIRECTS) + throw new FtpException("Too many redirects: " + url); + u = new URL(response.getHeader("Location")); + redirects++; + if (LOG.isTraceEnabled()) { + LOG.trace("redirect to " + u); + } + } else { // convert to exception + throw new FtpError(code); + } + } + } catch (Exception e) { + return new ProtocolOutput(null, new ProtocolStatus(e)); + } + } + + protected void finalize() { + try { + if (this.client != null && this.client.isConnected()) { + this.client.logout(); + this.client.disconnect(); + } + } catch (IOException e) { + // do nothing + } + } + + /** For debugging. */ + public static void main(String[] args) throws Exception { + int timeout = Integer.MIN_VALUE; + int maxContentLength = Integer.MIN_VALUE; + String logLevel = "info"; + boolean followTalk = false; + boolean keepConnection = false; + boolean dumpContent = false; + String urlString = null; + + String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url"; + + if (args.length == 0) { + System.err.println(usage); + System.exit(-1); + } + + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-logLevel")) { + logLevel = args[++i]; + } else if (args[i].equals("-followTalk")) { + followTalk = true; + } else if (args[i].equals("-keepConnection")) { + keepConnection = true; + } else if (args[i].equals("-timeout")) { + timeout = Integer.parseInt(args[++i]) * 1000; + } else if (args[i].equals("-maxContentLength")) { + maxContentLength = Integer.parseInt(args[++i]); + } else if (args[i].equals("-dumpContent")) { + dumpContent = true; + } else if (i != args.length - 1) { + System.err.println(usage); + System.exit(-1); + } else { + urlString = args[i]; + } + } + + Ftp ftp = new Ftp(); + + ftp.setFollowTalk(followTalk); + ftp.setKeepConnection(keepConnection); + + if (timeout != Integer.MIN_VALUE) // set timeout + ftp.setTimeout(timeout); + + if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength + ftp.setMaxContentLength(maxContentLength); + + // set log level + // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase())); + + Content content = ftp.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + + System.err.println("Content-Type: " + content.getContentType()); + System.err.println("Content-Length: " + + content.getMetadata().get(Response.CONTENT_LENGTH)); + System.err.println("Last-Modified: " + + content.getMetadata().get(Response.LAST_MODIFIED)); + if (dumpContent) { + System.out.print(new String(content.getContent())); + } + + ftp = null; + } + + /** + * Set the {@link Configuration} object + */ + public void setConf(Configuration conf) { + this.conf = conf; + this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024); + this.timeout = conf.getInt("ftp.timeout", 10000); + this.userName = conf.get("ftp.username", "anonymous"); + this.passWord = conf.get("ftp.password", "[email protected]"); + this.serverTimeout = conf.getInt("ftp.server.timeout", 60 * 1000); + this.keepConnection = conf.getBoolean("ftp.keep.connection", false); + this.followTalk = conf.getBoolean("ftp.follow.talk", false); + this.robots.setConf(conf); + } + + /** + * Get the {@link Configuration} object + */ + public Configuration getConf() { + return this.conf; + } + + /** + * Get the robots rules for a given url + */ + public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) { + return robots.getRobotRulesSet(this, url); + } + + public int getBufferSize() { + return BUFFER_SIZE; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java new file mode 100644 index 0000000..b63a67e --- /dev/null +++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.ftp; + +/** + * Thrown for Ftp error codes. + */ +public class FtpError extends FtpException { + + private int code; + + public int getCode(int code) { + return code; + } + + public FtpError(int code) { + super("Ftp Error: " + code); + this.code = code; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java new file mode 100644 index 0000000..5a29668 --- /dev/null +++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java @@ -0,0 +1,46 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.ftp; + +import org.apache.nutch.protocol.ProtocolException; + +/*** + * Superclass for important exceptions thrown during FTP talk, that must be + * handled with care. + * + * @author John Xing + */ +public class FtpException extends ProtocolException { + + public FtpException() { + super(); + } + + public FtpException(String message) { + super(message); + } + + public FtpException(String message, Throwable cause) { + super(message, cause); + } + + public FtpException(Throwable cause) { + super(cause); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java new file mode 100644 index 0000000..689ac8e --- /dev/null +++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java @@ -0,0 +1,29 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.ftp; + +/** + * Exception indicating bad reply of SYST command. + * + * @author John Xing + */ +public class FtpExceptionBadSystResponse extends FtpException { + FtpExceptionBadSystResponse(String msg) { + super(msg); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java new file mode 100644 index 0000000..9f35b74 --- /dev/null +++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java @@ -0,0 +1,29 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.ftp; + +/** + * Exception indicating failure of opening data connection. + * + * @author John Xing + */ +public class FtpExceptionCanNotHaveDataConnection extends FtpException { + FtpExceptionCanNotHaveDataConnection(String msg) { + super(msg); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java new file mode 100644 index 0000000..c058fcb --- /dev/null +++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java @@ -0,0 +1,30 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.ftp; + +/** + * Exception indicating control channel is closed by server end, due to forced + * closure of data channel at client (our) end. + * + * @author John Xing + */ +public class FtpExceptionControlClosedByForcedDataClose extends FtpException { + FtpExceptionControlClosedByForcedDataClose(String msg) { + super(msg); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java new file mode 100644 index 0000000..9083d7c --- /dev/null +++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java @@ -0,0 +1,30 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.ftp; + +/** + * Exception indicating unrecognizable reply from server after forced closure of + * data channel by client (our) side. + * + * @author John Xing + */ +public class FtpExceptionUnknownForcedDataClose extends FtpException { + FtpExceptionUnknownForcedDataClose(String msg) { + super(msg); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java new file mode 100644 index 0000000..f7c7c6d --- /dev/null +++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java @@ -0,0 +1,521 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.ftp; + +import org.apache.commons.net.ftp.FTP; +import org.apache.commons.net.ftp.FTPFile; +import org.apache.commons.net.ftp.FTPReply; +import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory; +import org.apache.commons.net.ftp.parser.ParserInitializationException; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.net.protocols.Response; +import org.apache.hadoop.conf.Configuration; + +import java.net.InetAddress; +import java.net.URL; +import java.util.List; +import java.util.LinkedList; +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +/** + * FtpResponse.java mimics ftp replies as http response. It tries its best to + * follow http's way for headers, response codes as well as exceptions. + * + * Comments: In this class, all FtpException*.java thrown by Client.java and + * some important commons-net exceptions passed by Client.java must have been + * properly dealt with. They'd better not be leaked to the caller of this class. + */ +public class FtpResponse { + + private String orig; + private String base; + private byte[] content; + private static final byte[] EMPTY_CONTENT = new byte[0]; + private int code; + private Metadata headers = new Metadata(); + + private final Ftp ftp; + private Configuration conf; + + /** Returns the response code. */ + public int getCode() { + return code; + } + + /** Returns the value of a named header. */ + public String getHeader(String name) { + return headers.get(name); + } + + public byte[] getContent() { + return content; + } + + public Content toContent() { + return new Content(orig, base, (content != null ? content : EMPTY_CONTENT), + getHeader(Response.CONTENT_TYPE), headers, this.conf); + } + + public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf) + throws FtpException, IOException { + + this.orig = url.toString(); + this.base = url.toString(); + this.ftp = ftp; + this.conf = conf; + + if (!"ftp".equals(url.getProtocol())) + throw new FtpException("Not a ftp url:" + url); + + if (url.getPath() != url.getFile()) { + if (Ftp.LOG.isWarnEnabled()) { + Ftp.LOG.warn("url.getPath() != url.getFile(): " + url); + } + } + + String path = "".equals(url.getPath()) ? "/" : url.getPath(); + + try { + + if (ftp.followTalk) { + if (Ftp.LOG.isInfoEnabled()) { + Ftp.LOG.info("fetching " + url); + } + } else { + if (Ftp.LOG.isTraceEnabled()) { + Ftp.LOG.trace("fetching " + url); + } + } + + InetAddress addr = InetAddress.getByName(url.getHost()); + if (addr != null && conf.getBoolean("store.ip.address", false) == true) { + headers.add("_ip_", addr.getHostAddress()); + } + + // idled too long, remote server or ourselves may have timed out, + // should start anew. + if (ftp.client != null && ftp.keepConnection + && ftp.renewalTime < System.currentTimeMillis()) { + if (Ftp.LOG.isInfoEnabled()) { + Ftp.LOG.info("delete client because idled too long"); + } + ftp.client = null; + } + + // start anew if needed + if (ftp.client == null) { + if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { + Ftp.LOG.info("start client"); + } + // the real client + ftp.client = new Client(); + // when to renew, take the lesser + // ftp.renewalTime = System.currentTimeMillis() + // + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout : + // ftp.serverTimeout); + + // timeout for control connection + ftp.client.setDefaultTimeout(ftp.timeout); + // timeout for data connection + ftp.client.setDataTimeout(ftp.timeout); + + // follow ftp talk? + if (ftp.followTalk) + ftp.client.addProtocolCommandListener(new PrintCommandListener( + Ftp.LOG)); + } + + // quit from previous site if at a different site now + if (ftp.client.isConnected()) { + InetAddress remoteAddress = ftp.client.getRemoteAddress(); + if (!addr.equals(remoteAddress)) { + if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { + Ftp.LOG.info("disconnect from " + remoteAddress + + " before connect to " + addr); + } + // quit from current site + ftp.client.logout(); + ftp.client.disconnect(); + } + } + + // connect to current site if needed + if (!ftp.client.isConnected()) { + + if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { + Ftp.LOG.info("connect to " + addr); + } + + ftp.client.connect(addr); + if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) { + ftp.client.disconnect(); + if (Ftp.LOG.isWarnEnabled()) { + Ftp.LOG.warn("ftp.client.connect() failed: " + addr + " " + + ftp.client.getReplyString()); + } + this.code = 500; // http Internal Server Error + return; + } + + if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { + Ftp.LOG.info("log into " + addr); + } + + if (!ftp.client.login(ftp.userName, ftp.passWord)) { + // login failed. + // please note that some server may return 421 immediately + // after USER anonymous, thus ftp.client.login() won't return false, + // but throw exception, which then will be handled by caller + // (not dealt with here at all) . + ftp.client.disconnect(); + if (Ftp.LOG.isWarnEnabled()) { + Ftp.LOG.warn("ftp.client.login() failed: " + addr); + } + this.code = 401; // http Unauthorized + return; + } + + // insist on binary file type + if (!ftp.client.setFileType(FTP.BINARY_FILE_TYPE)) { + ftp.client.logout(); + ftp.client.disconnect(); + if (Ftp.LOG.isWarnEnabled()) { + Ftp.LOG.warn("ftp.client.setFileType() failed: " + addr); + } + this.code = 500; // http Internal Server Error + return; + } + + if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { + Ftp.LOG.info("set parser for " + addr); + } + + // SYST is valid only after login + try { + ftp.parser = null; + String parserKey = ftp.client.getSystemName(); + // some server reports as UNKNOWN Type: L8, but in fact UNIX Type: L8 + if (parserKey.startsWith("UNKNOWN Type: L8")) + parserKey = "UNIX Type: L8"; + ftp.parser = (new DefaultFTPFileEntryParserFactory()) + .createFileEntryParser(parserKey); + } catch (FtpExceptionBadSystResponse e) { + if (Ftp.LOG.isWarnEnabled()) { + Ftp.LOG + .warn("ftp.client.getSystemName() failed: " + addr + " " + e); + } + ftp.parser = null; + } catch (ParserInitializationException e) { + // ParserInitializationException is RuntimeException defined in + // org.apache.commons.net.ftp.parser.ParserInitializationException + if (Ftp.LOG.isWarnEnabled()) { + Ftp.LOG.warn("createFileEntryParser() failed. " + addr + " " + e); + } + ftp.parser = null; + } finally { + if (ftp.parser == null) { + // do not log as severe, otherwise + // FetcherThread/RequestScheduler will abort + if (Ftp.LOG.isWarnEnabled()) { + Ftp.LOG.warn("ftp.parser is null: " + addr); + } + ftp.client.logout(); + ftp.client.disconnect(); + this.code = 500; // http Internal Server Error + return; + } + } + + } else { + if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { + Ftp.LOG.info("use existing connection"); + } + } + + this.content = null; + + if (path.endsWith("/")) { + getDirAsHttpResponse(path, datum.getModifiedTime()); + } else { + getFileAsHttpResponse(path, datum.getModifiedTime()); + } + + // reset next renewalTime, take the lesser + if (ftp.client != null && ftp.keepConnection) { + ftp.renewalTime = System.currentTimeMillis() + + ((ftp.timeout < ftp.serverTimeout) ? ftp.timeout + : ftp.serverTimeout); + if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { + Ftp.LOG.info("reset renewalTime to " + + HttpDateFormat.toString(ftp.renewalTime)); + } + } + + // getDirAsHttpResponse() or getFileAsHttpResponse() above + // may have deleted ftp.client + if (ftp.client != null && !ftp.keepConnection) { + if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { + Ftp.LOG.info("disconnect from " + addr); + } + ftp.client.logout(); + ftp.client.disconnect(); + } + + } catch (Exception e) { + if (Ftp.LOG.isWarnEnabled()) { + Ftp.LOG.warn("Error: ", e); + } + // for any un-foreseen exception (run time exception or not), + // do ultimate clean and leave ftp.client for garbage collection + if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { + Ftp.LOG.info("delete client due to exception"); + } + ftp.client = null; + // or do explicit garbage collection? + // System.gc(); + // can we be less dramatic, using the following instead? + // probably unnecessary for our practical purpose here + // try { + // ftp.client.logout(); + // ftp.client.disconnect(); + // } + throw new FtpException(e); + // throw e; + } + + } + + // get ftp file as http response + private void getFileAsHttpResponse(String path, long lastModified) + throws IOException { + + ByteArrayOutputStream os = null; + List<FTPFile> list = null; + + try { + // first get its possible attributes + list = new LinkedList<FTPFile>(); + ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser); + + FTPFile ftpFile = (FTPFile) list.get(0); + this.headers.set(Response.CONTENT_LENGTH, + new Long(ftpFile.getSize()).toString()); + this.headers.set(Response.LAST_MODIFIED, + HttpDateFormat.toString(ftpFile.getTimestamp())); + // don't retrieve the file if not changed. + if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) { + code = 304; + return; + } + os = new ByteArrayOutputStream(ftp.getBufferSize()); + ftp.client.retrieveFile(path, os, ftp.maxContentLength); + + this.content = os.toByteArray(); + + // // approximate bytes sent and read + // if (this.httpAccounting != null) { + // this.httpAccounting.incrementBytesSent(path.length()); + // this.httpAccounting.incrementBytesRead(this.content.length); + // } + + this.code = 200; // http OK + + } catch (FtpExceptionControlClosedByForcedDataClose e) { + + // control connection is off, clean up + // ftp.client.disconnect(); + if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { + Ftp.LOG.info("delete client because server cut off control channel: " + + e); + } + ftp.client = null; + + // in case this FtpExceptionControlClosedByForcedDataClose is + // thrown by retrieveList() (not retrieveFile()) above, + if (os == null) { // indicating throwing by retrieveList() + // throw new FtpException("fail to get attibutes: "+path); + if (Ftp.LOG.isWarnEnabled()) { + Ftp.LOG + .warn("Please try larger maxContentLength for ftp.client.retrieveList(). " + + e); + } + // in a way, this is our request fault + this.code = 400; // http Bad request + return; + } + + FTPFile ftpFile = (FTPFile) list.get(0); + this.headers.set(Response.CONTENT_LENGTH, + new Long(ftpFile.getSize()).toString()); + // this.headers.put("content-type", "text/html"); + this.headers.set(Response.LAST_MODIFIED, + HttpDateFormat.toString(ftpFile.getTimestamp())); + this.content = os.toByteArray(); + if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) { + code = 304; + return; + } + + // // approximate bytes sent and read + // if (this.httpAccounting != null) { + // this.httpAccounting.incrementBytesSent(path.length()); + // this.httpAccounting.incrementBytesRead(this.content.length); + // } + + this.code = 200; // http OK + + } catch (FtpExceptionCanNotHaveDataConnection e) { + + if (FTPReply.isPositiveCompletion(ftp.client.cwd(path))) { + // it is not a file, but dir, so redirect as a dir + this.headers.set(Response.LOCATION, path + "/"); + this.code = 300; // http redirect + // fixme, should we do ftp.client.cwd("/"), back to top dir? + } else { + // it is not a dir either + this.code = 404; // http Not Found + } + + } catch (FtpExceptionUnknownForcedDataClose e) { + // Please note control channel is still live. + // in a way, this is our request fault + if (Ftp.LOG.isWarnEnabled()) { + Ftp.LOG.warn("Unrecognized reply after forced close of data channel. " + + "If this is acceptable, please modify Client.java accordingly. " + + e); + } + this.code = 400; // http Bad Request + } + + } + + // get ftp dir list as http response + private void getDirAsHttpResponse(String path, long lastModified) + throws IOException { + List<FTPFile> list = new LinkedList<FTPFile>(); + + try { + + // change to that dir first + if (!FTPReply.isPositiveCompletion(ftp.client.cwd(path))) { + this.code = 404; // http Not Found + return; + } + + // fixme, should we do ftp.client.cwd("/"), back to top dir? + + ftp.client.retrieveList(null, list, ftp.maxContentLength, ftp.parser); + this.content = list2html(list, path, "/".equals(path) ? false : true); + this.headers.set(Response.CONTENT_LENGTH, + new Integer(this.content.length).toString()); + this.headers.set(Response.CONTENT_TYPE, "text/html"); + // this.headers.put("Last-Modified", null); + + // // approximate bytes sent and read + // if (this.httpAccounting != null) { + // this.httpAccounting.incrementBytesSent(path.length()); + // this.httpAccounting.incrementBytesRead(this.content.length); + // } + + this.code = 200; // http OK + + } catch (FtpExceptionControlClosedByForcedDataClose e) { + + // control connection is off, clean up + // ftp.client.disconnect(); + if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { + Ftp.LOG.info("delete client because server cut off control channel: " + + e); + } + ftp.client = null; + + this.content = list2html(list, path, "/".equals(path) ? false : true); + this.headers.set(Response.CONTENT_LENGTH, + new Integer(this.content.length).toString()); + this.headers.set(Response.CONTENT_TYPE, "text/html"); + // this.headers.put("Last-Modified", null); + + // // approximate bytes sent and read + // if (this.httpAccounting != null) { + // this.httpAccounting.incrementBytesSent(path.length()); + // this.httpAccounting.incrementBytesRead(this.content.length); + // } + + this.code = 200; // http OK + + } catch (FtpExceptionUnknownForcedDataClose e) { + // Please note control channel is still live. + // in a way, this is our request fault + if (Ftp.LOG.isWarnEnabled()) { + Ftp.LOG.warn("Unrecognized reply after forced close of data channel. " + + "If this is acceptable, please modify Client.java accordingly. " + + e); + } + this.code = 400; // http Bad Request + } catch (FtpExceptionCanNotHaveDataConnection e) { + if (Ftp.LOG.isWarnEnabled()) { + Ftp.LOG.warn("" + e); + } + this.code = 500; // http Iternal Server Error + } + + } + + // generate html page from ftp dir list + private byte[] list2html(List<FTPFile> list, String path, + boolean includeDotDot) { + + // StringBuffer x = new + // StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>"); + StringBuffer x = new StringBuffer("<html><head>"); + x.append("<title>Index of " + path + "</title></head>\n"); + x.append("<body><h1>Index of " + path + "</h1><pre>\n"); + + if (includeDotDot) { + x.append("<a href='../'>../</a>\t-\t-\t-\n"); + } + + for (int i = 0; i < list.size(); i++) { + FTPFile f = (FTPFile) list.get(i); + String name = f.getName(); + String time = HttpDateFormat.toString(f.getTimestamp()); + if (f.isDirectory()) { + // some ftp server LIST "." and "..", we skip them here + if (name.equals(".") || name.equals("..")) + continue; + x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t"); + x.append(time + "\t-\n"); + } else if (f.isFile()) { + x.append("<a href='" + name + "'>" + name + "</a>\t"); + x.append(time + "\t" + f.getSize() + "\n"); + } else { + // ignore isSymbolicLink() + // ignore isUnknown() + } + } + + x.append("</pre></body></html>\n"); + + return new String(x).getBytes(); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java new file mode 100644 index 0000000..3764864 --- /dev/null +++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.ftp; + +import java.net.URL; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.protocol.RobotRulesParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import crawlercommons.robots.BaseRobotRules; +import crawlercommons.robots.SimpleRobotRules; + +/** + * This class is used for parsing robots for urls belonging to FTP protocol. It + * extends the generic {@link RobotRulesParser} class and contains Ftp protocol + * specific implementation for obtaining the robots file. + */ +public class FtpRobotRulesParser extends RobotRulesParser { + + private static final String CONTENT_TYPE = "text/plain"; + public static final Logger LOG = LoggerFactory + .getLogger(FtpRobotRulesParser.class); + + FtpRobotRulesParser() { + } + + public FtpRobotRulesParser(Configuration conf) { + super(conf); + } + + /** + * The hosts for which the caching of robots rules is yet to be done, it sends + * a Ftp request to the host corresponding to the {@link URL} passed, gets + * robots file, parses the rules and caches the rules object to avoid re-work + * in future. + * + * @param ftp + * The {@link Protocol} object + * @param url + * URL + * + * @return robotRules A {@link BaseRobotRules} object for the rules + */ + public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) { + + String protocol = url.getProtocol().toLowerCase(); // normalize to lower + // case + String host = url.getHost().toLowerCase(); // normalize to lower case + + if (LOG.isTraceEnabled() && isWhiteListed(url)) { + LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url); + } + + BaseRobotRules robotRules = CACHE.get(protocol + ":" + host); + + if (robotRules != null) { + return robotRules; // cached rule + } else if (LOG.isTraceEnabled()) { + LOG.trace("cache miss " + url); + } + + boolean cacheRule = true; + + if (isWhiteListed(url)) { + // check in advance whether a host is whitelisted + // (we do not need to fetch robots.txt) + robotRules = EMPTY_RULES; + LOG.info("Whitelisted host found for: {}", url); + LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host); + + } else { + try { + Text robotsUrl = new Text(new URL(url, "/robots.txt").toString()); + ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl, + new CrawlDatum()); + ProtocolStatus status = output.getStatus(); + + if (status.getCode() == ProtocolStatus.SUCCESS) { + robotRules = parseRules(url.toString(), output.getContent() + .getContent(), CONTENT_TYPE, agentNames); + } else { + robotRules = EMPTY_RULES; // use default rules + } + } catch (Throwable t) { + if (LOG.isInfoEnabled()) { + LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString()); + } + cacheRule = false; // try again later to fetch robots.txt + robotRules = EMPTY_RULES; + } + + } + + if (cacheRule) + CACHE.put(protocol + ":" + host, robotRules); // cache rules for host + + return robotRules; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java new file mode 100644 index 0000000..c68eac8 --- /dev/null +++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java @@ -0,0 +1,71 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.ftp; + +import java.io.BufferedReader; +import java.io.StringReader; +import java.io.IOException; + +import org.slf4j.Logger; + +import org.apache.commons.net.ProtocolCommandEvent; +import org.apache.commons.net.ProtocolCommandListener; + +/*** + * This is a support class for logging all ftp command/reply traffic. + * + * @author John Xing + ***/ +public class PrintCommandListener implements ProtocolCommandListener { + private Logger __logger; + + public PrintCommandListener(Logger logger) { + __logger = logger; + } + + public void protocolCommandSent(ProtocolCommandEvent event) { + try { + __logIt(event); + } catch (IOException e) { + if (__logger.isInfoEnabled()) { + __logger.info("PrintCommandListener.protocolCommandSent(): " + e); + } + } + } + + public void protocolReplyReceived(ProtocolCommandEvent event) { + try { + __logIt(event); + } catch (IOException e) { + if (__logger.isInfoEnabled()) { + __logger.info("PrintCommandListener.protocolReplyReceived(): " + e); + } + } + } + + private void __logIt(ProtocolCommandEvent event) throws IOException { + if (!__logger.isInfoEnabled()) { + return; + } + BufferedReader br = new BufferedReader(new StringReader(event.getMessage())); + String line; + while ((line = br.readLine()) != null) { + __logger.info("ftp> " + line); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html new file mode 100644 index 0000000..d936930 --- /dev/null +++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html @@ -0,0 +1,5 @@ +<html> +<body> +<p>Protocol plugin which supports retrieving documents via the ftp protocol.</p><p></p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-htmlunit/build.xml b/nutch-plugins/protocol-htmlunit/build.xml new file mode 100644 index 0000000..899214c --- /dev/null +++ b/nutch-plugins/protocol-htmlunit/build.xml @@ -0,0 +1,37 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="protocol-htmlunit" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Build compilation dependencies --> + <target name="deps-jar"> + <ant target="jar" inheritall="false" dir="../lib-http"/> + <ant target="jar" inheritall="false" dir="../lib-htmlunit"/> + </target> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-http/*.jar" /> + <include name="**/lib-htmlunit/*.jar" /> + </fileset> + <pathelement location="${build.dir}/test/conf"/> + </path> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-htmlunit/ivy.xml b/nutch-plugins/protocol-htmlunit/ivy.xml new file mode 100644 index 0000000..8aa78d2 --- /dev/null +++ b/nutch-plugins/protocol-htmlunit/ivy.xml @@ -0,0 +1,38 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-htmlunit/plugin.xml b/nutch-plugins/protocol-htmlunit/plugin.xml new file mode 100644 index 0000000..36bcb80 --- /dev/null +++ b/nutch-plugins/protocol-htmlunit/plugin.xml @@ -0,0 +1,51 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="protocol-htmlunit" + name="HtmlUnit Protocol Plug-in" + version="1.0.0" + provider-name="nutch.apache.org"> + + <runtime> + <library name="protocol-htmlunit.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + <import plugin="lib-http"/> + <import plugin="lib-htmlunit"/> + </requires> + + <extension id="org.apache.nutch.protocol.http" + name="HttpProtocol" + point="org.apache.nutch.protocol.Protocol"> + + <implementation id="org.apache.nutch.protocol.htmlunit.Http" + class="org.apache.nutch.protocol.htmlunit.Http"> + <parameter name="protocolName" value="http"/> + </implementation> + + <implementation id="org.apache.nutch.protocol.htmlunit.Http" + class="org.apache.nutch.protocol.htmlunit.Http"> + <parameter name="protocolName" value="https"/> + </implementation> + + </extension> +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-htmlunit/pom.xml b/nutch-plugins/protocol-htmlunit/pom.xml new file mode 100644 index 0000000..e5a57d7 --- /dev/null +++ b/nutch-plugins/protocol-htmlunit/pom.xml @@ -0,0 +1,51 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>protocol-htmlunit</artifactId> + <packaging>jar</packaging> + + <name>protocol-htmlunit</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + + <dependencies> + <dependency> + <groupId>org.apache.nutch</groupId> + <artifactId>lib-htmlunit</artifactId> + <version>${project.parent.version}</version> + </dependency> + <dependency> + <groupId>org.apache.nutch</groupId> + <artifactId>lib-http</artifactId> + <version>${project.parent.version}</version> + </dependency> + </dependencies> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java new file mode 100644 index 0000000..c40ed69 --- /dev/null +++ b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.htmlunit; + +import java.io.IOException; +import java.net.URL; + +import org.apache.hadoop.conf.Configuration; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.http.api.HttpBase; +import org.apache.nutch.util.NutchConfiguration; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class Http extends HttpBase { + + public static final Logger LOG = LoggerFactory.getLogger(Http.class); + + /** + * Default constructor. + */ + public Http() { + super(LOG); + } + + /** + * Set the {@link org.apache.hadoop.conf.Configuration} object. + * + * @param conf + */ + public void setConf(Configuration conf) { + super.setConf(conf); + } + + public static void main(String[] args) throws Exception { + Http http = new Http(); + http.setConf(NutchConfiguration.create()); + main(http, args); + } + + protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) + throws ProtocolException, IOException { + return new HttpResponse(this, url, datum); + } +}
