http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java deleted file mode 100644 index f7c7c6d..0000000 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java +++ /dev/null @@ -1,521 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.protocol.ftp; - -import org.apache.commons.net.ftp.FTP; -import org.apache.commons.net.ftp.FTPFile; -import org.apache.commons.net.ftp.FTPReply; -import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory; -import org.apache.commons.net.ftp.parser.ParserInitializationException; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.net.protocols.HttpDateFormat; -import org.apache.nutch.net.protocols.Response; -import org.apache.hadoop.conf.Configuration; - -import java.net.InetAddress; -import java.net.URL; -import java.util.List; -import java.util.LinkedList; -import java.io.ByteArrayOutputStream; -import java.io.IOException; - -/** - * FtpResponse.java mimics ftp replies as http response. It tries its best to - * follow http's way for headers, response codes as well as exceptions. - * - * Comments: In this class, all FtpException*.java thrown by Client.java and - * some important commons-net exceptions passed by Client.java must have been - * properly dealt with. They'd better not be leaked to the caller of this class. - */ -public class FtpResponse { - - private String orig; - private String base; - private byte[] content; - private static final byte[] EMPTY_CONTENT = new byte[0]; - private int code; - private Metadata headers = new Metadata(); - - private final Ftp ftp; - private Configuration conf; - - /** Returns the response code. */ - public int getCode() { - return code; - } - - /** Returns the value of a named header. */ - public String getHeader(String name) { - return headers.get(name); - } - - public byte[] getContent() { - return content; - } - - public Content toContent() { - return new Content(orig, base, (content != null ? content : EMPTY_CONTENT), - getHeader(Response.CONTENT_TYPE), headers, this.conf); - } - - public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf) - throws FtpException, IOException { - - this.orig = url.toString(); - this.base = url.toString(); - this.ftp = ftp; - this.conf = conf; - - if (!"ftp".equals(url.getProtocol())) - throw new FtpException("Not a ftp url:" + url); - - if (url.getPath() != url.getFile()) { - if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("url.getPath() != url.getFile(): " + url); - } - } - - String path = "".equals(url.getPath()) ? "/" : url.getPath(); - - try { - - if (ftp.followTalk) { - if (Ftp.LOG.isInfoEnabled()) { - Ftp.LOG.info("fetching " + url); - } - } else { - if (Ftp.LOG.isTraceEnabled()) { - Ftp.LOG.trace("fetching " + url); - } - } - - InetAddress addr = InetAddress.getByName(url.getHost()); - if (addr != null && conf.getBoolean("store.ip.address", false) == true) { - headers.add("_ip_", addr.getHostAddress()); - } - - // idled too long, remote server or ourselves may have timed out, - // should start anew. - if (ftp.client != null && ftp.keepConnection - && ftp.renewalTime < System.currentTimeMillis()) { - if (Ftp.LOG.isInfoEnabled()) { - Ftp.LOG.info("delete client because idled too long"); - } - ftp.client = null; - } - - // start anew if needed - if (ftp.client == null) { - if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("start client"); - } - // the real client - ftp.client = new Client(); - // when to renew, take the lesser - // ftp.renewalTime = System.currentTimeMillis() - // + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout : - // ftp.serverTimeout); - - // timeout for control connection - ftp.client.setDefaultTimeout(ftp.timeout); - // timeout for data connection - ftp.client.setDataTimeout(ftp.timeout); - - // follow ftp talk? - if (ftp.followTalk) - ftp.client.addProtocolCommandListener(new PrintCommandListener( - Ftp.LOG)); - } - - // quit from previous site if at a different site now - if (ftp.client.isConnected()) { - InetAddress remoteAddress = ftp.client.getRemoteAddress(); - if (!addr.equals(remoteAddress)) { - if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("disconnect from " + remoteAddress - + " before connect to " + addr); - } - // quit from current site - ftp.client.logout(); - ftp.client.disconnect(); - } - } - - // connect to current site if needed - if (!ftp.client.isConnected()) { - - if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("connect to " + addr); - } - - ftp.client.connect(addr); - if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) { - ftp.client.disconnect(); - if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("ftp.client.connect() failed: " + addr + " " - + ftp.client.getReplyString()); - } - this.code = 500; // http Internal Server Error - return; - } - - if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("log into " + addr); - } - - if (!ftp.client.login(ftp.userName, ftp.passWord)) { - // login failed. - // please note that some server may return 421 immediately - // after USER anonymous, thus ftp.client.login() won't return false, - // but throw exception, which then will be handled by caller - // (not dealt with here at all) . - ftp.client.disconnect(); - if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("ftp.client.login() failed: " + addr); - } - this.code = 401; // http Unauthorized - return; - } - - // insist on binary file type - if (!ftp.client.setFileType(FTP.BINARY_FILE_TYPE)) { - ftp.client.logout(); - ftp.client.disconnect(); - if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("ftp.client.setFileType() failed: " + addr); - } - this.code = 500; // http Internal Server Error - return; - } - - if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("set parser for " + addr); - } - - // SYST is valid only after login - try { - ftp.parser = null; - String parserKey = ftp.client.getSystemName(); - // some server reports as UNKNOWN Type: L8, but in fact UNIX Type: L8 - if (parserKey.startsWith("UNKNOWN Type: L8")) - parserKey = "UNIX Type: L8"; - ftp.parser = (new DefaultFTPFileEntryParserFactory()) - .createFileEntryParser(parserKey); - } catch (FtpExceptionBadSystResponse e) { - if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG - .warn("ftp.client.getSystemName() failed: " + addr + " " + e); - } - ftp.parser = null; - } catch (ParserInitializationException e) { - // ParserInitializationException is RuntimeException defined in - // org.apache.commons.net.ftp.parser.ParserInitializationException - if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("createFileEntryParser() failed. " + addr + " " + e); - } - ftp.parser = null; - } finally { - if (ftp.parser == null) { - // do not log as severe, otherwise - // FetcherThread/RequestScheduler will abort - if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("ftp.parser is null: " + addr); - } - ftp.client.logout(); - ftp.client.disconnect(); - this.code = 500; // http Internal Server Error - return; - } - } - - } else { - if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("use existing connection"); - } - } - - this.content = null; - - if (path.endsWith("/")) { - getDirAsHttpResponse(path, datum.getModifiedTime()); - } else { - getFileAsHttpResponse(path, datum.getModifiedTime()); - } - - // reset next renewalTime, take the lesser - if (ftp.client != null && ftp.keepConnection) { - ftp.renewalTime = System.currentTimeMillis() - + ((ftp.timeout < ftp.serverTimeout) ? ftp.timeout - : ftp.serverTimeout); - if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("reset renewalTime to " - + HttpDateFormat.toString(ftp.renewalTime)); - } - } - - // getDirAsHttpResponse() or getFileAsHttpResponse() above - // may have deleted ftp.client - if (ftp.client != null && !ftp.keepConnection) { - if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("disconnect from " + addr); - } - ftp.client.logout(); - ftp.client.disconnect(); - } - - } catch (Exception e) { - if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("Error: ", e); - } - // for any un-foreseen exception (run time exception or not), - // do ultimate clean and leave ftp.client for garbage collection - if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("delete client due to exception"); - } - ftp.client = null; - // or do explicit garbage collection? - // System.gc(); - // can we be less dramatic, using the following instead? - // probably unnecessary for our practical purpose here - // try { - // ftp.client.logout(); - // ftp.client.disconnect(); - // } - throw new FtpException(e); - // throw e; - } - - } - - // get ftp file as http response - private void getFileAsHttpResponse(String path, long lastModified) - throws IOException { - - ByteArrayOutputStream os = null; - List<FTPFile> list = null; - - try { - // first get its possible attributes - list = new LinkedList<FTPFile>(); - ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser); - - FTPFile ftpFile = (FTPFile) list.get(0); - this.headers.set(Response.CONTENT_LENGTH, - new Long(ftpFile.getSize()).toString()); - this.headers.set(Response.LAST_MODIFIED, - HttpDateFormat.toString(ftpFile.getTimestamp())); - // don't retrieve the file if not changed. - if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) { - code = 304; - return; - } - os = new ByteArrayOutputStream(ftp.getBufferSize()); - ftp.client.retrieveFile(path, os, ftp.maxContentLength); - - this.content = os.toByteArray(); - - // // approximate bytes sent and read - // if (this.httpAccounting != null) { - // this.httpAccounting.incrementBytesSent(path.length()); - // this.httpAccounting.incrementBytesRead(this.content.length); - // } - - this.code = 200; // http OK - - } catch (FtpExceptionControlClosedByForcedDataClose e) { - - // control connection is off, clean up - // ftp.client.disconnect(); - if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("delete client because server cut off control channel: " - + e); - } - ftp.client = null; - - // in case this FtpExceptionControlClosedByForcedDataClose is - // thrown by retrieveList() (not retrieveFile()) above, - if (os == null) { // indicating throwing by retrieveList() - // throw new FtpException("fail to get attibutes: "+path); - if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG - .warn("Please try larger maxContentLength for ftp.client.retrieveList(). " - + e); - } - // in a way, this is our request fault - this.code = 400; // http Bad request - return; - } - - FTPFile ftpFile = (FTPFile) list.get(0); - this.headers.set(Response.CONTENT_LENGTH, - new Long(ftpFile.getSize()).toString()); - // this.headers.put("content-type", "text/html"); - this.headers.set(Response.LAST_MODIFIED, - HttpDateFormat.toString(ftpFile.getTimestamp())); - this.content = os.toByteArray(); - if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) { - code = 304; - return; - } - - // // approximate bytes sent and read - // if (this.httpAccounting != null) { - // this.httpAccounting.incrementBytesSent(path.length()); - // this.httpAccounting.incrementBytesRead(this.content.length); - // } - - this.code = 200; // http OK - - } catch (FtpExceptionCanNotHaveDataConnection e) { - - if (FTPReply.isPositiveCompletion(ftp.client.cwd(path))) { - // it is not a file, but dir, so redirect as a dir - this.headers.set(Response.LOCATION, path + "/"); - this.code = 300; // http redirect - // fixme, should we do ftp.client.cwd("/"), back to top dir? - } else { - // it is not a dir either - this.code = 404; // http Not Found - } - - } catch (FtpExceptionUnknownForcedDataClose e) { - // Please note control channel is still live. - // in a way, this is our request fault - if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("Unrecognized reply after forced close of data channel. " - + "If this is acceptable, please modify Client.java accordingly. " - + e); - } - this.code = 400; // http Bad Request - } - - } - - // get ftp dir list as http response - private void getDirAsHttpResponse(String path, long lastModified) - throws IOException { - List<FTPFile> list = new LinkedList<FTPFile>(); - - try { - - // change to that dir first - if (!FTPReply.isPositiveCompletion(ftp.client.cwd(path))) { - this.code = 404; // http Not Found - return; - } - - // fixme, should we do ftp.client.cwd("/"), back to top dir? - - ftp.client.retrieveList(null, list, ftp.maxContentLength, ftp.parser); - this.content = list2html(list, path, "/".equals(path) ? false : true); - this.headers.set(Response.CONTENT_LENGTH, - new Integer(this.content.length).toString()); - this.headers.set(Response.CONTENT_TYPE, "text/html"); - // this.headers.put("Last-Modified", null); - - // // approximate bytes sent and read - // if (this.httpAccounting != null) { - // this.httpAccounting.incrementBytesSent(path.length()); - // this.httpAccounting.incrementBytesRead(this.content.length); - // } - - this.code = 200; // http OK - - } catch (FtpExceptionControlClosedByForcedDataClose e) { - - // control connection is off, clean up - // ftp.client.disconnect(); - if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("delete client because server cut off control channel: " - + e); - } - ftp.client = null; - - this.content = list2html(list, path, "/".equals(path) ? false : true); - this.headers.set(Response.CONTENT_LENGTH, - new Integer(this.content.length).toString()); - this.headers.set(Response.CONTENT_TYPE, "text/html"); - // this.headers.put("Last-Modified", null); - - // // approximate bytes sent and read - // if (this.httpAccounting != null) { - // this.httpAccounting.incrementBytesSent(path.length()); - // this.httpAccounting.incrementBytesRead(this.content.length); - // } - - this.code = 200; // http OK - - } catch (FtpExceptionUnknownForcedDataClose e) { - // Please note control channel is still live. - // in a way, this is our request fault - if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("Unrecognized reply after forced close of data channel. " - + "If this is acceptable, please modify Client.java accordingly. " - + e); - } - this.code = 400; // http Bad Request - } catch (FtpExceptionCanNotHaveDataConnection e) { - if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("" + e); - } - this.code = 500; // http Iternal Server Error - } - - } - - // generate html page from ftp dir list - private byte[] list2html(List<FTPFile> list, String path, - boolean includeDotDot) { - - // StringBuffer x = new - // StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>"); - StringBuffer x = new StringBuffer("<html><head>"); - x.append("<title>Index of " + path + "</title></head>\n"); - x.append("<body><h1>Index of " + path + "</h1><pre>\n"); - - if (includeDotDot) { - x.append("<a href='../'>../</a>\t-\t-\t-\n"); - } - - for (int i = 0; i < list.size(); i++) { - FTPFile f = (FTPFile) list.get(i); - String name = f.getName(); - String time = HttpDateFormat.toString(f.getTimestamp()); - if (f.isDirectory()) { - // some ftp server LIST "." and "..", we skip them here - if (name.equals(".") || name.equals("..")) - continue; - x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t"); - x.append(time + "\t-\n"); - } else if (f.isFile()) { - x.append("<a href='" + name + "'>" + name + "</a>\t"); - x.append(time + "\t" + f.getSize() + "\n"); - } else { - // ignore isSymbolicLink() - // ignore isUnknown() - } - } - - x.append("</pre></body></html>\n"); - - return new String(x).getBytes(); - } - -}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java deleted file mode 100644 index 3764864..0000000 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java +++ /dev/null @@ -1,121 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.protocol.ftp; - -import java.net.URL; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolOutput; -import org.apache.nutch.protocol.ProtocolStatus; -import org.apache.nutch.protocol.RobotRulesParser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import crawlercommons.robots.BaseRobotRules; -import crawlercommons.robots.SimpleRobotRules; - -/** - * This class is used for parsing robots for urls belonging to FTP protocol. It - * extends the generic {@link RobotRulesParser} class and contains Ftp protocol - * specific implementation for obtaining the robots file. - */ -public class FtpRobotRulesParser extends RobotRulesParser { - - private static final String CONTENT_TYPE = "text/plain"; - public static final Logger LOG = LoggerFactory - .getLogger(FtpRobotRulesParser.class); - - FtpRobotRulesParser() { - } - - public FtpRobotRulesParser(Configuration conf) { - super(conf); - } - - /** - * The hosts for which the caching of robots rules is yet to be done, it sends - * a Ftp request to the host corresponding to the {@link URL} passed, gets - * robots file, parses the rules and caches the rules object to avoid re-work - * in future. - * - * @param ftp - * The {@link Protocol} object - * @param url - * URL - * - * @return robotRules A {@link BaseRobotRules} object for the rules - */ - public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) { - - String protocol = url.getProtocol().toLowerCase(); // normalize to lower - // case - String host = url.getHost().toLowerCase(); // normalize to lower case - - if (LOG.isTraceEnabled() && isWhiteListed(url)) { - LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url); - } - - BaseRobotRules robotRules = CACHE.get(protocol + ":" + host); - - if (robotRules != null) { - return robotRules; // cached rule - } else if (LOG.isTraceEnabled()) { - LOG.trace("cache miss " + url); - } - - boolean cacheRule = true; - - if (isWhiteListed(url)) { - // check in advance whether a host is whitelisted - // (we do not need to fetch robots.txt) - robotRules = EMPTY_RULES; - LOG.info("Whitelisted host found for: {}", url); - LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host); - - } else { - try { - Text robotsUrl = new Text(new URL(url, "/robots.txt").toString()); - ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl, - new CrawlDatum()); - ProtocolStatus status = output.getStatus(); - - if (status.getCode() == ProtocolStatus.SUCCESS) { - robotRules = parseRules(url.toString(), output.getContent() - .getContent(), CONTENT_TYPE, agentNames); - } else { - robotRules = EMPTY_RULES; // use default rules - } - } catch (Throwable t) { - if (LOG.isInfoEnabled()) { - LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString()); - } - cacheRule = false; // try again later to fetch robots.txt - robotRules = EMPTY_RULES; - } - - } - - if (cacheRule) - CACHE.put(protocol + ":" + host, robotRules); // cache rules for host - - return robotRules; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java deleted file mode 100644 index c68eac8..0000000 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java +++ /dev/null @@ -1,71 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.protocol.ftp; - -import java.io.BufferedReader; -import java.io.StringReader; -import java.io.IOException; - -import org.slf4j.Logger; - -import org.apache.commons.net.ProtocolCommandEvent; -import org.apache.commons.net.ProtocolCommandListener; - -/*** - * This is a support class for logging all ftp command/reply traffic. - * - * @author John Xing - ***/ -public class PrintCommandListener implements ProtocolCommandListener { - private Logger __logger; - - public PrintCommandListener(Logger logger) { - __logger = logger; - } - - public void protocolCommandSent(ProtocolCommandEvent event) { - try { - __logIt(event); - } catch (IOException e) { - if (__logger.isInfoEnabled()) { - __logger.info("PrintCommandListener.protocolCommandSent(): " + e); - } - } - } - - public void protocolReplyReceived(ProtocolCommandEvent event) { - try { - __logIt(event); - } catch (IOException e) { - if (__logger.isInfoEnabled()) { - __logger.info("PrintCommandListener.protocolReplyReceived(): " + e); - } - } - } - - private void __logIt(ProtocolCommandEvent event) throws IOException { - if (!__logger.isInfoEnabled()) { - return; - } - BufferedReader br = new BufferedReader(new StringReader(event.getMessage())); - String line; - while ((line = br.readLine()) != null) { - __logger.info("ftp> " + line); - } - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html deleted file mode 100644 index d936930..0000000 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the ftp protocol.</p><p></p> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/build.xml b/src/plugin/protocol-htmlunit/build.xml deleted file mode 100644 index 899214c..0000000 --- a/src/plugin/protocol-htmlunit/build.xml +++ /dev/null @@ -1,37 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="protocol-htmlunit" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- Build compilation dependencies --> - <target name="deps-jar"> - <ant target="jar" inheritall="false" dir="../lib-http"/> - <ant target="jar" inheritall="false" dir="../lib-htmlunit"/> - </target> - - <!-- Add compilation dependencies to classpath --> - <path id="plugin.deps"> - <fileset dir="${nutch.root}/build"> - <include name="**/lib-http/*.jar" /> - <include name="**/lib-htmlunit/*.jar" /> - </fileset> - <pathelement location="${build.dir}/test/conf"/> - </path> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/ivy.xml b/src/plugin/protocol-htmlunit/ivy.xml deleted file mode 100644 index 8aa78d2..0000000 --- a/src/plugin/protocol-htmlunit/ivy.xml +++ /dev/null @@ -1,38 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/plugin.xml b/src/plugin/protocol-htmlunit/plugin.xml deleted file mode 100644 index 36bcb80..0000000 --- a/src/plugin/protocol-htmlunit/plugin.xml +++ /dev/null @@ -1,51 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="protocol-htmlunit" - name="HtmlUnit Protocol Plug-in" - version="1.0.0" - provider-name="nutch.apache.org"> - - <runtime> - <library name="protocol-htmlunit.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - <import plugin="lib-http"/> - <import plugin="lib-htmlunit"/> - </requires> - - <extension id="org.apache.nutch.protocol.http" - name="HttpProtocol" - point="org.apache.nutch.protocol.Protocol"> - - <implementation id="org.apache.nutch.protocol.htmlunit.Http" - class="org.apache.nutch.protocol.htmlunit.Http"> - <parameter name="protocolName" value="http"/> - </implementation> - - <implementation id="org.apache.nutch.protocol.htmlunit.Http" - class="org.apache.nutch.protocol.htmlunit.Http"> - <parameter name="protocolName" value="https"/> - </implementation> - - </extension> -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java deleted file mode 100644 index c40ed69..0000000 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.htmlunit; - -import java.io.IOException; -import java.net.URL; - -import org.apache.hadoop.conf.Configuration; - -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.http.api.HttpBase; -import org.apache.nutch.util.NutchConfiguration; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class Http extends HttpBase { - - public static final Logger LOG = LoggerFactory.getLogger(Http.class); - - /** - * Default constructor. - */ - public Http() { - super(LOG); - } - - /** - * Set the {@link org.apache.hadoop.conf.Configuration} object. - * - * @param conf - */ - public void setConf(Configuration conf) { - super.setConf(conf); - } - - public static void main(String[] args) throws Exception { - Http http = new Http(); - http.setConf(NutchConfiguration.create()); - main(http, args); - } - - protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) - throws ProtocolException, IOException { - return new HttpResponse(this, url, datum); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java deleted file mode 100644 index 8b1a031..0000000 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java +++ /dev/null @@ -1,573 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.htmlunit; - -import java.io.BufferedInputStream; -import java.io.ByteArrayOutputStream; -import java.io.EOFException; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.PushbackInputStream; -import java.net.InetSocketAddress; -import java.net.Socket; -import java.net.URL; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; - -import javax.net.ssl.SSLSocket; -import javax.net.ssl.SSLSocketFactory; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.metadata.SpellCheckedMetadata; -import org.apache.nutch.net.protocols.HttpDateFormat; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.http.api.HttpBase; -import org.apache.nutch.protocol.http.api.HttpException; - -/** - * An HTTP response. - */ -public class HttpResponse implements Response { - - private Configuration conf; - private HttpBase http; - private URL url; - private String orig; - private String base; - private byte[] content; - private int code; - private Metadata headers = new SpellCheckedMetadata(); - // used for storing the http headers verbatim - private StringBuffer httpHeaders; - - protected enum Scheme { - HTTP, HTTPS, - } - - /** - * Default public constructor. - * - * @param http - * @param url - * @param datum - * @throws ProtocolException - * @throws IOException - */ - public HttpResponse(HttpBase http, URL url, CrawlDatum datum) - throws ProtocolException, IOException { - - this.http = http; - this.url = url; - this.orig = url.toString(); - this.base = url.toString(); - - Scheme scheme = null; - - if ("http".equals(url.getProtocol())) { - scheme = Scheme.HTTP; - } else if ("https".equals(url.getProtocol())) { - scheme = Scheme.HTTPS; - } else { - throw new HttpException("Unknown scheme (not http/https) for url:" + url); - } - - if (Http.LOG.isTraceEnabled()) { - Http.LOG.trace("fetching " + url); - } - - String path = "".equals(url.getFile()) ? "/" : url.getFile(); - - // some servers will redirect a request with a host line like - // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they - // don't want the :80... - - String host = url.getHost(); - int port; - String portString; - if (url.getPort() == -1) { - if (scheme == Scheme.HTTP) { - port = 80; - } else { - port = 443; - } - portString = ""; - } else { - port = url.getPort(); - portString = ":" + port; - } - Socket socket = null; - - try { - socket = new Socket(); // create the socket - socket.setSoTimeout(http.getTimeout()); - - // connect - String sockHost = http.useProxy(url) ? http.getProxyHost() : host; - int sockPort = http.useProxy(url) ? http.getProxyPort() : port; - InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort); - socket.connect(sockAddr, http.getTimeout()); - - if (scheme == Scheme.HTTPS) { - SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory - .getDefault(); - SSLSocket sslsocket = (SSLSocket) factory - .createSocket(socket, sockHost, sockPort, true); - sslsocket.setUseClientMode(true); - - // Get the protocols and ciphers supported by this JVM - Set<String> protocols = new HashSet<String>( - Arrays.asList(sslsocket.getSupportedProtocols())); - Set<String> ciphers = new HashSet<String>( - Arrays.asList(sslsocket.getSupportedCipherSuites())); - - // Intersect with preferred protocols and ciphers - protocols.retainAll(http.getTlsPreferredProtocols()); - ciphers.retainAll(http.getTlsPreferredCipherSuites()); - - sslsocket.setEnabledProtocols( - protocols.toArray(new String[protocols.size()])); - sslsocket.setEnabledCipherSuites( - ciphers.toArray(new String[ciphers.size()])); - - sslsocket.startHandshake(); - socket = sslsocket; - } - - this.conf = http.getConf(); - if (sockAddr != null - && conf.getBoolean("store.ip.address", false) == true) { - headers.add("_ip_", sockAddr.getAddress().getHostAddress()); - } - - // make request - OutputStream req = socket.getOutputStream(); - - StringBuffer reqStr = new StringBuffer("GET "); - if (http.useProxy(url)) { - reqStr.append(url.getProtocol() + "://" + host + portString + path); - } else { - reqStr.append(path); - } - - reqStr.append(" HTTP/1.0\r\n"); - - reqStr.append("Host: "); - reqStr.append(host); - reqStr.append(portString); - reqStr.append("\r\n"); - - reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n"); - - String userAgent = http.getUserAgent(); - if ((userAgent == null) || (userAgent.length() == 0)) { - if (Http.LOG.isErrorEnabled()) { - Http.LOG.error("User-agent is not set!"); - } - } else { - reqStr.append("User-Agent: "); - reqStr.append(userAgent); - reqStr.append("\r\n"); - } - - reqStr.append("Accept-Language: "); - reqStr.append(this.http.getAcceptLanguage()); - reqStr.append("\r\n"); - - reqStr.append("Accept: "); - reqStr.append(this.http.getAccept()); - reqStr.append("\r\n"); - - if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) { - reqStr.append("If-Modified-Since: " + HttpDateFormat - .toString(datum.getModifiedTime())); - reqStr.append("\r\n"); - } - reqStr.append("\r\n"); - - // store the request in the metadata? - if (conf.getBoolean("store.http.request", false) == true) { - headers.add("_request_", reqStr.toString()); - } - - byte[] reqBytes = reqStr.toString().getBytes(); - - req.write(reqBytes); - req.flush(); - - PushbackInputStream in = // process response - new PushbackInputStream( - new BufferedInputStream(socket.getInputStream(), - Http.BUFFER_SIZE), Http.BUFFER_SIZE); - - StringBuffer line = new StringBuffer(); - - // store the http headers verbatim - if (conf.getBoolean("store.http.headers", false) == true) { - httpHeaders = new StringBuffer(); - } - - headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis())); - - boolean haveSeenNonContinueStatus = false; - while (!haveSeenNonContinueStatus) { - // parse status code line - this.code = parseStatusLine(in, line); - if (httpHeaders != null) - httpHeaders.append(line).append("\n"); - // parse headers - parseHeaders(in, line, httpHeaders); - haveSeenNonContinueStatus = code != 100; // 100 is "Continue" - } - - // Get Content type header - String contentType = getHeader(Response.CONTENT_TYPE); - - // handle with HtmlUnit only if content type in HTML or XHTML - if (contentType != null) { - if (contentType.contains("text/html") || contentType.contains("application/xhtml")) { - readContentFromHtmlUnit(url); - } else { - String transferEncoding = getHeader(Response.TRANSFER_ENCODING); - if (transferEncoding != null && "chunked" - .equalsIgnoreCase(transferEncoding.trim())) { - readChunkedContent(in, line); - } else { - readPlainContent(in); - } - - String contentEncoding = getHeader(Response.CONTENT_ENCODING); - if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { - content = http.processGzipEncoded(content, url); - } else if ("deflate".equals(contentEncoding)) { - content = http.processDeflateEncoded(content, url); - } else { - // store the headers verbatim only if the response was not compressed - // as the content length reported with not match otherwise - if (httpHeaders != null) { - headers.add("_response.headers_", httpHeaders.toString()); - } - if (Http.LOG.isTraceEnabled()) { - Http.LOG.trace("fetched " + content.length + " bytes from " + url); - } - } - } - } - - } finally { - if (socket != null) - socket.close(); - } - - } - - /* - * ------------------------- * <implementation:Response> * - * ------------------------- - */ - - public URL getUrl() { - return url; - } - - public int getCode() { - return code; - } - - public String getHeader(String name) { - return headers.get(name); - } - - public Metadata getHeaders() { - return headers; - } - - public byte[] getContent() { - return content; - } - - /* - * ------------------------- * <implementation:Response> * - * ------------------------- - */ - - private void readContentFromHtmlUnit(URL url) throws IOException { - String page = HtmlUnitWebDriver.getHtmlPage(url.toString(), conf); - content = page.getBytes("UTF-8"); - } - - private void readPlainContent(InputStream in) - throws HttpException, IOException { - - int contentLength = Integer.MAX_VALUE; // get content length - String contentLengthString = headers.get(Response.CONTENT_LENGTH); - if (contentLengthString != null) { - contentLengthString = contentLengthString.trim(); - try { - if (!contentLengthString.isEmpty()) - contentLength = Integer.parseInt(contentLengthString); - } catch (NumberFormatException e) { - throw new HttpException("bad content length: " + contentLengthString); - } - } - if (http.getMaxContent() >= 0 && contentLength > http - .getMaxContent()) // limit - // download - // size - contentLength = http.getMaxContent(); - - ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); - byte[] bytes = new byte[Http.BUFFER_SIZE]; - int length = 0; - - // do not try to read if the contentLength is 0 - if (contentLength == 0) { - content = new byte[0]; - return; - } - - // read content - int i = in.read(bytes); - while (i != -1) { - out.write(bytes, 0, i); - length += i; - if (length >= contentLength) { - break; - } - if ((length + Http.BUFFER_SIZE) > contentLength) { - // reading next chunk may hit contentLength, - // must limit number of bytes read - i = in.read(bytes, 0, (contentLength - length)); - } else { - i = in.read(bytes); - } - } - content = out.toByteArray(); - } - - /** - * @param in - * @param line - * @throws HttpException - * @throws IOException - */ - private void readChunkedContent(PushbackInputStream in, StringBuffer line) - throws HttpException, IOException { - boolean doneChunks = false; - int contentBytesRead = 0; - byte[] bytes = new byte[Http.BUFFER_SIZE]; - ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); - - while (!doneChunks) { - if (Http.LOG.isTraceEnabled()) { - Http.LOG.trace("Http: starting chunk"); - } - - readLine(in, line, false); - - String chunkLenStr; - // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); - // } - - int pos = line.indexOf(";"); - if (pos < 0) { - chunkLenStr = line.toString(); - } else { - chunkLenStr = line.substring(0, pos); - // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + - // line.substring(pos+1)); } - } - chunkLenStr = chunkLenStr.trim(); - int chunkLen; - try { - chunkLen = Integer.parseInt(chunkLenStr, 16); - } catch (NumberFormatException e) { - throw new HttpException("bad chunk length: " + line.toString()); - } - - if (chunkLen == 0) { - doneChunks = true; - break; - } - - if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http - .getMaxContent()) - chunkLen = http.getMaxContent() - contentBytesRead; - - // read one chunk - int chunkBytesRead = 0; - while (chunkBytesRead < chunkLen) { - - int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? - (chunkLen - chunkBytesRead) : - Http.BUFFER_SIZE; - int len = in.read(bytes, 0, toRead); - - if (len == -1) - throw new HttpException("chunk eof after " + contentBytesRead - + " bytes in successful chunks" + " and " + chunkBytesRead - + " in current chunk"); - - // DANGER!!! Will printed GZIPed stuff right to your - // terminal! - // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0, - // len)); } - - out.write(bytes, 0, len); - chunkBytesRead += len; - } - - readLine(in, line, false); - - } - - if (!doneChunks) { - if (contentBytesRead != http.getMaxContent()) - throw new HttpException("chunk eof: !doneChunk && didn't max out"); - return; - } - - content = out.toByteArray(); - parseHeaders(in, line, null); - - } - - private int parseStatusLine(PushbackInputStream in, StringBuffer line) - throws IOException, HttpException { - readLine(in, line, false); - - int codeStart = line.indexOf(" "); - int codeEnd = line.indexOf(" ", codeStart + 1); - - // handle lines with no plaintext result code, ie: - // "HTTP/1.1 200" vs "HTTP/1.1 200 OK" - if (codeEnd == -1) - codeEnd = line.length(); - - int code; - try { - code = Integer.parseInt(line.substring(codeStart + 1, codeEnd)); - } catch (NumberFormatException e) { - throw new HttpException( - "bad status line '" + line + "': " + e.getMessage(), e); - } - - return code; - } - - private void processHeaderLine(StringBuffer line) - throws IOException, HttpException { - - int colonIndex = line.indexOf(":"); // key is up to colon - if (colonIndex == -1) { - int i; - for (i = 0; i < line.length(); i++) - if (!Character.isWhitespace(line.charAt(i))) - break; - if (i == line.length()) - return; - throw new HttpException("No colon in header:" + line); - } - String key = line.substring(0, colonIndex); - - int valueStart = colonIndex + 1; // skip whitespace - while (valueStart < line.length()) { - int c = line.charAt(valueStart); - if (c != ' ' && c != '\t') - break; - valueStart++; - } - String value = line.substring(valueStart); - headers.set(key, value); - } - - // Adds headers to our headers Metadata - private void parseHeaders(PushbackInputStream in, StringBuffer line, - StringBuffer httpHeaders) throws IOException, HttpException { - - while (readLine(in, line, true) != 0) { - - if (httpHeaders != null) - httpHeaders.append(line).append("\n"); - - // handle HTTP responses with missing blank line after headers - int pos; - if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ( - (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html")) - != -1)) { - - in.unread(line.substring(pos).getBytes("UTF-8")); - line.setLength(pos); - - try { - // TODO: (CM) We don't know the header names here - // since we're just handling them generically. It would - // be nice to provide some sort of mapping function here - // for the returned header names to the standard metadata - // names in the ParseData class - processHeaderLine(line); - } catch (Exception e) { - // fixme: - Http.LOG.warn("Error: ", e); - } - return; - } - - processHeaderLine(line); - } - } - - private static int readLine(PushbackInputStream in, StringBuffer line, - boolean allowContinuedLine) throws IOException { - line.setLength(0); - for (int c = in.read(); c != -1; c = in.read()) { - switch (c) { - case '\r': - if (peek(in) == '\n') { - in.read(); - } - case '\n': - if (line.length() > 0) { - // at EOL -- check for continued line if the current - // (possibly continued) line wasn't blank - if (allowContinuedLine) - switch (peek(in)) { - case ' ': - case '\t': // line is continued - in.read(); - continue; - } - } - return line.length(); // else complete - default: - line.append((char) c); - } - } - throw new EOFException(); - } - - private static int peek(PushbackInputStream in) throws IOException { - int value = in.read(); - in.unread(value); - return value; - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html deleted file mode 100644 index 4181951..0000000 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-http/build.xml b/src/plugin/protocol-http/build.xml deleted file mode 100755 index 30720f1..0000000 --- a/src/plugin/protocol-http/build.xml +++ /dev/null @@ -1,50 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="protocol-http" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- Build compilation dependencies --> - <target name="deps-jar"> - <ant target="jar" inheritall="false" dir="../lib-http"/> - </target> - - <!-- Add compilation dependencies to classpath --> - <path id="plugin.deps"> - <fileset dir="${nutch.root}/build"> - <include name="**/lib-http/*.jar" /> - </fileset> - <pathelement location="${build.dir}/test/conf"/> - </path> - - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" dir="../lib-http"/> - <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> - <copy toDir="${build.test}"> - <fileset dir="${src.test}" excludes="**/*.java"/> - </copy> - </target> - - <!-- for junit test --> - <mkdir dir="${build.test}/data" /> - <copy todir="${build.test}/data"> - <fileset dir="jsp"/> - </copy> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-http/ivy.xml b/src/plugin/protocol-http/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/protocol-http/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/jsp/basic-http.jsp ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-http/jsp/basic-http.jsp b/src/plugin/protocol-http/jsp/basic-http.jsp deleted file mode 100644 index bf1f8bd..0000000 --- a/src/plugin/protocol-http/jsp/basic-http.jsp +++ /dev/null @@ -1,44 +0,0 @@ -<%-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---%><%-- - Example JSP Page to Test Protocol-Http Plugin ---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><% -String path = request.getContextPath(); -String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/"; -%> - -<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> -<html> - <head> - <base href="<%=basePath%>"> - - <title>HelloWorld</title> - <meta http-equiv="content-type" content="text/html;charset=utf-8" /> - <meta name="Language" content="en" /> - <meta http-equiv="pragma" content="no-cache"> - <meta http-equiv="cache-control" content="no-cache"> - <meta http-equiv="expires" content="0"> - <meta http-equiv="keywords" content="keyword1,keyword2,keyword3"> - <meta http-equiv="description" content="This is my page"> - <!-- - <link rel="stylesheet" type="text/css" href="styles.css"> - --> - </head> - - <body> - Hello World!!! <br> - </body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/jsp/brokenpage.jsp ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-http/jsp/brokenpage.jsp b/src/plugin/protocol-http/jsp/brokenpage.jsp deleted file mode 100644 index f3f7c4a..0000000 --- a/src/plugin/protocol-http/jsp/brokenpage.jsp +++ /dev/null @@ -1,47 +0,0 @@ -<%-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---%><%-- - Example JSP Page to Test Protocol-Http Plugin ---%> - -@ page language="java" import="java.util.*" pageEncoding="UTF-8" - -String path = request.getContextPath(); -String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/"; - - -<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> -<html> - <head> - <base href="<%=basePath%>"> - - <title>HelloWorld</title> - <meta http-equiv="content-type" content="text/html;charset=utf-8" /> - <meta name="Language" content="en" /> - <meta http-equiv="pragma" content="no-cache"> - <meta http-equiv="cache-control" content="no-cache"> - <meta http-equiv="expires" content="0"> - <meta http-equiv="keywords" content="keyword1,keyword2,keyword3"> - <meta http-equiv="description" content="This is my page"> - <!-- - <link rel="stylesheet" type="text/css" href="styles.css"> - --> - </head> - - <body> - Hello World!!! <br> - </body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/jsp/redirect301.jsp ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-http/jsp/redirect301.jsp b/src/plugin/protocol-http/jsp/redirect301.jsp deleted file mode 100644 index 1100b89..0000000 --- a/src/plugin/protocol-http/jsp/redirect301.jsp +++ /dev/null @@ -1,49 +0,0 @@ -<%-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---%><%-- - Example JSP Page to Test Protocol-Http Plugin ---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><% -String path = request.getContextPath(); -String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/"; -%> - -<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> -<html> - <head> - <base href="<%=basePath%>"> - - <title>My JSP page</title> - - <meta http-equiv="pragma" content="no-cache"> - <meta http-equiv="cache-control" content="no-cache"> - <meta http-equiv="expires" content="0"> - <meta http-equiv="keywords" content="keyword1,keyword2,keyword3"> - <meta http-equiv="description" content="This is my page"> - <!-- - <link rel="stylesheet" type="text/css" href="styles.css"> - --> - - </head> - - <body> - <% - response.setStatus(301); - response.setHeader( "Location", "http://nutch.apache.org"); - response.setHeader( "Connection", "close" ); - %> - You are redirected by JSP<br> - </body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/jsp/redirect302.jsp ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-http/jsp/redirect302.jsp b/src/plugin/protocol-http/jsp/redirect302.jsp deleted file mode 100644 index 8a250d9..0000000 --- a/src/plugin/protocol-http/jsp/redirect302.jsp +++ /dev/null @@ -1,49 +0,0 @@ -<%-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---%><%-- - Example JSP Page to Test Protocol-Http Plugin ---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><% -String path = request.getContextPath(); -String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/"; -%> - -<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> -<html> - <head> - <base href="<%=basePath%>"> - - <title>My JSP page</title> - - <meta http-equiv="pragma" content="no-cache"> - <meta http-equiv="cache-control" content="no-cache"> - <meta http-equiv="expires" content="0"> - <meta http-equiv="keywords" content="keyword1,keyword2,keyword3"> - <meta http-equiv="description" content="This is my page"> - <!-- - <link rel="stylesheet" type="text/css" href="styles.css"> - --> - - </head> - - <body> - <% - response.setStatus(302); - response.setHeader( "Location", "http://nutch.apache.org"); - response.setHeader( "Connection", "close" ); - %> - You are sucessfully redirected by JSP<br> - </body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-http/plugin.xml b/src/plugin/protocol-http/plugin.xml deleted file mode 100755 index 8770b10..0000000 --- a/src/plugin/protocol-http/plugin.xml +++ /dev/null @@ -1,51 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="protocol-http" - name="Http Protocol Plug-in" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="protocol-http.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - <import plugin="lib-http"/> - </requires> - - <extension id="org.apache.nutch.protocol.http" - name="HttpProtocol" - point="org.apache.nutch.protocol.Protocol"> - - <implementation id="org.apache.nutch.protocol.http.Http" - class="org.apache.nutch.protocol.http.Http"> - <parameter name="protocolName" value="http"/> - </implementation> - - <implementation id="org.apache.nutch.protocol.http.Http" - class="org.apache.nutch.protocol.http.Http"> - <parameter name="protocolName" value="https"/> - </implementation> - - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java deleted file mode 100755 index 56f9f4f..0000000 --- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.http; - -// JDK imports -import java.io.IOException; -import java.net.URL; - -// Commons Logging imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -// Hadoop imports -import org.apache.hadoop.conf.Configuration; - -// Nutch imports -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.http.api.HttpBase; -import org.apache.nutch.util.NutchConfiguration; - -public class Http extends HttpBase { - - public static final Logger LOG = LoggerFactory.getLogger(Http.class); - - /** - * Public default constructor. - */ - public Http() { - super(LOG); - } - - /** - * Set the {@link org.apache.hadoop.conf.Configuration} object. - * - * @param conf - */ - public void setConf(Configuration conf) { - super.setConf(conf); - // Level logLevel = Level.WARNING; - // if (conf.getBoolean("http.verbose", false)) { - // logLevel = Level.FINE; - // } - // LOG.setLevel(logLevel); - } - - public static void main(String[] args) throws Exception { - Http http = new Http(); - http.setConf(NutchConfiguration.create()); - main(http, args); - } - - protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) - throws ProtocolException, IOException { - return new HttpResponse(this, url, datum); - } - -}
