http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java deleted file mode 100644 index f6d7e4d..0000000 --- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java +++ /dev/null @@ -1,558 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.http; - -import java.io.BufferedInputStream; -import java.io.ByteArrayOutputStream; -import java.io.EOFException; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.PushbackInputStream; -import java.net.InetSocketAddress; -import java.net.Socket; -import java.net.URL; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; - -import javax.net.ssl.SSLSocket; -import javax.net.ssl.SSLSocketFactory; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.metadata.SpellCheckedMetadata; -import org.apache.nutch.net.protocols.HttpDateFormat; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.http.api.HttpBase; -import org.apache.nutch.protocol.http.api.HttpException; - -/** - * An HTTP response. - */ -public class HttpResponse implements Response { - - private Configuration conf; - private HttpBase http; - private URL url; - private String orig; - private String base; - private byte[] content; - private int code; - private Metadata headers = new SpellCheckedMetadata(); - // used for storing the http headers verbatim - private StringBuffer httpHeaders; - - protected enum Scheme { - HTTP, HTTPS, - } - - /** - * Default public constructor. - * - * @param http - * @param url - * @param datum - * @throws ProtocolException - * @throws IOException - */ - public HttpResponse(HttpBase http, URL url, CrawlDatum datum) - throws ProtocolException, IOException { - - this.http = http; - this.url = url; - this.orig = url.toString(); - this.base = url.toString(); - - Scheme scheme = null; - - if ("http".equals(url.getProtocol())) { - scheme = Scheme.HTTP; - } else if ("https".equals(url.getProtocol())) { - scheme = Scheme.HTTPS; - } else { - throw new HttpException("Unknown scheme (not http/https) for url:" + url); - } - - if (Http.LOG.isTraceEnabled()) { - Http.LOG.trace("fetching " + url); - } - - String path = "".equals(url.getFile()) ? "/" : url.getFile(); - - // some servers will redirect a request with a host line like - // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they - // don't want the :80... - - String host = url.getHost(); - int port; - String portString; - if (url.getPort() == -1) { - if (scheme == Scheme.HTTP) { - port = 80; - } else { - port = 443; - } - portString = ""; - } else { - port = url.getPort(); - portString = ":" + port; - } - Socket socket = null; - - try { - socket = new Socket(); // create the socket - socket.setSoTimeout(http.getTimeout()); - - // connect - String sockHost = http.useProxy(url) ? http.getProxyHost() : host; - int sockPort = http.useProxy(url) ? http.getProxyPort() : port; - InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort); - socket.connect(sockAddr, http.getTimeout()); - - if (scheme == Scheme.HTTPS) { - SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory - .getDefault(); - SSLSocket sslsocket = (SSLSocket) factory - .createSocket(socket, sockHost, sockPort, true); - sslsocket.setUseClientMode(true); - - // Get the protocols and ciphers supported by this JVM - Set<String> protocols = new HashSet<String>( - Arrays.asList(sslsocket.getSupportedProtocols())); - Set<String> ciphers = new HashSet<String>( - Arrays.asList(sslsocket.getSupportedCipherSuites())); - - // Intersect with preferred protocols and ciphers - protocols.retainAll(http.getTlsPreferredProtocols()); - ciphers.retainAll(http.getTlsPreferredCipherSuites()); - - sslsocket.setEnabledProtocols( - protocols.toArray(new String[protocols.size()])); - sslsocket.setEnabledCipherSuites( - ciphers.toArray(new String[ciphers.size()])); - - sslsocket.startHandshake(); - socket = sslsocket; - } - - this.conf = http.getConf(); - if (sockAddr != null - && conf.getBoolean("store.ip.address", false) == true) { - headers.add("_ip_", sockAddr.getAddress().getHostAddress()); - } - - // make request - OutputStream req = socket.getOutputStream(); - - StringBuffer reqStr = new StringBuffer("GET "); - if (http.useProxy(url)) { - reqStr.append(url.getProtocol() + "://" + host + portString + path); - } else { - reqStr.append(path); - } - - reqStr.append(" HTTP/1.0\r\n"); - - reqStr.append("Host: "); - reqStr.append(host); - reqStr.append(portString); - reqStr.append("\r\n"); - - reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n"); - - String userAgent = http.getUserAgent(); - if ((userAgent == null) || (userAgent.length() == 0)) { - if (Http.LOG.isErrorEnabled()) { - Http.LOG.error("User-agent is not set!"); - } - } else { - reqStr.append("User-Agent: "); - reqStr.append(userAgent); - reqStr.append("\r\n"); - } - - reqStr.append("Accept-Language: "); - reqStr.append(this.http.getAcceptLanguage()); - reqStr.append("\r\n"); - - reqStr.append("Accept: "); - reqStr.append(this.http.getAccept()); - reqStr.append("\r\n"); - - if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) { - reqStr.append("If-Modified-Since: " + HttpDateFormat - .toString(datum.getModifiedTime())); - reqStr.append("\r\n"); - } - reqStr.append("\r\n"); - - // store the request in the metadata? - if (conf.getBoolean("store.http.request", false) == true) { - headers.add("_request_", reqStr.toString()); - } - - byte[] reqBytes = reqStr.toString().getBytes(); - - req.write(reqBytes); - req.flush(); - - PushbackInputStream in = // process response - new PushbackInputStream( - new BufferedInputStream(socket.getInputStream(), - Http.BUFFER_SIZE), Http.BUFFER_SIZE); - - StringBuffer line = new StringBuffer(); - - // store the http headers verbatim - if (conf.getBoolean("store.http.headers", false) == true) { - httpHeaders = new StringBuffer(); - } - - headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis())); - - boolean haveSeenNonContinueStatus = false; - while (!haveSeenNonContinueStatus) { - // parse status code line - this.code = parseStatusLine(in, line); - if (httpHeaders != null) - httpHeaders.append(line).append("\n"); - // parse headers - parseHeaders(in, line, httpHeaders); - haveSeenNonContinueStatus = code != 100; // 100 is "Continue" - } - - String transferEncoding = getHeader(Response.TRANSFER_ENCODING); - if (transferEncoding != null && "chunked" - .equalsIgnoreCase(transferEncoding.trim())) { - readChunkedContent(in, line); - } else { - readPlainContent(in); - } - - String contentEncoding = getHeader(Response.CONTENT_ENCODING); - if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { - content = http.processGzipEncoded(content, url); - } else if ("deflate".equals(contentEncoding)) { - content = http.processDeflateEncoded(content, url); - } else { - // store the headers verbatim only if the response was not compressed - // as the content length reported with not match otherwise - if (httpHeaders != null) { - headers.add("_response.headers_", httpHeaders.toString()); - } - if (Http.LOG.isTraceEnabled()) { - Http.LOG.trace("fetched " + content.length + " bytes from " + url); - } - } - - } finally { - if (socket != null) - socket.close(); - } - - } - - /* - * ------------------------- * <implementation:Response> * - * ------------------------- - */ - - public URL getUrl() { - return url; - } - - public int getCode() { - return code; - } - - public String getHeader(String name) { - return headers.get(name); - } - - public Metadata getHeaders() { - return headers; - } - - public byte[] getContent() { - return content; - } - - /* - * ------------------------- * <implementation:Response> * - * ------------------------- - */ - - private void readPlainContent(InputStream in) - throws HttpException, IOException { - - int contentLength = Integer.MAX_VALUE; // get content length - String contentLengthString = headers.get(Response.CONTENT_LENGTH); - if (contentLengthString != null) { - contentLengthString = contentLengthString.trim(); - try { - if (!contentLengthString.isEmpty()) - contentLength = Integer.parseInt(contentLengthString); - } catch (NumberFormatException e) { - throw new HttpException("bad content length: " + contentLengthString); - } - } - if (http.getMaxContent() >= 0 && contentLength > http - .getMaxContent()) // limit - // download - // size - contentLength = http.getMaxContent(); - - ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); - byte[] bytes = new byte[Http.BUFFER_SIZE]; - int length = 0; - - // do not try to read if the contentLength is 0 - if (contentLength == 0) { - content = new byte[0]; - return; - } - - // read content - int i = in.read(bytes); - while (i != -1) { - out.write(bytes, 0, i); - length += i; - if (length >= contentLength) { - break; - } - if ((length + Http.BUFFER_SIZE) > contentLength) { - // reading next chunk may hit contentLength, - // must limit number of bytes read - i = in.read(bytes, 0, (contentLength - length)); - } else { - i = in.read(bytes); - } - } - content = out.toByteArray(); - } - - /** - * @param in - * @param line - * @throws HttpException - * @throws IOException - */ - private void readChunkedContent(PushbackInputStream in, StringBuffer line) - throws HttpException, IOException { - boolean doneChunks = false; - int contentBytesRead = 0; - byte[] bytes = new byte[Http.BUFFER_SIZE]; - ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); - - while (!doneChunks) { - if (Http.LOG.isTraceEnabled()) { - Http.LOG.trace("Http: starting chunk"); - } - - readLine(in, line, false); - - String chunkLenStr; - // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); - // } - - int pos = line.indexOf(";"); - if (pos < 0) { - chunkLenStr = line.toString(); - } else { - chunkLenStr = line.substring(0, pos); - // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + - // line.substring(pos+1)); } - } - chunkLenStr = chunkLenStr.trim(); - int chunkLen; - try { - chunkLen = Integer.parseInt(chunkLenStr, 16); - } catch (NumberFormatException e) { - throw new HttpException("bad chunk length: " + line.toString()); - } - - if (chunkLen == 0) { - doneChunks = true; - break; - } - - if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http - .getMaxContent()) - chunkLen = http.getMaxContent() - contentBytesRead; - - // read one chunk - int chunkBytesRead = 0; - while (chunkBytesRead < chunkLen) { - - int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? - (chunkLen - chunkBytesRead) : - Http.BUFFER_SIZE; - int len = in.read(bytes, 0, toRead); - - if (len == -1) - throw new HttpException("chunk eof after " + contentBytesRead - + " bytes in successful chunks" + " and " + chunkBytesRead - + " in current chunk"); - - // DANGER!!! Will printed GZIPed stuff right to your - // terminal! - // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0, - // len)); } - - out.write(bytes, 0, len); - chunkBytesRead += len; - } - - readLine(in, line, false); - - } - - if (!doneChunks) { - if (contentBytesRead != http.getMaxContent()) - throw new HttpException("chunk eof: !doneChunk && didn't max out"); - return; - } - - content = out.toByteArray(); - parseHeaders(in, line, null); - - } - - private int parseStatusLine(PushbackInputStream in, StringBuffer line) - throws IOException, HttpException { - readLine(in, line, false); - - int codeStart = line.indexOf(" "); - int codeEnd = line.indexOf(" ", codeStart + 1); - - // handle lines with no plaintext result code, ie: - // "HTTP/1.1 200" vs "HTTP/1.1 200 OK" - if (codeEnd == -1) - codeEnd = line.length(); - - int code; - try { - code = Integer.parseInt(line.substring(codeStart + 1, codeEnd)); - } catch (NumberFormatException e) { - throw new HttpException( - "bad status line '" + line + "': " + e.getMessage(), e); - } - - return code; - } - - private void processHeaderLine(StringBuffer line) - throws IOException, HttpException { - - int colonIndex = line.indexOf(":"); // key is up to colon - if (colonIndex == -1) { - int i; - for (i = 0; i < line.length(); i++) - if (!Character.isWhitespace(line.charAt(i))) - break; - if (i == line.length()) - return; - throw new HttpException("No colon in header:" + line); - } - String key = line.substring(0, colonIndex); - - int valueStart = colonIndex + 1; // skip whitespace - while (valueStart < line.length()) { - int c = line.charAt(valueStart); - if (c != ' ' && c != '\t') - break; - valueStart++; - } - String value = line.substring(valueStart); - headers.set(key, value); - } - - // Adds headers to our headers Metadata - private void parseHeaders(PushbackInputStream in, StringBuffer line, - StringBuffer httpHeaders) throws IOException, HttpException { - - while (readLine(in, line, true) != 0) { - - if (httpHeaders != null) - httpHeaders.append(line).append("\n"); - - // handle HTTP responses with missing blank line after headers - int pos; - if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ( - (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html")) - != -1)) { - - in.unread(line.substring(pos).getBytes("UTF-8")); - line.setLength(pos); - - try { - // TODO: (CM) We don't know the header names here - // since we're just handling them generically. It would - // be nice to provide some sort of mapping function here - // for the returned header names to the standard metadata - // names in the ParseData class - processHeaderLine(line); - } catch (Exception e) { - // fixme: - Http.LOG.warn("Error: ", e); - } - return; - } - - processHeaderLine(line); - } - } - - private static int readLine(PushbackInputStream in, StringBuffer line, - boolean allowContinuedLine) throws IOException { - line.setLength(0); - for (int c = in.read(); c != -1; c = in.read()) { - switch (c) { - case '\r': - if (peek(in) == '\n') { - in.read(); - } - case '\n': - if (line.length() > 0) { - // at EOL -- check for continued line if the current - // (possibly continued) line wasn't blank - if (allowContinuedLine) - switch (peek(in)) { - case ' ': - case '\t': // line is continued - in.read(); - continue; - } - } - return line.length(); // else complete - default: - line.append((char) c); - } - } - throw new EOFException(); - } - - private static int peek(PushbackInputStream in) throws IOException { - int value = in.read(); - in.unread(value); - return value; - } - -} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html deleted file mode 100644 index 34d1d1c..0000000 --- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml b/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml deleted file mode 100644 index a9afd78..0000000 --- a/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml +++ /dev/null @@ -1,52 +0,0 @@ -<?xml version="1.0"?> -<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<configuration> - -<property> - <name>http.robots.agents</name> - <value>Nutch-Test,*</value> - <description></description> -</property> - -<property> - <name>http.agent.name</name> - <value>Nutch-Test</value> - <description></description> -</property> - -<property> - <name>http.agent.description</name> - <value>Nutch protocol-httpclient test</value> - <description></description> -</property> - -<property> - <name>http.auth.file</name> - <value>httpclient-auth-test.xml</value> - <description></description> -</property> - -<property> - <name>http.timeout</name> - <value>60000</value> - <description></description> -</property> - -</configuration> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java deleted file mode 100644 index 7dd9e9b..0000000 --- a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.protocol.http; - -import static org.junit.Assert.assertEquals; - -import java.net.URL; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolOutput; -import org.junit.After; -import org.junit.Test; -import org.mortbay.jetty.Server; -import org.mortbay.jetty.nio.SelectChannelConnector; -import org.mortbay.jetty.servlet.Context; -import org.mortbay.jetty.servlet.ServletHolder; - -/** - * Test cases for protocol-http - */ -public class TestProtocolHttp { - private static final String RES_DIR = System.getProperty("test.data", "."); - - private Http http; - private Server server; - private Context root; - private Configuration conf; - private int port; - - public void setUp(boolean redirection) throws Exception { - conf = new Configuration(); - conf.addResource("nutch-default.xml"); - conf.addResource("nutch-site-test.xml"); - - http = new Http(); - http.setConf(conf); - - server = new Server(); - - if (redirection) { - root = new Context(server, "/redirection", Context.SESSIONS); - root.setAttribute("newContextURL", "/redirect"); - } else { - root = new Context(server, "/", Context.SESSIONS); - } - - ServletHolder sh = new ServletHolder( - org.apache.jasper.servlet.JspServlet.class); - root.addServlet(sh, "*.jsp"); - root.setResourceBase(RES_DIR); - } - - @After - public void tearDown() throws Exception { - server.stop(); - } - - @Test - public void testStatusCode() throws Exception { - startServer(47504, false); - fetchPage("/basic-http.jsp", 200); - fetchPage("/redirect301.jsp", 301); - fetchPage("/redirect302.jsp", 302); - fetchPage("/nonexists.html", 404); - fetchPage("/brokenpage.jsp", 500); - } - - @Test - public void testRedirectionJetty() throws Exception { - // Redirection via Jetty - startServer(47503, true); - fetchPage("/redirection", 302); - } - - /** - * Starts the Jetty server at a specified port and redirection parameter. - * - * @param portno - * Port number. - * @param redirection - * whether redirection - */ - private void startServer(int portno, boolean redirection) throws Exception { - port = portno; - setUp(redirection); - SelectChannelConnector connector = new SelectChannelConnector(); - connector.setHost("127.0.0.1"); - connector.setPort(port); - - server.addConnector(connector); - server.start(); - } - - /** - * Fetches the specified <code>page</code> from the local Jetty server and - * checks whether the HTTP response status code matches with the expected - * code. Also use jsp pages for redirection. - * - * @param page - * Page to be fetched. - * @param expectedCode - * HTTP response status code expected while fetching the page. - */ - private void fetchPage(String page, int expectedCode) throws Exception { - URL url = new URL("http", "127.0.0.1", port, page); - CrawlDatum crawlDatum = new CrawlDatum(); - Response response = http.getResponse(url, crawlDatum, true); - ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), - crawlDatum); - Content content = out.getContent(); - assertEquals("HTTP Status Code for " + url, expectedCode, - response.getCode()); - - if (page.compareTo("/nonexists.html") != 0 - && page.compareTo("/brokenpage.jsp") != 0 - && page.compareTo("/redirection") != 0) { - assertEquals("ContentType " + url, "text/html", - content.getContentType()); - } - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/build.xml b/src/plugin/protocol-httpclient/build.xml deleted file mode 100644 index b66eb97..0000000 --- a/src/plugin/protocol-httpclient/build.xml +++ /dev/null @@ -1,45 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="protocol-httpclient" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <target name="deps-jar"> - <ant target="jar" inheritall="false" dir="../lib-http"/> - </target> - - <path id="plugin.deps"> - <fileset dir="${nutch.root}/build"> - <include name="**/lib-http/*.jar" /> - </fileset> - <pathelement location="${build.dir}/test/conf"/> - </path> - - <target name="deps-test"> - <copy toDir="${build.test}"> - <fileset dir="${src.test}" excludes="**/*.java"/> - </copy> - </target> - - <!-- for junit test --> - <mkdir dir="${build.test}/data" /> - <copy todir="${build.test}/data"> - <fileset dir="jsp"/> - </copy> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/ivy.xml b/src/plugin/protocol-httpclient/ivy.xml deleted file mode 100644 index 00b6f07..0000000 --- a/src/plugin/protocol-httpclient/ivy.xml +++ /dev/null @@ -1,42 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - <dependency org="org.jsoup" name="jsoup" rev="1.8.1" /> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/basic.jsp ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/jsp/basic.jsp b/src/plugin/protocol-httpclient/jsp/basic.jsp deleted file mode 100644 index c5bfb89..0000000 --- a/src/plugin/protocol-httpclient/jsp/basic.jsp +++ /dev/null @@ -1,74 +0,0 @@ -<%-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---%><%-- - This JSP demonstrates basic authentication. When this JSP page is - requested with no query parameters, then the user must enter the - username as 'userx' and password as 'passx' when prompted for - authentication. Apart from this there are a few other test cases, - which can be used by passing a test case number as query parameter in - the following manner: basic.jsp?case=1, basic.jsp?case=2, etc. - The credentials for each test case can be easily figured out from the - code below. - - Author: Susam Pal ---%><%@ page - import = "sun.misc.BASE64Decoder" -%><% - String authHeader = request.getHeader("Authorization"); - String realm = null; - String username = null; - String password = null; - int testCase = 0; - try { - testCase = Integer.parseInt(request.getParameter("case")); - } catch (Exception ex) { - // do nothing - } - switch (testCase) { - case 1: - realm = "realm1"; username = "user1"; password = "pass1"; - break; - - case 2: - realm = "realm2"; username = "user2"; password = "pass2"; - break; - - default: - realm = "realmx"; username = "userx"; password = "passx"; - break; - } - - boolean authenticated = false; - if (authHeader != null && authHeader.toUpperCase().startsWith("BASIC")) { - String creds[] = new String(new BASE64Decoder().decodeBuffer( - authHeader.substring(6))).split(":", 2); - if (creds[0].equals(username) && creds[1].equals(password)) - authenticated = true; - } - if (!authenticated) { - response.setHeader("WWW-Authenticate", "Basic realm=\"" + realm + "\""); - response.sendError(response.SC_UNAUTHORIZED); - } else { -%> -<html> -<head><title>Basic Authentication Test</title></head> -<body> -<p>Hi <%= username %>, you have been successfully authenticated.</p> -</body> -</html> -<% - } -%> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/cookies.jsp ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/jsp/cookies.jsp b/src/plugin/protocol-httpclient/jsp/cookies.jsp deleted file mode 100644 index ae2ace2..0000000 --- a/src/plugin/protocol-httpclient/jsp/cookies.jsp +++ /dev/null @@ -1,63 +0,0 @@ -<%-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---%><%-- - This JSP tests whether the client can remember cookies. When the JSP - is fetched for the first time without any query parameters, it sets - a few cookies in the client. On a second request, with the query - parameter, 'cookie=yes', it checks whether all the client has sent - the cookies. If the cookies are found, HTTP 200 response is returned. - If the cookies are not found, HTTP 403 response is returned. - - Author: Susam Pal ---%><% - String cookieParam = request.getParameter("cookie"); - if (!"yes".equals(cookieParam)) { // Send cookies - response.addCookie(new Cookie("var1", "val1")); - response.addCookie(new Cookie("var2", "val2")); -%> -<html> -<head><title>Cookies Set</title></head> -<body><p>Cookies have been set.</p></body> -</html> -<% - } else { // Check cookies - int cookiesCount = 0; - - Cookie[] cookies = request.getCookies(); - if (cookies != null) { - for (int i = 0; i < cookies.length; i++) { - if (cookies[i].getName().equals("var1") - && cookies[i].getValue().equals("val1")) - cookiesCount++; - - if (cookies[i].getName().equals("var2") - && cookies[i].getValue().equals("val2")) - cookiesCount++; - } - } - - if (cookiesCount != 2) { - response.sendError(response.SC_FORBIDDEN); - } else { -%> -<html> -<head><title>Cookies Found</title></head> -<body><p>Cookies found!</p></body> -</html> -<% - } - } -%> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/digest.jsp ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/jsp/digest.jsp b/src/plugin/protocol-httpclient/jsp/digest.jsp deleted file mode 100644 index c657484..0000000 --- a/src/plugin/protocol-httpclient/jsp/digest.jsp +++ /dev/null @@ -1,68 +0,0 @@ -<%-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---%><%-- - This JSP tests digest authentication. It generates an HTTP response - with authorization header for digest authentication and checks the - user-name supplied by the client. It does not check the other - parameters and hashes as controlled JUnit tests would be performed - against this and only the proper submission of credentials need to - be tested. - - Author: Susam Pal ---%><%@ page - import = "java.util.StringTokenizer" - import = "java.util.HashMap" -%><% - String username = "digest_user"; - String authHeader = request.getHeader("Authorization"); - - boolean authenticated = false; - if (authHeader != null && authHeader.toUpperCase().startsWith("DIGEST")) { - HashMap map = new HashMap(); - StringTokenizer tokenizer = new StringTokenizer( - authHeader.substring(7).trim(), ","); - while (tokenizer.hasMoreTokens()) { - String[] param = tokenizer.nextToken().trim().split("=", 2); - if (param[1].charAt(0) == '"') { - param[1] = param[1].substring(1, param[1].length() - 1); - } - map.put(param[0], param[1]); - } - - if (username.equals((String)map.get("username"))) - authenticated = true; - } - - if (!authenticated) { - String realm = "realm=\"realm1\""; - String qop = "qop=\"auth,auth-int\""; - String nonce = "nonce=\"dcd98b7102dd2f0e8b11d0f600bfb0c093\""; - String opaque = "opaque=\"5ccc069c403ebaf9f0171e9517f40e41\""; - - response.setHeader("WWW-Authenticate", "Digest " + realm + ", " - + qop + ", " + nonce + ", " + opaque); - response.sendError(response.SC_UNAUTHORIZED); - } else { -%> -<html> -<head><title>Digest Authentication Test</title></head> -<body> -<p>Hi <%= username %>, you have been successfully authenticated.</p> -</body> -</html> -<% - } -%> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/noauth.jsp ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/jsp/noauth.jsp b/src/plugin/protocol-httpclient/jsp/noauth.jsp deleted file mode 100644 index c726b0f..0000000 --- a/src/plugin/protocol-httpclient/jsp/noauth.jsp +++ /dev/null @@ -1,36 +0,0 @@ -<%-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---%><%-- - This JSP tests whether the client is sending any pre-emptive - authentication headers. The client is expected not to send pre-emptive - authentication headers. If such authentication headers are found, this - JSP will return an HTTP 403 response; HTTP 200 response otherwise. - - Author: Susam Pal ---%><% - if (request.getHeader("Authorization") != null) { - response.sendError(response.SC_UNAUTHORIZED); - } else { -%> -<html> -<head><title>No authorization headers found</title></head> -<body> -<p>No authorization headers found.</p> -</body> -</html> -<% - } -%> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/jsp/ntlm.jsp ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/jsp/ntlm.jsp b/src/plugin/protocol-httpclient/jsp/ntlm.jsp deleted file mode 100644 index 6ad921e..0000000 --- a/src/plugin/protocol-httpclient/jsp/ntlm.jsp +++ /dev/null @@ -1,89 +0,0 @@ -<%-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---%><%-- - This JSP tests NTLM authentication. It generates an HTTP response - with authorization header for NTLM authentication and checks the - user-name supplied by the client. It does not check the other - parameters and hashes as controlled JUnit tests would be performed - against this and only the proper submission of credentials need to - be tested. - - Author: Susam Pal ---%><%@ page - import = "sun.misc.BASE64Decoder" - import = "sun.misc.BASE64Encoder" -%><% - String authHeader = request.getHeader("Authorization"); - String username = null; - String domain = null; - String host = null; - - boolean authenticated = false; - if (authHeader != null && authHeader.startsWith("NTLM")) { - byte[] msg = new BASE64Decoder().decodeBuffer( - authHeader.substring(5)); - if (msg[8] == 1) { - byte[] type2msg = { - 'N', 'T', 'L', 'M', 'S', 'S', 'P', 0, // NTLMSSP Signature - 2, 0, 0, 0, // Type 2 Indicator - 10, 0, 10, 0, 32, 0, 0, 0, // length, offset - 0x00, 0x02, (byte) 0x81, 0, // Flags - 1, 2, 3, 4, 5, 6, 7, 8, // Challenge - 'N', 'U', 'T', 'C', 'H' // NUTCH (Domain) - }; - response.setHeader("WWW-Authenticate", "NTLM " - + new BASE64Encoder().encodeBuffer(type2msg)); - response.sendError(response.SC_UNAUTHORIZED); - return; - } else if (msg[8] == 3) { - int length; - int offset; - - // Get domain name - length = msg[30] + msg[31] * 256; - offset = msg[32] + msg[33] * 256; - domain = new String(msg, offset, length); - - // Get user name - length = msg[38] + msg[39] * 256; - offset = msg[40] + msg[41] * 256; - username = new String(msg, offset, length); - - // Get password - length = msg[46] + msg[47] * 256; - offset = msg[48] + msg[49] * 256; - host = new String(msg, offset, length); - - if ("ntlm_user".equalsIgnoreCase(username) - && "NUTCH".equalsIgnoreCase(domain)) - authenticated = true; - } - } - - if (!authenticated) { - response.setHeader("WWW-Authenticate", "NTLM"); - response.sendError(response.SC_UNAUTHORIZED); - } else { -%> -<html> -<head>NTLM Authentication Test</head> -<body> -<p>Hi <%= username %>, You have been successfully authenticated.</p> -</body> -</html> -<% - } -%> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/plugin.xml b/src/plugin/protocol-httpclient/plugin.xml deleted file mode 100644 index 1747713..0000000 --- a/src/plugin/protocol-httpclient/plugin.xml +++ /dev/null @@ -1,58 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="protocol-httpclient" - name="Http / Https Protocol Plug-in" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="protocol-httpclient.jar"> - <export name="*"/> - </library> - <library name="jsoup-1.8.1.jar"/> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - <import plugin="lib-http"/> - </requires> - - <extension id="org.apache.nutch.protocol.httpclient" - name="HttpProtocol" - point="org.apache.nutch.protocol.Protocol"> - - <implementation id="org.apache.nutch.protocol.httpclient.Http" - class="org.apache.nutch.protocol.httpclient.Http"> - <parameter name="protocolName" value="http"/> - </implementation> - - </extension> - - <extension id="org.apache.nutch.protocol.https" - name="HttpsProtocol" - point="org.apache.nutch.protocol.Protocol"> - - <implementation id="org.apache.nutch.protocol.httpclient.Http" - class="org.apache.nutch.protocol.httpclient.Http"> - <parameter name="protocolName" value="https"/> - </implementation> - - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java deleted file mode 100644 index afcf24a..0000000 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * Based on EasySSLProtocolSocketFactory from commons-httpclient: - * - * $Header: - * /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/contrib/org/apache/commons/httpclient/contrib/ssl/DummySSLProtocolSocketFactory.java,v - * 1.7 2004/06/11 19:26:27 olegk Exp $ $Revision$ $Date: 2005-02-26 05:01:52 - * -0800 (Sat, 26 Feb 2005) $ - */ - -package org.apache.nutch.protocol.httpclient; - -import java.io.IOException; -import java.net.InetAddress; -import java.net.Socket; -import java.net.UnknownHostException; - -import org.apache.commons.httpclient.ConnectTimeoutException; -import org.apache.commons.httpclient.HttpClientError; -import org.apache.commons.httpclient.params.HttpConnectionParams; -import org.apache.commons.httpclient.protocol.ControllerThreadSocketFactory; -import org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; - -public class DummySSLProtocolSocketFactory implements - SecureProtocolSocketFactory { - - /** Logger object for this class. */ - private static final Logger LOG = LoggerFactory - .getLogger(DummySSLProtocolSocketFactory.class); - - private SSLContext sslcontext = null; - - /** - * Constructor for DummySSLProtocolSocketFactory. - */ - public DummySSLProtocolSocketFactory() { - super(); - } - - private static SSLContext createEasySSLContext() { - try { - SSLContext context = SSLContext.getInstance("SSL"); - context.init(null, - new TrustManager[] { new DummyX509TrustManager(null) }, null); - return context; - } catch (Exception e) { - if (LOG.isErrorEnabled()) { - LOG.error(e.getMessage(), e); - } - throw new HttpClientError(e.toString()); - } - } - - private SSLContext getSSLContext() { - if (this.sslcontext == null) { - this.sslcontext = createEasySSLContext(); - } - return this.sslcontext; - } - - /** - * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int,InetAddress,int) - */ - public Socket createSocket(String host, int port, InetAddress clientHost, - int clientPort) throws IOException, UnknownHostException { - - return getSSLContext().getSocketFactory().createSocket(host, port, - clientHost, clientPort); - } - - /** - * Attempts to get a new socket connection to the given host within the given - * time limit. - * <p> - * To circumvent the limitations of older JREs that do not support connect - * timeout a controller thread is executed. The controller thread attempts to - * create a new socket within the given limit of time. If socket constructor - * does not return until the timeout expires, the controller terminates and - * throws an {@link ConnectTimeoutException} - * </p> - * - * @param host - * the host name/IP - * @param port - * the port on the host - * @param localAddress - * the local host name/IP to bind the socket to - * @param localPort - * the port on the local machine - * @param params - * {@link HttpConnectionParams Http connection parameters} - * - * @return Socket a new socket - * - * @throws IOException - * if an I/O error occurs while creating the socket - * @throws UnknownHostException - * if the IP address of the host cannot be determined - */ - public Socket createSocket(final String host, final int port, - final InetAddress localAddress, final int localPort, - final HttpConnectionParams params) throws IOException, - UnknownHostException, ConnectTimeoutException { - if (params == null) { - throw new IllegalArgumentException("Parameters may not be null"); - } - int timeout = params.getConnectionTimeout(); - if (timeout == 0) { - return createSocket(host, port, localAddress, localPort); - } else { - // To be eventually deprecated when migrated to Java 1.4 or above - return ControllerThreadSocketFactory.createSocket(this, host, port, - localAddress, localPort, timeout); - } - } - - /** - * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int) - */ - public Socket createSocket(String host, int port) throws IOException, - UnknownHostException { - return getSSLContext().getSocketFactory().createSocket(host, port); - } - - /** - * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(Socket,String,int,boolean) - */ - public Socket createSocket(Socket socket, String host, int port, - boolean autoClose) throws IOException, UnknownHostException { - return getSSLContext().getSocketFactory().createSocket(socket, host, port, - autoClose); - } - - public boolean equals(Object obj) { - return ((obj != null) && obj.getClass().equals( - DummySSLProtocolSocketFactory.class)); - } - - public int hashCode() { - return DummySSLProtocolSocketFactory.class.hashCode(); - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java deleted file mode 100644 index b5509cc..0000000 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * Based on EasyX509TrustManager from commons-httpclient. - */ - -package org.apache.nutch.protocol.httpclient; - -import java.security.KeyStore; -import java.security.KeyStoreException; -import java.security.NoSuchAlgorithmException; -import java.security.cert.CertificateException; -import java.security.cert.X509Certificate; - -import javax.net.ssl.TrustManagerFactory; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class DummyX509TrustManager implements X509TrustManager { - private X509TrustManager standardTrustManager = null; - - /** Logger object for this class. */ - private static final Logger LOG = LoggerFactory - .getLogger(DummyX509TrustManager.class); - - /** - * Constructor for DummyX509TrustManager. - */ - public DummyX509TrustManager(KeyStore keystore) - throws NoSuchAlgorithmException, KeyStoreException { - super(); - String algo = TrustManagerFactory.getDefaultAlgorithm(); - TrustManagerFactory factory = TrustManagerFactory.getInstance(algo); - factory.init(keystore); - TrustManager[] trustmanagers = factory.getTrustManagers(); - if (trustmanagers.length == 0) { - throw new NoSuchAlgorithmException(algo + " trust manager not supported"); - } - this.standardTrustManager = (X509TrustManager) trustmanagers[0]; - } - - /** - * @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[], - * String) - */ - public boolean isClientTrusted(X509Certificate[] certificates) { - return true; - } - - /** - * @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[], - * String) - */ - public boolean isServerTrusted(X509Certificate[] certificates) { - return true; - } - - /** - * @see javax.net.ssl.X509TrustManager#getAcceptedIssuers() - */ - public X509Certificate[] getAcceptedIssuers() { - return this.standardTrustManager.getAcceptedIssuers(); - } - - public void checkClientTrusted(X509Certificate[] arg0, String arg1) - throws CertificateException { - // do nothing - - } - - public void checkServerTrusted(X509Certificate[] arg0, String arg1) - throws CertificateException { - // do nothing - - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java deleted file mode 100644 index 75506ce..0000000 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java +++ /dev/null @@ -1,572 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.httpclient; - -// JDK imports -import java.io.InputStream; -import java.io.IOException; -import java.net.URL; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; - -import org.xml.sax.SAXException; -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.NodeList; -import org.w3c.dom.Node; - -// Slf4j Logging imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -// HTTP Client imports -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HostConfiguration; -import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; -import org.apache.commons.httpclient.NTCredentials; -import org.apache.commons.httpclient.auth.AuthScope; -import org.apache.commons.httpclient.params.HttpConnectionManagerParams; -import org.apache.commons.httpclient.protocol.Protocol; -import org.apache.commons.httpclient.protocol.ProtocolSocketFactory; -// NUTCH-1929 Consider implementing dependency injection for crawl HTTPS sites that use self signed certificates -//import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory; - -import org.apache.commons.lang.StringUtils; -// Nutch imports -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.http.api.HttpBase; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; - -/** - * <p> - * This class is a protocol plugin that configures an HTTP client for Basic, - * Digest and NTLM authentication schemes for web server as well as proxy - * server. It takes care of HTTPS protocol as well as cookies in a single fetch - * session. - * </p> - * <p> - * Documentation can be found on the Nutch <a - * href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes" - * >HttpAuthenticationSchemes</a> wiki page. - * </p> - * <p> - * The original description of the motivation to support <a - * href="https://wiki.apache.org/nutch/HttpPostAuthentication" - * >HttpPostAuthentication</a> is also included on the Nutch wiki. Additionally - * HttpPostAuthentication development is documented at the <a - * href="https://issues.apache.org/jira/browse/NUTCH-827">NUTCH-827</a> Jira - * issue. - * - * @author Susam Pal - */ -public class Http extends HttpBase { - - public static final Logger LOG = LoggerFactory.getLogger(Http.class); - - private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager(); - - // Since the Configuration has not yet been set, - // then an unconfigured client is returned. - private static HttpClient client = new HttpClient(connectionManager); - private static String defaultUsername; - private static String defaultPassword; - private static String defaultRealm; - private static String defaultScheme; - private static String authFile; - private static String agentHost; - private static boolean authRulesRead = false; - private static Configuration conf; - - private int maxThreadsTotal = 10; - - private String proxyUsername; - private String proxyPassword; - private String proxyRealm; - - private static HttpFormAuthConfigurer formConfigurer; - - /** - * Returns the configured HTTP client. - * - * @return HTTP client - */ - static synchronized HttpClient getClient() { - return client; - } - - /** - * Constructs this plugin. - */ - public Http() { - super(LOG); - } - - /** - * Reads the configuration from the Nutch configuration files and sets the - * configuration. - * - * @param conf - * Configuration - */ - public void setConf(Configuration conf) { - super.setConf(conf); - this.conf = conf; - this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10); - this.proxyUsername = conf.get("http.proxy.username", ""); - this.proxyPassword = conf.get("http.proxy.password", ""); - this.proxyRealm = conf.get("http.proxy.realm", ""); - agentHost = conf.get("http.agent.host", ""); - authFile = conf.get("http.auth.file", ""); - configureClient(); - try { - setCredentials(); - } catch (Exception ex) { - if (LOG.isErrorEnabled()) { - LOG.error("Could not read " + authFile + " : " + ex.getMessage()); - } - } - } - - /** - * Main method. - * - * @param args - * Command line arguments - */ - public static void main(String[] args) throws Exception { - Http http = new Http(); - http.setConf(NutchConfiguration.create()); - main(http, args); - } - - /** - * Fetches the <code>url</code> with a configured HTTP client and gets the - * response. - * - * @param url - * URL to be fetched - * @param datum - * Crawl data - * @param redirect - * Follow redirects if and only if true - * @return HTTP response - */ - protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) - throws ProtocolException, IOException { - resolveCredentials(url); - return new HttpResponse(this, url, datum, redirect); - } - - /** - * Configures the HTTP client - */ - private void configureClient() { - - // Set up an HTTPS socket factory that accepts self-signed certs. - // ProtocolSocketFactory factory = new SSLProtocolSocketFactory(); - ProtocolSocketFactory factory = new DummySSLProtocolSocketFactory(); - Protocol https = new Protocol("https", factory, 443); - Protocol.registerProtocol("https", https); - - HttpConnectionManagerParams params = connectionManager.getParams(); - params.setConnectionTimeout(timeout); - params.setSoTimeout(timeout); - params.setSendBufferSize(BUFFER_SIZE); - params.setReceiveBufferSize(BUFFER_SIZE); - - // -------------------------------------------------------------------------------- - // NUTCH-1836: Modification to increase the number of available connections - // for multi-threaded crawls. - // -------------------------------------------------------------------------------- - params.setMaxTotalConnections(conf.getInt( - "mapred.tasktracker.map.tasks.maximum", 5) - * conf.getInt("fetcher.threads.fetch", maxThreadsTotal)); - - // Also set max connections per host to maxThreadsTotal since all threads - // might be used to fetch from the same host - otherwise timeout errors can - // occur - params.setDefaultMaxConnectionsPerHost(conf.getInt( - "fetcher.threads.fetch", maxThreadsTotal)); - - // executeMethod(HttpMethod) seems to ignore the connection timeout on the - // connection manager. - // set it explicitly on the HttpClient. - client.getParams().setConnectionManagerTimeout(timeout); - - HostConfiguration hostConf = client.getHostConfiguration(); - ArrayList<Header> headers = new ArrayList<Header>(); - // Set the User Agent in the header - // headers.add(new Header("User-Agent", userAgent)); //NUTCH-1941 - // prefer English - headers.add(new Header("Accept-Language", acceptLanguage)); - // prefer UTF-8 - headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7")); - // prefer understandable formats - headers - .add(new Header( - "Accept", - "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5")); - // accept gzipped content - headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate")); - hostConf.getParams().setParameter("http.default-headers", headers); - - // HTTP proxy server details - if (useProxy) { - hostConf.setProxy(proxyHost, proxyPort); - - if (proxyUsername.length() > 0) { - - AuthScope proxyAuthScope = getAuthScope(this.proxyHost, this.proxyPort, - this.proxyRealm); - - NTCredentials proxyCredentials = new NTCredentials(this.proxyUsername, - this.proxyPassword, Http.agentHost, this.proxyRealm); - - client.getState().setProxyCredentials(proxyAuthScope, proxyCredentials); - } - } - - } - - /** - * Reads authentication configuration file (defined as 'http.auth.file' in - * Nutch configuration file) and sets the credentials for the configured - * authentication scopes in the HTTP client object. - * - * @throws ParserConfigurationException - * If a document builder can not be created. - * @throws SAXException - * If any parsing error occurs. - * @throws IOException - * If any I/O error occurs. - */ - private static synchronized void setCredentials() - throws ParserConfigurationException, SAXException, IOException { - - if (authRulesRead) - return; - - authRulesRead = true; // Avoid re-attempting to read - - InputStream is = conf.getConfResourceAsInputStream(authFile); - if (is != null) { - Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() - .parse(is); - - Element rootElement = doc.getDocumentElement(); - if (!"auth-configuration".equals(rootElement.getTagName())) { - if (LOG.isWarnEnabled()) - LOG.warn("Bad auth conf file: root element <" - + rootElement.getTagName() + "> found in " + authFile - + " - must be <auth-configuration>"); - } - - // For each set of credentials - NodeList credList = rootElement.getChildNodes(); - for (int i = 0; i < credList.getLength(); i++) { - Node credNode = credList.item(i); - if (!(credNode instanceof Element)) - continue; - - Element credElement = (Element) credNode; - if (!"credentials".equals(credElement.getTagName())) { - if (LOG.isWarnEnabled()) - LOG.warn("Bad auth conf file: Element <" + credElement.getTagName() - + "> not recognized in " + authFile - + " - expected <credentials>"); - continue; - } - - String authMethod = credElement.getAttribute("authMethod"); - // read http form post auth info - if (StringUtils.isNotBlank(authMethod)) { - formConfigurer = readFormAuthConfigurer(credElement, authMethod); - continue; - } - - String username = credElement.getAttribute("username"); - String password = credElement.getAttribute("password"); - - // For each authentication scope - NodeList scopeList = credElement.getChildNodes(); - for (int j = 0; j < scopeList.getLength(); j++) { - Node scopeNode = scopeList.item(j); - if (!(scopeNode instanceof Element)) - continue; - - Element scopeElement = (Element) scopeNode; - - if ("default".equals(scopeElement.getTagName())) { - - // Determine realm and scheme, if any - String realm = scopeElement.getAttribute("realm"); - String scheme = scopeElement.getAttribute("scheme"); - - // Set default credentials - defaultUsername = username; - defaultPassword = password; - defaultRealm = realm; - defaultScheme = scheme; - - if (LOG.isTraceEnabled()) { - LOG.trace("Credentials - username: " + username - + "; set as default" + " for realm: " + realm + "; scheme: " - + scheme); - } - - } else if ("authscope".equals(scopeElement.getTagName())) { - - // Determine authentication scope details - String host = scopeElement.getAttribute("host"); - int port = -1; // For setting port to AuthScope.ANY_PORT - try { - port = Integer.parseInt(scopeElement.getAttribute("port")); - } catch (Exception ex) { - // do nothing, port is already set to any port - } - String realm = scopeElement.getAttribute("realm"); - String scheme = scopeElement.getAttribute("scheme"); - - // Set credentials for the determined scope - AuthScope authScope = getAuthScope(host, port, realm, scheme); - NTCredentials credentials = new NTCredentials(username, password, - agentHost, realm); - - client.getState().setCredentials(authScope, credentials); - - if (LOG.isTraceEnabled()) { - LOG.trace("Credentials - username: " + username - + "; set for AuthScope - " + "host: " + host + "; port: " - + port + "; realm: " + realm + "; scheme: " + scheme); - } - - } else { - if (LOG.isWarnEnabled()) - LOG.warn("Bad auth conf file: Element <" - + scopeElement.getTagName() + "> not recognized in " - + authFile + " - expected <authscope>"); - } - } - is.close(); - } - } - } - - /** - * <auth-configuration> <credentials authMethod="formAuth" loginUrl="loginUrl" - * loginFormId="loginFormId" loginRedirect="true"> <loginPostData> <field - * name="username" value="user1"/> </loginPostData> <additionalPostHeaders> - * <field name="header1" value="vaule1"/> </additionalPostHeaders> - * <removedFormFields> <field name="header1"/> </removedFormFields> - * </credentials> </auth-configuration> - */ - private static HttpFormAuthConfigurer readFormAuthConfigurer( - Element credElement, String authMethod) { - if ("formAuth".equals(authMethod)) { - HttpFormAuthConfigurer formConfigurer = new HttpFormAuthConfigurer(); - - String str = credElement.getAttribute("loginUrl"); - if (StringUtils.isNotBlank(str)) { - formConfigurer.setLoginUrl(str.trim()); - } else { - throw new IllegalArgumentException("Must set loginUrl."); - } - str = credElement.getAttribute("loginFormId"); - if (StringUtils.isNotBlank(str)) { - formConfigurer.setLoginFormId(str.trim()); - } else { - throw new IllegalArgumentException("Must set loginFormId."); - } - str = credElement.getAttribute("loginRedirect"); - if (StringUtils.isNotBlank(str)) { - formConfigurer.setLoginRedirect(Boolean.parseBoolean(str)); - } - - NodeList nodeList = credElement.getChildNodes(); - for (int j = 0; j < nodeList.getLength(); j++) { - Node node = nodeList.item(j); - if (!(node instanceof Element)) - continue; - - Element element = (Element) node; - if ("loginPostData".equals(element.getTagName())) { - Map<String, String> loginPostData = new HashMap<String, String>(); - NodeList childNodes = element.getChildNodes(); - for (int k = 0; k < childNodes.getLength(); k++) { - Node fieldNode = childNodes.item(k); - if (!(fieldNode instanceof Element)) - continue; - - Element fieldElement = (Element) fieldNode; - String name = fieldElement.getAttribute("name"); - String value = fieldElement.getAttribute("value"); - loginPostData.put(name, value); - } - formConfigurer.setLoginPostData(loginPostData); - } else if ("additionalPostHeaders".equals(element.getTagName())) { - Map<String, String> additionalPostHeaders = new HashMap<String, String>(); - NodeList childNodes = element.getChildNodes(); - for (int k = 0; k < childNodes.getLength(); k++) { - Node fieldNode = childNodes.item(k); - if (!(fieldNode instanceof Element)) - continue; - - Element fieldElement = (Element) fieldNode; - String name = fieldElement.getAttribute("name"); - String value = fieldElement.getAttribute("value"); - additionalPostHeaders.put(name, value); - } - formConfigurer.setAdditionalPostHeaders(additionalPostHeaders); - } else if ("removedFormFields".equals(element.getTagName())) { - Set<String> removedFormFields = new HashSet<String>(); - NodeList childNodes = element.getChildNodes(); - for (int k = 0; k < childNodes.getLength(); k++) { - Node fieldNode = childNodes.item(k); - if (!(fieldNode instanceof Element)) - continue; - - Element fieldElement = (Element) fieldNode; - String name = fieldElement.getAttribute("name"); - removedFormFields.add(name); - } - formConfigurer.setRemovedFormFields(removedFormFields); - } - } - - return formConfigurer; - } else { - throw new IllegalArgumentException("Unsupported authMethod: " - + authMethod); - } - } - - /** - * If credentials for the authentication scope determined from the specified - * <code>url</code> is not already set in the HTTP client, then this method - * sets the default credentials to fetch the specified <code>url</code>. If - * credentials are found for the authentication scope, the method returns - * without altering the client. - * - * @param url - * URL to be fetched - */ - private void resolveCredentials(URL url) { - - if (formConfigurer != null) { - HttpFormAuthentication formAuther = new HttpFormAuthentication( - formConfigurer, client, this); - try { - formAuther.login(); - } catch (Exception e) { - throw new RuntimeException(e); - } - - return; - } - - if (defaultUsername != null && defaultUsername.length() > 0) { - - int port = url.getPort(); - if (port == -1) { - if ("https".equals(url.getProtocol())) - port = 443; - else - port = 80; - } - - AuthScope scope = new AuthScope(url.getHost(), port); - - if (client.getState().getCredentials(scope) != null) { - if (LOG.isTraceEnabled()) - LOG.trace("Pre-configured credentials with scope - host: " - + url.getHost() + "; port: " + port + "; found for url: " + url); - - // Credentials are already configured, so do nothing and return - return; - } - - if (LOG.isTraceEnabled()) - LOG.trace("Pre-configured credentials with scope - host: " - + url.getHost() + "; port: " + port + "; not found for url: " + url); - - AuthScope serverAuthScope = getAuthScope(url.getHost(), port, - defaultRealm, defaultScheme); - - NTCredentials serverCredentials = new NTCredentials(defaultUsername, - defaultPassword, agentHost, defaultRealm); - - client.getState().setCredentials(serverAuthScope, serverCredentials); - } - } - - /** - * Returns an authentication scope for the specified <code>host</code>, - * <code>port</code>, <code>realm</code> and <code>scheme</code>. - * - * @param host - * Host name or address. - * @param port - * Port number. - * @param realm - * Authentication realm. - * @param scheme - * Authentication scheme. - */ - private static AuthScope getAuthScope(String host, int port, String realm, - String scheme) { - - if (host.length() == 0) - host = null; - - if (port < 0) - port = -1; - - if (realm.length() == 0) - realm = null; - - if (scheme.length() == 0) - scheme = null; - - return new AuthScope(host, port, realm, scheme); - } - - /** - * Returns an authentication scope for the specified <code>host</code>, - * <code>port</code> and <code>realm</code>. - * - * @param host - * Host name or address. - * @param port - * Port number. - * @param realm - * Authentication realm. - */ - private static AuthScope getAuthScope(String host, int port, String realm) { - - return getAuthScope(host, port, realm, ""); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java deleted file mode 100644 index 54dc905..0000000 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.httpclient; - -import java.util.List; - -/** - * The base level of services required for Http Authentication - * - * @see HttpAuthenticationFactory - * - * @author Matt Tencati - */ -public interface HttpAuthentication { - - /** - * Gets the credentials generated by the HttpAuthentication object. May return - * null. - * - * @return The credentials value - */ - public List<String> getCredentials(); - - /** - * Gets the realm used by the HttpAuthentication object during creation. - * - * @return The realm value - */ - public String getRealm(); - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java deleted file mode 100644 index daff5ec..0000000 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java +++ /dev/null @@ -1,71 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.httpclient; - -/** - * Can be used to identify problems during creation of Authentication objects. - * In the future it may be used as a method of collecting authentication - * failures during Http protocol transfer in order to present the user with - * credentials required during a future fetch. - * - * @author Matt Tencati - */ -public class HttpAuthenticationException extends Exception { - - /** - * Constructs a new exception with null as its detail message. - */ - public HttpAuthenticationException() { - super(); - } - - /** - * Constructs a new exception with the specified detail message. - * - * @param message - * the detail message. The detail message is saved for later - * retrieval by the {@link Throwable#getMessage()} method. - */ - public HttpAuthenticationException(String message) { - super(message); - } - - /** - * Constructs a new exception with the specified message and cause. - * - * @param message - * the detail message. The detail message is saved for later - * retrieval by the {@link Throwable#getMessage()} method. - * @param cause - * the cause (use {@link #getCause()} to retrieve the cause) - */ - public HttpAuthenticationException(String message, Throwable cause) { - super(message, cause); - } - - /** - * Constructs a new exception with the specified cause and detail message from - * given clause if it is not null. - * - * @param cause - * the cause (use {@link #getCause()} to retrieve the cause) - */ - public HttpAuthenticationException(Throwable cause) { - super(cause); - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java deleted file mode 100644 index 064a6d0..0000000 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java +++ /dev/null @@ -1,98 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.httpclient; - -// JDK imports -import java.util.ArrayList; -import java.util.Collection; - -// Slf4j Logging imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -// Hadoop imports -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configurable; - -// Nutch imports -import org.apache.nutch.metadata.Metadata; - -/** - * Provides the Http protocol implementation with the ability to authenticate - * when prompted. The goal is to provide multiple authentication types but for - * now just the {@link HttpBasicAuthentication} authentication type is provided. - * - * @see HttpBasicAuthentication - * @see Http - * @see HttpResponse - * - * @author Matt Tencati - */ -public class HttpAuthenticationFactory implements Configurable { - - /** - * The HTTP Authentication (WWW-Authenticate) header which is returned by a - * webserver requiring authentication. - */ - public static final String WWW_AUTHENTICATE = "WWW-Authenticate"; - - public static final Logger LOG = LoggerFactory - .getLogger(HttpAuthenticationFactory.class); - - private Configuration conf = null; - - public HttpAuthenticationFactory(Configuration conf) { - setConf(conf); - } - - public void setConf(Configuration conf) { - this.conf = conf; - } - - public Configuration getConf() { - return conf; - } - - public HttpAuthentication findAuthentication(Metadata header) { - - if (header == null) - return null; - - try { - Collection<String> challenge = new ArrayList<String>(); - challenge.add(header.get(WWW_AUTHENTICATE)); - - for (String challengeString : challenge) { - if (challengeString.equals("NTLM")) - challengeString = "Basic realm=techweb"; - - if (LOG.isTraceEnabled()) - LOG.trace("Checking challengeString=" + challengeString); - - HttpAuthentication auth = HttpBasicAuthentication.getAuthentication( - challengeString, conf); - if (auth != null) - return auth; - - // TODO Add additional Authentication lookups here - } - } catch (Exception e) { - LOG.error("Error: ", e); - } - return null; - } -}
