http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java new file mode 100644 index 0000000..8b1a031 --- /dev/null +++ b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java @@ -0,0 +1,573 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.htmlunit; + +import java.io.BufferedInputStream; +import java.io.ByteArrayOutputStream; +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PushbackInputStream; +import java.net.InetSocketAddress; +import java.net.Socket; +import java.net.URL; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import javax.net.ssl.SSLSocket; +import javax.net.ssl.SSLSocketFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.SpellCheckedMetadata; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.http.api.HttpBase; +import org.apache.nutch.protocol.http.api.HttpException; + +/** + * An HTTP response. + */ +public class HttpResponse implements Response { + + private Configuration conf; + private HttpBase http; + private URL url; + private String orig; + private String base; + private byte[] content; + private int code; + private Metadata headers = new SpellCheckedMetadata(); + // used for storing the http headers verbatim + private StringBuffer httpHeaders; + + protected enum Scheme { + HTTP, HTTPS, + } + + /** + * Default public constructor. + * + * @param http + * @param url + * @param datum + * @throws ProtocolException + * @throws IOException + */ + public HttpResponse(HttpBase http, URL url, CrawlDatum datum) + throws ProtocolException, IOException { + + this.http = http; + this.url = url; + this.orig = url.toString(); + this.base = url.toString(); + + Scheme scheme = null; + + if ("http".equals(url.getProtocol())) { + scheme = Scheme.HTTP; + } else if ("https".equals(url.getProtocol())) { + scheme = Scheme.HTTPS; + } else { + throw new HttpException("Unknown scheme (not http/https) for url:" + url); + } + + if (Http.LOG.isTraceEnabled()) { + Http.LOG.trace("fetching " + url); + } + + String path = "".equals(url.getFile()) ? "/" : url.getFile(); + + // some servers will redirect a request with a host line like + // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they + // don't want the :80... + + String host = url.getHost(); + int port; + String portString; + if (url.getPort() == -1) { + if (scheme == Scheme.HTTP) { + port = 80; + } else { + port = 443; + } + portString = ""; + } else { + port = url.getPort(); + portString = ":" + port; + } + Socket socket = null; + + try { + socket = new Socket(); // create the socket + socket.setSoTimeout(http.getTimeout()); + + // connect + String sockHost = http.useProxy(url) ? http.getProxyHost() : host; + int sockPort = http.useProxy(url) ? http.getProxyPort() : port; + InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort); + socket.connect(sockAddr, http.getTimeout()); + + if (scheme == Scheme.HTTPS) { + SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory + .getDefault(); + SSLSocket sslsocket = (SSLSocket) factory + .createSocket(socket, sockHost, sockPort, true); + sslsocket.setUseClientMode(true); + + // Get the protocols and ciphers supported by this JVM + Set<String> protocols = new HashSet<String>( + Arrays.asList(sslsocket.getSupportedProtocols())); + Set<String> ciphers = new HashSet<String>( + Arrays.asList(sslsocket.getSupportedCipherSuites())); + + // Intersect with preferred protocols and ciphers + protocols.retainAll(http.getTlsPreferredProtocols()); + ciphers.retainAll(http.getTlsPreferredCipherSuites()); + + sslsocket.setEnabledProtocols( + protocols.toArray(new String[protocols.size()])); + sslsocket.setEnabledCipherSuites( + ciphers.toArray(new String[ciphers.size()])); + + sslsocket.startHandshake(); + socket = sslsocket; + } + + this.conf = http.getConf(); + if (sockAddr != null + && conf.getBoolean("store.ip.address", false) == true) { + headers.add("_ip_", sockAddr.getAddress().getHostAddress()); + } + + // make request + OutputStream req = socket.getOutputStream(); + + StringBuffer reqStr = new StringBuffer("GET "); + if (http.useProxy(url)) { + reqStr.append(url.getProtocol() + "://" + host + portString + path); + } else { + reqStr.append(path); + } + + reqStr.append(" HTTP/1.0\r\n"); + + reqStr.append("Host: "); + reqStr.append(host); + reqStr.append(portString); + reqStr.append("\r\n"); + + reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n"); + + String userAgent = http.getUserAgent(); + if ((userAgent == null) || (userAgent.length() == 0)) { + if (Http.LOG.isErrorEnabled()) { + Http.LOG.error("User-agent is not set!"); + } + } else { + reqStr.append("User-Agent: "); + reqStr.append(userAgent); + reqStr.append("\r\n"); + } + + reqStr.append("Accept-Language: "); + reqStr.append(this.http.getAcceptLanguage()); + reqStr.append("\r\n"); + + reqStr.append("Accept: "); + reqStr.append(this.http.getAccept()); + reqStr.append("\r\n"); + + if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) { + reqStr.append("If-Modified-Since: " + HttpDateFormat + .toString(datum.getModifiedTime())); + reqStr.append("\r\n"); + } + reqStr.append("\r\n"); + + // store the request in the metadata? + if (conf.getBoolean("store.http.request", false) == true) { + headers.add("_request_", reqStr.toString()); + } + + byte[] reqBytes = reqStr.toString().getBytes(); + + req.write(reqBytes); + req.flush(); + + PushbackInputStream in = // process response + new PushbackInputStream( + new BufferedInputStream(socket.getInputStream(), + Http.BUFFER_SIZE), Http.BUFFER_SIZE); + + StringBuffer line = new StringBuffer(); + + // store the http headers verbatim + if (conf.getBoolean("store.http.headers", false) == true) { + httpHeaders = new StringBuffer(); + } + + headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis())); + + boolean haveSeenNonContinueStatus = false; + while (!haveSeenNonContinueStatus) { + // parse status code line + this.code = parseStatusLine(in, line); + if (httpHeaders != null) + httpHeaders.append(line).append("\n"); + // parse headers + parseHeaders(in, line, httpHeaders); + haveSeenNonContinueStatus = code != 100; // 100 is "Continue" + } + + // Get Content type header + String contentType = getHeader(Response.CONTENT_TYPE); + + // handle with HtmlUnit only if content type in HTML or XHTML + if (contentType != null) { + if (contentType.contains("text/html") || contentType.contains("application/xhtml")) { + readContentFromHtmlUnit(url); + } else { + String transferEncoding = getHeader(Response.TRANSFER_ENCODING); + if (transferEncoding != null && "chunked" + .equalsIgnoreCase(transferEncoding.trim())) { + readChunkedContent(in, line); + } else { + readPlainContent(in); + } + + String contentEncoding = getHeader(Response.CONTENT_ENCODING); + if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { + content = http.processGzipEncoded(content, url); + } else if ("deflate".equals(contentEncoding)) { + content = http.processDeflateEncoded(content, url); + } else { + // store the headers verbatim only if the response was not compressed + // as the content length reported with not match otherwise + if (httpHeaders != null) { + headers.add("_response.headers_", httpHeaders.toString()); + } + if (Http.LOG.isTraceEnabled()) { + Http.LOG.trace("fetched " + content.length + " bytes from " + url); + } + } + } + } + + } finally { + if (socket != null) + socket.close(); + } + + } + + /* + * ------------------------- * <implementation:Response> * + * ------------------------- + */ + + public URL getUrl() { + return url; + } + + public int getCode() { + return code; + } + + public String getHeader(String name) { + return headers.get(name); + } + + public Metadata getHeaders() { + return headers; + } + + public byte[] getContent() { + return content; + } + + /* + * ------------------------- * <implementation:Response> * + * ------------------------- + */ + + private void readContentFromHtmlUnit(URL url) throws IOException { + String page = HtmlUnitWebDriver.getHtmlPage(url.toString(), conf); + content = page.getBytes("UTF-8"); + } + + private void readPlainContent(InputStream in) + throws HttpException, IOException { + + int contentLength = Integer.MAX_VALUE; // get content length + String contentLengthString = headers.get(Response.CONTENT_LENGTH); + if (contentLengthString != null) { + contentLengthString = contentLengthString.trim(); + try { + if (!contentLengthString.isEmpty()) + contentLength = Integer.parseInt(contentLengthString); + } catch (NumberFormatException e) { + throw new HttpException("bad content length: " + contentLengthString); + } + } + if (http.getMaxContent() >= 0 && contentLength > http + .getMaxContent()) // limit + // download + // size + contentLength = http.getMaxContent(); + + ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); + byte[] bytes = new byte[Http.BUFFER_SIZE]; + int length = 0; + + // do not try to read if the contentLength is 0 + if (contentLength == 0) { + content = new byte[0]; + return; + } + + // read content + int i = in.read(bytes); + while (i != -1) { + out.write(bytes, 0, i); + length += i; + if (length >= contentLength) { + break; + } + if ((length + Http.BUFFER_SIZE) > contentLength) { + // reading next chunk may hit contentLength, + // must limit number of bytes read + i = in.read(bytes, 0, (contentLength - length)); + } else { + i = in.read(bytes); + } + } + content = out.toByteArray(); + } + + /** + * @param in + * @param line + * @throws HttpException + * @throws IOException + */ + private void readChunkedContent(PushbackInputStream in, StringBuffer line) + throws HttpException, IOException { + boolean doneChunks = false; + int contentBytesRead = 0; + byte[] bytes = new byte[Http.BUFFER_SIZE]; + ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); + + while (!doneChunks) { + if (Http.LOG.isTraceEnabled()) { + Http.LOG.trace("Http: starting chunk"); + } + + readLine(in, line, false); + + String chunkLenStr; + // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); + // } + + int pos = line.indexOf(";"); + if (pos < 0) { + chunkLenStr = line.toString(); + } else { + chunkLenStr = line.substring(0, pos); + // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + + // line.substring(pos+1)); } + } + chunkLenStr = chunkLenStr.trim(); + int chunkLen; + try { + chunkLen = Integer.parseInt(chunkLenStr, 16); + } catch (NumberFormatException e) { + throw new HttpException("bad chunk length: " + line.toString()); + } + + if (chunkLen == 0) { + doneChunks = true; + break; + } + + if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http + .getMaxContent()) + chunkLen = http.getMaxContent() - contentBytesRead; + + // read one chunk + int chunkBytesRead = 0; + while (chunkBytesRead < chunkLen) { + + int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? + (chunkLen - chunkBytesRead) : + Http.BUFFER_SIZE; + int len = in.read(bytes, 0, toRead); + + if (len == -1) + throw new HttpException("chunk eof after " + contentBytesRead + + " bytes in successful chunks" + " and " + chunkBytesRead + + " in current chunk"); + + // DANGER!!! Will printed GZIPed stuff right to your + // terminal! + // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0, + // len)); } + + out.write(bytes, 0, len); + chunkBytesRead += len; + } + + readLine(in, line, false); + + } + + if (!doneChunks) { + if (contentBytesRead != http.getMaxContent()) + throw new HttpException("chunk eof: !doneChunk && didn't max out"); + return; + } + + content = out.toByteArray(); + parseHeaders(in, line, null); + + } + + private int parseStatusLine(PushbackInputStream in, StringBuffer line) + throws IOException, HttpException { + readLine(in, line, false); + + int codeStart = line.indexOf(" "); + int codeEnd = line.indexOf(" ", codeStart + 1); + + // handle lines with no plaintext result code, ie: + // "HTTP/1.1 200" vs "HTTP/1.1 200 OK" + if (codeEnd == -1) + codeEnd = line.length(); + + int code; + try { + code = Integer.parseInt(line.substring(codeStart + 1, codeEnd)); + } catch (NumberFormatException e) { + throw new HttpException( + "bad status line '" + line + "': " + e.getMessage(), e); + } + + return code; + } + + private void processHeaderLine(StringBuffer line) + throws IOException, HttpException { + + int colonIndex = line.indexOf(":"); // key is up to colon + if (colonIndex == -1) { + int i; + for (i = 0; i < line.length(); i++) + if (!Character.isWhitespace(line.charAt(i))) + break; + if (i == line.length()) + return; + throw new HttpException("No colon in header:" + line); + } + String key = line.substring(0, colonIndex); + + int valueStart = colonIndex + 1; // skip whitespace + while (valueStart < line.length()) { + int c = line.charAt(valueStart); + if (c != ' ' && c != '\t') + break; + valueStart++; + } + String value = line.substring(valueStart); + headers.set(key, value); + } + + // Adds headers to our headers Metadata + private void parseHeaders(PushbackInputStream in, StringBuffer line, + StringBuffer httpHeaders) throws IOException, HttpException { + + while (readLine(in, line, true) != 0) { + + if (httpHeaders != null) + httpHeaders.append(line).append("\n"); + + // handle HTTP responses with missing blank line after headers + int pos; + if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ( + (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html")) + != -1)) { + + in.unread(line.substring(pos).getBytes("UTF-8")); + line.setLength(pos); + + try { + // TODO: (CM) We don't know the header names here + // since we're just handling them generically. It would + // be nice to provide some sort of mapping function here + // for the returned header names to the standard metadata + // names in the ParseData class + processHeaderLine(line); + } catch (Exception e) { + // fixme: + Http.LOG.warn("Error: ", e); + } + return; + } + + processHeaderLine(line); + } + } + + private static int readLine(PushbackInputStream in, StringBuffer line, + boolean allowContinuedLine) throws IOException { + line.setLength(0); + for (int c = in.read(); c != -1; c = in.read()) { + switch (c) { + case '\r': + if (peek(in) == '\n') { + in.read(); + } + case '\n': + if (line.length() > 0) { + // at EOL -- check for continued line if the current + // (possibly continued) line wasn't blank + if (allowContinuedLine) + switch (peek(in)) { + case ' ': + case '\t': // line is continued + in.read(); + continue; + } + } + return line.length(); // else complete + default: + line.append((char) c); + } + } + throw new EOFException(); + } + + private static int peek(PushbackInputStream in) throws IOException { + int value = in.read(); + in.unread(value); + return value; + } + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/package.html b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/package.html new file mode 100644 index 0000000..4181951 --- /dev/null +++ b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/package.html @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +<html> +<body> +<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/build.xml b/nutch-plugins/protocol-http/build.xml new file mode 100755 index 0000000..30720f1 --- /dev/null +++ b/nutch-plugins/protocol-http/build.xml @@ -0,0 +1,50 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="protocol-http" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Build compilation dependencies --> + <target name="deps-jar"> + <ant target="jar" inheritall="false" dir="../lib-http"/> + </target> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-http/*.jar" /> + </fileset> + <pathelement location="${build.dir}/test/conf"/> + </path> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../lib-http"/> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + <copy toDir="${build.test}"> + <fileset dir="${src.test}" excludes="**/*.java"/> + </copy> + </target> + + <!-- for junit test --> + <mkdir dir="${build.test}/data" /> + <copy todir="${build.test}/data"> + <fileset dir="jsp"/> + </copy> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/ivy.xml b/nutch-plugins/protocol-http/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/protocol-http/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/jsp/basic-http.jsp ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/jsp/basic-http.jsp b/nutch-plugins/protocol-http/jsp/basic-http.jsp new file mode 100644 index 0000000..bf1f8bd --- /dev/null +++ b/nutch-plugins/protocol-http/jsp/basic-http.jsp @@ -0,0 +1,44 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%><%-- + Example JSP Page to Test Protocol-Http Plugin +--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><% +String path = request.getContextPath(); +String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/"; +%> + +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> +<html> + <head> + <base href="<%=basePath%>"> + + <title>HelloWorld</title> + <meta http-equiv="content-type" content="text/html;charset=utf-8" /> + <meta name="Language" content="en" /> + <meta http-equiv="pragma" content="no-cache"> + <meta http-equiv="cache-control" content="no-cache"> + <meta http-equiv="expires" content="0"> + <meta http-equiv="keywords" content="keyword1,keyword2,keyword3"> + <meta http-equiv="description" content="This is my page"> + <!-- + <link rel="stylesheet" type="text/css" href="styles.css"> + --> + </head> + + <body> + Hello World!!! <br> + </body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/jsp/brokenpage.jsp ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/jsp/brokenpage.jsp b/nutch-plugins/protocol-http/jsp/brokenpage.jsp new file mode 100644 index 0000000..f3f7c4a --- /dev/null +++ b/nutch-plugins/protocol-http/jsp/brokenpage.jsp @@ -0,0 +1,47 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%><%-- + Example JSP Page to Test Protocol-Http Plugin +--%> + +@ page language="java" import="java.util.*" pageEncoding="UTF-8" + +String path = request.getContextPath(); +String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/"; + + +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> +<html> + <head> + <base href="<%=basePath%>"> + + <title>HelloWorld</title> + <meta http-equiv="content-type" content="text/html;charset=utf-8" /> + <meta name="Language" content="en" /> + <meta http-equiv="pragma" content="no-cache"> + <meta http-equiv="cache-control" content="no-cache"> + <meta http-equiv="expires" content="0"> + <meta http-equiv="keywords" content="keyword1,keyword2,keyword3"> + <meta http-equiv="description" content="This is my page"> + <!-- + <link rel="stylesheet" type="text/css" href="styles.css"> + --> + </head> + + <body> + Hello World!!! <br> + </body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/jsp/redirect301.jsp ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/jsp/redirect301.jsp b/nutch-plugins/protocol-http/jsp/redirect301.jsp new file mode 100644 index 0000000..1100b89 --- /dev/null +++ b/nutch-plugins/protocol-http/jsp/redirect301.jsp @@ -0,0 +1,49 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%><%-- + Example JSP Page to Test Protocol-Http Plugin +--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><% +String path = request.getContextPath(); +String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/"; +%> + +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> +<html> + <head> + <base href="<%=basePath%>"> + + <title>My JSP page</title> + + <meta http-equiv="pragma" content="no-cache"> + <meta http-equiv="cache-control" content="no-cache"> + <meta http-equiv="expires" content="0"> + <meta http-equiv="keywords" content="keyword1,keyword2,keyword3"> + <meta http-equiv="description" content="This is my page"> + <!-- + <link rel="stylesheet" type="text/css" href="styles.css"> + --> + + </head> + + <body> + <% + response.setStatus(301); + response.setHeader( "Location", "http://nutch.apache.org"); + response.setHeader( "Connection", "close" ); + %> + You are redirected by JSP<br> + </body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/jsp/redirect302.jsp ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/jsp/redirect302.jsp b/nutch-plugins/protocol-http/jsp/redirect302.jsp new file mode 100644 index 0000000..8a250d9 --- /dev/null +++ b/nutch-plugins/protocol-http/jsp/redirect302.jsp @@ -0,0 +1,49 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%><%-- + Example JSP Page to Test Protocol-Http Plugin +--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><% +String path = request.getContextPath(); +String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/"; +%> + +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> +<html> + <head> + <base href="<%=basePath%>"> + + <title>My JSP page</title> + + <meta http-equiv="pragma" content="no-cache"> + <meta http-equiv="cache-control" content="no-cache"> + <meta http-equiv="expires" content="0"> + <meta http-equiv="keywords" content="keyword1,keyword2,keyword3"> + <meta http-equiv="description" content="This is my page"> + <!-- + <link rel="stylesheet" type="text/css" href="styles.css"> + --> + + </head> + + <body> + <% + response.setStatus(302); + response.setHeader( "Location", "http://nutch.apache.org"); + response.setHeader( "Connection", "close" ); + %> + You are sucessfully redirected by JSP<br> + </body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/plugin.xml b/nutch-plugins/protocol-http/plugin.xml new file mode 100755 index 0000000..8770b10 --- /dev/null +++ b/nutch-plugins/protocol-http/plugin.xml @@ -0,0 +1,51 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="protocol-http" + name="Http Protocol Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="protocol-http.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + <import plugin="lib-http"/> + </requires> + + <extension id="org.apache.nutch.protocol.http" + name="HttpProtocol" + point="org.apache.nutch.protocol.Protocol"> + + <implementation id="org.apache.nutch.protocol.http.Http" + class="org.apache.nutch.protocol.http.Http"> + <parameter name="protocolName" value="http"/> + </implementation> + + <implementation id="org.apache.nutch.protocol.http.Http" + class="org.apache.nutch.protocol.http.Http"> + <parameter name="protocolName" value="https"/> + </implementation> + + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/pom.xml b/nutch-plugins/protocol-http/pom.xml new file mode 100644 index 0000000..e7ade28 --- /dev/null +++ b/nutch-plugins/protocol-http/pom.xml @@ -0,0 +1,57 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>protocol-http</artifactId> + <packaging>jar</packaging> + + <name>protocol-http</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + <dependencies> + <dependency> + <groupId>org.apache.nutch</groupId> + <artifactId>lib-http</artifactId> + <version>${project.parent.version}</version> + </dependency> + <dependency> + <groupId> org.mortbay.jetty</groupId> + <artifactId>jetty</artifactId> + <version>6.1.26</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId> org.mortbay.jetty</groupId> + <artifactId>jsp-2.1</artifactId> + <version>6.1.14</version> + <scope>test</scope> + </dependency> + </dependencies> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/Http.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/Http.java b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/Http.java new file mode 100755 index 0000000..56f9f4f --- /dev/null +++ b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/Http.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.http; + +// JDK imports +import java.io.IOException; +import java.net.URL; + +// Commons Logging imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.http.api.HttpBase; +import org.apache.nutch.util.NutchConfiguration; + +public class Http extends HttpBase { + + public static final Logger LOG = LoggerFactory.getLogger(Http.class); + + /** + * Public default constructor. + */ + public Http() { + super(LOG); + } + + /** + * Set the {@link org.apache.hadoop.conf.Configuration} object. + * + * @param conf + */ + public void setConf(Configuration conf) { + super.setConf(conf); + // Level logLevel = Level.WARNING; + // if (conf.getBoolean("http.verbose", false)) { + // logLevel = Level.FINE; + // } + // LOG.setLevel(logLevel); + } + + public static void main(String[] args) throws Exception { + Http http = new Http(); + http.setConf(NutchConfiguration.create()); + main(http, args); + } + + protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) + throws ProtocolException, IOException { + return new HttpResponse(this, url, datum); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/HttpResponse.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/HttpResponse.java b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/HttpResponse.java new file mode 100644 index 0000000..f6d7e4d --- /dev/null +++ b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/HttpResponse.java @@ -0,0 +1,558 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.http; + +import java.io.BufferedInputStream; +import java.io.ByteArrayOutputStream; +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PushbackInputStream; +import java.net.InetSocketAddress; +import java.net.Socket; +import java.net.URL; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import javax.net.ssl.SSLSocket; +import javax.net.ssl.SSLSocketFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.SpellCheckedMetadata; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.http.api.HttpBase; +import org.apache.nutch.protocol.http.api.HttpException; + +/** + * An HTTP response. + */ +public class HttpResponse implements Response { + + private Configuration conf; + private HttpBase http; + private URL url; + private String orig; + private String base; + private byte[] content; + private int code; + private Metadata headers = new SpellCheckedMetadata(); + // used for storing the http headers verbatim + private StringBuffer httpHeaders; + + protected enum Scheme { + HTTP, HTTPS, + } + + /** + * Default public constructor. + * + * @param http + * @param url + * @param datum + * @throws ProtocolException + * @throws IOException + */ + public HttpResponse(HttpBase http, URL url, CrawlDatum datum) + throws ProtocolException, IOException { + + this.http = http; + this.url = url; + this.orig = url.toString(); + this.base = url.toString(); + + Scheme scheme = null; + + if ("http".equals(url.getProtocol())) { + scheme = Scheme.HTTP; + } else if ("https".equals(url.getProtocol())) { + scheme = Scheme.HTTPS; + } else { + throw new HttpException("Unknown scheme (not http/https) for url:" + url); + } + + if (Http.LOG.isTraceEnabled()) { + Http.LOG.trace("fetching " + url); + } + + String path = "".equals(url.getFile()) ? "/" : url.getFile(); + + // some servers will redirect a request with a host line like + // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they + // don't want the :80... + + String host = url.getHost(); + int port; + String portString; + if (url.getPort() == -1) { + if (scheme == Scheme.HTTP) { + port = 80; + } else { + port = 443; + } + portString = ""; + } else { + port = url.getPort(); + portString = ":" + port; + } + Socket socket = null; + + try { + socket = new Socket(); // create the socket + socket.setSoTimeout(http.getTimeout()); + + // connect + String sockHost = http.useProxy(url) ? http.getProxyHost() : host; + int sockPort = http.useProxy(url) ? http.getProxyPort() : port; + InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort); + socket.connect(sockAddr, http.getTimeout()); + + if (scheme == Scheme.HTTPS) { + SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory + .getDefault(); + SSLSocket sslsocket = (SSLSocket) factory + .createSocket(socket, sockHost, sockPort, true); + sslsocket.setUseClientMode(true); + + // Get the protocols and ciphers supported by this JVM + Set<String> protocols = new HashSet<String>( + Arrays.asList(sslsocket.getSupportedProtocols())); + Set<String> ciphers = new HashSet<String>( + Arrays.asList(sslsocket.getSupportedCipherSuites())); + + // Intersect with preferred protocols and ciphers + protocols.retainAll(http.getTlsPreferredProtocols()); + ciphers.retainAll(http.getTlsPreferredCipherSuites()); + + sslsocket.setEnabledProtocols( + protocols.toArray(new String[protocols.size()])); + sslsocket.setEnabledCipherSuites( + ciphers.toArray(new String[ciphers.size()])); + + sslsocket.startHandshake(); + socket = sslsocket; + } + + this.conf = http.getConf(); + if (sockAddr != null + && conf.getBoolean("store.ip.address", false) == true) { + headers.add("_ip_", sockAddr.getAddress().getHostAddress()); + } + + // make request + OutputStream req = socket.getOutputStream(); + + StringBuffer reqStr = new StringBuffer("GET "); + if (http.useProxy(url)) { + reqStr.append(url.getProtocol() + "://" + host + portString + path); + } else { + reqStr.append(path); + } + + reqStr.append(" HTTP/1.0\r\n"); + + reqStr.append("Host: "); + reqStr.append(host); + reqStr.append(portString); + reqStr.append("\r\n"); + + reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n"); + + String userAgent = http.getUserAgent(); + if ((userAgent == null) || (userAgent.length() == 0)) { + if (Http.LOG.isErrorEnabled()) { + Http.LOG.error("User-agent is not set!"); + } + } else { + reqStr.append("User-Agent: "); + reqStr.append(userAgent); + reqStr.append("\r\n"); + } + + reqStr.append("Accept-Language: "); + reqStr.append(this.http.getAcceptLanguage()); + reqStr.append("\r\n"); + + reqStr.append("Accept: "); + reqStr.append(this.http.getAccept()); + reqStr.append("\r\n"); + + if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) { + reqStr.append("If-Modified-Since: " + HttpDateFormat + .toString(datum.getModifiedTime())); + reqStr.append("\r\n"); + } + reqStr.append("\r\n"); + + // store the request in the metadata? + if (conf.getBoolean("store.http.request", false) == true) { + headers.add("_request_", reqStr.toString()); + } + + byte[] reqBytes = reqStr.toString().getBytes(); + + req.write(reqBytes); + req.flush(); + + PushbackInputStream in = // process response + new PushbackInputStream( + new BufferedInputStream(socket.getInputStream(), + Http.BUFFER_SIZE), Http.BUFFER_SIZE); + + StringBuffer line = new StringBuffer(); + + // store the http headers verbatim + if (conf.getBoolean("store.http.headers", false) == true) { + httpHeaders = new StringBuffer(); + } + + headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis())); + + boolean haveSeenNonContinueStatus = false; + while (!haveSeenNonContinueStatus) { + // parse status code line + this.code = parseStatusLine(in, line); + if (httpHeaders != null) + httpHeaders.append(line).append("\n"); + // parse headers + parseHeaders(in, line, httpHeaders); + haveSeenNonContinueStatus = code != 100; // 100 is "Continue" + } + + String transferEncoding = getHeader(Response.TRANSFER_ENCODING); + if (transferEncoding != null && "chunked" + .equalsIgnoreCase(transferEncoding.trim())) { + readChunkedContent(in, line); + } else { + readPlainContent(in); + } + + String contentEncoding = getHeader(Response.CONTENT_ENCODING); + if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { + content = http.processGzipEncoded(content, url); + } else if ("deflate".equals(contentEncoding)) { + content = http.processDeflateEncoded(content, url); + } else { + // store the headers verbatim only if the response was not compressed + // as the content length reported with not match otherwise + if (httpHeaders != null) { + headers.add("_response.headers_", httpHeaders.toString()); + } + if (Http.LOG.isTraceEnabled()) { + Http.LOG.trace("fetched " + content.length + " bytes from " + url); + } + } + + } finally { + if (socket != null) + socket.close(); + } + + } + + /* + * ------------------------- * <implementation:Response> * + * ------------------------- + */ + + public URL getUrl() { + return url; + } + + public int getCode() { + return code; + } + + public String getHeader(String name) { + return headers.get(name); + } + + public Metadata getHeaders() { + return headers; + } + + public byte[] getContent() { + return content; + } + + /* + * ------------------------- * <implementation:Response> * + * ------------------------- + */ + + private void readPlainContent(InputStream in) + throws HttpException, IOException { + + int contentLength = Integer.MAX_VALUE; // get content length + String contentLengthString = headers.get(Response.CONTENT_LENGTH); + if (contentLengthString != null) { + contentLengthString = contentLengthString.trim(); + try { + if (!contentLengthString.isEmpty()) + contentLength = Integer.parseInt(contentLengthString); + } catch (NumberFormatException e) { + throw new HttpException("bad content length: " + contentLengthString); + } + } + if (http.getMaxContent() >= 0 && contentLength > http + .getMaxContent()) // limit + // download + // size + contentLength = http.getMaxContent(); + + ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); + byte[] bytes = new byte[Http.BUFFER_SIZE]; + int length = 0; + + // do not try to read if the contentLength is 0 + if (contentLength == 0) { + content = new byte[0]; + return; + } + + // read content + int i = in.read(bytes); + while (i != -1) { + out.write(bytes, 0, i); + length += i; + if (length >= contentLength) { + break; + } + if ((length + Http.BUFFER_SIZE) > contentLength) { + // reading next chunk may hit contentLength, + // must limit number of bytes read + i = in.read(bytes, 0, (contentLength - length)); + } else { + i = in.read(bytes); + } + } + content = out.toByteArray(); + } + + /** + * @param in + * @param line + * @throws HttpException + * @throws IOException + */ + private void readChunkedContent(PushbackInputStream in, StringBuffer line) + throws HttpException, IOException { + boolean doneChunks = false; + int contentBytesRead = 0; + byte[] bytes = new byte[Http.BUFFER_SIZE]; + ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); + + while (!doneChunks) { + if (Http.LOG.isTraceEnabled()) { + Http.LOG.trace("Http: starting chunk"); + } + + readLine(in, line, false); + + String chunkLenStr; + // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); + // } + + int pos = line.indexOf(";"); + if (pos < 0) { + chunkLenStr = line.toString(); + } else { + chunkLenStr = line.substring(0, pos); + // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + + // line.substring(pos+1)); } + } + chunkLenStr = chunkLenStr.trim(); + int chunkLen; + try { + chunkLen = Integer.parseInt(chunkLenStr, 16); + } catch (NumberFormatException e) { + throw new HttpException("bad chunk length: " + line.toString()); + } + + if (chunkLen == 0) { + doneChunks = true; + break; + } + + if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http + .getMaxContent()) + chunkLen = http.getMaxContent() - contentBytesRead; + + // read one chunk + int chunkBytesRead = 0; + while (chunkBytesRead < chunkLen) { + + int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? + (chunkLen - chunkBytesRead) : + Http.BUFFER_SIZE; + int len = in.read(bytes, 0, toRead); + + if (len == -1) + throw new HttpException("chunk eof after " + contentBytesRead + + " bytes in successful chunks" + " and " + chunkBytesRead + + " in current chunk"); + + // DANGER!!! Will printed GZIPed stuff right to your + // terminal! + // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0, + // len)); } + + out.write(bytes, 0, len); + chunkBytesRead += len; + } + + readLine(in, line, false); + + } + + if (!doneChunks) { + if (contentBytesRead != http.getMaxContent()) + throw new HttpException("chunk eof: !doneChunk && didn't max out"); + return; + } + + content = out.toByteArray(); + parseHeaders(in, line, null); + + } + + private int parseStatusLine(PushbackInputStream in, StringBuffer line) + throws IOException, HttpException { + readLine(in, line, false); + + int codeStart = line.indexOf(" "); + int codeEnd = line.indexOf(" ", codeStart + 1); + + // handle lines with no plaintext result code, ie: + // "HTTP/1.1 200" vs "HTTP/1.1 200 OK" + if (codeEnd == -1) + codeEnd = line.length(); + + int code; + try { + code = Integer.parseInt(line.substring(codeStart + 1, codeEnd)); + } catch (NumberFormatException e) { + throw new HttpException( + "bad status line '" + line + "': " + e.getMessage(), e); + } + + return code; + } + + private void processHeaderLine(StringBuffer line) + throws IOException, HttpException { + + int colonIndex = line.indexOf(":"); // key is up to colon + if (colonIndex == -1) { + int i; + for (i = 0; i < line.length(); i++) + if (!Character.isWhitespace(line.charAt(i))) + break; + if (i == line.length()) + return; + throw new HttpException("No colon in header:" + line); + } + String key = line.substring(0, colonIndex); + + int valueStart = colonIndex + 1; // skip whitespace + while (valueStart < line.length()) { + int c = line.charAt(valueStart); + if (c != ' ' && c != '\t') + break; + valueStart++; + } + String value = line.substring(valueStart); + headers.set(key, value); + } + + // Adds headers to our headers Metadata + private void parseHeaders(PushbackInputStream in, StringBuffer line, + StringBuffer httpHeaders) throws IOException, HttpException { + + while (readLine(in, line, true) != 0) { + + if (httpHeaders != null) + httpHeaders.append(line).append("\n"); + + // handle HTTP responses with missing blank line after headers + int pos; + if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ( + (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html")) + != -1)) { + + in.unread(line.substring(pos).getBytes("UTF-8")); + line.setLength(pos); + + try { + // TODO: (CM) We don't know the header names here + // since we're just handling them generically. It would + // be nice to provide some sort of mapping function here + // for the returned header names to the standard metadata + // names in the ParseData class + processHeaderLine(line); + } catch (Exception e) { + // fixme: + Http.LOG.warn("Error: ", e); + } + return; + } + + processHeaderLine(line); + } + } + + private static int readLine(PushbackInputStream in, StringBuffer line, + boolean allowContinuedLine) throws IOException { + line.setLength(0); + for (int c = in.read(); c != -1; c = in.read()) { + switch (c) { + case '\r': + if (peek(in) == '\n') { + in.read(); + } + case '\n': + if (line.length() > 0) { + // at EOL -- check for continued line if the current + // (possibly continued) line wasn't blank + if (allowContinuedLine) + switch (peek(in)) { + case ' ': + case '\t': // line is continued + in.read(); + continue; + } + } + return line.length(); // else complete + default: + line.append((char) c); + } + } + throw new EOFException(); + } + + private static int peek(PushbackInputStream in) throws IOException { + int value = in.read(); + in.unread(value); + return value; + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/package.html b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/package.html new file mode 100644 index 0000000..34d1d1c --- /dev/null +++ b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/package.html @@ -0,0 +1,5 @@ +<html> +<body> +<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/test/conf/nutch-site-test.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/src/test/conf/nutch-site-test.xml b/nutch-plugins/protocol-http/src/test/conf/nutch-site-test.xml new file mode 100644 index 0000000..a9afd78 --- /dev/null +++ b/nutch-plugins/protocol-http/src/test/conf/nutch-site-test.xml @@ -0,0 +1,52 @@ +<?xml version="1.0"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<configuration> + +<property> + <name>http.robots.agents</name> + <value>Nutch-Test,*</value> + <description></description> +</property> + +<property> + <name>http.agent.name</name> + <value>Nutch-Test</value> + <description></description> +</property> + +<property> + <name>http.agent.description</name> + <value>Nutch protocol-httpclient test</value> + <description></description> +</property> + +<property> + <name>http.auth.file</name> + <value>httpclient-auth-test.xml</value> + <description></description> +</property> + +<property> + <name>http.timeout</name> + <value>60000</value> + <description></description> +</property> + +</configuration> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java b/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java new file mode 100644 index 0000000..7dd9e9b --- /dev/null +++ b/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.http; + +import static org.junit.Assert.assertEquals; + +import java.net.URL; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolOutput; +import org.junit.After; +import org.junit.Test; +import org.mortbay.jetty.Server; +import org.mortbay.jetty.nio.SelectChannelConnector; +import org.mortbay.jetty.servlet.Context; +import org.mortbay.jetty.servlet.ServletHolder; + +/** + * Test cases for protocol-http + */ +public class TestProtocolHttp { + private static final String RES_DIR = System.getProperty("test.data", "."); + + private Http http; + private Server server; + private Context root; + private Configuration conf; + private int port; + + public void setUp(boolean redirection) throws Exception { + conf = new Configuration(); + conf.addResource("nutch-default.xml"); + conf.addResource("nutch-site-test.xml"); + + http = new Http(); + http.setConf(conf); + + server = new Server(); + + if (redirection) { + root = new Context(server, "/redirection", Context.SESSIONS); + root.setAttribute("newContextURL", "/redirect"); + } else { + root = new Context(server, "/", Context.SESSIONS); + } + + ServletHolder sh = new ServletHolder( + org.apache.jasper.servlet.JspServlet.class); + root.addServlet(sh, "*.jsp"); + root.setResourceBase(RES_DIR); + } + + @After + public void tearDown() throws Exception { + server.stop(); + } + + @Test + public void testStatusCode() throws Exception { + startServer(47504, false); + fetchPage("/basic-http.jsp", 200); + fetchPage("/redirect301.jsp", 301); + fetchPage("/redirect302.jsp", 302); + fetchPage("/nonexists.html", 404); + fetchPage("/brokenpage.jsp", 500); + } + + @Test + public void testRedirectionJetty() throws Exception { + // Redirection via Jetty + startServer(47503, true); + fetchPage("/redirection", 302); + } + + /** + * Starts the Jetty server at a specified port and redirection parameter. + * + * @param portno + * Port number. + * @param redirection + * whether redirection + */ + private void startServer(int portno, boolean redirection) throws Exception { + port = portno; + setUp(redirection); + SelectChannelConnector connector = new SelectChannelConnector(); + connector.setHost("127.0.0.1"); + connector.setPort(port); + + server.addConnector(connector); + server.start(); + } + + /** + * Fetches the specified <code>page</code> from the local Jetty server and + * checks whether the HTTP response status code matches with the expected + * code. Also use jsp pages for redirection. + * + * @param page + * Page to be fetched. + * @param expectedCode + * HTTP response status code expected while fetching the page. + */ + private void fetchPage(String page, int expectedCode) throws Exception { + URL url = new URL("http", "127.0.0.1", port, page); + CrawlDatum crawlDatum = new CrawlDatum(); + Response response = http.getResponse(url, crawlDatum, true); + ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), + crawlDatum); + Content content = out.getContent(); + assertEquals("HTTP Status Code for " + url, expectedCode, + response.getCode()); + + if (page.compareTo("/nonexists.html") != 0 + && page.compareTo("/brokenpage.jsp") != 0 + && page.compareTo("/redirection") != 0) { + assertEquals("ContentType " + url, "text/html", + content.getContentType()); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-httpclient/build.xml b/nutch-plugins/protocol-httpclient/build.xml new file mode 100644 index 0000000..b66eb97 --- /dev/null +++ b/nutch-plugins/protocol-httpclient/build.xml @@ -0,0 +1,45 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="protocol-httpclient" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <target name="deps-jar"> + <ant target="jar" inheritall="false" dir="../lib-http"/> + </target> + + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-http/*.jar" /> + </fileset> + <pathelement location="${build.dir}/test/conf"/> + </path> + + <target name="deps-test"> + <copy toDir="${build.test}"> + <fileset dir="${src.test}" excludes="**/*.java"/> + </copy> + </target> + + <!-- for junit test --> + <mkdir dir="${build.test}/data" /> + <copy todir="${build.test}/data"> + <fileset dir="jsp"/> + </copy> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-httpclient/ivy.xml b/nutch-plugins/protocol-httpclient/ivy.xml new file mode 100644 index 0000000..00b6f07 --- /dev/null +++ b/nutch-plugins/protocol-httpclient/ivy.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + <dependency org="org.jsoup" name="jsoup" rev="1.8.1" /> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/basic.jsp ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-httpclient/jsp/basic.jsp b/nutch-plugins/protocol-httpclient/jsp/basic.jsp new file mode 100644 index 0000000..c5bfb89 --- /dev/null +++ b/nutch-plugins/protocol-httpclient/jsp/basic.jsp @@ -0,0 +1,74 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%><%-- + This JSP demonstrates basic authentication. When this JSP page is + requested with no query parameters, then the user must enter the + username as 'userx' and password as 'passx' when prompted for + authentication. Apart from this there are a few other test cases, + which can be used by passing a test case number as query parameter in + the following manner: basic.jsp?case=1, basic.jsp?case=2, etc. + The credentials for each test case can be easily figured out from the + code below. + + Author: Susam Pal +--%><%@ page + import = "sun.misc.BASE64Decoder" +%><% + String authHeader = request.getHeader("Authorization"); + String realm = null; + String username = null; + String password = null; + int testCase = 0; + try { + testCase = Integer.parseInt(request.getParameter("case")); + } catch (Exception ex) { + // do nothing + } + switch (testCase) { + case 1: + realm = "realm1"; username = "user1"; password = "pass1"; + break; + + case 2: + realm = "realm2"; username = "user2"; password = "pass2"; + break; + + default: + realm = "realmx"; username = "userx"; password = "passx"; + break; + } + + boolean authenticated = false; + if (authHeader != null && authHeader.toUpperCase().startsWith("BASIC")) { + String creds[] = new String(new BASE64Decoder().decodeBuffer( + authHeader.substring(6))).split(":", 2); + if (creds[0].equals(username) && creds[1].equals(password)) + authenticated = true; + } + if (!authenticated) { + response.setHeader("WWW-Authenticate", "Basic realm=\"" + realm + "\""); + response.sendError(response.SC_UNAUTHORIZED); + } else { +%> +<html> +<head><title>Basic Authentication Test</title></head> +<body> +<p>Hi <%= username %>, you have been successfully authenticated.</p> +</body> +</html> +<% + } +%> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/cookies.jsp ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-httpclient/jsp/cookies.jsp b/nutch-plugins/protocol-httpclient/jsp/cookies.jsp new file mode 100644 index 0000000..ae2ace2 --- /dev/null +++ b/nutch-plugins/protocol-httpclient/jsp/cookies.jsp @@ -0,0 +1,63 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%><%-- + This JSP tests whether the client can remember cookies. When the JSP + is fetched for the first time without any query parameters, it sets + a few cookies in the client. On a second request, with the query + parameter, 'cookie=yes', it checks whether all the client has sent + the cookies. If the cookies are found, HTTP 200 response is returned. + If the cookies are not found, HTTP 403 response is returned. + + Author: Susam Pal +--%><% + String cookieParam = request.getParameter("cookie"); + if (!"yes".equals(cookieParam)) { // Send cookies + response.addCookie(new Cookie("var1", "val1")); + response.addCookie(new Cookie("var2", "val2")); +%> +<html> +<head><title>Cookies Set</title></head> +<body><p>Cookies have been set.</p></body> +</html> +<% + } else { // Check cookies + int cookiesCount = 0; + + Cookie[] cookies = request.getCookies(); + if (cookies != null) { + for (int i = 0; i < cookies.length; i++) { + if (cookies[i].getName().equals("var1") + && cookies[i].getValue().equals("val1")) + cookiesCount++; + + if (cookies[i].getName().equals("var2") + && cookies[i].getValue().equals("val2")) + cookiesCount++; + } + } + + if (cookiesCount != 2) { + response.sendError(response.SC_FORBIDDEN); + } else { +%> +<html> +<head><title>Cookies Found</title></head> +<body><p>Cookies found!</p></body> +</html> +<% + } + } +%> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/digest.jsp ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-httpclient/jsp/digest.jsp b/nutch-plugins/protocol-httpclient/jsp/digest.jsp new file mode 100644 index 0000000..c657484 --- /dev/null +++ b/nutch-plugins/protocol-httpclient/jsp/digest.jsp @@ -0,0 +1,68 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%><%-- + This JSP tests digest authentication. It generates an HTTP response + with authorization header for digest authentication and checks the + user-name supplied by the client. It does not check the other + parameters and hashes as controlled JUnit tests would be performed + against this and only the proper submission of credentials need to + be tested. + + Author: Susam Pal +--%><%@ page + import = "java.util.StringTokenizer" + import = "java.util.HashMap" +%><% + String username = "digest_user"; + String authHeader = request.getHeader("Authorization"); + + boolean authenticated = false; + if (authHeader != null && authHeader.toUpperCase().startsWith("DIGEST")) { + HashMap map = new HashMap(); + StringTokenizer tokenizer = new StringTokenizer( + authHeader.substring(7).trim(), ","); + while (tokenizer.hasMoreTokens()) { + String[] param = tokenizer.nextToken().trim().split("=", 2); + if (param[1].charAt(0) == '"') { + param[1] = param[1].substring(1, param[1].length() - 1); + } + map.put(param[0], param[1]); + } + + if (username.equals((String)map.get("username"))) + authenticated = true; + } + + if (!authenticated) { + String realm = "realm=\"realm1\""; + String qop = "qop=\"auth,auth-int\""; + String nonce = "nonce=\"dcd98b7102dd2f0e8b11d0f600bfb0c093\""; + String opaque = "opaque=\"5ccc069c403ebaf9f0171e9517f40e41\""; + + response.setHeader("WWW-Authenticate", "Digest " + realm + ", " + + qop + ", " + nonce + ", " + opaque); + response.sendError(response.SC_UNAUTHORIZED); + } else { +%> +<html> +<head><title>Digest Authentication Test</title></head> +<body> +<p>Hi <%= username %>, you have been successfully authenticated.</p> +</body> +</html> +<% + } +%> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/noauth.jsp ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-httpclient/jsp/noauth.jsp b/nutch-plugins/protocol-httpclient/jsp/noauth.jsp new file mode 100644 index 0000000..c726b0f --- /dev/null +++ b/nutch-plugins/protocol-httpclient/jsp/noauth.jsp @@ -0,0 +1,36 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%><%-- + This JSP tests whether the client is sending any pre-emptive + authentication headers. The client is expected not to send pre-emptive + authentication headers. If such authentication headers are found, this + JSP will return an HTTP 403 response; HTTP 200 response otherwise. + + Author: Susam Pal +--%><% + if (request.getHeader("Authorization") != null) { + response.sendError(response.SC_UNAUTHORIZED); + } else { +%> +<html> +<head><title>No authorization headers found</title></head> +<body> +<p>No authorization headers found.</p> +</body> +</html> +<% + } +%> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/ntlm.jsp ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-httpclient/jsp/ntlm.jsp b/nutch-plugins/protocol-httpclient/jsp/ntlm.jsp new file mode 100644 index 0000000..6ad921e --- /dev/null +++ b/nutch-plugins/protocol-httpclient/jsp/ntlm.jsp @@ -0,0 +1,89 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%><%-- + This JSP tests NTLM authentication. It generates an HTTP response + with authorization header for NTLM authentication and checks the + user-name supplied by the client. It does not check the other + parameters and hashes as controlled JUnit tests would be performed + against this and only the proper submission of credentials need to + be tested. + + Author: Susam Pal +--%><%@ page + import = "sun.misc.BASE64Decoder" + import = "sun.misc.BASE64Encoder" +%><% + String authHeader = request.getHeader("Authorization"); + String username = null; + String domain = null; + String host = null; + + boolean authenticated = false; + if (authHeader != null && authHeader.startsWith("NTLM")) { + byte[] msg = new BASE64Decoder().decodeBuffer( + authHeader.substring(5)); + if (msg[8] == 1) { + byte[] type2msg = { + 'N', 'T', 'L', 'M', 'S', 'S', 'P', 0, // NTLMSSP Signature + 2, 0, 0, 0, // Type 2 Indicator + 10, 0, 10, 0, 32, 0, 0, 0, // length, offset + 0x00, 0x02, (byte) 0x81, 0, // Flags + 1, 2, 3, 4, 5, 6, 7, 8, // Challenge + 'N', 'U', 'T', 'C', 'H' // NUTCH (Domain) + }; + response.setHeader("WWW-Authenticate", "NTLM " + + new BASE64Encoder().encodeBuffer(type2msg)); + response.sendError(response.SC_UNAUTHORIZED); + return; + } else if (msg[8] == 3) { + int length; + int offset; + + // Get domain name + length = msg[30] + msg[31] * 256; + offset = msg[32] + msg[33] * 256; + domain = new String(msg, offset, length); + + // Get user name + length = msg[38] + msg[39] * 256; + offset = msg[40] + msg[41] * 256; + username = new String(msg, offset, length); + + // Get password + length = msg[46] + msg[47] * 256; + offset = msg[48] + msg[49] * 256; + host = new String(msg, offset, length); + + if ("ntlm_user".equalsIgnoreCase(username) + && "NUTCH".equalsIgnoreCase(domain)) + authenticated = true; + } + } + + if (!authenticated) { + response.setHeader("WWW-Authenticate", "NTLM"); + response.sendError(response.SC_UNAUTHORIZED); + } else { +%> +<html> +<head>NTLM Authentication Test</head> +<body> +<p>Hi <%= username %>, You have been successfully authenticated.</p> +</body> +</html> +<% + } +%> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-httpclient/plugin.xml b/nutch-plugins/protocol-httpclient/plugin.xml new file mode 100644 index 0000000..1747713 --- /dev/null +++ b/nutch-plugins/protocol-httpclient/plugin.xml @@ -0,0 +1,58 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="protocol-httpclient" + name="Http / Https Protocol Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="protocol-httpclient.jar"> + <export name="*"/> + </library> + <library name="jsoup-1.8.1.jar"/> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + <import plugin="lib-http"/> + </requires> + + <extension id="org.apache.nutch.protocol.httpclient" + name="HttpProtocol" + point="org.apache.nutch.protocol.Protocol"> + + <implementation id="org.apache.nutch.protocol.httpclient.Http" + class="org.apache.nutch.protocol.httpclient.Http"> + <parameter name="protocolName" value="http"/> + </implementation> + + </extension> + + <extension id="org.apache.nutch.protocol.https" + name="HttpsProtocol" + point="org.apache.nutch.protocol.Protocol"> + + <implementation id="org.apache.nutch.protocol.httpclient.Http" + class="org.apache.nutch.protocol.httpclient.Http"> + <parameter name="protocolName" value="https"/> + </implementation> + + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-httpclient/pom.xml b/nutch-plugins/protocol-httpclient/pom.xml new file mode 100644 index 0000000..2f2fc7c --- /dev/null +++ b/nutch-plugins/protocol-httpclient/pom.xml @@ -0,0 +1,62 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>protocol-httpclient</artifactId> + <packaging>jar</packaging> + + <name>protocol-httpclient</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + <dependencies> + <dependency> + <groupId>org.jsoup</groupId> + <artifactId>jsoup</artifactId> + <version>1.8.1</version> + </dependency> + <dependency> + <groupId>org.apache.nutch</groupId> + <artifactId>lib-http</artifactId> + <version>${project.parent.version}</version> + </dependency> + <dependency> + <groupId> org.mortbay.jetty</groupId> + <artifactId>jetty</artifactId> + <version>6.1.26</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId> org.mortbay.jetty</groupId> + <artifactId>jsp-2.1</artifactId> + <version>6.1.14</version> + <scope>test</scope> + </dependency> + </dependencies> + +</project>
