Author: dogacan Date: Fri Jan 4 11:48:32 2008 New Revision: 608972 URL: http://svn.apache.org/viewvc?rev=608972&view=rev Log: NUTCH-559 - NTLM, Basic and Digest Authentication schemes for web/proxy. Contributed by Susam Pal.
Added: lucene/nutch/trunk/conf/httpclient-auth.xml.template lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/ lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=608972&r1=608971&r2=608972&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Jan 4 11:48:32 2008 @@ -179,6 +179,9 @@ 61. NUTCH-586 - Add option to run compiled classes without job file (enis via ab) +62. NUTCH-559 - NTLM, Basic and Digest Authentication schemes for web/proxy + server. (Susam Pal via dogacan) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Added: lucene/nutch/trunk/conf/httpclient-auth.xml.template URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/httpclient-auth.xml.template?rev=608972&view=auto ============================================================================== --- lucene/nutch/trunk/conf/httpclient-auth.xml.template (added) +++ lucene/nutch/trunk/conf/httpclient-auth.xml.template Fri Jan 4 11:48:32 2008 @@ -0,0 +1,61 @@ +<?xml version="1.0"?> +<!-- + This is the authentication configuration file for protocol-httpclient. + Different credentials for different authentication scopes can be + configured in this file. If a set of credentials is configured for a + particular authentication scope (i.e. particular host, port number, + scheme and realm), then that set of credentials would be sent only to + servers falling under the specified authentication scope. Apart from + this at most one set of credentials can be configured as 'default'. + + When authentication is required to fetch a resource from a web-server, + the authentication-scope is determined from the host, port, scheme and + realm (if present) obtained from the URL of the page and the + authentication headers in the HTTP response. If it matches any + 'authscope' in this configuration file, then the 'credentials' for + that 'authscope' is used for authentication. Otherwise, it would use + the 'default' set of credentials (with an exception which is described + in the next paragraph), if present. If any attribute is missing, it + would match all values for that attribute. + + If there are several pages having different authentication realms and + schemes on the same web-server (same host and port, but different + realms and schemes), and credentials for one or more of the realms and + schemes for that web-server is specified, then the 'default' + credentials would be ignored completely for that web-server (for that + host and port). So, credentials to handle all realms and schemes for + that server may be specified explicitly by adding an extra 'authscope' + tag with the 'realm' and 'scheme' attributes missing for that server. + This is demonstrated by the last 'authscope' tag for 'example:8080' in + the following example. + + Example:- + <credentials username="susam" password="masus"> + <default realm="sso"/> + <authscope host="192.168.101.33" port="80" realm="login"/> + <authscope host="example" port="8080" realm="blogs"/> + <authscope host="example" port="8080" realm="wiki"/> + <authscope host="example" port="80" realm="quiz" scheme="NTLM"/> + </credentials> + <credentials username="admin" password="nimda"> + <authscope host="example" port="8080"/> + </credentials> + + In the above example, 'example:8080' server has pages with multiple + authentication realms. The first set of credentials would be used for + 'blogs' and 'wiki' authentication realms. The second set of + credentials would be used for all other realms. For 'login' realm of + '192.168.101.33', the first set of credentials would be used. For any + other realm of '192.168.101.33' authentication would not be done. For + the NTLM authentication required by 'example:80', the first set of + credentials would be used. For 'sso' realms of all other servers, the + first set of credentials would be used, since it is configured as + 'default'. + + NTLM does not use the notion of realms. The domain name may be + specified as the value for 'realm' attribute in case of NTLM. +--> + +<auth-configuration> + +</auth-configuration> Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=608972&r1=608971&r2=608972&view=diff ============================================================================== --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Fri Jan 4 11:48:32 2008 @@ -119,6 +119,15 @@ </property> <property> + <name>http.agent.host</name> + <value></value> + <description>Name or IP address of the host on which the Nutch crawler + would be running. Currently this is used by 'protocol-httpclient' + plugin. + </description> +</property> + +<property> <name>http.timeout</name> <value>10000</value> <description>The default network timeout, in milliseconds.</description> @@ -152,6 +161,48 @@ <name>http.proxy.port</name> <value></value> <description>The proxy port.</description> +</property> + +<property> + <name>http.proxy.username</name> + <value></value> + <description>Username for proxy. This will be used by + 'protocol-httpclient', if the proxy server requests basic, digest + and/or NTLM authentication. To use this, 'protocol-httpclient' must + be present in the value of 'plugin.includes' property. + NOTE: For NTLM authentication, do not prefix the username with the + domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect. + </description> +</property> + +<property> + <name>http.proxy.password</name> + <value></value> + <description>Password for proxy. This will be used by + 'protocol-httpclient', if the proxy server requests basic, digest + and/or NTLM authentication. To use this, 'protocol-httpclient' must + be present in the value of 'plugin.includes' property. + </description> +</property> + +<property> + <name>http.proxy.realm</name> + <value></value> + <description>Authentication realm for proxy. Do not define a value + if realm is not required or authentication should take place for any + realm. NTLM does not use the notion of realms. Specify the domain name + of NTLM authentication as the value for this property. To use this, + 'protocol-httpclient' must be present in the value of + 'plugin.includes' property. + </description> +</property> + +<property> + <name>http.auth.file</name> + <value>httpclient-auth.xml</value> + <description>Authentication configuration file for + 'protocol-httpclient' plugin. + </description> </property> <property> Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=608972&r1=608971&r2=608972&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Fri Jan 4 11:48:32 2008 @@ -89,6 +89,7 @@ <ant dir="languageidentifier" target="test"/> <ant dir="lib-http" target="test"/> <ant dir="ontology" target="test"/> + <ant dir="protocol-httpclient" target="test"/> <!--ant dir="parse-ext" target="test"/--> <ant dir="parse-html" target="test"/> <!-- <ant dir="parse-mp3" target="test"/> --> Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml?rev=608972&r1=608971&r2=608972&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml Fri Jan 4 11:48:32 2008 @@ -27,6 +27,23 @@ <fileset dir="${nutch.root}/build"> <include name="**/lib-http/*.jar" /> </fileset> + <fileset dir="${nutch.root}/lib/jetty-ext"> + <include name="*.jar"/> + <exclude name="ant.jar"/> + </fileset> + <pathelement location="${build.dir}/test/conf"/> </path> + + <target name="deps-test"> + <copy toDir="${build.test}"> + <fileset dir="${src.test}" excludes="**/*.java"/> + </copy> + </target> + + <!-- for junit test --> + <mkdir dir="${build.test}/data" /> + <copy todir="${build.test}/data"> + <fileset dir="jsp"/> + </copy> </project> Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp?rev=608972&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp (added) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp Fri Jan 4 11:48:32 2008 @@ -0,0 +1,77 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%> +<%-- + This JSP demonstrates basic authentication. When this JSP page is + requested with no query parameters, then the user must enter the + username as 'userx' and password as 'passx' when prompted for + authentication. Apart from this there are a few other test cases, + which can be used by passing a test case number as query parameter in + the following manner: basic.jsp?case=1, basic.jsp?case=2, etc. + The credentials for each test case can be easily figured out from the + code below. + + Author: Susam Pal +--%> +<%@ page + import = "sun.misc.BASE64Decoder" +%> +<% + String authHeader = request.getHeader("Authorization"); + String realm = null; + String username = null; + String password = null; + int testCase = 0; + try { + testCase = Integer.parseInt(request.getParameter("case")); + } catch (Exception ex) { + // do nothing + } + switch (testCase) { + case 1: + realm = "realm1"; username = "user1"; password = "pass1"; + break; + + case 2: + realm = "realm2"; username = "user2"; password = "pass2"; + break; + + default: + realm = "realmx"; username = "userx"; password = "passx"; + break; + } + + boolean authenticated = false; + if (authHeader != null && authHeader.toUpperCase().startsWith("BASIC")) { + String creds[] = new String(new BASE64Decoder().decodeBuffer( + authHeader.substring(6))).split(":", 2); + if (creds[0].equals(username) && creds[1].equals(password)) + authenticated = true; + } + if (!authenticated) { + response.setHeader("WWW-Authenticate", "Basic realm=\"" + realm + "\""); + response.sendError(response.SC_UNAUTHORIZED); + } else { +%> +<html> +<head><title>Basic Authentication Test</title></head> +<body> +<p>Hi <%= username %>, you have been successfully authenticated.</p> +</body> +</html> +<% + } +%> Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp?rev=608972&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp (added) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp Fri Jan 4 11:48:32 2008 @@ -0,0 +1,65 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%> +<%-- + This JSP tests whether the client can remember cookies. When the JSP + is fetched for the first time without any query parameters, it sets + a few cookies in the client. On a second request, with the query + parameter, 'cookie=yes', it checks whether all the client has sent + the cookies. If the cookies are found, HTTP 200 response is returned. + If the cookies are not found, HTTP 403 response is returned. + + Author: Susam Pal +--%> +<% + String cookieParam = request.getParameter("cookie"); + if (!"yes".equals(cookieParam)) { // Send cookies + response.addCookie(new Cookie("var1", "val1")); + response.addCookie(new Cookie("var2", "val2")); +%> +<html> +<head><title>Cookies Set</title></head> +<body><p>Cookies have been set.</p></body> +</html> +<% + } else { // Check cookies + int cookiesCount = 0; + + Cookie[] cookies = request.getCookies(); + if (cookies != null) { + for (int i = 0; i < cookies.length; i++) { + if (cookies[i].getName().equals("var1") + && cookies[i].getValue().equals("val1")) + cookiesCount++; + + if (cookies[i].getName().equals("var2") + && cookies[i].getValue().equals("val2")) + cookiesCount++; + } + } + + if (cookiesCount != 2) { + response.sendError(response.SC_FORBIDDEN); + } else { +%> +<html> +<head><title>Cookies Found</title></head> +<body><p>Cookies found!</p></body> +</html> +<% + } + } +%> Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp?rev=608972&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp (added) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp Fri Jan 4 11:48:32 2008 @@ -0,0 +1,71 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%> +<%-- + This JSP tests digest authentication. It generates an HTTP response + with authorization header for digest authentication and checks the + user-name supplied by the client. It does not check the other + parameters and hashes as controlled JUnit tests would be performed + against this and only the proper submission of credentials need to + be tested. + + Author: Susam Pal +--%> +<%@ page + import = "java.util.StringTokenizer" + import = "java.util.HashMap" +%> +<% + String username = "digest_user"; + String authHeader = request.getHeader("Authorization"); + + boolean authenticated = false; + if (authHeader != null && authHeader.toUpperCase().startsWith("DIGEST")) { + HashMap map = new HashMap(); + StringTokenizer tokenizer = new StringTokenizer( + authHeader.substring(7).trim(), ","); + while (tokenizer.hasMoreTokens()) { + String[] param = tokenizer.nextToken().trim().split("=", 2); + if (param[1].charAt(0) == '"') { + param[1] = param[1].substring(1, param[1].length() - 1); + } + map.put(param[0], param[1]); + } + + if (username.equals((String)map.get("username"))) + authenticated = true; + } + + if (!authenticated) { + String realm = "realm=\"realm1\""; + String qop = "qop=\"auth,auth-int\""; + String nonce = "nonce=\"dcd98b7102dd2f0e8b11d0f600bfb0c093\""; + String opaque = "opaque=\"5ccc069c403ebaf9f0171e9517f40e41\""; + + response.setHeader("WWW-Authenticate", "Digest " + realm + ", " + + qop + ", " + nonce + ", " + opaque); + response.sendError(response.SC_UNAUTHORIZED); + } else { +%> +<html> +<head><title>Digest Authentication Test</title></head> +<body> +<p>Hi <%= username %>, you have been successfully authenticated.</p> +</body> +</html> +<% + } +%> Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp?rev=608972&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp (added) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp Fri Jan 4 11:48:32 2008 @@ -0,0 +1,38 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%> +<%-- + This JSP tests whether the client is sending any pre-emptive + authentication headers. The client is expected not to send pre-emptive + authentication headers. If such authentication headers are found, this + JSP will return an HTTP 403 response; HTTP 200 response otherwise. + + Author: Susam Pal +--%> +<% + if (request.getHeader("Authorization") != null) { + response.sendError(response.SC_UNAUTHORIZED); + } else { +%> +<html> +<head><title>No authorization headers found</title></head> +<body> +<p>No authorization headers found.</p> +</body> +</html> +<% + } +%> Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp?rev=608972&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp (added) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp Fri Jan 4 11:48:32 2008 @@ -0,0 +1,92 @@ +<%-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--%> +<%-- + This JSP tests NTLM authentication. It generates an HTTP response + with authorization header for NTLM authentication and checks the + user-name supplied by the client. It does not check the other + parameters and hashes as controlled JUnit tests would be performed + against this and only the proper submission of credentials need to + be tested. + + Author: Susam Pal +--%> +<%@ page + import = "sun.misc.BASE64Decoder" + import = "sun.misc.BASE64Encoder" +%> +<% + String authHeader = request.getHeader("Authorization"); + String username = null; + String domain = null; + String host = null; + + boolean authenticated = false; + if (authHeader != null && authHeader.startsWith("NTLM")) { + byte[] msg = new BASE64Decoder().decodeBuffer( + authHeader.substring(5)); + if (msg[8] == 1) { + byte[] type2msg = { + 'N', 'T', 'L', 'M', 'S', 'S', 'P', 0, // NTLMSSP Signature + 2, 0, 0, 0, // Type 2 Indicator + 10, 0, 10, 0, 32, 0, 0, 0, // length, offset + 0x00, 0x02, (byte) 0x81, 0, // Flags + 1, 2, 3, 4, 5, 6, 7, 8, // Challenge + 'N', 'U', 'T', 'C', 'H' // NUTCH (Domain) + }; + response.setHeader("WWW-Authenticate", "NTLM " + + new BASE64Encoder().encodeBuffer(type2msg)); + response.sendError(response.SC_UNAUTHORIZED); + return; + } else if (msg[8] == 3) { + int length; + int offset; + + // Get domain name + length = msg[30] + msg[31] * 256; + offset = msg[32] + msg[33] * 256; + domain = new String(msg, offset, length); + + // Get user name + length = msg[38] + msg[39] * 256; + offset = msg[40] + msg[41] * 256; + username = new String(msg, offset, length); + + // Get password + length = msg[46] + msg[47] * 256; + offset = msg[48] + msg[49] * 256; + host = new String(msg, offset, length); + + if ("ntlm_user".equalsIgnoreCase(username) + && "NUTCH".equalsIgnoreCase(domain)) + authenticated = true; + } + } + + if (!authenticated) { + response.setHeader("WWW-Authenticate", "NTLM"); + response.sendError(response.SC_UNAUTHORIZED); + } else { +%> +<html> +<head>NTLM Authentication Test</head> +<body> +<p>Hi <%= username %>, You have been successfully authenticated.</p> +</body> +</html> +<% + } +%> Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=608972&r1=608971&r2=608972&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Fri Jan 4 11:48:32 2008 @@ -17,16 +17,23 @@ package org.apache.nutch.protocol.httpclient; // JDK imports +import java.io.InputStream; import java.io.IOException; import java.net.URL; import java.util.ArrayList; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import org.xml.sax.SAXException; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; +import org.w3c.dom.Node; // Commons Logging imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; // HTTP Client imports -import org.apache.commons.httpclient.Credentials; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HostConfiguration; import org.apache.commons.httpclient.HttpClient; @@ -37,6 +44,7 @@ import org.apache.commons.httpclient.protocol.Protocol; // Nutch imports +import org.apache.nutch.util.LogUtil; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.ProtocolException; @@ -44,7 +52,14 @@ import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; - +/** + * This class is a protocol plugin that configures an HTTP client for + * Basic, Digest and NTLM authentication schemes for web server as well + * as proxy server. It takes care of HTTPS protocol as well as cookies + * in a single fetch session. + * + * @author Susam Pal + */ public class Http extends HttpBase { public static final Log LOG = LogFactory.getLog(Http.class); @@ -55,49 +70,100 @@ // Since the Configuration has not yet been set, // then an unconfigured client is returned. private static HttpClient client = new HttpClient(connectionManager); + private static String defaultUsername; + private static String defaultPassword; + private static String defaultRealm; + private static String defaultScheme; + private static String authFile; + private static String agentHost; + private static boolean authRulesRead = false; + private static Configuration conf; + + int maxThreadsTotal = 10; + private String proxyUsername; + private String proxyPassword; + private String proxyRealm; + + + /** + * Returns the configured HTTP client. + * + * @return HTTP client + */ static synchronized HttpClient getClient() { return client; } - boolean verbose = false; - int maxThreadsTotal = 10; - String ntlmUsername = ""; - String ntlmPassword = ""; - String ntlmDomain = ""; - String ntlmHost = ""; - + /** + * Constructs this plugin. + */ public Http() { super(LOG); } + /** + * Reads the configuration from the Nutch configuration files and sets + * the configuration. + * + * @param conf Configuration + */ public void setConf(Configuration conf) { super.setConf(conf); + this.conf = conf; this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10); - this.ntlmUsername = conf.get("http.auth.ntlm.username", ""); - this.ntlmPassword = conf.get("http.auth.ntlm.password", ""); - this.ntlmDomain = conf.get("http.auth.ntlm.domain", ""); - this.ntlmHost = conf.get("http.auth.ntlm.host", ""); + this.proxyUsername = conf.get("http.proxy.username", ""); + this.proxyPassword = conf.get("http.proxy.password", ""); + this.proxyRealm = conf.get("http.proxy.realm", ""); + agentHost = conf.get("http.agent.host", ""); + authFile = conf.get("http.auth.file", ""); configureClient(); + try { + setCredentials(); + } catch (Exception ex) { + if (LOG.isFatalEnabled()) { + LOG.fatal("Could not read " + authFile + " : " + ex.getMessage()); + ex.printStackTrace(LogUtil.getErrorStream(LOG)); + } + } } + /** + * Main method. + * + * @param args Command line arguments + */ public static void main(String[] args) throws Exception { Http http = new Http(); http.setConf(NutchConfiguration.create()); main(http, args); } + /** + * Fetches the <code>url</code> with a configured HTTP client and + * gets the response. + * + * @param url URL to be fetched + * @param datum Crawl data + * @param redirect Follow redirects if and only if true + * @return HTTP response + */ protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) throws ProtocolException, IOException { + resolveCredentials(url); return new HttpResponse(this, url, datum, redirect); } - + + /** + * Configures the HTTP client + */ private void configureClient() { // Set up an HTTPS socket factory that accepts self-signed certs. - Protocol dummyhttps = new Protocol("https", new DummySSLProtocolSocketFactory(), 443); - Protocol.registerProtocol("https", dummyhttps); - + Protocol https = new Protocol("https", + new DummySSLProtocolSocketFactory(), 443); + Protocol.registerProtocol("https", https); + HttpConnectionManagerParams params = connectionManager.getParams(); params.setConnectionTimeout(timeout); params.setSoTimeout(timeout); @@ -112,6 +178,8 @@ HostConfiguration hostConf = client.getHostConfiguration(); ArrayList headers = new ArrayList(); + // Set the User Agent in the header + headers.add(new Header("User-Agent", userAgent)); // prefer English headers.add(new Header("Accept-Language", "en-us,en-gb,en;q=0.7,*;q=0.3")); // prefer UTF-8 @@ -122,17 +190,236 @@ // accept gzipped content headers.add(new Header("Accept-Encoding", "x-gzip, gzip")); hostConf.getParams().setParameter("http.default-headers", headers); + + // HTTP proxy server details if (useProxy) { hostConf.setProxy(proxyHost, proxyPort); + + if (proxyUsername.length() > 0) { + + AuthScope proxyAuthScope = getAuthScope( + this.proxyHost, this.proxyPort, this.proxyRealm); + + NTCredentials proxyCredentials = new NTCredentials( + this.proxyUsername, this.proxyPassword, + this.agentHost, this.proxyRealm); + + client.getState().setProxyCredentials( + proxyAuthScope, proxyCredentials); + } } - if (ntlmUsername.length() > 0) { - Credentials ntCreds = new NTCredentials(ntlmUsername, ntlmPassword, ntlmHost, ntlmDomain); - client.getState().setCredentials(new AuthScope(ntlmHost, AuthScope.ANY_PORT), ntCreds); - if (LOG.isInfoEnabled()) { - LOG.info("Added NTLM credentials for " + ntlmUsername); + } + + /** + * Reads authentication configuration file (defined as + * 'http.auth.file' in Nutch configuration file) and sets the + * credentials for the configured authentication scopes in the HTTP + * client object. + * + * @throws ParserConfigurationException If a document builder can not + * be created. + * @throws SAXException If any parsing error occurs. + * @throws IOException If any I/O error occurs. + */ + private static synchronized void setCredentials() throws + ParserConfigurationException, SAXException, IOException { + + if (authRulesRead) + return; + + authRulesRead = true; // Avoid re-attempting to read + + InputStream is = conf.getConfResourceAsInputStream(authFile); + if (is != null) { + Document doc = DocumentBuilderFactory.newInstance() + .newDocumentBuilder().parse(is); + + Element rootElement = doc.getDocumentElement(); + if (!"auth-configuration".equals(rootElement.getTagName())) { + if (LOG.isWarnEnabled()) + LOG.warn("Bad auth conf file: root element <" + + rootElement.getTagName() + "> found in " + authFile + + " - must be <auth-configuration>"); + } + + // For each set of credentials + NodeList credList = rootElement.getChildNodes(); + for (int i = 0; i < credList.getLength(); i++) { + Node credNode = credList.item(i); + if (!(credNode instanceof Element)) + continue; + + Element credElement = (Element) credNode; + if (!"credentials".equals(credElement.getTagName())) { + if (LOG.isWarnEnabled()) + LOG.warn("Bad auth conf file: Element <" + + credElement.getTagName() + "> not recognized in " + + authFile + " - expected <credentials>"); + continue; + } + + String username = credElement.getAttribute("username"); + String password = credElement.getAttribute("password"); + + // For each authentication scope + NodeList scopeList = credElement.getChildNodes(); + for (int j = 0; j < scopeList.getLength(); j++) { + Node scopeNode = scopeList.item(j); + if (!(scopeNode instanceof Element)) + continue; + + Element scopeElement = (Element) scopeNode; + + if ("default".equals(scopeElement.getTagName())) { + + // Determine realm and scheme, if any + String realm = scopeElement.getAttribute("realm"); + String scheme = scopeElement.getAttribute("scheme"); + + // Set default credentials + defaultUsername = username; + defaultPassword = password; + defaultRealm = realm; + defaultScheme = scheme; + + if (LOG.isTraceEnabled()) { + LOG.trace("Credentials - username: " + username + + "; set as default" + + " for realm: " + realm + "; scheme: " + scheme); + } + + } else if ("authscope".equals(scopeElement.getTagName())) { + + // Determine authentication scope details + String host = scopeElement.getAttribute("host"); + int port = -1; // For setting port to AuthScope.ANY_PORT + try { + port = Integer.parseInt( + scopeElement.getAttribute("port")); + } catch (Exception ex) { + // do nothing, port is already set to any port + } + String realm = scopeElement.getAttribute("realm"); + String scheme = scopeElement.getAttribute("scheme"); + + // Set credentials for the determined scope + AuthScope authScope = getAuthScope(host, port, realm, scheme); + NTCredentials credentials = new NTCredentials( + username, password, agentHost, realm); + + client.getState().setCredentials(authScope, credentials); + + if (LOG.isTraceEnabled()) { + LOG.trace("Credentials - username: " + username + + "; set for AuthScope - " + "host: " + host + + "; port: " + port + "; realm: " + realm + + "; scheme: " + scheme); + } + + } else { + if (LOG.isWarnEnabled()) + LOG.warn("Bad auth conf file: Element <" + + scopeElement.getTagName() + "> not recognized in " + + authFile + " - expected <authscope>"); + } + } + is.close(); } } - if (LOG.isInfoEnabled()) { LOG.info("Configured Client"); } + } + + /** + * If credentials for the authentication scope determined from the + * specified <code>url</code> is not already set in the HTTP client, + * then this method sets the default credentials to fetch the + * specified <code>url</code>. If credentials are found for the + * authentication scope, the method returns without altering the + * client. + * + * @param url URL to be fetched + */ + private void resolveCredentials(URL url) { + + if (defaultUsername != null && defaultUsername.length() > 0) { + + int port = url.getPort(); + if (port == -1) { + if ("https".equals(url.getProtocol())) + port = 443; + else + port = 80; + } + + AuthScope scope = new AuthScope(url.getHost(), port); + + if (client.getState().getCredentials(scope) != null) { + if (LOG.isTraceEnabled()) + LOG.trace("Pre-configured credentials with scope - host: " + + url.getHost() + "; port: " + port + + "; found for url: " + url); + + // Credentials are already configured, so do nothing and return + return; + } + + if (LOG.isTraceEnabled()) + LOG.trace("Pre-configured credentials with scope - host: " + + url.getHost() + "; port: " + port + + "; not found for url: " + url); + + AuthScope serverAuthScope = getAuthScope( + url.getHost(), port, defaultRealm, defaultScheme); + + NTCredentials serverCredentials = new NTCredentials( + defaultUsername, defaultPassword, + agentHost, defaultRealm); + + client.getState().setCredentials( + serverAuthScope, serverCredentials); + } + } + + /** + * Returns an authentication scope for the specified + * <code>host</code>, <code>port</code>, <code>realm</code> and + * <code>scheme</code>. + * + * @param host Host name or address. + * @param port Port number. + * @param realm Authentication realm. + * @param scheme Authentication scheme. + */ + private static AuthScope getAuthScope(String host, int port, + String realm, String scheme) { + + if (host.length() == 0) + host = null; + + if (port < 0) + port = -1; + + if (realm.length() == 0) + realm = null; + + if (scheme.length() == 0) + scheme = null; + + return new AuthScope(host, port, realm, scheme); + } + + /** + * Returns an authentication scope for the specified + * <code>host</code>, <code>port</code> and <code>realm</code>. + * + * @param host Host name or address. + * @param port Port number. + * @param realm Authentication realm. + */ + private static AuthScope getAuthScope(String host, int port, + String realm) { + + return getAuthScope(host, port, realm, ""); } } + Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=608972&r1=608971&r2=608972&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Fri Jan 4 11:48:32 2008 @@ -21,11 +21,6 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; -import java.util.Date; - -// Commons Logging imports -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; // HTTP Client imports import org.apache.commons.httpclient.Header; @@ -33,6 +28,7 @@ import org.apache.commons.httpclient.cookie.CookiePolicy; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; +import org.apache.commons.httpclient.HttpException; // Nutch imports import org.apache.nutch.crawl.CrawlDatum; @@ -41,46 +37,45 @@ import org.apache.nutch.net.protocols.HttpDateFormat; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.http.api.HttpBase; -import org.apache.nutch.util.LogUtil; - /** * An HTTP response. + * + * @author Susam Pal */ public class HttpResponse implements Response { - public final static Log LOG = LogFactory.getLog(HttpResponse.class); - private URL url; - - private String orig; - - private String base; - private byte[] content; - - private HttpBase http; - private int code; - private Metadata headers = new SpellCheckedMetadata(); - - public HttpResponse(HttpBase http, URL url, CrawlDatum datum) throws IOException { - this(http, url, datum, false); - } + /** + * Fetches the given <code>url</code> and prepares HTTP response. + * + * @param http An instance of the implementation class + * of this plugin + * @param url URL to be fetched + * @param datum Crawl data + * @param followRedirects Whether to follow redirects; follows + * redirect if and only if this is true + * @return HTTP response + * @throws IOException When an error occurs + */ + HttpResponse(Http http, URL url, CrawlDatum datum, + boolean followRedirects) throws IOException { - - HttpResponse(HttpBase http, URL url, CrawlDatum datum, boolean followRedirects) throws IOException { - this.http = http; + // Prepare GET method for HTTP request this.url = url; - this.base = url.toString(); - this.orig = url.toString(); - GetMethod get = new GetMethod(this.orig); + GetMethod get = new GetMethod(url.toString()); get.setFollowRedirects(followRedirects); - get.setRequestHeader("User-Agent", http.getUserAgent()); - if (datum.getModifiedTime() > 0) - get.setRequestHeader("If-Modified-Since", HttpDateFormat.toString(datum.getModifiedTime())); + get.setDoAuthentication(true); + if (datum.getModifiedTime() > 0) { + get.setRequestHeader("If-Modified-Since", + HttpDateFormat.toString(datum.getModifiedTime())); + } + + // Set HTTP parameters HttpMethodParams params = get.getParams(); if (http.getUseHttp11()) { params.setVersion(HttpVersion.HTTP_1_1); @@ -104,38 +99,75 @@ headers.set(heads[i].getName(), heads[i].getValue()); } + // Limit download size + int contentLength = Integer.MAX_VALUE; + String contentLengthString = headers.get(Response.CONTENT_LENGTH); + if (contentLengthString != null) { + try { + contentLength = Integer.parseInt(contentLengthString.trim()); + } catch (NumberFormatException ex) { + throw new HttpException("bad content length: " + + contentLengthString); + } + } + if (http.getMaxContent() >= 0 && + contentLength > http.getMaxContent()) { + contentLength = http.getMaxContent(); + } + // always read content. Sometimes content is useful to find a cause // for error. + InputStream in = get.getResponseBodyAsStream(); try { - InputStream in = get.getResponseBodyAsStream(); - byte[] buffer = new byte[http.BUFFER_SIZE]; + byte[] buffer = new byte[HttpBase.BUFFER_SIZE]; int bufferFilled = 0; int totalRead = 0; ByteArrayOutputStream out = new ByteArrayOutputStream(); - int tryAndRead = calculateTryToRead(totalRead); - while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1 && tryAndRead > 0) { + while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1 + && totalRead < contentLength) { totalRead += bufferFilled; out.write(buffer, 0, bufferFilled); - tryAndRead = calculateTryToRead(totalRead); } content = out.toByteArray(); - in.close(); } catch (Exception e) { if (code == 200) throw new IOException(e.toString()); // for codes other than 200 OK, we are fine with empty content + } finally { + in.close(); + get.abort(); } + + StringBuilder fetchTrace = null; + if (Http.LOG.isTraceEnabled()) { + // Trace message + fetchTrace = new StringBuilder("url: " + url + + "; status code: " + code + + "; bytes received: " + content.length); + if (getHeader(Response.CONTENT_LENGTH) != null) + fetchTrace.append("; Content-Length: " + + getHeader(Response.CONTENT_LENGTH)); + if (getHeader(Response.LOCATION) != null) + fetchTrace.append("; Location: " + getHeader(Response.LOCATION)); + } + // Extract gzip and x-gzip files if (content != null) { // check if we have to uncompress it String contentEncoding = headers.get(Response.CONTENT_ENCODING); - if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { + if (contentEncoding != null && Http.LOG.isTraceEnabled()) + fetchTrace.append("; Content-Encoding: " + contentEncoding); + if ("gzip".equals(contentEncoding) || + "x-gzip".equals(contentEncoding)) { content = http.processGzipEncoded(content, url); + if (Http.LOG.isTraceEnabled()) + fetchTrace.append("; extracted to " + content.length + " bytes"); } } - } catch (org.apache.commons.httpclient.ProtocolException pe) { - pe.printStackTrace(LogUtil.getErrorStream(LOG)); - get.releaseConnection(); - throw new IOException(pe.toString()); + + // Log trace message + if (Http.LOG.isTraceEnabled()) { + Http.LOG.trace(fetchTrace); + } } finally { get.releaseConnection(); } @@ -169,17 +201,5 @@ /* -------------------------- * * </implementation:Response> * * -------------------------- */ - - - - private int calculateTryToRead(int totalRead) { - int tryToRead = Http.BUFFER_SIZE; - if (http.getMaxContent() <= 0) { - return http.BUFFER_SIZE; - } else if (http.getMaxContent() - totalRead < http.BUFFER_SIZE) { - tryToRead = http.getMaxContent() - totalRead; - } - return tryToRead; - } - } + Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html?rev=608972&r1=608971&r2=608972&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html Fri Jan 4 11:48:32 2008 @@ -1,7 +1,9 @@ <html> <body> -<p>Protocol plugin which supports retrieving documents via the HTTP protocol.</p> -<p>This plugin is based on Jakarta Commons HttpClient library, and handles -also HTTPS and cookies.</p> +<p>Protocol plugin which supports retrieving documents via the HTTP and +HTTPS protocols, optionally with Basic, Digest and NTLM authentication +schemes for web server as well as proxy server. It handles cookies +within a single fetch operation. This plugin is based on Jakarta +Commons HttpClient library.</p> </body> </html> Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml?rev=608972&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml (added) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml Fri Jan 4 11:48:32 2008 @@ -0,0 +1,42 @@ +<?xml version="1.0"?> + +<auth-configuration> + + <!-- Default credentials --> + <credentials username="userx" password="passx"> + <default/> + <authscope host="127.0.0.1" port="47500"/> + </credentials> + + <!-- Defined a realm for 127.0.0.1:47501 so that authentication for + other realms fail (except another realm for 127.0.0.1:47501 is + defined below for NTLM scheme). --> + <credentials username="userx" password="passx"> + <authscope host="127.0.0.1" port="47501" realm="realmx" + scheme="BASIC"/> + </credentials> + + <!-- Test case for NTLM authentication scheme. --> + <credentials username="ntlm_user" password="ntlm_pass"> + <authscope host="127.0.0.1" port="47501" realm="NUTCH" + scheme="NTLM"/> + </credentials> + + <!-- Test case for credentials selection based on scheme (realm1 is + present in basic.jsp as well as digest.jsp). + Also tests Digest authentication scheme. --> + <credentials username="digest_user" password="digest_pass"> + <authscope host="127.0.0.1" port="47500" realm="realm1" + scheme="DIGEST"/> + </credentials> + + <!-- Test case for Basic authentication scheme. --> + <credentials username="user1" password="pass1"> + <authscope host="127.0.0.1" port="47500" realm="realm1"/> + </credentials> + <credentials username="user2" password="pass2"> + <authscope host="127.0.0.1" port="47500" realm="realm2"/> + </credentials> + +</auth-configuration> + Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml?rev=608972&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml (added) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml Fri Jan 4 11:48:32 2008 @@ -0,0 +1,36 @@ +<?xml version="1.0"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + +<property> + <name>http.robots.agents</name> + <value>Nutch-Test,*</value> + <description></description> +</property> + +<property> + <name>http.agent.name</name> + <value>Nutch-Test</value> + <description></description> +</property> + +<property> + <name>http.agent.description</name> + <value>Nutch protocol-httpclient test</value> + <description></description> +</property> + +<property> + <name>http.auth.file</name> + <value>httpclient-auth-test.xml</value> + <description></description> +</property> + +<property> + <name>http.timeout</name> + <value>60000</value> + <description></description> +</property> + +</configuration> Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java?rev=608972&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java (added) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java Fri Jan 4 11:48:32 2008 @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.httpclient; + +import java.net.URL; +import java.net.MalformedURLException; +import junit.framework.TestCase; +import org.mortbay.jetty.Server; +import org.mortbay.jetty.servlet.ServletHttpContext; +import org.mortbay.jetty.servlet.ServletHandler; +import org.mortbay.http.SocketListener; +import org.mortbay.http.handler.ResourceHandler; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; + +/** + * Test cases for protocol-httpclient. + * + * @author Susam Pal + */ +public class TestProtocolHttpClient extends TestCase { + + private Server server; + private Configuration conf; + private static final String RES_DIR = System.getProperty("test.data", "."); + private int port; + private Http http = new Http(); + + protected void setUp() throws Exception { + + ServletHttpContext context = new ServletHttpContext(); + context.setContextPath("/"); + context.setResourceBase(RES_DIR); + context.addServlet("JSP", "*.jsp", "org.apache.jasper.servlet.JspServlet"); + context.addHandler(new ResourceHandler()); + + server = new Server(); + server.addContext(context); + + conf = new Configuration(); + conf.addDefaultResource("nutch-default.xml"); + conf.addResource("nutch-site-test.xml"); + + http = new Http(); + http.setConf(conf); + } + + protected void tearDown() throws Exception { + server.stop(); + } + + /** + * Tests whether the client can remember cookies. + * + * @throws Exception If an error occurs or the test case fails. + */ + public void testCookies() throws Exception { + startServer(47500); + fetchPage("/cookies.jsp", 200); + fetchPage("/cookies.jsp?cookie=yes", 200); + } + + /** + * Tests that no pre-emptive authorization headers are sent by the + * client. + * + * @throws Exception If an error occurs or the test case fails. + */ + public void testNoPreemptiveAuth() throws Exception { + startServer(47500); + fetchPage("/noauth.jsp", 200); + } + + /** + * Tests default credentials. + * + * @throws Exception If an error occurs or the test case fails. + */ + public void testDefaultCredentials() throws Exception { + startServer(47502); + fetchPage("/basic.jsp", 200); + } + + /** + * Tests basic authentication scheme for various realms. + * + * @throws Exception If an error occurs or the test case fails. + */ + public void testBasicAuth() throws Exception { + startServer(47500); + fetchPage("/basic.jsp", 200); + fetchPage("/basic.jsp?case=1", 200); + fetchPage("/basic.jsp?case=2", 200); + server.start(); + } + + /** + * Tests that authentication happens for a defined realm and not for + * other realms for a host:port when an extra <code>authscope</code> + * tag is not defined to match all other realms. + * + * @throws Exception If an error occurs or the test case fails. + */ + public void testOtherRealmsNoAuth() throws Exception { + startServer(47501); + fetchPage("/basic.jsp", 200); + fetchPage("/basic.jsp?case=1", 401); + fetchPage("/basic.jsp?case=2", 401); + } + + /** + * Tests Digest authentication scheme. + * + * @throws Exception If an error occurs or the test case fails. + */ + public void testDigestAuth() throws Exception { + startServer(47500); + fetchPage("/digest.jsp", 200); + } + + /** + * Tests NTLM authentication scheme. + * + * @throws Exception If an error occurs or the test case fails. + */ + public void testNtlmAuth() throws Exception { + startServer(47501); + fetchPage("/ntlm.jsp", 200); + } + + /** + * Starts the Jetty server at a specified port. + * + * @param portno Port number. + * @throws Exception When an error occurs. + */ + private void startServer(int portno) throws Exception { + port = portno; + SocketListener listener = new SocketListener(); + listener.setHost("127.0.0.1"); + listener.setPort(port); + server.addListener(listener); + server.start(); + } + + /** + * Fetches the specified <code>page</code> from the local Jetty server + * and checks whether the HTTP response status code matches with the + * expected code. + * + * @param page Page to be fetched. + * @param expectedCode HTTP response status code expected while + * fetching the page. + * @throws Exception When an error occurs or test case fails. + */ + private void fetchPage(String page, int expectedCode) + throws Exception { + URL url = new URL("http", "127.0.0.1", port, page); + Response response = null; + response = http.getResponse(url, new CrawlDatum(), true); + + int code = response.getCode(); + assertEquals("HTTP Status Code for " + url, expectedCode, code); + } + + /** + * Returns an URL to the specified page. + * + * @param page Page available in the local Jetty + * server. + * @throws MalformedURLException If an URL can not be formed. + */ + private URL getURL(String page) throws MalformedURLException { + return new URL("http", "127.0.0.1", port, page); + } +}