Author: dogacan
Date: Fri Jan  4 11:48:32 2008
New Revision: 608972

URL: http://svn.apache.org/viewvc?rev=608972&view=rev
Log:
NUTCH-559 - NTLM, Basic and Digest Authentication schemes for web/proxy. 
Contributed by Susam Pal.

Added:
    lucene/nutch/trunk/conf/httpclient-auth.xml.template
    lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/
    lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp
    lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp
    lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp
    lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp
    lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp
    lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/
    lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml
    lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/
    lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/plugin/build.xml
    lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=608972&r1=608971&r2=608972&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Jan  4 11:48:32 2008
@@ -179,6 +179,9 @@
 61. NUTCH-586 - Add option to run compiled classes without job file
     (enis via ab)
 
+62. NUTCH-559 - NTLM, Basic and Digest Authentication schemes for web/proxy
+    server. (Susam Pal via dogacan)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Added: lucene/nutch/trunk/conf/httpclient-auth.xml.template
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/httpclient-auth.xml.template?rev=608972&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/httpclient-auth.xml.template (added)
+++ lucene/nutch/trunk/conf/httpclient-auth.xml.template Fri Jan  4 11:48:32 
2008
@@ -0,0 +1,61 @@
+<?xml version="1.0"?>
+<!--
+  This is the authentication configuration file for protocol-httpclient.
+  Different credentials for different authentication scopes can be
+  configured in this file. If a set of credentials is configured for a 
+  particular authentication scope (i.e. particular host, port number,
+  scheme and realm), then that set of credentials would be sent only to
+  servers falling under the specified authentication scope. Apart from
+  this at most one set of credentials can be configured as 'default'.
+  
+  When authentication is required to fetch a resource from a web-server,
+  the authentication-scope is determined from the host, port, scheme and
+  realm (if present) obtained from the URL of the page and the
+  authentication headers in the HTTP response. If it matches any
+  'authscope' in this configuration file, then the 'credentials' for
+  that 'authscope' is used for authentication. Otherwise, it would use
+  the 'default' set of credentials (with an exception which is described
+  in the next paragraph), if present. If any attribute is missing, it
+  would match all values for that attribute.
+
+  If there are several pages having different authentication realms and
+  schemes on the same web-server (same host and port, but different
+  realms and schemes), and credentials for one or more of the realms and
+  schemes for that web-server is specified, then the 'default'
+  credentials would be ignored completely for that web-server (for that
+  host and port). So, credentials to handle all realms and schemes for
+  that server may be specified explicitly by adding an extra 'authscope'
+  tag with the 'realm' and 'scheme' attributes missing for that server.
+  This is demonstrated by the last 'authscope' tag for 'example:8080' in
+  the following example.
+
+  Example:-
+    <credentials username="susam" password="masus">
+      <default realm="sso"/>
+      <authscope host="192.168.101.33" port="80" realm="login"/>
+      <authscope host="example" port="8080" realm="blogs"/>
+      <authscope host="example" port="8080" realm="wiki"/>
+      <authscope host="example" port="80" realm="quiz" scheme="NTLM"/>
+    </credentials>
+    <credentials username="admin" password="nimda">
+      <authscope host="example" port="8080"/>
+    </credentials>
+
+  In the above example, 'example:8080' server has pages with multiple
+  authentication realms. The first set of credentials would be used for
+  'blogs' and 'wiki' authentication realms. The second set of
+  credentials would be used for all other realms. For 'login' realm of
+  '192.168.101.33', the first set of credentials would be used. For any
+  other realm of '192.168.101.33' authentication would not be done. For
+  the NTLM authentication required by 'example:80', the first set of
+  credentials would be used. For 'sso' realms of all other servers, the
+  first set of credentials would be used, since it is configured as
+  'default'.
+
+  NTLM does not use the notion of realms. The domain name may be
+  specified as the value for 'realm' attribute in case of NTLM.
+-->
+
+<auth-configuration>
+
+</auth-configuration>

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=608972&r1=608971&r2=608972&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Fri Jan  4 11:48:32 2008
@@ -119,6 +119,15 @@
 </property>
 
 <property>
+  <name>http.agent.host</name>
+  <value></value>
+  <description>Name or IP address of the host on which the Nutch crawler
+  would be running. Currently this is used by 'protocol-httpclient'
+  plugin.
+  </description>
+</property>
+
+<property>
   <name>http.timeout</name>
   <value>10000</value>
   <description>The default network timeout, in milliseconds.</description>
@@ -152,6 +161,48 @@
   <name>http.proxy.port</name>
   <value></value>
   <description>The proxy port.</description>
+</property>
+
+<property>
+  <name>http.proxy.username</name>
+  <value></value>
+  <description>Username for proxy. This will be used by
+  'protocol-httpclient', if the proxy server requests basic, digest
+  and/or NTLM authentication. To use this, 'protocol-httpclient' must
+  be present in the value of 'plugin.includes' property.
+  NOTE: For NTLM authentication, do not prefix the username with the
+  domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect.
+  </description>
+</property>
+
+<property>
+  <name>http.proxy.password</name>
+  <value></value>
+  <description>Password for proxy. This will be used by
+  'protocol-httpclient', if the proxy server requests basic, digest
+  and/or NTLM authentication. To use this, 'protocol-httpclient' must
+  be present in the value of 'plugin.includes' property.
+  </description>
+</property>
+
+<property>
+  <name>http.proxy.realm</name>
+  <value></value>
+  <description>Authentication realm for proxy. Do not define a value
+  if realm is not required or authentication should take place for any
+  realm. NTLM does not use the notion of realms. Specify the domain name
+  of NTLM authentication as the value for this property. To use this,
+  'protocol-httpclient' must be present in the value of
+  'plugin.includes' property.
+  </description>
+</property>
+
+<property>
+  <name>http.auth.file</name>
+  <value>httpclient-auth.xml</value>
+  <description>Authentication configuration file for
+  'protocol-httpclient' plugin.
+  </description>
 </property>
 
 <property>

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=608972&r1=608971&r2=608972&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Fri Jan  4 11:48:32 2008
@@ -89,6 +89,7 @@
      <ant dir="languageidentifier" target="test"/>
      <ant dir="lib-http" target="test"/>
      <ant dir="ontology" target="test"/>
+     <ant dir="protocol-httpclient" target="test"/>
      <!--ant dir="parse-ext" target="test"/-->
      <ant dir="parse-html" target="test"/>
      <!-- <ant dir="parse-mp3" target="test"/> -->

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml?rev=608972&r1=608971&r2=608972&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml Fri Jan  4 
11:48:32 2008
@@ -27,6 +27,23 @@
     <fileset dir="${nutch.root}/build">
       <include name="**/lib-http/*.jar" />
     </fileset>
+    <fileset dir="${nutch.root}/lib/jetty-ext">
+      <include name="*.jar"/>
+      <exclude name="ant.jar"/>
+    </fileset>
+    <pathelement location="${build.dir}/test/conf"/>
   </path>
+
+  <target name="deps-test">
+    <copy toDir="${build.test}">
+      <fileset dir="${src.test}" excludes="**/*.java"/>
+    </copy>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+      <fileset dir="jsp"/>
+   </copy>
 
 </project>

Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp?rev=608972&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp (added)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp Fri Jan  4 
11:48:32 2008
@@ -0,0 +1,77 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%>
+<%--
+  This JSP demonstrates basic authentication. When this JSP page is
+  requested with no query parameters, then the user must enter the
+  username as 'userx' and password as 'passx' when prompted for
+  authentication. Apart from this there are a few other test cases,
+  which can be used by passing a test case number as query parameter in
+  the following manner: basic.jsp?case=1, basic.jsp?case=2, etc.
+  The credentials for each test case can be easily figured out from the
+  code below.
+
+  Author: Susam Pal
+--%>
+<%@ page
+    import = "sun.misc.BASE64Decoder"
+%>
+<%
+  String authHeader = request.getHeader("Authorization");
+  String realm = null;
+  String username = null;
+  String password = null;
+  int testCase = 0;
+  try {
+    testCase = Integer.parseInt(request.getParameter("case"));
+  } catch (Exception ex) {
+    // do nothing
+  }
+  switch (testCase) {
+    case 1:
+      realm = "realm1"; username = "user1"; password = "pass1";
+      break;
+
+    case 2:
+      realm = "realm2"; username = "user2"; password = "pass2";
+      break;
+
+    default:
+      realm = "realmx"; username = "userx"; password = "passx";
+      break;
+  }
+
+  boolean authenticated = false;
+  if (authHeader != null && authHeader.toUpperCase().startsWith("BASIC")) {
+    String creds[] = new String(new BASE64Decoder().decodeBuffer(
+        authHeader.substring(6))).split(":", 2);
+    if (creds[0].equals(username) && creds[1].equals(password))
+          authenticated = true;
+  }
+  if (!authenticated) {
+    response.setHeader("WWW-Authenticate", "Basic realm=\"" + realm + "\"");
+    response.sendError(response.SC_UNAUTHORIZED);
+  } else {
+%>
+<html>
+<head><title>Basic Authentication Test</title></head>
+<body>
+<p>Hi <%= username %>, you have been successfully authenticated.</p>
+</body>
+</html>
+<%
+  }
+%>

Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp?rev=608972&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp (added)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp Fri Jan  
4 11:48:32 2008
@@ -0,0 +1,65 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%>
+<%--
+  This JSP tests whether the client can remember cookies. When the JSP
+  is fetched for the first time without any query parameters, it sets
+  a few cookies in the client. On a second request, with the query
+  parameter, 'cookie=yes', it checks whether all the client has sent
+  the cookies. If the cookies are found, HTTP 200 response is returned.
+  If the cookies are not found, HTTP 403 response is returned.
+
+  Author: Susam Pal
+--%>
+<%
+  String cookieParam = request.getParameter("cookie");
+  if (!"yes".equals(cookieParam)) { // Send cookies
+    response.addCookie(new Cookie("var1", "val1"));
+    response.addCookie(new Cookie("var2", "val2"));
+%>
+<html>
+<head><title>Cookies Set</title></head>
+<body><p>Cookies have been set.</p></body>
+</html>
+<%
+  } else { // Check cookies
+    int cookiesCount = 0;
+
+    Cookie[] cookies = request.getCookies();
+    if (cookies != null) {
+      for (int i = 0; i < cookies.length; i++) {
+        if (cookies[i].getName().equals("var1")
+            && cookies[i].getValue().equals("val1"))
+          cookiesCount++;
+
+        if (cookies[i].getName().equals("var2")
+            && cookies[i].getValue().equals("val2"))
+          cookiesCount++;
+      }
+    }
+
+    if (cookiesCount != 2) {
+      response.sendError(response.SC_FORBIDDEN);
+    } else {
+%>
+<html>
+<head><title>Cookies Found</title></head>
+<body><p>Cookies found!</p></body>
+</html>
+<%
+    }
+  }
+%>

Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp?rev=608972&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp (added)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp Fri Jan  4 
11:48:32 2008
@@ -0,0 +1,71 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%>
+<%--
+  This JSP tests digest authentication. It generates an HTTP response
+  with authorization header for digest authentication and checks the
+  user-name supplied by the client. It does not check the other
+  parameters and hashes as controlled JUnit tests would be performed
+  against this and only the proper submission of credentials need to
+  be tested.
+
+  Author: Susam Pal
+--%>
+<%@ page
+    import = "java.util.StringTokenizer"
+    import = "java.util.HashMap"
+%>
+<%
+  String username = "digest_user";
+  String authHeader = request.getHeader("Authorization");
+  
+  boolean authenticated = false;
+  if (authHeader != null && authHeader.toUpperCase().startsWith("DIGEST")) {
+    HashMap map = new HashMap();
+    StringTokenizer tokenizer = new StringTokenizer(
+        authHeader.substring(7).trim(), ",");
+    while (tokenizer.hasMoreTokens()) {
+      String[] param = tokenizer.nextToken().trim().split("=", 2);
+      if (param[1].charAt(0) == '"') {
+        param[1] = param[1].substring(1, param[1].length() - 1);
+      }
+      map.put(param[0], param[1]);
+    }
+
+    if (username.equals((String)map.get("username")))
+      authenticated = true;
+  }
+
+  if (!authenticated) {
+    String realm = "realm=\"realm1\"";
+    String qop   = "qop=\"auth,auth-int\"";
+    String nonce = "nonce=\"dcd98b7102dd2f0e8b11d0f600bfb0c093\"";
+    String opaque = "opaque=\"5ccc069c403ebaf9f0171e9517f40e41\"";
+
+    response.setHeader("WWW-Authenticate", "Digest " + realm + ", "
+        + qop + ", " + nonce + ", " + opaque);
+    response.sendError(response.SC_UNAUTHORIZED);
+  } else {
+%>
+<html>
+<head><title>Digest Authentication Test</title></head>
+<body>
+<p>Hi <%= username %>, you have been successfully authenticated.</p>
+</body>
+</html>
+<%
+  }
+%>

Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp?rev=608972&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp (added)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp Fri Jan  4 
11:48:32 2008
@@ -0,0 +1,38 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%>
+<%--
+  This JSP tests whether the client is sending any pre-emptive
+  authentication headers. The client is expected not to send pre-emptive
+  authentication headers. If such authentication headers are found, this
+  JSP will return an HTTP 403 response; HTTP 200 response otherwise.
+
+  Author: Susam Pal
+--%>
+<%
+  if (request.getHeader("Authorization") != null) {
+    response.sendError(response.SC_UNAUTHORIZED);
+  } else {
+%>
+<html>
+<head><title>No authorization headers found</title></head>
+<body>
+<p>No authorization headers found.</p>
+</body>
+</html>
+<%
+  }
+%>

Added: lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp?rev=608972&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp (added)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp Fri Jan  4 
11:48:32 2008
@@ -0,0 +1,92 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%>
+<%--
+  This JSP tests NTLM authentication. It generates an HTTP response
+  with authorization header for NTLM authentication and checks the
+  user-name supplied by the client. It does not check the other
+  parameters and hashes as controlled JUnit tests would be performed
+  against this and only the proper submission of credentials need to
+  be tested.
+
+  Author: Susam Pal
+--%>
+<%@ page
+    import = "sun.misc.BASE64Decoder"
+    import = "sun.misc.BASE64Encoder"
+%>
+<%
+  String authHeader = request.getHeader("Authorization");
+  String username = null;
+  String domain = null;
+  String host = null;
+
+  boolean authenticated = false;
+  if (authHeader != null && authHeader.startsWith("NTLM")) {
+    byte[] msg = new BASE64Decoder().decodeBuffer(
+        authHeader.substring(5));
+    if (msg[8] == 1) {
+      byte[] type2msg = {
+          'N', 'T', 'L', 'M', 'S', 'S', 'P', 0, // NTLMSSP Signature
+          2, 0, 0, 0,                           // Type 2 Indicator
+          10, 0, 10, 0, 32, 0, 0, 0,            // length, offset
+          0x00, 0x02, (byte) 0x81, 0,           // Flags
+          1, 2, 3, 4, 5, 6, 7, 8,               // Challenge
+          'N', 'U', 'T', 'C', 'H' // NUTCH (Domain)
+      };
+      response.setHeader("WWW-Authenticate", "NTLM "
+          + new BASE64Encoder().encodeBuffer(type2msg));
+      response.sendError(response.SC_UNAUTHORIZED);
+      return;
+    } else if (msg[8] == 3) {
+      int length;
+      int offset;
+
+      // Get domain name
+      length = msg[30] + msg[31] * 256;
+      offset = msg[32] + msg[33] * 256;
+      domain = new String(msg, offset, length);
+
+      // Get user name
+      length = msg[38] + msg[39] * 256;
+      offset = msg[40] + msg[41] * 256;
+      username = new String(msg, offset, length);
+
+      // Get password
+      length = msg[46] + msg[47] * 256;
+      offset = msg[48] + msg[49] * 256;
+      host = new String(msg, offset, length);
+
+      if ("ntlm_user".equalsIgnoreCase(username)
+          && "NUTCH".equalsIgnoreCase(domain))
+        authenticated = true;
+    }
+  }
+
+  if (!authenticated) {
+    response.setHeader("WWW-Authenticate", "NTLM");
+    response.sendError(response.SC_UNAUTHORIZED);
+  } else {
+%>
+<html>
+<head>NTLM Authentication Test</head>
+<body>
+<p>Hi <%= username %>, You have been successfully authenticated.</p>
+</body>
+</html>
+<%
+  }
+%>

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=608972&r1=608971&r2=608972&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 Fri Jan  4 11:48:32 2008
@@ -17,16 +17,23 @@
 package org.apache.nutch.protocol.httpclient;
 
 // JDK imports
+import java.io.InputStream;
 import java.io.IOException;
 import java.net.URL;
 import java.util.ArrayList;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import org.xml.sax.SAXException;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Node;
 
 // Commons Logging imports
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
 // HTTP Client imports
-import org.apache.commons.httpclient.Credentials;
 import org.apache.commons.httpclient.Header;
 import org.apache.commons.httpclient.HostConfiguration;
 import org.apache.commons.httpclient.HttpClient;
@@ -37,6 +44,7 @@
 import org.apache.commons.httpclient.protocol.Protocol;
 
 // Nutch imports
+import org.apache.nutch.util.LogUtil;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.ProtocolException;
@@ -44,7 +52,14 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.util.NutchConfiguration;
 
-
+/**
+ * This class is a protocol plugin that configures an HTTP client for
+ * Basic, Digest and NTLM authentication schemes for web server as well
+ * as proxy server. It takes care of HTTPS protocol as well as cookies
+ * in a single fetch session.
+ *
+ * @author Susam Pal
+ */
 public class Http extends HttpBase {
 
   public static final Log LOG = LogFactory.getLog(Http.class);
@@ -55,49 +70,100 @@
   // Since the Configuration has not yet been set,
   // then an unconfigured client is returned.
   private static HttpClient client = new HttpClient(connectionManager);
+  private static String defaultUsername;
+  private static String defaultPassword;
+  private static String defaultRealm;
+  private static String defaultScheme;
+  private static String authFile;
+  private static String agentHost;
+  private static boolean authRulesRead = false;
+  private static Configuration conf;
+
+  int maxThreadsTotal = 10;
 
+  private String proxyUsername;
+  private String proxyPassword;
+  private String proxyRealm;
+
+
+  /**
+   * Returns the configured HTTP client.
+   *
+   * @return HTTP client
+   */
   static synchronized HttpClient getClient() {
     return client;
   }
 
-  boolean verbose = false;
-  int maxThreadsTotal = 10;
-  String ntlmUsername = "";
-  String ntlmPassword = "";
-  String ntlmDomain = "";
-  String ntlmHost = "";
-
+  /**
+   * Constructs this plugin.
+   */
   public Http() {
     super(LOG);
   }
 
+  /**
+   * Reads the configuration from the Nutch configuration files and sets
+   * the configuration.
+   *
+   * @param conf Configuration
+   */
   public void setConf(Configuration conf) {
     super.setConf(conf);
+    this.conf = conf;
     this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
-    this.ntlmUsername = conf.get("http.auth.ntlm.username", "");
-    this.ntlmPassword = conf.get("http.auth.ntlm.password", "");
-    this.ntlmDomain = conf.get("http.auth.ntlm.domain", "");
-    this.ntlmHost = conf.get("http.auth.ntlm.host", "");
+    this.proxyUsername = conf.get("http.proxy.username", "");
+    this.proxyPassword = conf.get("http.proxy.password", "");
+    this.proxyRealm = conf.get("http.proxy.realm", "");
+    agentHost = conf.get("http.agent.host", "");
+    authFile = conf.get("http.auth.file", "");
     configureClient();
+    try {
+      setCredentials();
+    } catch (Exception ex) {
+      if (LOG.isFatalEnabled()) {
+        LOG.fatal("Could not read " + authFile + " : " + ex.getMessage());
+        ex.printStackTrace(LogUtil.getErrorStream(LOG));
+      }
+    }
   }
 
+  /**
+   * Main method.
+   *
+   * @param args Command line arguments
+   */
   public static void main(String[] args) throws Exception {
     Http http = new Http();
     http.setConf(NutchConfiguration.create());
     main(http, args);
   }
 
+  /**
+   * Fetches the <code>url</code> with a configured HTTP client and
+   * gets the response.
+   *
+   * @param url       URL to be fetched
+   * @param datum     Crawl data
+   * @param redirect  Follow redirects if and only if true
+   * @return          HTTP response
+   */
   protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
     throws ProtocolException, IOException {
+    resolveCredentials(url);
     return new HttpResponse(this, url, datum, redirect);
   }
-  
+
+  /**
+   * Configures the HTTP client
+   */
   private void configureClient() {
 
     // Set up an HTTPS socket factory that accepts self-signed certs.
-    Protocol dummyhttps = new Protocol("https", new 
DummySSLProtocolSocketFactory(), 443);
-    Protocol.registerProtocol("https", dummyhttps);
-    
+    Protocol https = new Protocol("https",
+        new DummySSLProtocolSocketFactory(), 443);
+    Protocol.registerProtocol("https", https);
+
     HttpConnectionManagerParams params = connectionManager.getParams();
     params.setConnectionTimeout(timeout);
     params.setSoTimeout(timeout);
@@ -112,6 +178,8 @@
 
     HostConfiguration hostConf = client.getHostConfiguration();
     ArrayList headers = new ArrayList();
+    // Set the User Agent in the header
+    headers.add(new Header("User-Agent", userAgent));
     // prefer English
     headers.add(new Header("Accept-Language", "en-us,en-gb,en;q=0.7,*;q=0.3"));
     // prefer UTF-8
@@ -122,17 +190,236 @@
     // accept gzipped content
     headers.add(new Header("Accept-Encoding", "x-gzip, gzip"));
     hostConf.getParams().setParameter("http.default-headers", headers);
+
+    // HTTP proxy server details
     if (useProxy) {
       hostConf.setProxy(proxyHost, proxyPort);
+
+      if (proxyUsername.length() > 0) {
+
+        AuthScope proxyAuthScope = getAuthScope(
+            this.proxyHost, this.proxyPort, this.proxyRealm);
+
+        NTCredentials proxyCredentials = new NTCredentials(
+            this.proxyUsername, this.proxyPassword,
+            this.agentHost, this.proxyRealm);
+
+        client.getState().setProxyCredentials(
+            proxyAuthScope, proxyCredentials);
+      }
     }
-    if (ntlmUsername.length() > 0) {
-      Credentials ntCreds = new NTCredentials(ntlmUsername, ntlmPassword, 
ntlmHost, ntlmDomain);
-      client.getState().setCredentials(new AuthScope(ntlmHost, 
AuthScope.ANY_PORT), ntCreds);
 
-      if (LOG.isInfoEnabled()) {
-        LOG.info("Added NTLM credentials for " + ntlmUsername);
+  }
+
+  /**
+   * Reads authentication configuration file (defined as
+   * 'http.auth.file' in Nutch configuration file) and sets the
+   * credentials for the configured authentication scopes in the HTTP
+   * client object.
+   *
+   * @throws ParserConfigurationException  If a document builder can not
+   *                                       be created.
+   * @throws SAXException                  If any parsing error occurs.
+   * @throws IOException                   If any I/O error occurs.
+   */
+  private static synchronized void setCredentials() throws 
+      ParserConfigurationException, SAXException, IOException {
+
+    if (authRulesRead)
+      return;
+
+    authRulesRead = true; // Avoid re-attempting to read
+
+    InputStream is = conf.getConfResourceAsInputStream(authFile);    
+    if (is != null) {
+      Document doc = DocumentBuilderFactory.newInstance()
+                     .newDocumentBuilder().parse(is);
+
+      Element rootElement = doc.getDocumentElement();
+      if (!"auth-configuration".equals(rootElement.getTagName())) {
+        if (LOG.isWarnEnabled())
+          LOG.warn("Bad auth conf file: root element <"
+              + rootElement.getTagName() + "> found in " + authFile
+              + " - must be <auth-configuration>");
+      }
+
+      // For each set of credentials
+      NodeList credList = rootElement.getChildNodes();
+      for (int i = 0; i < credList.getLength(); i++) {
+        Node credNode = credList.item(i);
+        if (!(credNode instanceof Element))
+          continue;    
+
+        Element credElement = (Element) credNode;
+        if (!"credentials".equals(credElement.getTagName())) {
+          if (LOG.isWarnEnabled())
+            LOG.warn("Bad auth conf file: Element <"
+            + credElement.getTagName() + "> not recognized in "
+            + authFile + " - expected <credentials>");
+          continue;
+        }
+
+        String username = credElement.getAttribute("username");
+        String password = credElement.getAttribute("password");
+
+        // For each authentication scope
+        NodeList scopeList = credElement.getChildNodes();
+        for (int j = 0; j < scopeList.getLength(); j++) {
+          Node scopeNode = scopeList.item(j);
+          if (!(scopeNode instanceof Element))
+            continue;
+          
+          Element scopeElement = (Element) scopeNode;
+
+          if ("default".equals(scopeElement.getTagName())) {
+
+            // Determine realm and scheme, if any
+            String realm = scopeElement.getAttribute("realm");
+            String scheme = scopeElement.getAttribute("scheme");
+
+            // Set default credentials
+            defaultUsername = username;
+            defaultPassword = password;
+            defaultRealm = realm;
+            defaultScheme = scheme;
+
+            if (LOG.isTraceEnabled()) {
+              LOG.trace("Credentials - username: " + username 
+                  + "; set as default"
+                  + " for realm: " + realm + "; scheme: " + scheme);
+            }
+
+          } else if ("authscope".equals(scopeElement.getTagName())) {
+
+            // Determine authentication scope details
+            String host = scopeElement.getAttribute("host");
+            int port = -1; // For setting port to AuthScope.ANY_PORT
+            try {
+              port = Integer.parseInt(
+                  scopeElement.getAttribute("port"));
+            } catch (Exception ex) {
+              // do nothing, port is already set to any port
+            }
+            String realm = scopeElement.getAttribute("realm");
+            String scheme = scopeElement.getAttribute("scheme");
+
+            // Set credentials for the determined scope
+            AuthScope authScope = getAuthScope(host, port, realm, scheme);
+            NTCredentials credentials = new NTCredentials(
+                username, password, agentHost, realm);
+
+            client.getState().setCredentials(authScope, credentials);
+
+            if (LOG.isTraceEnabled()) {
+              LOG.trace("Credentials - username: " + username
+                  + "; set for AuthScope - " + "host: " + host
+                  + "; port: " + port + "; realm: " + realm
+                  + "; scheme: " + scheme);
+            }
+
+          } else {
+            if (LOG.isWarnEnabled())
+              LOG.warn("Bad auth conf file: Element <"
+                  + scopeElement.getTagName() + "> not recognized in "
+                  + authFile + " - expected <authscope>");
+          }
+        }
+        is.close();
       }
     }
-    if (LOG.isInfoEnabled()) { LOG.info("Configured Client"); }
+  }
+
+  /**
+   * If credentials for the authentication scope determined from the
+   * specified <code>url</code> is not already set in the HTTP client,
+   * then this method sets the default credentials to fetch the
+   * specified <code>url</code>. If credentials are found for the
+   * authentication scope, the method returns without altering the
+   * client.
+   *
+   * @param url URL to be fetched
+   */
+  private void resolveCredentials(URL url) {
+
+    if (defaultUsername != null && defaultUsername.length() > 0) {
+
+      int port = url.getPort();
+      if (port == -1) {
+        if ("https".equals(url.getProtocol()))
+          port = 443;
+        else
+          port = 80;
+      }
+
+      AuthScope scope = new AuthScope(url.getHost(), port);
+
+      if (client.getState().getCredentials(scope) != null) {
+        if (LOG.isTraceEnabled())
+          LOG.trace("Pre-configured credentials with scope - host: "
+              + url.getHost() + "; port: " + port
+              + "; found for url: " + url);
+
+        // Credentials are already configured, so do nothing and return
+        return;
+      }
+
+      if (LOG.isTraceEnabled())
+          LOG.trace("Pre-configured credentials with scope -  host: "
+              + url.getHost() + "; port: " + port
+              + "; not found for url: " + url);
+
+      AuthScope serverAuthScope = getAuthScope(
+          url.getHost(), port, defaultRealm, defaultScheme);
+
+      NTCredentials serverCredentials = new NTCredentials(
+          defaultUsername, defaultPassword,
+          agentHost, defaultRealm);
+
+      client.getState().setCredentials(
+          serverAuthScope, serverCredentials);
+    }
+  }
+
+  /**
+   * Returns an authentication scope for the specified
+   * <code>host</code>, <code>port</code>, <code>realm</code> and
+   * <code>scheme</code>.
+   *
+   * @param host    Host name or address.
+   * @param port    Port number.
+   * @param realm   Authentication realm.
+   * @param scheme  Authentication scheme.
+   */
+  private static AuthScope getAuthScope(String host, int port,
+      String realm, String scheme) {
+    
+    if (host.length() == 0)
+      host = null;
+
+    if (port < 0)
+      port = -1;
+
+    if (realm.length() == 0)
+      realm = null;
+
+    if (scheme.length() == 0)
+      scheme = null;
+
+    return new AuthScope(host, port, realm, scheme);
+  }
+
+  /**
+   * Returns an authentication scope for the specified
+   * <code>host</code>, <code>port</code> and <code>realm</code>.
+   *
+   * @param host    Host name or address.
+   * @param port    Port number.
+   * @param realm   Authentication realm.
+   */
+  private static AuthScope getAuthScope(String host, int port,
+      String realm) {
+
+      return getAuthScope(host, port, realm, "");
   }
 }
+

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=608972&r1=608971&r2=608972&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 Fri Jan  4 11:48:32 2008
@@ -21,11 +21,6 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
-import java.util.Date;
-
-// Commons Logging imports
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
 
 // HTTP Client imports
 import org.apache.commons.httpclient.Header;
@@ -33,6 +28,7 @@
 import org.apache.commons.httpclient.cookie.CookiePolicy;
 import org.apache.commons.httpclient.methods.GetMethod;
 import org.apache.commons.httpclient.params.HttpMethodParams;
+import org.apache.commons.httpclient.HttpException;
 
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
@@ -41,46 +37,45 @@
 import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.http.api.HttpBase;
-import org.apache.nutch.util.LogUtil;
-
 
 /**
  * An HTTP response.
+ *
+ * @author Susam Pal
  */
 public class HttpResponse implements Response {
 
-  public final static Log LOG = LogFactory.getLog(HttpResponse.class);
-
   private URL url;
-  
-  private String orig;
-
-  private String base;
-
   private byte[] content;
-
-  private HttpBase http;
-
   private int code;
-
   private Metadata headers = new SpellCheckedMetadata();
 
-  
-  public HttpResponse(HttpBase http, URL url, CrawlDatum datum) throws 
IOException {
-    this(http, url, datum, false);
-  }
+  /**
+   * Fetches the given <code>url</code> and prepares HTTP response.
+   *
+   * @param http                An instance of the implementation class
+   *                            of this plugin
+   * @param url                 URL to be fetched
+   * @param datum               Crawl data
+   * @param followRedirects     Whether to follow redirects; follows
+   *                            redirect if and only if this is true
+   * @return                    HTTP response
+   * @throws IOException        When an error occurs
+   */
+  HttpResponse(Http http, URL url, CrawlDatum datum,
+      boolean followRedirects) throws IOException {
 
-  
-  HttpResponse(HttpBase http, URL url, CrawlDatum datum, boolean 
followRedirects) throws IOException {
-    this.http = http;
+    // Prepare GET method for HTTP request
     this.url = url;
-    this.base = url.toString();
-    this.orig = url.toString();
-    GetMethod get = new GetMethod(this.orig);
+    GetMethod get = new GetMethod(url.toString());
     get.setFollowRedirects(followRedirects);
-    get.setRequestHeader("User-Agent", http.getUserAgent());
-    if (datum.getModifiedTime() > 0)
-      get.setRequestHeader("If-Modified-Since", 
HttpDateFormat.toString(datum.getModifiedTime()));
+    get.setDoAuthentication(true);
+    if (datum.getModifiedTime() > 0) {
+      get.setRequestHeader("If-Modified-Since",
+          HttpDateFormat.toString(datum.getModifiedTime()));
+    }
+
+    // Set HTTP parameters
     HttpMethodParams params = get.getParams();
     if (http.getUseHttp11()) {
       params.setVersion(HttpVersion.HTTP_1_1);
@@ -104,38 +99,75 @@
         headers.set(heads[i].getName(), heads[i].getValue());
       }
       
+      // Limit download size
+      int contentLength = Integer.MAX_VALUE;
+      String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+      if (contentLengthString != null) {
+        try {
+          contentLength = Integer.parseInt(contentLengthString.trim());
+        } catch (NumberFormatException ex) {
+          throw new HttpException("bad content length: " +
+              contentLengthString);
+        }
+      }
+      if (http.getMaxContent() >= 0 &&
+          contentLength > http.getMaxContent()) {
+        contentLength = http.getMaxContent();
+      }
+
       // always read content. Sometimes content is useful to find a cause
       // for error.
+      InputStream in = get.getResponseBodyAsStream();
       try {
-        InputStream in = get.getResponseBodyAsStream();
-        byte[] buffer = new byte[http.BUFFER_SIZE];
+        byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
         int bufferFilled = 0;
         int totalRead = 0;
         ByteArrayOutputStream out = new ByteArrayOutputStream();
-        int tryAndRead = calculateTryToRead(totalRead);
-        while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1 && 
tryAndRead > 0) {
+        while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
+            && totalRead < contentLength) {
           totalRead += bufferFilled;
           out.write(buffer, 0, bufferFilled);
-          tryAndRead = calculateTryToRead(totalRead);
         }
 
         content = out.toByteArray();
-        in.close();
       } catch (Exception e) {
         if (code == 200) throw new IOException(e.toString());
         // for codes other than 200 OK, we are fine with empty content
+      } finally {
+        in.close();
+        get.abort();
       }
+      
+      StringBuilder fetchTrace = null;
+      if (Http.LOG.isTraceEnabled()) {
+        // Trace message
+        fetchTrace = new StringBuilder("url: " + url +
+            "; status code: " + code +
+            "; bytes received: " + content.length);
+        if (getHeader(Response.CONTENT_LENGTH) != null)
+          fetchTrace.append("; Content-Length: " +
+              getHeader(Response.CONTENT_LENGTH));
+        if (getHeader(Response.LOCATION) != null)
+          fetchTrace.append("; Location: " + getHeader(Response.LOCATION));
+      }
+      // Extract gzip and x-gzip files
       if (content != null) {
         // check if we have to uncompress it
         String contentEncoding = headers.get(Response.CONTENT_ENCODING);
-        if ("gzip".equals(contentEncoding) || 
"x-gzip".equals(contentEncoding)) {
+        if (contentEncoding != null && Http.LOG.isTraceEnabled())
+          fetchTrace.append("; Content-Encoding: " + contentEncoding);
+        if ("gzip".equals(contentEncoding) ||
+            "x-gzip".equals(contentEncoding)) {
           content = http.processGzipEncoded(content, url);
+          if (Http.LOG.isTraceEnabled())
+            fetchTrace.append("; extracted to " + content.length + " bytes");
         }
       }
-    } catch (org.apache.commons.httpclient.ProtocolException pe) {
-      pe.printStackTrace(LogUtil.getErrorStream(LOG));
-      get.releaseConnection();
-      throw new IOException(pe.toString());
+
+      // Log trace message
+      if (Http.LOG.isTraceEnabled()) {
+        Http.LOG.trace(fetchTrace);
+      }
     } finally {
       get.releaseConnection();
     }
@@ -169,17 +201,5 @@
   /* -------------------------- *
    * </implementation:Response> *
    * -------------------------- */
-
-  
-
-  private int calculateTryToRead(int totalRead) {
-    int tryToRead = Http.BUFFER_SIZE;
-    if (http.getMaxContent() <= 0) {
-      return http.BUFFER_SIZE;
-    } else if (http.getMaxContent() - totalRead < http.BUFFER_SIZE) {
-      tryToRead = http.getMaxContent() - totalRead;
-    }
-    return tryToRead;
-  }
-
 }
+

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html?rev=608972&r1=608971&r2=608972&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html
 Fri Jan  4 11:48:32 2008
@@ -1,7 +1,9 @@
 <html>
 <body>
-<p>Protocol plugin which supports retrieving documents via the HTTP 
protocol.</p>
-<p>This plugin is based on Jakarta Commons HttpClient library, and handles
-also HTTPS and cookies.</p>
+<p>Protocol plugin which supports retrieving documents via the HTTP and
+HTTPS protocols, optionally with Basic, Digest and NTLM authentication
+schemes for web server as well as proxy server. It handles cookies
+within a single fetch operation. This plugin is based on Jakarta
+Commons HttpClient library.</p>
 </body>
 </html>

Added: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml?rev=608972&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml
 (added)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml
 Fri Jan  4 11:48:32 2008
@@ -0,0 +1,42 @@
+<?xml version="1.0"?>
+
+<auth-configuration>
+
+  <!-- Default credentials -->
+  <credentials username="userx" password="passx">
+    <default/>
+    <authscope host="127.0.0.1" port="47500"/>
+  </credentials>
+
+  <!-- Defined a realm for 127.0.0.1:47501 so that authentication for
+       other realms fail (except another realm for 127.0.0.1:47501 is
+       defined below for NTLM scheme). -->
+  <credentials username="userx" password="passx">
+    <authscope host="127.0.0.1" port="47501" realm="realmx"
+    scheme="BASIC"/>
+  </credentials>
+
+  <!-- Test case for NTLM authentication scheme. -->
+  <credentials username="ntlm_user" password="ntlm_pass">
+    <authscope host="127.0.0.1" port="47501" realm="NUTCH"
+    scheme="NTLM"/>
+  </credentials>
+
+  <!-- Test case for credentials selection based on scheme (realm1 is
+       present in basic.jsp as well as digest.jsp).
+       Also tests Digest authentication scheme. -->
+  <credentials username="digest_user" password="digest_pass">
+    <authscope host="127.0.0.1" port="47500" realm="realm1"
+    scheme="DIGEST"/>
+  </credentials>
+
+  <!-- Test case for Basic authentication scheme. -->
+  <credentials username="user1" password="pass1">
+    <authscope host="127.0.0.1" port="47500" realm="realm1"/>
+  </credentials>
+  <credentials username="user2" password="pass2">
+    <authscope host="127.0.0.1" port="47500" realm="realm2"/>
+  </credentials>
+
+</auth-configuration>
+

Added: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml?rev=608972&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml
 (added)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml
 Fri Jan  4 11:48:32 2008
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<configuration>
+
+<property>
+  <name>http.robots.agents</name>
+  <value>Nutch-Test,*</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.agent.name</name>
+  <value>Nutch-Test</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.agent.description</name>
+  <value>Nutch protocol-httpclient test</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.auth.file</name>
+  <value>httpclient-auth-test.xml</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.timeout</name>
+  <value>60000</value>
+  <description></description>
+</property>
+
+</configuration>

Added: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java?rev=608972&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
 Fri Jan  4 11:48:32 2008
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.httpclient;
+
+import java.net.URL;
+import java.net.MalformedURLException;
+import junit.framework.TestCase;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.servlet.ServletHttpContext;
+import org.mortbay.jetty.servlet.ServletHandler;
+import org.mortbay.http.SocketListener;
+import org.mortbay.http.handler.ResourceHandler;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+
+/**
+ * Test cases for protocol-httpclient.
+ *
+ * @author Susam Pal
+ */
+public class TestProtocolHttpClient extends TestCase {
+
+  private Server server;
+  private Configuration conf;
+  private static final String RES_DIR = System.getProperty("test.data", ".");
+  private int port;
+  private Http http = new Http();
+
+  protected void setUp() throws Exception {
+
+    ServletHttpContext context = new ServletHttpContext();
+    context.setContextPath("/");
+    context.setResourceBase(RES_DIR);
+    context.addServlet("JSP", "*.jsp", "org.apache.jasper.servlet.JspServlet");
+    context.addHandler(new ResourceHandler());
+
+    server = new Server();
+    server.addContext(context);
+
+    conf = new Configuration();
+    conf.addDefaultResource("nutch-default.xml");
+    conf.addResource("nutch-site-test.xml");
+    
+    http = new Http();
+    http.setConf(conf);
+  }
+
+  protected void tearDown() throws Exception {
+    server.stop();
+  }
+
+  /**
+   * Tests whether the client can remember cookies.
+   *
+   * @throws Exception If an error occurs or the test case fails.
+   */
+  public void testCookies() throws Exception {
+    startServer(47500);
+    fetchPage("/cookies.jsp", 200);
+    fetchPage("/cookies.jsp?cookie=yes", 200);
+  }
+
+  /**
+   * Tests that no pre-emptive authorization headers are sent by the
+   * client.
+   *
+   * @throws Exception If an error occurs or the test case fails.
+   */
+  public void testNoPreemptiveAuth() throws Exception {
+    startServer(47500);
+    fetchPage("/noauth.jsp", 200);
+  }
+
+  /**
+   * Tests default credentials.
+   *
+   * @throws Exception If an error occurs or the test case fails.
+   */
+  public void testDefaultCredentials() throws Exception {
+    startServer(47502);
+    fetchPage("/basic.jsp", 200);
+  }
+
+  /**
+   * Tests basic authentication scheme for various realms.
+   * 
+   * @throws Exception If an error occurs or the test case fails.
+   */
+  public void testBasicAuth() throws Exception {
+    startServer(47500);
+    fetchPage("/basic.jsp", 200);
+    fetchPage("/basic.jsp?case=1", 200);
+    fetchPage("/basic.jsp?case=2", 200);
+    server.start();
+  }
+
+  /**
+   * Tests that authentication happens for a defined realm and not for
+   * other realms for a host:port when an extra <code>authscope</code>
+   * tag is not defined to match all other realms.
+   *
+   * @throws Exception If an error occurs or the test case fails.
+   */
+  public void testOtherRealmsNoAuth() throws Exception {
+    startServer(47501);
+    fetchPage("/basic.jsp", 200);
+    fetchPage("/basic.jsp?case=1", 401);
+    fetchPage("/basic.jsp?case=2", 401);
+  }
+
+  /**
+   * Tests Digest authentication scheme.
+   *
+   * @throws Exception If an error occurs or the test case fails.
+   */
+  public void testDigestAuth() throws Exception {
+    startServer(47500);
+    fetchPage("/digest.jsp", 200);
+  }
+
+  /**
+   * Tests NTLM authentication scheme.
+   *
+   * @throws Exception If an error occurs or the test case fails.
+   */
+  public void testNtlmAuth() throws Exception {
+    startServer(47501);
+    fetchPage("/ntlm.jsp", 200);
+  }
+
+  /**
+   * Starts the Jetty server at a specified port.
+   *
+   * @param  portno     Port number.
+   * @throws Exception  When an error occurs.
+   */
+  private void startServer(int portno) throws Exception {
+    port = portno;
+    SocketListener listener = new SocketListener();
+    listener.setHost("127.0.0.1");
+    listener.setPort(port);
+    server.addListener(listener);
+    server.start();
+  }
+
+  /**
+   * Fetches the specified <code>page</code> from the local Jetty server
+   * and checks whether the HTTP response status code matches with the
+   * expected code.
+   *
+   * @param page          Page to be fetched.
+   * @param expectedCode  HTTP response status code expected while
+   *                      fetching the page.
+   * @throws Exception    When an error occurs or test case fails.
+   */
+  private void fetchPage(String page, int expectedCode)
+      throws Exception {
+    URL url = new URL("http", "127.0.0.1", port, page);
+    Response response = null;
+    response = http.getResponse(url, new CrawlDatum(), true);
+
+    int code = response.getCode();
+    assertEquals("HTTP Status Code for " + url, expectedCode, code);
+  }
+  
+  /**
+   * Returns an URL to the specified page.
+   *
+   * @param  page                  Page available in the local Jetty
+   *                               server.
+   * @throws MalformedURLException If an URL can not be formed.
+   */
+  private URL getURL(String page) throws MalformedURLException {
+    return new URL("http", "127.0.0.1", port, page);
+  }
+}


Reply via email to