Author: mattmann
Date: Sun Jul  5 02:47:23 2015
New Revision: 1689209

URL: http://svn.apache.org/r1689209
Log:
Fix for NUTCH-2059: protocol-httpclient, protocol-http unit test errors on 
Jenkins contributed by PeterCiuffetti <[email protected]> this closes 
#45.

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp
    nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp
    nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp
    nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp
    nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp
    
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1689209&r1=1689208&r2=1689209&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sun Jul  5 02:47:23 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2059 protocol-httpclient, protocol-http unit test errors on Jenkins 
(Peter Ciuffetti via mattmann)
+
 * NUTCH-1980 Jexl expressions for CrawlDbReader (markus)
 
 * NUTCH-1692 SegmentReader was broken in distributed mode (markus, tejasp)

Modified: nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp?rev=1689209&r1=1689208&r2=1689209&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp (original)
+++ nutch/trunk/src/plugin/protocol-httpclient/jsp/basic.jsp Sun Jul  5 
02:47:23 2015
@@ -13,8 +13,7 @@
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
---%>
-<%--
+--%><%--
   This JSP demonstrates basic authentication. When this JSP page is
   requested with no query parameters, then the user must enter the
   username as 'userx' and password as 'passx' when prompted for
@@ -25,11 +24,9 @@
   code below.
 
   Author: Susam Pal
---%>
-<%@ page
+--%><%@ page
     import = "sun.misc.BASE64Decoder"
-%>
-<%
+%><%
   String authHeader = request.getHeader("Authorization");
   String realm = null;
   String username = null;
@@ -74,4 +71,4 @@
 </html>
 <%
   }
-%>
+%>
\ No newline at end of file

Modified: nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp?rev=1689209&r1=1689208&r2=1689209&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp (original)
+++ nutch/trunk/src/plugin/protocol-httpclient/jsp/cookies.jsp Sun Jul  5 
02:47:23 2015
@@ -13,8 +13,7 @@
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
---%>
-<%--
+--%><%--
   This JSP tests whether the client can remember cookies. When the JSP
   is fetched for the first time without any query parameters, it sets
   a few cookies in the client. On a second request, with the query
@@ -23,8 +22,7 @@
   If the cookies are not found, HTTP 403 response is returned.
 
   Author: Susam Pal
---%>
-<%
+--%><%
   String cookieParam = request.getParameter("cookie");
   if (!"yes".equals(cookieParam)) { // Send cookies
     response.addCookie(new Cookie("var1", "val1"));
@@ -62,4 +60,4 @@
 <%
     }
   }
-%>
+%>
\ No newline at end of file

Modified: nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp?rev=1689209&r1=1689208&r2=1689209&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp (original)
+++ nutch/trunk/src/plugin/protocol-httpclient/jsp/digest.jsp Sun Jul  5 
02:47:23 2015
@@ -13,8 +13,7 @@
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
---%>
-<%--
+--%><%--
   This JSP tests digest authentication. It generates an HTTP response
   with authorization header for digest authentication and checks the
   user-name supplied by the client. It does not check the other
@@ -23,12 +22,10 @@
   be tested.
 
   Author: Susam Pal
---%>
-<%@ page
+--%><%@ page
     import = "java.util.StringTokenizer"
     import = "java.util.HashMap"
-%>
-<%
+%><%
   String username = "digest_user";
   String authHeader = request.getHeader("Authorization");
   
@@ -68,4 +65,4 @@
 </html>
 <%
   }
-%>
+%>
\ No newline at end of file

Modified: nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp?rev=1689209&r1=1689208&r2=1689209&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp (original)
+++ nutch/trunk/src/plugin/protocol-httpclient/jsp/noauth.jsp Sun Jul  5 
02:47:23 2015
@@ -13,16 +13,14 @@
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
---%>
-<%--
+--%><%--
   This JSP tests whether the client is sending any pre-emptive
   authentication headers. The client is expected not to send pre-emptive
   authentication headers. If such authentication headers are found, this
   JSP will return an HTTP 403 response; HTTP 200 response otherwise.
 
   Author: Susam Pal
---%>
-<%
+--%><%
   if (request.getHeader("Authorization") != null) {
     response.sendError(response.SC_UNAUTHORIZED);
   } else {
@@ -35,4 +33,4 @@
 </html>
 <%
   }
-%>
+%>
\ No newline at end of file

Modified: nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp?rev=1689209&r1=1689208&r2=1689209&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp (original)
+++ nutch/trunk/src/plugin/protocol-httpclient/jsp/ntlm.jsp Sun Jul  5 02:47:23 
2015
@@ -13,8 +13,7 @@
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
---%>
-<%--
+--%><%--
   This JSP tests NTLM authentication. It generates an HTTP response
   with authorization header for NTLM authentication and checks the
   user-name supplied by the client. It does not check the other
@@ -23,12 +22,10 @@
   be tested.
 
   Author: Susam Pal
---%>
-<%@ page
+--%><%@ page
     import = "sun.misc.BASE64Decoder"
     import = "sun.misc.BASE64Encoder"
-%>
-<%
+%><%
   String authHeader = request.getHeader("Authorization");
   String username = null;
   String domain = null;
@@ -89,4 +86,4 @@
 </html>
 <%
   }
-%>
+%>
\ No newline at end of file

Modified: 
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=1689209&r1=1689208&r2=1689209&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 Sun Jul  5 02:47:23 2015
@@ -62,15 +62,24 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.nutch.util.NutchConfiguration;
 
 /**
- * <p>This class is a protocol plugin that configures an HTTP client for Basic,
+ * <p>
+ * This class is a protocol plugin that configures an HTTP client for Basic,
  * Digest and NTLM authentication schemes for web server as well as proxy
  * server. It takes care of HTTPS protocol as well as cookies in a single fetch
- * session.</p>
- * <p>Documentation can be found on the Nutch <a 
href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes";>HttpAuthenticationSchemes</a>
- * wiki page.</p>
- * <p>The original description of the motivation to support <a 
href="https://wiki.apache.org/nutch/HttpPostAuthentication";>HttpPostAuthentication</a>
- * is also included on the Nutch wiki. Additionally HttpPostAuthentication 
development is documented
- * at the <a 
href="https://issues.apache.org/jira/browse/NUTCH-827";>NUTCH-827</a> Jira issue.
+ * session.
+ * </p>
+ * <p>
+ * Documentation can be found on the Nutch <a
+ * href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes";
+ * >HttpAuthenticationSchemes</a> wiki page.
+ * </p>
+ * <p>
+ * The original description of the motivation to support <a
+ * href="https://wiki.apache.org/nutch/HttpPostAuthentication";
+ * >HttpPostAuthentication</a> is also included on the Nutch wiki. Additionally
+ * HttpPostAuthentication development is documented at the <a
+ * href="https://issues.apache.org/jira/browse/NUTCH-827";>NUTCH-827</a> Jira
+ * issue.
  * 
  * @author Susam Pal
  */
@@ -178,7 +187,7 @@ public class Http extends HttpBase {
   private void configureClient() {
 
     // Set up an HTTPS socket factory that accepts self-signed certs.
-    //ProtocolSocketFactory factory = new SSLProtocolSocketFactory();
+    // ProtocolSocketFactory factory = new SSLProtocolSocketFactory();
     ProtocolSocketFactory factory = new DummySSLProtocolSocketFactory();
     Protocol https = new Protocol("https", factory, 443);
     Protocol.registerProtocol("https", https);
@@ -188,12 +197,20 @@ public class Http extends HttpBase {
     params.setSoTimeout(timeout);
     params.setSendBufferSize(BUFFER_SIZE);
     params.setReceiveBufferSize(BUFFER_SIZE);
-    params.setMaxTotalConnections(maxThreadsTotal);
+
+    // 
--------------------------------------------------------------------------------
+    // NUTCH-1836: Modification to increase the number of available connections
+    // for multi-threaded crawls.
+    // 
--------------------------------------------------------------------------------
+    params.setMaxTotalConnections(conf.getInt(
+        "mapred.tasktracker.map.tasks.maximum", 5)
+        * conf.getInt("fetcher.threads.fetch", maxThreadsTotal));
 
     // Also set max connections per host to maxThreadsTotal since all threads
     // might be used to fetch from the same host - otherwise timeout errors can
     // occur
-    params.setDefaultMaxConnectionsPerHost(maxThreadsTotal);
+    params.setDefaultMaxConnectionsPerHost(conf.getInt(
+        "fetcher.threads.fetch", maxThreadsTotal));
 
     // executeMethod(HttpMethod) seems to ignore the connection timeout on the
     // connection manager.
@@ -203,16 +220,16 @@ public class Http extends HttpBase {
     HostConfiguration hostConf = client.getHostConfiguration();
     ArrayList<Header> headers = new ArrayList<Header>();
     // Set the User Agent in the header
-    //headers.add(new Header("User-Agent", userAgent)); //NUTCH-1941
+    // headers.add(new Header("User-Agent", userAgent)); //NUTCH-1941
     // prefer English
     headers.add(new Header("Accept-Language", acceptLanguage));
     // prefer UTF-8
     headers.add(new Header("Accept-Charset", 
"utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
     // prefer understandable formats
     headers
-    .add(new Header(
-        "Accept",
-        
"text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
+        .add(new Header(
+            "Accept",
+            
"text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
     // accept gzipped content
     headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
     hostConf.getParams().setParameter("http.default-headers", headers);
@@ -287,8 +304,7 @@ public class Http extends HttpBase {
         String authMethod = credElement.getAttribute("authMethod");
         // read http form post auth info
         if (StringUtils.isNotBlank(authMethod)) {
-          formConfigurer = readFormAuthConfigurer(credElement,
-              authMethod);
+          formConfigurer = readFormAuthConfigurer(credElement, authMethod);
           continue;
         }
 
@@ -361,12 +377,12 @@ public class Http extends HttpBase {
   }
 
   /**
-   * <auth-configuration> <credentials authMethod="formAuth"
-   * loginUrl="loginUrl" loginFormId="loginFormId" loginRedirect="true">
-   * <loginPostData> <field name="username" value="user1"/> </loginPostData>
-   * <additionalPostHeaders> <field name="header1" value="vaule1"/>
-   * </additionalPostHeaders> <removedFormFields> <field name="header1"/>
-   * </removedFormFields> </credentials> </auth-configuration>
+   * <auth-configuration> <credentials authMethod="formAuth" 
loginUrl="loginUrl"
+   * loginFormId="loginFormId" loginRedirect="true"> <loginPostData> <field
+   * name="username" value="user1"/> </loginPostData> <additionalPostHeaders>
+   * <field name="header1" value="vaule1"/> </additionalPostHeaders>
+   * <removedFormFields> <field name="header1"/> </removedFormFields>
+   * </credentials> </auth-configuration>
    */
   private static HttpFormAuthConfigurer readFormAuthConfigurer(
       Element credElement, String authMethod) {
@@ -424,8 +440,7 @@ public class Http extends HttpBase {
             String value = fieldElement.getAttribute("value");
             additionalPostHeaders.put(name, value);
           }
-          formConfigurer
-          .setAdditionalPostHeaders(additionalPostHeaders);
+          formConfigurer.setAdditionalPostHeaders(additionalPostHeaders);
         } else if ("removedFormFields".equals(element.getTagName())) {
           Set<String> removedFormFields = new HashSet<String>();
           NodeList childNodes = element.getChildNodes();


Reply via email to