Author: ab
Date: Thu Jul 21 04:07:10 2005
New Revision: 220034

URL: http://svn.apache.org/viewcvs?rev=220034&view=rev
Log:
Fixes for NUTCH-66, and some cleanup:

* apply the more lenient CookiePolicy. This fixes the problem with poorly
  formatted cookies being rejected.

* remove the while() loop, which was required only to handle retries related
  to authentication. commons-httpclient can handle this internally; and the
  present code was broken anyway. This is not a solution yet - the
  process of loading and presenting credentials needs to be developed further.

* cleaned up deprecation warnings by setting parameters at the correct level
  and in the new preferred way.

* correctly set the maximum number of threads.

* add MIME magic handling.


Modified:
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java?rev=220034&r1=220033&r2=220034&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
 Thu Jul 21 04:07:10 2005
@@ -18,14 +18,14 @@
 import org.apache.commons.httpclient.HttpClientError;
 import org.apache.commons.httpclient.params.HttpConnectionParams;
 import org.apache.commons.httpclient.protocol.ControllerThreadSocketFactory;
-import org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory;
+import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
 import com.sun.net.ssl.SSLContext;
 import com.sun.net.ssl.TrustManager;
 
-public class DummySSLProtocolSocketFactory implements 
SecureProtocolSocketFactory {
+public class DummySSLProtocolSocketFactory implements ProtocolSocketFactory {
 
   /** Log object for this class. */
   private static final Log LOG = 
LogFactory.getLog(DummySSLProtocolSocketFactory.class);

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=220034&r1=220033&r2=220034&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 Thu Jul 21 04:07:10 2005
@@ -9,18 +9,16 @@
 import java.net.UnknownHostException;
 import java.util.HashMap;
 import java.util.LinkedList;
-import java.util.Properties;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
-import org.apache.commons.httpclient.Cookie;
 import org.apache.commons.httpclient.Credentials;
 import org.apache.commons.httpclient.HostConfiguration;
 import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.HttpState;
 import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
 import org.apache.commons.httpclient.NTCredentials;
-import org.apache.commons.httpclient.params.HttpConnectionParams;
+import org.apache.commons.httpclient.auth.AuthScope;
+import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
 import org.apache.commons.httpclient.protocol.Protocol;
 import org.apache.nutch.db.Page;
 import org.apache.nutch.pagedb.FetchListEntry;
@@ -47,7 +45,6 @@
   }
 
   static final int BUFFER_SIZE = 8 * 1024;
-  private static final int MAX_REDIRECTS = 
NutchConf.get().getInt("http.redirect.max", 3);
   private static MultiThreadedHttpConnectionManager connectionManager = new 
MultiThreadedHttpConnectionManager();
   private static HttpClient client;
 
@@ -102,8 +99,6 @@
    */
   private static LinkedList BLOCKED_ADDR_QUEUE = new LinkedList();
 
-  private RobotRulesParser robotRules = new RobotRulesParser();
-
   private static InetAddress blockAddr(URL url) throws ProtocolException {
     InetAddress addr;
     try {
@@ -183,7 +178,6 @@
   }
 
   public ProtocolOutput getProtocolOutput(String urlString) {
-    ProtocolOutput output = null;
     try {
       return getProtocolOutput(new FetchListEntry(true, new Page(urlString, 
1.0f), new String[0]));
     } catch (MalformedURLException mue) {
@@ -196,9 +190,6 @@
     try {
       URL url = new URL(urlString);
 
-      int redirects = 0;
-      HttpAuthentication auth = null;
-      while (true) {
         try {
           if (!RobotRulesParser.isAllowed(url))
                   return new ProtocolOutput(null, new 
ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
@@ -210,7 +201,7 @@
         InetAddress addr = blockAddr(url);
         HttpResponse response;
         try {
-          response = new HttpResponse(urlString, url); // make a request
+          response = new HttpResponse(url); // make a request
         } finally {
           unblockAddr(addr);
         }
@@ -255,19 +246,10 @@
         } else if (code == 400) { // bad request, mark as GONE
           LOG.fine("400 Bad request: " + url);
           return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, 
url));
-        } else if (code == 401) { // requires authorization
+        } else if (code == 401) { // requires authorization, but no valid auth 
provided.
           LOG.fine("401 Authentication Required");
-          if (redirects == MAX_REDIRECTS)
-                  return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.REDIR_EXCEEDED,
-                          "Too many redirects: " + urlString));
-          Properties p = c.getMetadata();
-          if (p instanceof MultiProperties) {
-            auth = 
HttpAuthenticationFactory.findAuthentication((MultiProperties) p);
-          } else {
-            return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authorization required: "
-                    + urlString));
-          }
-          redirects++;
+          return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+                  + urlString));
         } else if (code == 404) {
           return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.NOTFOUND, url));
         } else if (code == 410) { // permanently GONE
@@ -276,7 +258,6 @@
           return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
                   + url));
         }
-      }
     } catch (Throwable e) {
       e.printStackTrace();
       return new ProtocolOutput(null, new ProtocolStatus(e));
@@ -371,55 +352,33 @@
     // get a client isntance -- we just need one.
 
     client = new HttpClient(connectionManager);
-    // this is just to add logging, whenever cookies are added.
-    client.setState(new NutchHttpState());
 
     // Set up an HTTPS socket factory that accepts self-signed certs.
     Protocol dummyhttps = new Protocol("https", new 
DummySSLProtocolSocketFactory(), 443);
     Protocol.registerProtocol("https", dummyhttps);
     
-    // set up the connection manager
-    // hardcoded for now
-
-    connectionManager.setMaxTotalConnections(MAX_THREADS_TOTAL);
-    //if (MAX_THREADS_TOTAL > MAX_THREADS_PER_HOST) {
-    //  connectionManager.setMaxConnectionsPerHost(MAX_THREADS_PER_HOST);
-    //} else {
-    //  connectionManager.setMaxConnectionsPerHost(MAX_THREADS_TOTAL);
-    //}
-
-    HttpConnectionParams params = connectionManager.getParams();
+    HttpConnectionManagerParams params = connectionManager.getParams();
     params.setConnectionTimeout(TIMEOUT);
     params.setSoTimeout(TIMEOUT);
     params.setSendBufferSize(BUFFER_SIZE);
     params.setReceiveBufferSize(BUFFER_SIZE);
+    params.setMaxTotalConnections(MAX_THREADS_TOTAL);
+    if (MAX_THREADS_TOTAL > MAX_THREADS_PER_HOST) {
+      params.setDefaultMaxConnectionsPerHost(MAX_THREADS_PER_HOST);
+    } else {
+      params.setDefaultMaxConnectionsPerHost(MAX_THREADS_TOTAL);
+    }
+
     HostConfiguration hostConf = client.getHostConfiguration();
     if (PROXY) {
       hostConf.setProxy(PROXY_HOST, PROXY_PORT);
     }
     if (NTLM_USERNAME.length() > 0) {
       Credentials ntCreds = new NTCredentials(NTLM_USERNAME, NTLM_PASSWORD, 
NTLM_HOST, NTLM_DOMAIN);
-      client.getState().setCredentials(null, null, ntCreds);
+      client.getState().setCredentials(new AuthScope(NTLM_HOST, 
AuthScope.ANY_PORT), ntCreds);
 
       LOG.info("Added NTLM credentials for " + NTLM_USERNAME);
     }
     LOG.info("Configured Client");
   }
-}
-
-class NutchHttpState extends HttpState {
-  public static final Logger LOG = 
LogFormatter.getLogger("org.apache.nutch.net.Http.NutchHttpState");
-  
-  public void addCookie(Cookie cookie) {
-    LOG.fine(" - setting cookie: " + cookie);
-    super.addCookie(cookie);
-  }
-  
-  public void addCookies(Cookie[] cookies) {
-    LOG.fine(" - setting cookies: ");
-    for (int i = 0; i < cookies.length; i++)
-      LOG.fine("   cookie: " + cookies[i]);
-    
-    super.addCookies(cookies);
-  }
-}
+}
\ No newline at end of file

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=220034&r1=220033&r2=220034&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 Thu Jul 21 04:07:10 2005
@@ -4,24 +4,34 @@
 package org.apache.nutch.protocol.httpclient;
 
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.mime.MimeType;
+import org.apache.nutch.util.mime.MimeTypes;
 
 import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpVersion;
 
+import org.apache.commons.httpclient.cookie.CookiePolicy;
 import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.httpclient.params.HttpMethodParams;
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
-import java.util.Properties;
-import java.util.List;
-import java.util.ListIterator;
 
 /**
  * An HTTP response.
  */
 public class HttpResponse {
+  /** A flag that tells if magic resolution must be performed */
+  private final static boolean MAGIC =
+        NutchConf.get().getBoolean("mime.type.magic", true);
+
+  /** Get the MimeTypes resolver instance. */
+  private final static MimeTypes MIME = 
+        MimeTypes.get(NutchConf.get().get("mime.types.file"));
+
   private String orig;
 
   private String base;
@@ -54,24 +64,40 @@
 
   public Content toContent() {
     String contentType = getHeader("Content-Type");
-    if (contentType == null) contentType = "";
+    if (contentType == null) {
+      MimeType type = null;
+      if (MAGIC) {
+        type = MIME.getMimeType(orig, content);
+      } else {
+        type = MIME.getMimeType(orig);
+      }
+      if (type != null) {
+          contentType = type.getName();
+      } else {
+          contentType = "";
+      }
+    }
     if (content == null) content = EMPTY_CONTENT;
     return new Content(orig, base, content, contentType, headers);
   }
 
-  public HttpResponse(URL url) throws ProtocolException, IOException {
-    this(url.toString(), url);
-  }
-
-  public HttpResponse(String orig, URL url) throws IOException {
-    this.orig = orig;
+  public HttpResponse(URL url) throws IOException {
     this.base = url.toString();
-    GetMethod get = new GetMethod(url.toString());
+    this.orig = url.toString();
+    GetMethod get = new GetMethod(this.orig);
     get.setFollowRedirects(false);
-    get.setStrictMode(false);
     get.setRequestHeader("User-Agent", Http.AGENT_STRING);
-    get.setHttp11(false);
-    get.setMethodRetryHandler(null);
+    HttpMethodParams params = get.getParams();
+    // some servers cannot digest the new protocol
+    params.setVersion(HttpVersion.HTTP_1_0);
+    params.makeLenient();
+    params.setContentCharset("UTF-8");
+    params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
+    params.setBooleanParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);
+    // XXX (ab) not sure about this... the default is to retry 3 times; if
+    // XXX the request body was sent the method is not retried, so there is
+    // XXX little danger in retrying...
+    // params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
     try {
       code = Http.getClient().executeMethod(get);
 
@@ -103,6 +129,7 @@
       }
     } catch (org.apache.commons.httpclient.ProtocolException pe) {
       pe.printStackTrace();
+      get.releaseConnection();
       throw new IOException(pe.toString());
     } finally {
       get.releaseConnection();


Reply via email to