Author: ab Date: Thu Jul 21 04:07:10 2005 New Revision: 220034 URL: http://svn.apache.org/viewcvs?rev=220034&view=rev Log: Fixes for NUTCH-66, and some cleanup:
* apply the more lenient CookiePolicy. This fixes the problem with poorly formatted cookies being rejected. * remove the while() loop, which was required only to handle retries related to authentication. commons-httpclient can handle this internally; and the present code was broken anyway. This is not a solution yet - the process of loading and presenting credentials needs to be developed further. * cleaned up deprecation warnings by setting parameters at the correct level and in the new preferred way. * correctly set the maximum number of threads. * add MIME magic handling. Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java?rev=220034&r1=220033&r2=220034&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java Thu Jul 21 04:07:10 2005 @@ -18,14 +18,14 @@ import org.apache.commons.httpclient.HttpClientError; import org.apache.commons.httpclient.params.HttpConnectionParams; import org.apache.commons.httpclient.protocol.ControllerThreadSocketFactory; -import org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory; +import org.apache.commons.httpclient.protocol.ProtocolSocketFactory; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.sun.net.ssl.SSLContext; import com.sun.net.ssl.TrustManager; -public class DummySSLProtocolSocketFactory implements SecureProtocolSocketFactory { +public class DummySSLProtocolSocketFactory implements ProtocolSocketFactory { /** Log object for this class. */ private static final Log LOG = LogFactory.getLog(DummySSLProtocolSocketFactory.class); Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=220034&r1=220033&r2=220034&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Thu Jul 21 04:07:10 2005 @@ -9,18 +9,16 @@ import java.net.UnknownHostException; import java.util.HashMap; import java.util.LinkedList; -import java.util.Properties; import java.util.logging.Level; import java.util.logging.Logger; -import org.apache.commons.httpclient.Cookie; import org.apache.commons.httpclient.Credentials; import org.apache.commons.httpclient.HostConfiguration; import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.HttpState; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.NTCredentials; -import org.apache.commons.httpclient.params.HttpConnectionParams; +import org.apache.commons.httpclient.auth.AuthScope; +import org.apache.commons.httpclient.params.HttpConnectionManagerParams; import org.apache.commons.httpclient.protocol.Protocol; import org.apache.nutch.db.Page; import org.apache.nutch.pagedb.FetchListEntry; @@ -47,7 +45,6 @@ } static final int BUFFER_SIZE = 8 * 1024; - private static final int MAX_REDIRECTS = NutchConf.get().getInt("http.redirect.max", 3); private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager(); private static HttpClient client; @@ -102,8 +99,6 @@ */ private static LinkedList BLOCKED_ADDR_QUEUE = new LinkedList(); - private RobotRulesParser robotRules = new RobotRulesParser(); - private static InetAddress blockAddr(URL url) throws ProtocolException { InetAddress addr; try { @@ -183,7 +178,6 @@ } public ProtocolOutput getProtocolOutput(String urlString) { - ProtocolOutput output = null; try { return getProtocolOutput(new FetchListEntry(true, new Page(urlString, 1.0f), new String[0])); } catch (MalformedURLException mue) { @@ -196,9 +190,6 @@ try { URL url = new URL(urlString); - int redirects = 0; - HttpAuthentication auth = null; - while (true) { try { if (!RobotRulesParser.isAllowed(url)) return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url)); @@ -210,7 +201,7 @@ InetAddress addr = blockAddr(url); HttpResponse response; try { - response = new HttpResponse(urlString, url); // make a request + response = new HttpResponse(url); // make a request } finally { unblockAddr(addr); } @@ -255,19 +246,10 @@ } else if (code == 400) { // bad request, mark as GONE LOG.fine("400 Bad request: " + url); return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, url)); - } else if (code == 401) { // requires authorization + } else if (code == 401) { // requires authorization, but no valid auth provided. LOG.fine("401 Authentication Required"); - if (redirects == MAX_REDIRECTS) - return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.REDIR_EXCEEDED, - "Too many redirects: " + urlString)); - Properties p = c.getMetadata(); - if (p instanceof MultiProperties) { - auth = HttpAuthenticationFactory.findAuthentication((MultiProperties) p); - } else { - return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authorization required: " - + urlString)); - } - redirects++; + return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: " + + urlString)); } else if (code == 404) { return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, url)); } else if (code == 410) { // permanently GONE @@ -276,7 +258,6 @@ return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + url)); } - } } catch (Throwable e) { e.printStackTrace(); return new ProtocolOutput(null, new ProtocolStatus(e)); @@ -371,55 +352,33 @@ // get a client isntance -- we just need one. client = new HttpClient(connectionManager); - // this is just to add logging, whenever cookies are added. - client.setState(new NutchHttpState()); // Set up an HTTPS socket factory that accepts self-signed certs. Protocol dummyhttps = new Protocol("https", new DummySSLProtocolSocketFactory(), 443); Protocol.registerProtocol("https", dummyhttps); - // set up the connection manager - // hardcoded for now - - connectionManager.setMaxTotalConnections(MAX_THREADS_TOTAL); - //if (MAX_THREADS_TOTAL > MAX_THREADS_PER_HOST) { - // connectionManager.setMaxConnectionsPerHost(MAX_THREADS_PER_HOST); - //} else { - // connectionManager.setMaxConnectionsPerHost(MAX_THREADS_TOTAL); - //} - - HttpConnectionParams params = connectionManager.getParams(); + HttpConnectionManagerParams params = connectionManager.getParams(); params.setConnectionTimeout(TIMEOUT); params.setSoTimeout(TIMEOUT); params.setSendBufferSize(BUFFER_SIZE); params.setReceiveBufferSize(BUFFER_SIZE); + params.setMaxTotalConnections(MAX_THREADS_TOTAL); + if (MAX_THREADS_TOTAL > MAX_THREADS_PER_HOST) { + params.setDefaultMaxConnectionsPerHost(MAX_THREADS_PER_HOST); + } else { + params.setDefaultMaxConnectionsPerHost(MAX_THREADS_TOTAL); + } + HostConfiguration hostConf = client.getHostConfiguration(); if (PROXY) { hostConf.setProxy(PROXY_HOST, PROXY_PORT); } if (NTLM_USERNAME.length() > 0) { Credentials ntCreds = new NTCredentials(NTLM_USERNAME, NTLM_PASSWORD, NTLM_HOST, NTLM_DOMAIN); - client.getState().setCredentials(null, null, ntCreds); + client.getState().setCredentials(new AuthScope(NTLM_HOST, AuthScope.ANY_PORT), ntCreds); LOG.info("Added NTLM credentials for " + NTLM_USERNAME); } LOG.info("Configured Client"); } -} - -class NutchHttpState extends HttpState { - public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.net.Http.NutchHttpState"); - - public void addCookie(Cookie cookie) { - LOG.fine(" - setting cookie: " + cookie); - super.addCookie(cookie); - } - - public void addCookies(Cookie[] cookies) { - LOG.fine(" - setting cookies: "); - for (int i = 0; i < cookies.length; i++) - LOG.fine(" cookie: " + cookies[i]); - - super.addCookies(cookies); - } -} +} \ No newline at end of file Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=220034&r1=220033&r2=220034&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Thu Jul 21 04:07:10 2005 @@ -4,24 +4,34 @@ package org.apache.nutch.protocol.httpclient; import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.util.NutchConf; +import org.apache.nutch.util.mime.MimeType; +import org.apache.nutch.util.mime.MimeTypes; import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpVersion; +import org.apache.commons.httpclient.cookie.CookiePolicy; import org.apache.commons.httpclient.methods.GetMethod; +import org.apache.commons.httpclient.params.HttpMethodParams; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; -import java.util.Properties; -import java.util.List; -import java.util.ListIterator; /** * An HTTP response. */ public class HttpResponse { + /** A flag that tells if magic resolution must be performed */ + private final static boolean MAGIC = + NutchConf.get().getBoolean("mime.type.magic", true); + + /** Get the MimeTypes resolver instance. */ + private final static MimeTypes MIME = + MimeTypes.get(NutchConf.get().get("mime.types.file")); + private String orig; private String base; @@ -54,24 +64,40 @@ public Content toContent() { String contentType = getHeader("Content-Type"); - if (contentType == null) contentType = ""; + if (contentType == null) { + MimeType type = null; + if (MAGIC) { + type = MIME.getMimeType(orig, content); + } else { + type = MIME.getMimeType(orig); + } + if (type != null) { + contentType = type.getName(); + } else { + contentType = ""; + } + } if (content == null) content = EMPTY_CONTENT; return new Content(orig, base, content, contentType, headers); } - public HttpResponse(URL url) throws ProtocolException, IOException { - this(url.toString(), url); - } - - public HttpResponse(String orig, URL url) throws IOException { - this.orig = orig; + public HttpResponse(URL url) throws IOException { this.base = url.toString(); - GetMethod get = new GetMethod(url.toString()); + this.orig = url.toString(); + GetMethod get = new GetMethod(this.orig); get.setFollowRedirects(false); - get.setStrictMode(false); get.setRequestHeader("User-Agent", Http.AGENT_STRING); - get.setHttp11(false); - get.setMethodRetryHandler(null); + HttpMethodParams params = get.getParams(); + // some servers cannot digest the new protocol + params.setVersion(HttpVersion.HTTP_1_0); + params.makeLenient(); + params.setContentCharset("UTF-8"); + params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); + params.setBooleanParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true); + // XXX (ab) not sure about this... the default is to retry 3 times; if + // XXX the request body was sent the method is not retried, so there is + // XXX little danger in retrying... + // params.setParameter(HttpMethodParams.RETRY_HANDLER, null); try { code = Http.getClient().executeMethod(get); @@ -103,6 +129,7 @@ } } catch (org.apache.commons.httpclient.ProtocolException pe) { pe.printStackTrace(); + get.releaseConnection(); throw new IOException(pe.toString()); } finally { get.releaseConnection();