Author: ab
Date: Thu Jul 21 04:07:10 2005
New Revision: 220034
URL: http://svn.apache.org/viewcvs?rev=220034&view=rev
Log:
Fixes for NUTCH-66, and some cleanup:
* apply the more lenient CookiePolicy. This fixes the problem with poorly
formatted cookies being rejected.
* remove the while() loop, which was required only to handle retries related
to authentication. commons-httpclient can handle this internally; and the
present code was broken anyway. This is not a solution yet - the
process of loading and presenting credentials needs to be developed further.
* cleaned up deprecation warnings by setting parameters at the correct level
and in the new preferred way.
* correctly set the maximum number of threads.
* add MIME magic handling.
Modified:
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
Modified:
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java?rev=220034&r1=220033&r2=220034&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
Thu Jul 21 04:07:10 2005
@@ -18,14 +18,14 @@
import org.apache.commons.httpclient.HttpClientError;
import org.apache.commons.httpclient.params.HttpConnectionParams;
import org.apache.commons.httpclient.protocol.ControllerThreadSocketFactory;
-import org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory;
+import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.sun.net.ssl.SSLContext;
import com.sun.net.ssl.TrustManager;
-public class DummySSLProtocolSocketFactory implements
SecureProtocolSocketFactory {
+public class DummySSLProtocolSocketFactory implements ProtocolSocketFactory {
/** Log object for this class. */
private static final Log LOG =
LogFactory.getLog(DummySSLProtocolSocketFactory.class);
Modified:
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=220034&r1=220033&r2=220034&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
Thu Jul 21 04:07:10 2005
@@ -9,18 +9,16 @@
import java.net.UnknownHostException;
import java.util.HashMap;
import java.util.LinkedList;
-import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;
-import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.Credentials;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.HttpState;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NTCredentials;
-import org.apache.commons.httpclient.params.HttpConnectionParams;
+import org.apache.commons.httpclient.auth.AuthScope;
+import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.protocol.Protocol;
import org.apache.nutch.db.Page;
import org.apache.nutch.pagedb.FetchListEntry;
@@ -47,7 +45,6 @@
}
static final int BUFFER_SIZE = 8 * 1024;
- private static final int MAX_REDIRECTS =
NutchConf.get().getInt("http.redirect.max", 3);
private static MultiThreadedHttpConnectionManager connectionManager = new
MultiThreadedHttpConnectionManager();
private static HttpClient client;
@@ -102,8 +99,6 @@
*/
private static LinkedList BLOCKED_ADDR_QUEUE = new LinkedList();
- private RobotRulesParser robotRules = new RobotRulesParser();
-
private static InetAddress blockAddr(URL url) throws ProtocolException {
InetAddress addr;
try {
@@ -183,7 +178,6 @@
}
public ProtocolOutput getProtocolOutput(String urlString) {
- ProtocolOutput output = null;
try {
return getProtocolOutput(new FetchListEntry(true, new Page(urlString,
1.0f), new String[0]));
} catch (MalformedURLException mue) {
@@ -196,9 +190,6 @@
try {
URL url = new URL(urlString);
- int redirects = 0;
- HttpAuthentication auth = null;
- while (true) {
try {
if (!RobotRulesParser.isAllowed(url))
return new ProtocolOutput(null, new
ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
@@ -210,7 +201,7 @@
InetAddress addr = blockAddr(url);
HttpResponse response;
try {
- response = new HttpResponse(urlString, url); // make a request
+ response = new HttpResponse(url); // make a request
} finally {
unblockAddr(addr);
}
@@ -255,19 +246,10 @@
} else if (code == 400) { // bad request, mark as GONE
LOG.fine("400 Bad request: " + url);
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
url));
- } else if (code == 401) { // requires authorization
+ } else if (code == 401) { // requires authorization, but no valid auth
provided.
LOG.fine("401 Authentication Required");
- if (redirects == MAX_REDIRECTS)
- return new ProtocolOutput(c, new
ProtocolStatus(ProtocolStatus.REDIR_EXCEEDED,
- "Too many redirects: " + urlString));
- Properties p = c.getMetadata();
- if (p instanceof MultiProperties) {
- auth =
HttpAuthenticationFactory.findAuthentication((MultiProperties) p);
- } else {
- return new ProtocolOutput(c, new
ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authorization required: "
- + urlString));
- }
- redirects++;
+ return new ProtocolOutput(c, new
ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+ + urlString));
} else if (code == 404) {
return new ProtocolOutput(c, new
ProtocolStatus(ProtocolStatus.NOTFOUND, url));
} else if (code == 410) { // permanently GONE
@@ -276,7 +258,6 @@
return new ProtocolOutput(c, new
ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
+ url));
}
- }
} catch (Throwable e) {
e.printStackTrace();
return new ProtocolOutput(null, new ProtocolStatus(e));
@@ -371,55 +352,33 @@
// get a client isntance -- we just need one.
client = new HttpClient(connectionManager);
- // this is just to add logging, whenever cookies are added.
- client.setState(new NutchHttpState());
// Set up an HTTPS socket factory that accepts self-signed certs.
Protocol dummyhttps = new Protocol("https", new
DummySSLProtocolSocketFactory(), 443);
Protocol.registerProtocol("https", dummyhttps);
- // set up the connection manager
- // hardcoded for now
-
- connectionManager.setMaxTotalConnections(MAX_THREADS_TOTAL);
- //if (MAX_THREADS_TOTAL > MAX_THREADS_PER_HOST) {
- // connectionManager.setMaxConnectionsPerHost(MAX_THREADS_PER_HOST);
- //} else {
- // connectionManager.setMaxConnectionsPerHost(MAX_THREADS_TOTAL);
- //}
-
- HttpConnectionParams params = connectionManager.getParams();
+ HttpConnectionManagerParams params = connectionManager.getParams();
params.setConnectionTimeout(TIMEOUT);
params.setSoTimeout(TIMEOUT);
params.setSendBufferSize(BUFFER_SIZE);
params.setReceiveBufferSize(BUFFER_SIZE);
+ params.setMaxTotalConnections(MAX_THREADS_TOTAL);
+ if (MAX_THREADS_TOTAL > MAX_THREADS_PER_HOST) {
+ params.setDefaultMaxConnectionsPerHost(MAX_THREADS_PER_HOST);
+ } else {
+ params.setDefaultMaxConnectionsPerHost(MAX_THREADS_TOTAL);
+ }
+
HostConfiguration hostConf = client.getHostConfiguration();
if (PROXY) {
hostConf.setProxy(PROXY_HOST, PROXY_PORT);
}
if (NTLM_USERNAME.length() > 0) {
Credentials ntCreds = new NTCredentials(NTLM_USERNAME, NTLM_PASSWORD,
NTLM_HOST, NTLM_DOMAIN);
- client.getState().setCredentials(null, null, ntCreds);
+ client.getState().setCredentials(new AuthScope(NTLM_HOST,
AuthScope.ANY_PORT), ntCreds);
LOG.info("Added NTLM credentials for " + NTLM_USERNAME);
}
LOG.info("Configured Client");
}
-}
-
-class NutchHttpState extends HttpState {
- public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.net.Http.NutchHttpState");
-
- public void addCookie(Cookie cookie) {
- LOG.fine(" - setting cookie: " + cookie);
- super.addCookie(cookie);
- }
-
- public void addCookies(Cookie[] cookies) {
- LOG.fine(" - setting cookies: ");
- for (int i = 0; i < cookies.length; i++)
- LOG.fine(" cookie: " + cookies[i]);
-
- super.addCookies(cookies);
- }
-}
+}
\ No newline at end of file
Modified:
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=220034&r1=220033&r2=220034&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
(original)
+++
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
Thu Jul 21 04:07:10 2005
@@ -4,24 +4,34 @@
package org.apache.nutch.protocol.httpclient;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.mime.MimeType;
+import org.apache.nutch.util.mime.MimeTypes;
import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpVersion;
+import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.httpclient.params.HttpMethodParams;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
-import java.util.Properties;
-import java.util.List;
-import java.util.ListIterator;
/**
* An HTTP response.
*/
public class HttpResponse {
+ /** A flag that tells if magic resolution must be performed */
+ private final static boolean MAGIC =
+ NutchConf.get().getBoolean("mime.type.magic", true);
+
+ /** Get the MimeTypes resolver instance. */
+ private final static MimeTypes MIME =
+ MimeTypes.get(NutchConf.get().get("mime.types.file"));
+
private String orig;
private String base;
@@ -54,24 +64,40 @@
public Content toContent() {
String contentType = getHeader("Content-Type");
- if (contentType == null) contentType = "";
+ if (contentType == null) {
+ MimeType type = null;
+ if (MAGIC) {
+ type = MIME.getMimeType(orig, content);
+ } else {
+ type = MIME.getMimeType(orig);
+ }
+ if (type != null) {
+ contentType = type.getName();
+ } else {
+ contentType = "";
+ }
+ }
if (content == null) content = EMPTY_CONTENT;
return new Content(orig, base, content, contentType, headers);
}
- public HttpResponse(URL url) throws ProtocolException, IOException {
- this(url.toString(), url);
- }
-
- public HttpResponse(String orig, URL url) throws IOException {
- this.orig = orig;
+ public HttpResponse(URL url) throws IOException {
this.base = url.toString();
- GetMethod get = new GetMethod(url.toString());
+ this.orig = url.toString();
+ GetMethod get = new GetMethod(this.orig);
get.setFollowRedirects(false);
- get.setStrictMode(false);
get.setRequestHeader("User-Agent", Http.AGENT_STRING);
- get.setHttp11(false);
- get.setMethodRetryHandler(null);
+ HttpMethodParams params = get.getParams();
+ // some servers cannot digest the new protocol
+ params.setVersion(HttpVersion.HTTP_1_0);
+ params.makeLenient();
+ params.setContentCharset("UTF-8");
+ params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
+ params.setBooleanParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);
+ // XXX (ab) not sure about this... the default is to retry 3 times; if
+ // XXX the request body was sent the method is not retried, so there is
+ // XXX little danger in retrying...
+ // params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
try {
code = Http.getClient().executeMethod(get);
@@ -103,6 +129,7 @@
}
} catch (org.apache.commons.httpclient.ProtocolException pe) {
pe.printStackTrace();
+ get.releaseConnection();
throw new IOException(pe.toString());
} finally {
get.releaseConnection();