Author: ab Date: Thu May 18 08:26:06 2006 New Revision: 407567 URL: http://svn.apache.org/viewvc?rev=407567&view=rev Log: Refactor HTTP plugins so that both support gzip encoding. Add appropriate headers in protocol-httpclient so that it prefers this encoding.
Add an option to use HTTP 1.1 (at the moment only protocol-httpclient supports it). Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=407567&r1=407566&r2=407567&view=diff ============================================================================== --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Thu May 18 08:26:06 2006 @@ -132,6 +132,14 @@ trying to fetch a page.</description> </property> +<property> + <name>http.useHttp11</name> + <value>false</value> + <description>NOTE: at the moment this works only for protocol-httpclient. + If true, use HTTP 1.1, if false use HTTP 1.0 . + </description> +</property> + <!-- FTP properties --> <property> Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=407567&r1=407566&r2=407567&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Thu May 18 08:26:06 2006 @@ -27,13 +27,13 @@ // Nutch imports import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.ProtocolOutput; import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.util.GZIPUtils; // Hadoop imports import org.apache.hadoop.conf.Configuration; @@ -120,6 +120,8 @@ /** Do we block by IP addresses or by hostnames? */ private boolean byIP = true; + /** Do we use HTTP/1.1? */ + protected boolean useHttp11 = false; /** Creates a new instance of HttpBase */ public HttpBase() { @@ -149,6 +151,7 @@ this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000); // backward-compatible default setting this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true); + this.useHttp11 = conf.getBoolean("http.http11", false); this.robots.setConf(conf); logConf(); } @@ -285,8 +288,11 @@ public String getUserAgent() { return userAgent; } - - + + public boolean getUseHttp11() { + return useHttp11; + } + private String blockAddr(URL url) throws ProtocolException { String host; @@ -428,6 +434,21 @@ logger.info("http.max.delays = " + maxDelays); } + public byte[] processGzipEncoded(byte[] compressed, URL url) throws IOException { + LOGGER.fine("uncompressing...."); + + byte[] content = GZIPUtils.unzipBestEffort(compressed, getMaxContent()); + + if (content == null) + throw new IOException("unzipBestEffort returned null"); + + if (LOGGER.isLoggable(Level.FINE)) + LOGGER.fine("fetched " + compressed.length + + " bytes of compressed content (expanded to " + + content.length + " bytes) from " + url); + return content; + } + protected static void main(HttpBase http, String[] args) throws Exception { boolean verbose = false; String url = null; @@ -475,5 +496,5 @@ CrawlDatum datum, boolean followRedirects) throws ProtocolException, IOException; - + } Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=407567&r1=407566&r2=407567&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Thu May 18 08:26:06 2006 @@ -26,9 +26,6 @@ import java.net.InetSocketAddress; import java.net.Socket; import java.net.URL; -import java.util.Map; -import java.util.TreeMap; -import java.util.Date; import java.util.logging.Level; // Nutch imports @@ -38,7 +35,6 @@ import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.http.api.HttpBase; import org.apache.nutch.protocol.http.api.HttpException; -import org.apache.nutch.util.GZIPUtils; /** An HTTP response. */ @@ -150,18 +146,7 @@ String contentEncoding = getHeader(Response.CONTENT_ENCODING); if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { - Http.LOG.fine("uncompressing...."); - byte[] compressed = content; - - content = GZIPUtils.unzipBestEffort(compressed, http.getMaxContent()); - - if (content == null) - throw new HttpException("unzipBestEffort returned null"); - - if (Http.LOG.isLoggable(Level.FINE)) - Http.LOG.fine("fetched " + compressed.length - + " bytes of compressed content (expanded to " - + content.length + " bytes) from " + url); + content = http.processGzipEncoded(content, url); } else { if (Http.LOG.isLoggable(Level.FINE)) Http.LOG.fine("fetched " + content.length + " bytes from " + url); Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=407567&r1=407566&r2=407567&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Thu May 18 08:26:06 2006 @@ -124,6 +124,8 @@ // prefer understandable formats headers.add(new Header("Accept", "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5")); + // accept gzipped content + headers.add(new Header("Accept-Encoding", "x-gzip, gzip")); hostConf.getParams().setParameter("http.default-headers", headers); if (useProxy) { hostConf.setProxy(proxyHost, proxyPort); Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=407567&r1=407566&r2=407567&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Thu May 18 08:26:06 2006 @@ -70,8 +70,11 @@ get.setFollowRedirects(followRedirects); get.setRequestHeader("User-Agent", http.getUserAgent()); HttpMethodParams params = get.getParams(); - // some servers cannot digest the new protocol - params.setVersion(HttpVersion.HTTP_1_0); + if (http.getUseHttp11()) { + params.setVersion(HttpVersion.HTTP_1_1); + } else { + params.setVersion(HttpVersion.HTTP_1_0); + } params.makeLenient(); params.setContentCharset("UTF-8"); params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); @@ -109,6 +112,13 @@ } catch (Exception e) { if (code == 200) throw new IOException(e.toString()); // for codes other than 200 OK, we are fine with empty content + } + if (content != null) { + // check if we have to uncompress it + String contentEncoding = headers.get(Response.CONTENT_ENCODING); + if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { + content = http.processGzipEncoded(content, url); + } } } catch (org.apache.commons.httpclient.ProtocolException pe) { pe.printStackTrace(); ------------------------------------------------------- Using Tomcat but need to do more? Need to support web services, security? Get stuff done quickly with pre-integrated technology to make your job easier Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642 _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs