Author: ab
Date: Thu May 18 08:26:06 2006
New Revision: 407567

URL: http://svn.apache.org/viewvc?rev=407567&view=rev
Log:
Refactor HTTP plugins so that both support gzip encoding. Add
appropriate headers in protocol-httpclient so that it prefers this
encoding.

Add an option to use HTTP 1.1 (at the moment only protocol-httpclient
supports it).

Modified:
    lucene/nutch/trunk/conf/nutch-default.xml
    
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=407567&r1=407566&r2=407567&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Thu May 18 08:26:06 2006
@@ -132,6 +132,14 @@
     trying to fetch a page.</description>
 </property>
 
+<property>
+  <name>http.useHttp11</name>
+  <value>false</value>
+  <description>NOTE: at the moment this works only for protocol-httpclient.
+  If true, use HTTP 1.1, if false use HTTP 1.0 .
+  </description>
+</property>
+
 <!-- FTP properties -->
 
 <property>

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=407567&r1=407566&r2=407567&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Thu May 18 08:26:06 2006
@@ -27,13 +27,13 @@
 
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.util.GZIPUtils;
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
@@ -120,6 +120,8 @@
   /** Do we block by IP addresses or by hostnames? */
   private boolean byIP = true;
  
+  /** Do we use HTTP/1.1? */
+  protected boolean useHttp11 = false;
 
   /** Creates a new instance of HttpBase */
   public HttpBase() {
@@ -149,6 +151,7 @@
         this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) 
* 1000);
         // backward-compatible default setting
         this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
+        this.useHttp11 = conf.getBoolean("http.http11", false);
         this.robots.setConf(conf);
         logConf();
     }
@@ -285,8 +288,11 @@
   public String getUserAgent() {
     return userAgent;
   }
-
-
+  
+  public boolean getUseHttp11() {
+    return useHttp11;
+  }
+  
   private String blockAddr(URL url) throws ProtocolException {
     
     String host;
@@ -428,6 +434,21 @@
     logger.info("http.max.delays = " + maxDelays);
   }
   
+  public byte[] processGzipEncoded(byte[] compressed, URL url) throws 
IOException {
+    LOGGER.fine("uncompressing....");
+
+    byte[] content = GZIPUtils.unzipBestEffort(compressed, getMaxContent());
+
+    if (content == null)
+      throw new IOException("unzipBestEffort returned null");
+
+    if (LOGGER.isLoggable(Level.FINE))
+      LOGGER.fine("fetched " + compressed.length
+                    + " bytes of compressed content (expanded to "
+                    + content.length + " bytes) from " + url);
+    return content;
+  }
+  
   protected static void main(HttpBase http, String[] args) throws Exception {
     boolean verbose = false;
     String url = null;
@@ -475,5 +496,5 @@
                                           CrawlDatum datum,
                                           boolean followRedirects)
     throws ProtocolException, IOException;
-  
+
 }

Modified: 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=407567&r1=407566&r2=407567&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 Thu May 18 08:26:06 2006
@@ -26,9 +26,6 @@
 import java.net.InetSocketAddress;
 import java.net.Socket;
 import java.net.URL;
-import java.util.Map;
-import java.util.TreeMap;
-import java.util.Date;
 import java.util.logging.Level;
 
 // Nutch imports
@@ -38,7 +35,6 @@
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.http.api.HttpBase;
 import org.apache.nutch.protocol.http.api.HttpException;
-import org.apache.nutch.util.GZIPUtils;
 
 
 /** An HTTP response. */
@@ -150,18 +146,7 @@
 
       String contentEncoding = getHeader(Response.CONTENT_ENCODING);
       if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
-        Http.LOG.fine("uncompressing....");
-        byte[] compressed = content;
-
-        content = GZIPUtils.unzipBestEffort(compressed, http.getMaxContent());
-
-        if (content == null)
-          throw new HttpException("unzipBestEffort returned null");
-
-        if (Http.LOG.isLoggable(Level.FINE))
-          Http.LOG.fine("fetched " + compressed.length
-                        + " bytes of compressed content (expanded to "
-                        + content.length + " bytes) from " + url);
+        content = http.processGzipEncoded(content, url);
       } else {
         if (Http.LOG.isLoggable(Level.FINE))
           Http.LOG.fine("fetched " + content.length + " bytes from " + url);

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=407567&r1=407566&r2=407567&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
 Thu May 18 08:26:06 2006
@@ -124,6 +124,8 @@
     // prefer understandable formats
     headers.add(new Header("Accept",
             
"text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
+    // accept gzipped content
+    headers.add(new Header("Accept-Encoding", "x-gzip, gzip"));
     hostConf.getParams().setParameter("http.default-headers", headers);
     if (useProxy) {
       hostConf.setProxy(proxyHost, proxyPort);

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=407567&r1=407566&r2=407567&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 Thu May 18 08:26:06 2006
@@ -70,8 +70,11 @@
     get.setFollowRedirects(followRedirects);
     get.setRequestHeader("User-Agent", http.getUserAgent());
     HttpMethodParams params = get.getParams();
-    // some servers cannot digest the new protocol
-    params.setVersion(HttpVersion.HTTP_1_0);
+    if (http.getUseHttp11()) {
+      params.setVersion(HttpVersion.HTTP_1_1);
+    } else {
+      params.setVersion(HttpVersion.HTTP_1_0);
+    }
     params.makeLenient();
     params.setContentCharset("UTF-8");
     params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
@@ -109,6 +112,13 @@
       } catch (Exception e) {
         if (code == 200) throw new IOException(e.toString());
         // for codes other than 200 OK, we are fine with empty content
+      }
+      if (content != null) {
+        // check if we have to uncompress it
+        String contentEncoding = headers.get(Response.CONTENT_ENCODING);
+        if ("gzip".equals(contentEncoding) || 
"x-gzip".equals(contentEncoding)) {
+          content = http.processGzipEncoded(content, url);
+        }
       }
     } catch (org.apache.commons.httpclient.ProtocolException pe) {
       pe.printStackTrace();




-------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to