Author: snagel
Date: Thu Nov  6 21:51:46 2014
New Revision: 1637236

URL: http://svn.apache.org/r1637236
Log:
NUTCH-1825 protocol-http may hang for certain web pages

Modified:
    nutch/branches/2.x/CHANGES.txt
    
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
    nutch/trunk/CHANGES.txt
    
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1637236&r1=1637235&r2=1637236&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Nov  6 21:51:46 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.3-SNAPSHOT
 
+* NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via snagel)
+
 * NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério 
Pereira Araújo, Mengying Wang, snagel)
 
 * NUTCH-1885 Protocol-file should treat symbolic links as redirects (Mengying 
Wang, snagel)

Modified: 
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1637236&r1=1637235&r2=1637236&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 Thu Nov  6 21:51:46 2014
@@ -278,11 +278,22 @@ public class HttpResponse implements Res
 
     ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
     byte[] bytes = new byte[Http.BUFFER_SIZE];
-    int length = 0;                           // read content
-    for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = 
in.read(bytes)) {
-
+    int length = 0;
+    // read content
+    int i = in.read(bytes);
+    while (i != -1) {
       out.write(bytes, 0, i);
       length += i;
+      if (length >= contentLength) {
+        break;
+      }
+      if ((length + Http.BUFFER_SIZE) > contentLength) {
+        // reading next chunk may hit contentLength,
+        // must limit number of bytes read
+        i = in.read(bytes, 0, (contentLength - length));
+      } else {
+        i = in.read(bytes);
+      }
     }
     content = out.toByteArray();
   }

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1637236&r1=1637235&r2=1637236&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Nov  6 21:51:46 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via snagel)
+
 * NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério 
Pereira Araújo, Mengying Wang, snagel)
 
 * NUTCH-1885 Protocol-file should treat symbolic links as redirects (Mengying 
Wang, snagel)

Modified: 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1637236&r1=1637235&r2=1637236&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 Thu Nov  6 21:51:46 2014
@@ -26,16 +26,14 @@ import java.io.PushbackInputStream;
 import java.net.InetSocketAddress;
 import java.net.Socket;
 import java.net.URL;
-
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
- 
+
 import javax.net.ssl.SSLSocket;
 import javax.net.ssl.SSLSocketFactory;
 
 import org.apache.hadoop.conf.Configuration;
-
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.SpellCheckedMetadata;
@@ -289,11 +287,22 @@ public class HttpResponse implements Res
 
     ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
     byte[] bytes = new byte[Http.BUFFER_SIZE];
-    int length = 0;                           // read content
-    for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = 
in.read(bytes)) {
-
+    int length = 0;
+    // read content
+    int i = in.read(bytes);
+    while (i != -1) {
       out.write(bytes, 0, i);
       length += i;
+      if (length >= contentLength) {
+        break;
+      }
+      if ((length + Http.BUFFER_SIZE) > contentLength) {
+        // reading next chunk may hit contentLength,
+        // must limit number of bytes read
+        i = in.read(bytes, 0, (contentLength - length));
+      } else {
+        i = in.read(bytes);
+      }
     }
     content = out.toByteArray();
   }


Reply via email to