Author: snagel
Date: Thu Nov 6 21:51:46 2014
New Revision: 1637236
URL: http://svn.apache.org/r1637236
Log:
NUTCH-1825 protocol-http may hang for certain web pages
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1637236&r1=1637235&r2=1637236&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Nov 6 21:51:46 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development 2.3-SNAPSHOT
+* NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via snagel)
+
* NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério
Pereira Araújo, Mengying Wang, snagel)
* NUTCH-1885 Protocol-file should treat symbolic links as redirects (Mengying
Wang, snagel)
Modified:
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1637236&r1=1637235&r2=1637236&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(original)
+++
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Thu Nov 6 21:51:46 2014
@@ -278,11 +278,22 @@ public class HttpResponse implements Res
ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
byte[] bytes = new byte[Http.BUFFER_SIZE];
- int length = 0; // read content
- for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i =
in.read(bytes)) {
-
+ int length = 0;
+ // read content
+ int i = in.read(bytes);
+ while (i != -1) {
out.write(bytes, 0, i);
length += i;
+ if (length >= contentLength) {
+ break;
+ }
+ if ((length + Http.BUFFER_SIZE) > contentLength) {
+ // reading next chunk may hit contentLength,
+ // must limit number of bytes read
+ i = in.read(bytes, 0, (contentLength - length));
+ } else {
+ i = in.read(bytes);
+ }
}
content = out.toByteArray();
}
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1637236&r1=1637235&r2=1637236&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Nov 6 21:51:46 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via snagel)
+
* NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério
Pereira Araújo, Mengying Wang, snagel)
* NUTCH-1885 Protocol-file should treat symbolic links as redirects (Mengying
Wang, snagel)
Modified:
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1637236&r1=1637235&r2=1637236&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(original)
+++
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Thu Nov 6 21:51:46 2014
@@ -26,16 +26,14 @@ import java.io.PushbackInputStream;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
-
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
-
+
import javax.net.ssl.SSLSocket;
import javax.net.ssl.SSLSocketFactory;
import org.apache.hadoop.conf.Configuration;
-
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.SpellCheckedMetadata;
@@ -289,11 +287,22 @@ public class HttpResponse implements Res
ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
byte[] bytes = new byte[Http.BUFFER_SIZE];
- int length = 0; // read content
- for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i =
in.read(bytes)) {
-
+ int length = 0;
+ // read content
+ int i = in.read(bytes);
+ while (i != -1) {
out.write(bytes, 0, i);
length += i;
+ if (length >= contentLength) {
+ break;
+ }
+ if ((length + Http.BUFFER_SIZE) > contentLength) {
+ // reading next chunk may hit contentLength,
+ // must limit number of bytes read
+ i = in.read(bytes, 0, (contentLength - length));
+ } else {
+ i = in.read(bytes);
+ }
}
content = out.toByteArray();
}