Thanks Seb! ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Chris Mattmann, Ph.D. Chief Architect Instrument Software and Science Data Systems Section (398) NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA Office: 168-519, Mailstop: 168-527 Email: [email protected] WWW: http://sunset.usc.edu/~mattmann/ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Adjunct Associate Professor, Computer Science Department University of Southern California, Los Angeles, CA 90089 USA ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-----Original Message----- From: "[email protected]" <[email protected]> Reply-To: "[email protected]" <[email protected]> Date: Thursday, November 6, 2014 at 1:51 PM To: "[email protected]" <[email protected]> Subject: svn commit: r1637236 - in /nutch: branches/2.x/ branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/ht tp/ trunk/ trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ >Author: snagel >Date: Thu Nov 6 21:51:46 2014 >New Revision: 1637236 > >URL: http://svn.apache.org/r1637236 >Log: >NUTCH-1825 protocol-http may hang for certain web pages > >Modified: > nutch/branches/2.x/CHANGES.txt > >nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/prot >ocol/http/HttpResponse.java > nutch/trunk/CHANGES.txt > >nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/ht >tp/HttpResponse.java > >Modified: nutch/branches/2.x/CHANGES.txt >URL: >http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1637236&r1 >=1637235&r2=1637236&view=diff >========================================================================== >==== >--- nutch/branches/2.x/CHANGES.txt (original) >+++ nutch/branches/2.x/CHANGES.txt Thu Nov 6 21:51:46 2014 >@@ -2,6 +2,8 @@ Nutch Change Log > > Current Development 2.3-SNAPSHOT > >+* NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via >snagel) >+ > * NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério >Pereira Araújo, Mengying Wang, snagel) > > * NUTCH-1885 Protocol-file should treat symbolic links as redirects >(Mengying Wang, snagel) > >Modified: >nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/prot >ocol/http/HttpResponse.java >URL: >http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-http/s >rc/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1637236&r1=16 >37235&r2=1637236&view=diff >========================================================================== >==== >--- >nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/prot >ocol/http/HttpResponse.java (original) >+++ >nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/prot >ocol/http/HttpResponse.java Thu Nov 6 21:51:46 2014 >@@ -278,11 +278,22 @@ public class HttpResponse implements Res > > ByteArrayOutputStream out = new >ByteArrayOutputStream(Http.BUFFER_SIZE); > byte[] bytes = new byte[Http.BUFFER_SIZE]; >- int length = 0; // read content >- for (int i = in.read(bytes); i != -1 && length + i <= contentLength; >i = in.read(bytes)) { >- >+ int length = 0; >+ // read content >+ int i = in.read(bytes); >+ while (i != -1) { > out.write(bytes, 0, i); > length += i; >+ if (length >= contentLength) { >+ break; >+ } >+ if ((length + Http.BUFFER_SIZE) > contentLength) { >+ // reading next chunk may hit contentLength, >+ // must limit number of bytes read >+ i = in.read(bytes, 0, (contentLength - length)); >+ } else { >+ i = in.read(bytes); >+ } > } > content = out.toByteArray(); > } > >Modified: nutch/trunk/CHANGES.txt >URL: >http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1637236&r1=163723 >5&r2=1637236&view=diff >========================================================================== >==== >--- nutch/trunk/CHANGES.txt (original) >+++ nutch/trunk/CHANGES.txt Thu Nov 6 21:51:46 2014 >@@ -2,6 +2,8 @@ Nutch Change Log > > Nutch Current Development 1.10-SNAPSHOT > >+* NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via >snagel) >+ > * NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério >Pereira Araújo, Mengying Wang, snagel) > > * NUTCH-1885 Protocol-file should treat symbolic links as redirects >(Mengying Wang, snagel) > >Modified: >nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/ht >tp/HttpResponse.java >URL: >http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java >/org/apache/nutch/protocol/http/HttpResponse.java?rev=1637236&r1=1637235&r >2=1637236&view=diff >========================================================================== >==== >--- >nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/ht >tp/HttpResponse.java (original) >+++ >nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/ht >tp/HttpResponse.java Thu Nov 6 21:51:46 2014 >@@ -26,16 +26,14 @@ import java.io.PushbackInputStream; > import java.net.InetSocketAddress; > import java.net.Socket; > import java.net.URL; >- > import java.util.Arrays; > import java.util.HashSet; > import java.util.Set; >- >+ > import javax.net.ssl.SSLSocket; > import javax.net.ssl.SSLSocketFactory; > > import org.apache.hadoop.conf.Configuration; >- > import org.apache.nutch.crawl.CrawlDatum; > import org.apache.nutch.metadata.Metadata; > import org.apache.nutch.metadata.SpellCheckedMetadata; >@@ -289,11 +287,22 @@ public class HttpResponse implements Res > > ByteArrayOutputStream out = new >ByteArrayOutputStream(Http.BUFFER_SIZE); > byte[] bytes = new byte[Http.BUFFER_SIZE]; >- int length = 0; // read content >- for (int i = in.read(bytes); i != -1 && length + i <= contentLength; >i = in.read(bytes)) { >- >+ int length = 0; >+ // read content >+ int i = in.read(bytes); >+ while (i != -1) { > out.write(bytes, 0, i); > length += i; >+ if (length >= contentLength) { >+ break; >+ } >+ if ((length + Http.BUFFER_SIZE) > contentLength) { >+ // reading next chunk may hit contentLength, >+ // must limit number of bytes read >+ i = in.read(bytes, 0, (contentLength - length)); >+ } else { >+ i = in.read(bytes); >+ } > } > content = out.toByteArray(); > } > >

