[
https://issues.apache.org/jira/browse/NUTCH-1736?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
ysc updated NUTCH-1736:
-----------------------
Comment: was deleted
(was: 1、for nutch1.x can use the below patch:
#P nutch1.7
Index: src/java/org/apache/nutch/metadata/HttpHeaders.java
===================================================================
--- src/java/org/apache/nutch/metadata/HttpHeaders.java (revision 1573324)
+++ src/java/org/apache/nutch/metadata/HttpHeaders.java (working copy)
@@ -26,6 +26,8 @@
*/
public interface HttpHeaders {
+ public final static String TRANSFER_ENCODING = "Transfer-Encoding";
+
public final static String CONTENT_ENCODING = "Content-Encoding";
public final static String CONTENT_LANGUAGE = "Content-Language";
Index:
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
===================================================================
---
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(revision 1573324)
+++
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(working copy)
@@ -156,9 +156,13 @@
parseHeaders(in, line);
haveSeenNonContinueStatus= code != 100; // 100 is "Continue"
}
+ String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
+ if(transferEncoding != null &&
"chunked".equalsIgnoreCase(transferEncoding.trim())){
+ readChunkedContent(in, line);
+ }else{
+ readPlainContent(in);
+ }
- readPlainContent(in);
-
String contentEncoding = getHeader(Response.CONTENT_ENCODING);
if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
content = http.processGzipEncoded(content, url);
@@ -432,5 +436,4 @@
in.unread(value);
return value;
}
-
}
2、for nutch2.x can use the below patch:
#P nutch-2.2.1
Index: src/java/org/apache/nutch/metadata/HttpHeaders.java
===================================================================
--- src/java/org/apache/nutch/metadata/HttpHeaders.java (revision 1523958)
+++ src/java/org/apache/nutch/metadata/HttpHeaders.java (working copy)
@@ -28,6 +28,7 @@
* @author Jérôme Charron
*/
public interface HttpHeaders {
+ public final static String TRANSFER_ENCODING = "Transfer-Encoding";
public final static String CONTENT_ENCODING = "Content-Encoding";
Index:
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
===================================================================
---
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(revision 1523958)
+++
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(working copy)
@@ -150,7 +150,12 @@
haveSeenNonContinueStatus= code != 100; // 100 is "Continue"
}
- readPlainContent(in);
+ String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
+ if(transferEncoding != null &&
"chunked".equalsIgnoreCase(transferEncoding.trim())){
+ readChunkedContent(in, line);
+ }else{
+ readPlainContent(in);
+ }
String contentEncoding = getHeader(Response.CONTENT_ENCODING);
if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
@@ -234,7 +239,92 @@
}
content = out.toByteArray();
}
+ /**
+ *
+ * @param in
+ * @param line
+ * @throws HttpException
+ * @throws IOException
+ */
+ @SuppressWarnings("unused")
+ private void readChunkedContent(PushbackInputStream in,
+ StringBuffer line)
+ throws HttpException, IOException {
+ boolean doneChunks= false;
+ int contentBytesRead= 0;
+ byte[] bytes = new byte[Http.BUFFER_SIZE];
+ ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
+ while (!doneChunks) {
+ if (Http.LOG.isTraceEnabled()) {
+ Http.LOG.trace("Http: starting chunk");
+ }
+
+ readLine(in, line, false);
+
+ String chunkLenStr;
+ // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line +
"'"); }
+
+ int pos= line.indexOf(";");
+ if (pos < 0) {
+ chunkLenStr= line.toString();
+ } else {
+ chunkLenStr= line.substring(0, pos);
+ // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " +
line.substring(pos+1)); }
+ }
+ chunkLenStr= chunkLenStr.trim();
+ int chunkLen;
+ try {
+ chunkLen= Integer.parseInt(chunkLenStr, 16);
+ } catch (NumberFormatException e){
+ throw new HttpException("bad chunk length: "+line.toString());
+ }
+
+ if (chunkLen == 0) {
+ doneChunks= true;
+ break;
+ }
+
+ if ( (contentBytesRead + chunkLen) > http.getMaxContent() )
+ chunkLen= http.getMaxContent() - contentBytesRead;
+
+ // read one chunk
+ int chunkBytesRead= 0;
+ while (chunkBytesRead < chunkLen) {
+
+ int toRead= (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
+ (chunkLen - chunkBytesRead) : Http.BUFFER_SIZE;
+ int len= in.read(bytes, 0, toRead);
+
+ if (len == -1)
+ throw new HttpException("chunk eof after " + contentBytesRead
+ + " bytes in successful chunks"
+ + " and " + chunkBytesRead
+ + " in current chunk");
+
+ // DANGER!!! Will printed GZIPed stuff right to your
+ // terminal!
+ // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes,
0, len)); }
+
+ out.write(bytes, 0, len);
+ chunkBytesRead+= len;
+ }
+
+ readLine(in, line, false);
+
+ }
+
+ if (!doneChunks) {
+ if (contentBytesRead != http.getMaxContent())
+ throw new HttpException("chunk eof: !doneChunk && didn't max out");
+ return;
+ }
+
+ content = out.toByteArray();
+ parseHeaders(in, line);
+
+ }
+
private int parseStatusLine(PushbackInputStream in, StringBuffer line)
throws IOException, HttpException {
readLine(in, line, false);
)
> can't fetch page if http response header contains Transfer-Encoding:chunked
> ---------------------------------------------------------------------------
>
> Key: NUTCH-1736
> URL: https://issues.apache.org/jira/browse/NUTCH-1736
> Project: Nutch
> Issue Type: Bug
> Components: protocol
> Affects Versions: 1.6, 2.1, 1.7, 2.2, 2.3, 1.8, 2.4, 1.9, 2.2.1
> Reporter: ysc
> Priority: Critical
> Original Estimate: 24h
> Remaining Estimate: 24h
>
> fetching:
> http://szs.mof.gov.cn/zhengwuxinxi/zhengcefabu/201402/t20140224_1046354.html
> Fetch failed with protocol status: EXCEPTION: java.io.IOException:
> unzipBestEffort returned null
--
This message was sent by Atlassian JIRA
(v6.2#6252)