This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 2fce5cdb41f6b838ab2e4e29932792672290bf5b Author: Yossi Tamari <yossi.tam...@pipl.com> AuthorDate: Tue May 7 19:22:52 2019 +0300 NUTCH-2716 Response headers are not stored for a compressed response Even when store.http.headers=true, the HTTP headers are not saved for a gzipped or deflated response, because they may contain an incorrect content-length header. This causes WARCExporter to generate "resource" (header-less) entries instead of "response" entries. The correct behaviour is to store all the headers, and code that uses them should be aware and careful that they represent the original headers, not the stored content. This fixes protocol-http, protocol-selenium, and protocol-htmlunit to write the raw response headers, and adds logic to WARCExporter and CommonCrawlDataDumper to fix these headers. It also fixed NUTCH-2715 (WARCExporter fails on large records), and upgrades lib-htmlunit to use version 3.141.5 of Selenium, since Eclipse fails to compile otherwise (conflicts with lib-selenium). --- .../apache/nutch/tools/CommonCrawlFormatWARC.java | 1 + src/java/org/apache/nutch/tools/WARCUtils.java | 109 +++++++++++++++++++++ .../org/apache/nutch/tools/warc/WARCExporter.java | 4 +- src/plugin/lib-htmlunit/ivy.xml | 3 +- .../nutch/protocol/htmlunit/HttpResponse.java | 8 +- .../apache/nutch/protocol/http/HttpResponse.java | 10 +- .../nutch/protocol/selenium/HttpResponse.java | 3 + 7 files changed, 125 insertions(+), 13 deletions(-) diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java b/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java index 02c2415..f401041 100644 --- a/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java +++ b/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java @@ -192,6 +192,7 @@ public class CommonCrawlFormatWARC extends AbstractCommonCrawlFormat { ByteArrayOutputStream output = new ByteArrayOutputStream(); String httpHeaders = metadata.get("_response.headers_"); + httpHeaders = WARCUtils.fixHttpHeaders(httpHeaders, content.getContent().length); if (StringUtils.isNotBlank(httpHeaders)) { output.write(httpHeaders.getBytes()); diff --git a/src/java/org/apache/nutch/tools/WARCUtils.java b/src/java/org/apache/nutch/tools/WARCUtils.java index a880783..34365a8 100644 --- a/src/java/org/apache/nutch/tools/WARCUtils.java +++ b/src/java/org/apache/nutch/tools/WARCUtils.java @@ -24,6 +24,7 @@ import java.net.InetAddress; import java.net.UnknownHostException; import java.util.Date; import java.util.List; +import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.indexer.NutchDocument; @@ -46,6 +47,11 @@ public class WARCUtils { public final static String CONFORMS_TO = "conformsTo"; public final static String IP = "ip"; public final static UUIDGenerator generator = new UUIDGenerator(); + public static final String CRLF = "\r\n"; + public static final String COLONSP = ": "; + protected static final Pattern PROBLEMATIC_HEADERS = Pattern + .compile("(?i)(?:Content-(?:Encoding|Length)|Transfer-Encoding)"); + protected static final String X_HIDE_HEADER = "X-Crawler-"; public static final ANVLRecord getWARCInfoContent(Configuration conf) { ANVLRecord record = new ANVLRecord(); @@ -167,4 +173,107 @@ public class WARCUtils { return record; } + + /** + * Modify verbatim HTTP response headers: fix, remove or replace headers + * <code>Content-Length</code>, <code>Content-Encoding</code> and + * <code>Transfer-Encoding</code> which may confuse WARC readers. Ensure that + * returned header end with a single empty line (<code>\r\n\r\n</code>). + * + * @param headers + * HTTP 1.1 or 1.0 response header string, CR-LF-separated lines, + * first line is status line + * @return safe HTTP response header + */ + public static final String fixHttpHeaders(String headers, int contentLength) { + int start = 0, lineEnd = 0, last = 0, trailingCrLf= 0; + StringBuilder replace = new StringBuilder(); + while (start < headers.length()) { + lineEnd = headers.indexOf(CRLF, start); + trailingCrLf = 1; + if (lineEnd == -1) { + lineEnd = headers.length(); + trailingCrLf = 0; + } + int colonPos = -1; + for (int i = start; i < lineEnd; i++) { + if (headers.charAt(i) == ':') { + colonPos = i; + break; + } + } + if (colonPos == -1) { + boolean valid = true; + if (start == 0) { + // status line (without colon) + // TODO: http/2 + } else if ((lineEnd + 4) == headers.length() + && headers.endsWith(CRLF + CRLF)) { + // ok, trailing empty line + trailingCrLf = 2; + } else { + valid = false; + } + if (!valid) { + if (last < start) { + replace.append(headers.substring(last, start)); + } + last = lineEnd + 2 * trailingCrLf; + } + start = lineEnd + 2 * trailingCrLf; + /* + * skip over invalid header line, no further check for problematic + * headers required + */ + continue; + } + String name = headers.substring(start, colonPos); + if (PROBLEMATIC_HEADERS.matcher(name).matches()) { + boolean needsFix = true; + if (name.equalsIgnoreCase("content-length")) { + String value = headers.substring(colonPos + 1, lineEnd).trim(); + try { + int l = Integer.parseInt(value); + if (l == contentLength) { + needsFix = false; + } + } catch (NumberFormatException e) { + // needs to be fixed + } + } + if (needsFix) { + if (last < start) { + replace.append(headers.substring(last, start)); + } + last = lineEnd + 2 * trailingCrLf; + replace.append(X_HIDE_HEADER) + .append(headers.substring(start, lineEnd + 2 * trailingCrLf)); + if (trailingCrLf == 0) { + replace.append(CRLF); + trailingCrLf = 1; + } + if (name.equalsIgnoreCase("content-length")) { + // add effective uncompressed and unchunked length of content + replace.append("Content-Length").append(COLONSP) + .append(contentLength).append(CRLF); + } + } + } + start = lineEnd + 2 * trailingCrLf; + } + if (last > 0 || trailingCrLf != 2) { + if (last < headers.length()) { + // append trailing headers + replace.append(headers.substring(last)); + } + while (trailingCrLf < 2) { + replace.append(CRLF); + trailingCrLf++; + } + return replace.toString(); + } + return headers; + } + + } diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java index 0b0b4c2..d307000 100644 --- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java +++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java @@ -51,6 +51,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.parse.ParseSegment; import org.apache.nutch.protocol.Content; +import org.apache.nutch.tools.WARCUtils; import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -144,6 +145,7 @@ public class WARCExporter extends Configured implements Tool { // were the headers stored as is? Can write a response element then String headersVerbatim = content.getMetadata().get("_response.headers_"); + headersVerbatim = WARCUtils.fixHttpHeaders(headersVerbatim, content.getContent().length); byte[] httpheaders = new byte[0]; if (StringUtils.isNotBlank(headersVerbatim)) { // check that ends with an empty line @@ -241,7 +243,7 @@ public class WARCExporter extends Configured implements Tool { WARCRecord record = new WARCRecord(in); context.write(NullWritable.get(), new WARCWritable(record)); context.getCounter("WARCExporter", "records generated").increment(1); - } catch (IOException exception) { + } catch (IOException | IllegalStateException exception) { LOG.error("Exception when generating WARC record for {} : {}", key, exception.getMessage()); context.getCounter("WARCExporter", "exception").increment(1); diff --git a/src/plugin/lib-htmlunit/ivy.xml b/src/plugin/lib-htmlunit/ivy.xml index 6430535..f3b57b1 100644 --- a/src/plugin/lib-htmlunit/ivy.xml +++ b/src/plugin/lib-htmlunit/ivy.xml @@ -37,7 +37,8 @@ <dependencies> <!-- begin selenium dependencies --> - <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.44.0" /> + <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="3.141.5" /> + <dependency org="org.seleniumhq.selenium" name="htmlunit-driver" rev="2.35.1" /> <dependency org="com.opera" name="operadriver" rev="1.5"> <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" /> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java index 6cc0c4b..e76bc04 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java @@ -262,16 +262,14 @@ public class HttpResponse implements Response { } else if ("deflate".equals(contentEncoding)) { content = http.processDeflateEncoded(content, url); } else { - // store the headers verbatim only if the response was not compressed - // as the content length reported with not match otherwise - if (httpHeaders != null) { - headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString()); - } if (Http.LOG.isTraceEnabled()) { Http.LOG.trace("fetched " + content.length + " bytes from " + url); } } } + if (httpHeaders != null) { + headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString()); + } } } finally { diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java index 25efb5e..5a4b1ef 100644 --- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java +++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java @@ -294,16 +294,14 @@ public class HttpResponse implements Response { } else if ("deflate".equals(contentEncoding)) { content = http.processDeflateEncoded(content, url); } else { - // store the headers verbatim only if the response was not compressed - // as the content length reported does not match otherwise - if (httpHeaders != null) { - httpHeaders.append("\r\n"); - headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString()); - } if (Http.LOG.isTraceEnabled()) { Http.LOG.trace("fetched " + content.length + " bytes from " + url); } } + if (httpHeaders != null) { + httpHeaders.append("\r\n"); + headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString()); + } } catch (IOException | HttpException e) { // Headers parsing went fine, but an error occurred while trying to read // the body of the request (the body may be malformed) diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java index 516b2ec..4a20b04 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java @@ -295,6 +295,9 @@ public class HttpResponse implements Response { } } } + if (httpHeaders != null) { + headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString()); + } } } finally {