[nutch] 01/04: NUTCH-2716 Response headers are not stored for a compressed response

snagel Fri, 24 May 2019 06:23:38 -0700

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


commit 2fce5cdb41f6b838ab2e4e29932792672290bf5b
Author: Yossi Tamari <[email protected]>
AuthorDate: Tue May 7 19:22:52 2019 +0300

    NUTCH-2716 Response headers are not stored for a compressed response
    
    Even when store.http.headers=true, the HTTP headers are not saved for a
    gzipped or deflated response, because they may contain an incorrect
    content-length header.
    This causes WARCExporter to generate "resource" (header-less) entries
    instead of "response" entries.
    The correct behaviour is to store all the headers, and code that uses
    them should be aware and careful that they represent the original
    headers, not the stored content.
    
    This fixes protocol-http, protocol-selenium, and protocol-htmlunit to
    write the raw response headers, and adds logic to WARCExporter and
    CommonCrawlDataDumper to fix these headers.
    
    It also fixed NUTCH-2715 (WARCExporter fails on large records), and
    upgrades lib-htmlunit to use version 3.141.5 of Selenium, since Eclipse
    fails to compile otherwise (conflicts with lib-selenium).
---
 .../apache/nutch/tools/CommonCrawlFormatWARC.java  |   1 +
 src/java/org/apache/nutch/tools/WARCUtils.java     | 109 +++++++++++++++++++++
 .../org/apache/nutch/tools/warc/WARCExporter.java  |   4 +-
 src/plugin/lib-htmlunit/ivy.xml                    |   3 +-
 .../nutch/protocol/htmlunit/HttpResponse.java      |   8 +-
 .../apache/nutch/protocol/http/HttpResponse.java   |  10 +-
 .../nutch/protocol/selenium/HttpResponse.java      |   3 +
 7 files changed, 125 insertions(+), 13 deletions(-)

diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java 
b/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
index 02c2415..f401041 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
@@ -192,6 +192,7 @@ public class CommonCrawlFormatWARC extends 
AbstractCommonCrawlFormat {
     ByteArrayOutputStream output = new ByteArrayOutputStream();
 
     String httpHeaders = metadata.get("_response.headers_");
+    httpHeaders = WARCUtils.fixHttpHeaders(httpHeaders, 
content.getContent().length);
 
     if (StringUtils.isNotBlank(httpHeaders)) {
       output.write(httpHeaders.getBytes());
diff --git a/src/java/org/apache/nutch/tools/WARCUtils.java 
b/src/java/org/apache/nutch/tools/WARCUtils.java
index a880783..34365a8 100644
--- a/src/java/org/apache/nutch/tools/WARCUtils.java
+++ b/src/java/org/apache/nutch/tools/WARCUtils.java
@@ -24,6 +24,7 @@ import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.util.Date;
 import java.util.List;
+import java.util.regex.Pattern;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.indexer.NutchDocument;
@@ -46,6 +47,11 @@ public class WARCUtils {
   public final static String CONFORMS_TO = "conformsTo";
   public final static String IP = "ip";
   public final static UUIDGenerator generator = new UUIDGenerator();
+  public static final String CRLF = "\r\n";
+  public static final String COLONSP = ": ";
+  protected static final Pattern PROBLEMATIC_HEADERS = Pattern
+      .compile("(?i)(?:Content-(?:Encoding|Length)|Transfer-Encoding)");
+  protected static final String X_HIDE_HEADER = "X-Crawler-";
 
   public static final ANVLRecord getWARCInfoContent(Configuration conf) {
     ANVLRecord record = new ANVLRecord();
@@ -167,4 +173,107 @@ public class WARCUtils {
 
     return record;
   }
+  
+  /**
+   * Modify verbatim HTTP response headers: fix, remove or replace headers
+   * <code>Content-Length</code>, <code>Content-Encoding</code> and
+   * <code>Transfer-Encoding</code> which may confuse WARC readers. Ensure that
+   * returned header end with a single empty line (<code>\r\n\r\n</code>).
+   * 
+   * @param headers
+   *          HTTP 1.1 or 1.0 response header string, CR-LF-separated lines,
+   *          first line is status line
+   * @return safe HTTP response header
+   */
+  public static final String fixHttpHeaders(String headers, int contentLength) 
{
+    int start = 0, lineEnd = 0, last = 0, trailingCrLf= 0;
+    StringBuilder replace = new StringBuilder();
+    while (start < headers.length()) {
+      lineEnd = headers.indexOf(CRLF, start);
+      trailingCrLf = 1;
+      if (lineEnd == -1) {
+        lineEnd = headers.length();
+        trailingCrLf = 0;
+      }
+      int colonPos = -1;
+      for (int i = start; i < lineEnd; i++) {
+        if (headers.charAt(i) == ':') {
+          colonPos = i;
+          break;
+        }
+      }
+      if (colonPos == -1) {
+        boolean valid = true;
+        if (start == 0) {
+          // status line (without colon)
+          // TODO: http/2
+        } else if ((lineEnd + 4) == headers.length()
+            && headers.endsWith(CRLF + CRLF)) {
+          // ok, trailing empty line
+          trailingCrLf = 2;
+        } else {
+          valid = false;
+        }
+        if (!valid) {
+          if (last < start) {
+            replace.append(headers.substring(last, start));
+          }
+          last = lineEnd + 2 * trailingCrLf;
+        }
+        start = lineEnd + 2 * trailingCrLf;
+        /*
+         * skip over invalid header line, no further check for problematic
+         * headers required
+         */
+        continue;
+      }
+      String name = headers.substring(start, colonPos);
+      if (PROBLEMATIC_HEADERS.matcher(name).matches()) {
+        boolean needsFix = true;
+        if (name.equalsIgnoreCase("content-length")) {
+          String value = headers.substring(colonPos + 1, lineEnd).trim();
+          try {
+            int l = Integer.parseInt(value);
+            if (l == contentLength) {
+              needsFix = false;
+            }
+          } catch (NumberFormatException e) {
+            // needs to be fixed
+          }
+        }
+        if (needsFix) {
+          if (last < start) {
+            replace.append(headers.substring(last, start));
+          }
+          last = lineEnd + 2 * trailingCrLf;
+          replace.append(X_HIDE_HEADER)
+              .append(headers.substring(start, lineEnd + 2 * trailingCrLf));
+          if (trailingCrLf == 0) {
+            replace.append(CRLF);
+            trailingCrLf = 1;
+          }
+          if (name.equalsIgnoreCase("content-length")) {
+            // add effective uncompressed and unchunked length of content
+            replace.append("Content-Length").append(COLONSP)
+                .append(contentLength).append(CRLF);
+          }
+        }
+      }
+      start = lineEnd + 2 * trailingCrLf;
+    }
+    if (last > 0 || trailingCrLf != 2) {
+      if (last < headers.length()) {
+        // append trailing headers
+        replace.append(headers.substring(last));
+      }
+      while (trailingCrLf < 2) {
+        replace.append(CRLF);
+        trailingCrLf++;
+      }
+      return replace.toString();
+    }
+    return headers;
+  }
+
+  
 }
diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java 
b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
index 0b0b4c2..d307000 100644
--- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
+++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
@@ -51,6 +51,7 @@ import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.parse.ParseSegment;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.tools.WARCUtils;
 import org.apache.nutch.util.HadoopFSUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
@@ -144,6 +145,7 @@ public class WARCExporter extends Configured implements 
Tool {
 
         // were the headers stored as is? Can write a response element then
         String headersVerbatim = 
content.getMetadata().get("_response.headers_");
+        headersVerbatim = WARCUtils.fixHttpHeaders(headersVerbatim, 
content.getContent().length);
         byte[] httpheaders = new byte[0];
         if (StringUtils.isNotBlank(headersVerbatim)) {
           // check that ends with an empty line
@@ -241,7 +243,7 @@ public class WARCExporter extends Configured implements 
Tool {
           WARCRecord record = new WARCRecord(in);
           context.write(NullWritable.get(), new WARCWritable(record));
           context.getCounter("WARCExporter", "records generated").increment(1);
-        } catch (IOException exception) {
+        } catch (IOException | IllegalStateException exception) {
           LOG.error("Exception when generating WARC record for {} : {}", key,
               exception.getMessage());
           context.getCounter("WARCExporter", "exception").increment(1);
diff --git a/src/plugin/lib-htmlunit/ivy.xml b/src/plugin/lib-htmlunit/ivy.xml
index 6430535..f3b57b1 100644
--- a/src/plugin/lib-htmlunit/ivy.xml
+++ b/src/plugin/lib-htmlunit/ivy.xml
@@ -37,7 +37,8 @@
 
   <dependencies>
     <!-- begin selenium dependencies -->
-    <dependency org="org.seleniumhq.selenium" name="selenium-java" 
rev="2.44.0" />
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" 
rev="3.141.5" />
+    <dependency org="org.seleniumhq.selenium" name="htmlunit-driver" 
rev="2.35.1" />
     
     <dependency org="com.opera" name="operadriver" rev="1.5">
       <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
diff --git 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index 6cc0c4b..e76bc04 100644
--- 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -262,16 +262,14 @@ public class HttpResponse implements Response {
           } else if ("deflate".equals(contentEncoding)) {
             content = http.processDeflateEncoded(content, url);
           } else {
-            // store the headers verbatim only if the response was not 
compressed
-            // as the content length reported with not match otherwise
-            if (httpHeaders != null) {
-              headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
-            }
             if (Http.LOG.isTraceEnabled()) {
               Http.LOG.trace("fetched " + content.length + " bytes from " + 
url);
             }
           }
         }
+        if (httpHeaders != null) {
+          headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+        }
       }
 
     } finally {
diff --git 
a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 
b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index 25efb5e..5a4b1ef 100644
--- 
a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ 
b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -294,16 +294,14 @@ public class HttpResponse implements Response {
         } else if ("deflate".equals(contentEncoding)) {
           content = http.processDeflateEncoded(content, url);
         } else {
-          // store the headers verbatim only if the response was not compressed
-          // as the content length reported does not match otherwise
-          if (httpHeaders != null) {
-            httpHeaders.append("\r\n");
-            headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
-          }
           if (Http.LOG.isTraceEnabled()) {
             Http.LOG.trace("fetched " + content.length + " bytes from " + url);
           }
         }
+        if (httpHeaders != null) {
+          httpHeaders.append("\r\n");
+          headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+        }
       } catch (IOException | HttpException e) {
         // Headers parsing went fine, but an error occurred while trying to 
read
         // the body of the request (the body may be malformed)
diff --git 
a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
 
b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
index 516b2ec..4a20b04 100644
--- 
a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
+++ 
b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
@@ -295,6 +295,9 @@ public class HttpResponse implements Response {
             }
           }
         }
+        if (httpHeaders != null) {
+          headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+        }
       } 
 
     } finally {

[nutch] 01/04: NUTCH-2716 Response headers are not stored for a compressed response

Reply via email to