[ 
https://issues.apache.org/jira/browse/NUTCH-2549?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16509804#comment-16509804
 ] 

ASF GitHub Bot commented on NUTCH-2549:
---------------------------------------

sebastian-nagel closed pull request #347: NUTCH-2549  protocol-http does not 
behave the same as browsers
URL: https://github.com/apache/nutch/pull/347
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/build.xml b/build.xml
index 1d680d0bd..d4836a4f2 100644
--- a/build.xml
+++ b/build.xml
@@ -215,6 +215,7 @@
       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
       <packageset dir="${plugins.dir}/protocol-interactiveselenium/src/java"/>
+      <packageset dir="${plugins.dir}/protocol-okhttp/src/java"/>
       <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
       <packageset dir="${plugins.dir}/publish-rabbitmq/src/java"/>
       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
@@ -673,6 +674,7 @@
       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
       <packageset dir="${plugins.dir}/protocol-interactiveselenium/src/java"/>
+      <packageset dir="${plugins.dir}/protocol-okhttp/src/java"/>
       <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
       <packageset dir="${plugins.dir}/publish-rabbitmq/src/java"/>
       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
@@ -1107,6 +1109,8 @@
         <source path="${plugins.dir}/protocol-httpclient/src/java/" />
         <source path="${plugins.dir}/protocol-httpclient/src/test/" />
         <source path="${plugins.dir}/protocol-interactiveselenium/src/java/" />
+        <source path="${plugins.dir}/protocol-okhttp/src/java/" />
+        <source path="${plugins.dir}/protocol-okhttp/src/test/" />
         <source path="${plugins.dir}/protocol-selenium/src/java"/>
         <source path="${plugins.dir}/publish-rabbitmq/src/java"/>
         <source path="${plugins.dir}/scoring-depth/src/java/" />
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 37f73b8cd..cb2d2df50 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -277,6 +277,15 @@
   </description>
 </property>
 
+<property>
+  <name>http.proxy.type</name>
+  <value>HTTP</value>
+  <description>
+    Proxy type: HTTP or SOCKS (cf. java.net.Proxy.Type).
+    Note: supported by protocol-okhttp.
+  </description>
+</property>
+
 <property>
   <name>http.proxy.exception.list</name>
   <value></value>
@@ -301,9 +310,22 @@
 
 <property>
   <name>http.useHttp11</name>
+  <value>true</value>
+  <description>
+    If true, use HTTP 1.1, if false use HTTP 1.0 .
+  </description>
+</property>
+
+<property>
+  <name>http.useHttp2</name>
   <value>false</value>
-  <description>NOTE: at the moment this works only for protocol-httpclient.
-  If true, use HTTP 1.1, if false use HTTP 1.0 .
+  <description>
+    If true try HTTP/2 and fall-back to HTTP/1.1 if HTTP/2 not
+    supported, if false use always HTTP/1.1.
+
+    NOTE: HTTP/2 is currently only supported by protocol-okhttp and
+    requires at runtime Java 9 or a modified Java 8 with support for
+    ALPN (Application Layer Protocol Negotiation).
   </description>
 </property>
 
diff --git a/src/java/org/apache/nutch/metadata/HttpHeaders.java 
b/src/java/org/apache/nutch/metadata/HttpHeaders.java
index 71a66f66c..b7700e5d3 100644
--- a/src/java/org/apache/nutch/metadata/HttpHeaders.java
+++ b/src/java/org/apache/nutch/metadata/HttpHeaders.java
@@ -28,6 +28,8 @@
 
   public static final String TRANSFER_ENCODING = "Transfer-Encoding";
 
+  public static final String CLIENT_TRANSFER_ENCODING = 
"Client-Transfer-Encoding";
+
   public static final String CONTENT_ENCODING = "Content-Encoding";
 
   public static final String CONTENT_LANGUAGE = "Content-Language";
@@ -48,4 +50,8 @@
 
   public static final String LOCATION = "Location";
 
+  public static final String IF_MODIFIED_SINCE = "If-Modified-Since";
+
+  public static final String USER_AGENT = "User-Agent";
+
 }
diff --git a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java 
b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
index 9434cab60..fdbf1b62c 100644
--- a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
+++ b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
@@ -32,9 +32,10 @@
 public class SpellCheckedMetadata extends Metadata {
 
   /**
-   * Treshold divider.
+   * Threshold divider to calculate max. Levenshtein distance for misspelled
+   * header field names:
    * 
-   * <code>threshold = searched.length() / TRESHOLD_DIVIDER;</code>
+   * <code>threshold = Math.min(3, searched.length() / 
TRESHOLD_DIVIDER);</code>
    */
   private static final int TRESHOLD_DIVIDER = 3;
 
@@ -112,7 +113,7 @@ public static String getNormalizedName(final String name) {
     String value = NAMES_IDX.get(searched);
 
     if ((value == null) && (normalized != null)) {
-      int threshold = searched.length() / TRESHOLD_DIVIDER;
+      int threshold = Math.min(3, searched.length() / TRESHOLD_DIVIDER);
       for (int i = 0; i < normalized.length && value == null; i++) {
         if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < 
threshold) {
           value = NAMES_IDX.get(normalized[i]);
diff --git a/src/java/org/apache/nutch/net/protocols/Response.java 
b/src/java/org/apache/nutch/net/protocols/Response.java
index c9139bd6c..7096c934d 100644
--- a/src/java/org/apache/nutch/net/protocols/Response.java
+++ b/src/java/org/apache/nutch/net/protocols/Response.java
@@ -26,6 +26,32 @@
  */
 public interface Response extends HttpHeaders {
 
+  /** Key to hold the HTTP request if <code>store.http.request</code> is true 
*/
+  public static final String REQUEST = "_request_";
+
+  /**
+   * Key to hold the HTTP response header if <code>store.http.headers</code> is
+   * true
+   */
+  public static final String RESPONSE_HEADERS = "_response.headers_";
+
+  /**
+   * Key to hold the IP address the request is sent to if
+   * <code>store.ip.address</code> is true
+   */
+  public static final String IP_ADDRESS = "_ip_";
+
+  /**
+   * Key to hold the time when the page has been fetched
+   */
+  public static final String FETCH_TIME = "nutch.fetch.time";
+
+  /**
+   * Key to hold boolean whether content has been trimmed because it exceeds
+   * <code>http.content.limit</code>
+   */
+  public static final String TRIMMED_CONTENT = "http.content.trimmed";
+
   /** Returns the URL used to retrieve this response. */
   public URL getUrl();
 
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 5a3a8c910..a9cb912cc 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -73,6 +73,7 @@
     <ant dir="protocol-http" target="deploy"/>
     <ant dir="protocol-httpclient" target="deploy"/>
     <ant dir="protocol-interactiveselenium" target="deploy" />
+    <ant dir="protocol-okhttp" target="deploy"/>
     <ant dir="protocol-selenium" target="deploy" />
     <ant dir="publish-rabbitmq" target="deploy"/>
     <ant dir="scoring-depth" target="deploy"/>
@@ -132,6 +133,7 @@
      <ant dir="protocol-file" target="test"/>
      <ant dir="protocol-http" target="test"/>
      <ant dir="protocol-httpclient" target="test"/>
+     <ant dir="protocol-okhttp" target="test"/>
      <ant dir="scoring-orphan" target="test"/>
      <ant dir="subcollection" target="test"/>
      <ant dir="urlfilter-automaton" target="test"/>
@@ -209,6 +211,7 @@
     <ant dir="protocol-http" target="clean"/>
     <ant dir="protocol-httpclient" target="clean"/>
     <ant dir="protocol-interactiveselenium" target="clean" />
+    <ant dir="protocol-okhttp" target="clean"/>
     <ant dir="protocol-selenium" target="clean" />
     <ant dir="publish-rabbitmq" target="clean"/>
     <ant dir="scoring-depth" target="clean"/>
diff --git 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index d9284c9aa..1cb2bb151 100644
--- 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -20,6 +20,8 @@
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.Reader;
+import java.net.Proxy;
+import java.net.URI;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -70,8 +72,11 @@
   /** The proxy port. */
   protected int proxyPort = 8080;
   
+  /** The proxy port. */
+  protected Proxy.Type proxyType = Proxy.Type.HTTP;
+
   /** The proxy exception list. */
-  protected HashMap proxyException = new HashMap(); 
+  protected HashMap<String,String> proxyException = new HashMap<>();
 
   /** Indicates if a proxy is used */
   protected boolean useProxy = false;
@@ -89,7 +94,7 @@
   /** The "Accept-Language" request header value. */
   protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
 
-  /** The "Accept-Language" request header value. */
+  /** The "Accept-Charset" request header value. */
   protected String acceptCharset = "utf-8,iso-8859-1;q=0.7,*;q=0.7";
 
   /** The "Accept" request header value. */
@@ -108,12 +113,33 @@
   /** Do we use HTTP/1.1? */
   protected boolean useHttp11 = false;
 
+  /** Whether to use HTTP/2 */
+  protected boolean useHttp2 = false;
+
   /**
    * Record response time in CrawlDatum's meta data, see property
    * http.store.responsetime.
    */
   protected boolean responseTime = true;
 
+  /**
+   * Record the IP address of the responding server, see property
+   * <code>store.ip.address</code>.
+   */
+  protected boolean storeIPAddress = false;
+
+  /**
+   * Record the HTTP request in the metadata, see property
+   * <code>store.http.request</code>.
+   */
+  protected boolean storeHttpRequest = false;
+
+  /**
+   * Record the HTTP response header in the metadata, see property
+   * <code>store.http.headers</code>.
+   */
+  protected boolean storeHttpHeaders = false;
+
   /** Skip page if Crawl-Delay longer than this value. */
   protected long maxCrawlDelay = -1L;
 
@@ -147,6 +173,7 @@ public void setConf(Configuration conf) {
     this.conf = conf;
     this.proxyHost = conf.get("http.proxy.host");
     this.proxyPort = conf.getInt("http.proxy.port", 8080);
+    this.proxyType = Proxy.Type.valueOf(conf.get("http.proxy.type", "HTTP"));
     this.proxyException = 
arrayToMap(conf.getStrings("http.proxy.exception.list"));
     this.useProxy = (proxyHost != null && proxyHost.length() > 0);
     this.timeout = conf.getInt("http.timeout", 10000);
@@ -160,7 +187,11 @@ public void setConf(Configuration conf) {
     this.accept = conf.get("http.accept", accept).trim();
     // backward-compatible default setting
     this.useHttp11 = conf.getBoolean("http.useHttp11", false);
+    this.useHttp2 = conf.getBoolean("http.useHttp2", false);
     this.responseTime = conf.getBoolean("http.store.responsetime", true);
+    this.storeIPAddress = conf.getBoolean("store.ip.address", false);
+    this.storeHttpRequest = conf.getBoolean("store.http.request", false);
+    this.storeHttpHeaders = conf.getBoolean("store.http.headers", false);
     this.enableIfModifiedsinceHeader = 
conf.getBoolean("http.enable.if.modified.since.header", true);
     this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header", 
true);
     this.robots.setConf(conf);
@@ -360,9 +391,15 @@ public int getProxyPort() {
   }
 
   public boolean useProxy(URL url) {
-    if (!useProxy){
-      return false;
-    } else if (proxyException.get(url.getHost())!=null){
+    return useProxy(url.getHost());
+  }
+
+  public boolean useProxy(URI uri) {
+    return useProxy(uri.getHost());
+  }
+
+  public boolean useProxy(String host) {
+    if (useProxy && proxyException.containsKey(host)) {
       return false;
     }
     return useProxy;
@@ -380,13 +417,26 @@ public boolean isCookieEnabled() {
     return enableCookieHeader;
   }
 
+  public boolean isStoreIPAddress() {
+    return storeIPAddress;
+  }
+
+  public boolean isStoreHttpRequest() {
+    return storeHttpRequest;
+  }
+
+  public boolean isStoreHttpHeaders() {
+    return storeHttpHeaders;
+  }
+
   public int getMaxContent() {
     return maxContent;
   }
 
   public String getUserAgent() {
-    if (userAgentNames!=null) {
-      return 
userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()));
+    if (userAgentNames != null) {
+      return userAgentNames
+          .get(ThreadLocalRandom.current().nextInt(userAgentNames.size()));
     }
     return userAgent;
   }
diff --git 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index 8b1a03183..19c00fde5 100644
--- 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -205,7 +205,7 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum 
datum)
 
       // store the request in the metadata?
       if (conf.getBoolean("store.http.request", false) == true) {
-        headers.add("_request_", reqStr.toString());
+        headers.add(Response.REQUEST, reqStr.toString());
       }
 
       byte[] reqBytes = reqStr.toString().getBytes();
@@ -263,7 +263,7 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum 
datum)
             // store the headers verbatim only if the response was not 
compressed
             // as the content length reported with not match otherwise
             if (httpHeaders != null) {
-              headers.add("_response.headers_", httpHeaders.toString());
+              headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
             }
             if (Http.LOG.isTraceEnabled()) {
               Http.LOG.trace("fetched " + content.length + " bytes from " + 
url);
diff --git 
a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 
b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index 56ae789f3..4b5544e90 100644
--- 
a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ 
b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -18,7 +18,6 @@
 
 import java.io.BufferedInputStream;
 import java.io.ByteArrayOutputStream;
-import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
@@ -26,6 +25,7 @@
 import java.net.InetSocketAddress;
 import java.net.Socket;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
@@ -33,9 +33,9 @@
 import javax.net.ssl.SSLSocket;
 import javax.net.ssl.SSLSocketFactory;
 
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.HttpHeaders;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.SpellCheckedMetadata;
 import org.apache.nutch.net.protocols.HttpDateFormat;
@@ -49,7 +49,6 @@
  */
 public class HttpResponse implements Response {
 
-  private Configuration conf;
   private HttpBase http;
   private URL url;
   private byte[] content;
@@ -91,7 +90,10 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
       Http.LOG.trace("fetching " + url);
     }
 
-    String path = "".equals(url.getFile()) ? "/" : url.getFile();
+    String path = url.getFile();
+    if (!path.startsWith("/")) {
+      path = "/" + path;
+    }
 
     // some servers will redirect a request with a host line like
     // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
@@ -150,9 +152,7 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum 
datum)
         socket = sslsocket;
       }
 
-      this.conf = http.getConf();
-      if (sockAddr != null
-          && conf.getBoolean("store.ip.address", false) == true) {
+      if (sockAddr != null && http.isStoreIPAddress()) {
         headers.add("_ip_", sockAddr.getAddress().getHostAddress());
       }
 
@@ -166,7 +166,11 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum 
datum)
         reqStr.append(path);
       }
 
-      reqStr.append(" HTTP/1.0\r\n");
+      if (http.getUseHttp11()) {
+        reqStr.append(" HTTP/1.1\r\n");
+      } else {
+        reqStr.append(" HTTP/1.0\r\n");
+      }
 
       reqStr.append("Host: ");
       reqStr.append(host);
@@ -217,15 +221,19 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum 
datum)
       }
 
       if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
-        reqStr.append("If-Modified-Since: " + HttpDateFormat
-            .toString(datum.getModifiedTime()));
+        reqStr.append(HttpHeaders.IF_MODIFIED_SINCE + ": "
+            + HttpDateFormat.toString(datum.getModifiedTime()));
         reqStr.append("\r\n");
       }
+
+      // "signal that this connection will be closed after completion of the
+      // response", see https://tools.ietf.org/html/rfc7230#section-6.1
+      reqStr.append("Connection: close\r\n");
       reqStr.append("\r\n");
 
       // store the request in the metadata?
-      if (conf.getBoolean("store.http.request", false) == true) {
-        headers.add("_request_", reqStr.toString());
+      if (http.isStoreHttpRequest()) {
+        headers.add(Response.REQUEST, reqStr.toString());
       }
 
       byte[] reqBytes = reqStr.toString().getBytes();
@@ -239,18 +247,28 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum 
datum)
                   Http.BUFFER_SIZE), Http.BUFFER_SIZE);
 
       StringBuffer line = new StringBuffer();
+      StringBuffer lineSeparator = new StringBuffer();
 
       // store the http headers verbatim
-      if (conf.getBoolean("store.http.headers", false) == true) {
+      if (http.isStoreHttpHeaders()) {
         httpHeaders = new StringBuffer();
       }
 
-      headers.add("nutch.fetch.time", 
Long.toString(System.currentTimeMillis()));
+      headers.add(FETCH_TIME, Long.toString(System.currentTimeMillis()));
 
       boolean haveSeenNonContinueStatus = false;
       while (!haveSeenNonContinueStatus) {
         // parse status code line
-        this.code = parseStatusLine(in, line);
+        try {
+          this.code = parseStatusLine(in, line, lineSeparator);
+        } catch(HttpException e) {
+          Http.LOG.warn("Missing or invalid HTTP status line", e);
+          Http.LOG.warn("No HTTP header, assuming HTTP/0.9 for {}", getUrl());
+          this.code = 200;
+          
in.unread(lineSeparator.toString().getBytes(StandardCharsets.ISO_8859_1));
+          in.unread(line.toString().getBytes(StandardCharsets.ISO_8859_1));
+          break;
+        }
         if (httpHeaders != null)
           httpHeaders.append(line).append("\n");
         // parse headers
@@ -258,30 +276,46 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum 
datum)
         haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
       }
 
-      String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
-      if (transferEncoding != null && "chunked"
-          .equalsIgnoreCase(transferEncoding.trim())) {
-        readChunkedContent(in, line);
-      } else {
-        readPlainContent(in);
-      }
+      try {
+        String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
+        if (transferEncoding != null
+            && "chunked".equalsIgnoreCase(transferEncoding.trim())) {
+          readChunkedContent(in, line);
+        } else {
+          readPlainContent(in);
+        }
 
-      String contentEncoding = getHeader(Response.CONTENT_ENCODING);
-      if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
-        content = http.processGzipEncoded(content, url);
-      } else if ("deflate".equals(contentEncoding)) {
-        content = http.processDeflateEncoded(content, url);
-      } else {
-        // store the headers verbatim only if the response was not compressed
-        // as the content length reported with not match otherwise
-        if (httpHeaders != null) {
-          headers.add("_response.headers_", httpHeaders.toString());
+        String contentEncoding = getHeader(Response.CONTENT_ENCODING);
+        if ("gzip".equals(contentEncoding)
+            || "x-gzip".equals(contentEncoding)) {
+          content = http.processGzipEncoded(content, url);
+        } else if ("deflate".equals(contentEncoding)) {
+          content = http.processDeflateEncoded(content, url);
+        } else {
+          // store the headers verbatim only if the response was not compressed
+          // as the content length reported does not match otherwise
+          if (httpHeaders != null) {
+            headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+          }
+          if (Http.LOG.isTraceEnabled()) {
+            Http.LOG.trace("fetched " + content.length + " bytes from " + url);
+          }
         }
-        if (Http.LOG.isTraceEnabled()) {
-          Http.LOG.trace("fetched " + content.length + " bytes from " + url);
+      } catch (IOException | HttpException e) {
+        // Headers parsing went fine, but an error occurred while trying to 
read
+        // the body of the request (the body may be malformed)
+        if (code != 200) {
+          Http.LOG.warn(
+              "Ignored exception while reading payload of response with status 
code "
+                  + code + ":",
+              e);
+          content = null;
+        } else {
+          // If the page is a "200 OK" response, we do not want to go further
+          // with processing the invalid payload.
+          throw e;
         }
       }
-
     } finally {
       if (socket != null)
         socket.close();
@@ -352,10 +386,11 @@ private void readPlainContent(InputStream in)
     if (contentLengthString != null) {
       contentLengthString = contentLengthString.trim();
       try {
-        if (!contentLengthString.isEmpty())
+        if (!contentLengthString.isEmpty()) {
           contentLength = Integer.parseInt(contentLengthString);
+        }
       } catch (NumberFormatException e) {
-        throw new HttpException("bad content length: " + contentLengthString);
+        Http.LOG.warn("bad content length: {}", contentLengthString);
       }
     }
     if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
@@ -491,41 +526,34 @@ private void readChunkedContent(PushbackInputStream in, 
StringBuffer line)
 
   }
 
-  private int parseStatusLine(PushbackInputStream in, StringBuffer line)
-      throws IOException, HttpException {
-    readLine(in, line, false);
+  private int parseStatusLine(PushbackInputStream in, StringBuffer line,
+      StringBuffer lineSeparator) throws IOException, HttpException {
+    readLine(in, line, false, 2048, lineSeparator);
 
     int codeStart = line.indexOf(" ");
-    int codeEnd = line.indexOf(" ", codeStart + 1);
-
-    // handle lines with no plaintext result code, ie:
-    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
-    if (codeEnd == -1)
-      codeEnd = line.length();
+    int codeEnd;
+    int lineLength = line.length();
+
+    // We want to handle lines like "HTTP/1.1 200", "HTTP/1.1 200 OK", or 
"HTTP/1.1 404: Not Found"
+    for (codeEnd = codeStart + 1; codeEnd < lineLength; codeEnd++) {
+      if (!Character.isDigit(line.charAt(codeEnd))) break;
+      // Note: input is plain ASCII and may not contain Arabic etc. digits
+      // covered by Character.isDigit()
+    }
 
-    int code;
     try {
-      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+      return Integer.parseInt(line.substring(codeStart + 1, codeEnd));
     } catch (NumberFormatException e) {
-      throw new HttpException(
-          "bad status line '" + line + "': " + e.getMessage(), e);
+      throw new HttpException("Bad status line, no HTTP response code: " + 
line, e);
     }
-
-    return code;
   }
 
-  private void processHeaderLine(StringBuffer line)
-      throws IOException, HttpException {
+  private void processHeaderLine(StringBuffer line) {
 
     int colonIndex = line.indexOf(":"); // key is up to colon
     if (colonIndex == -1) {
-      int i;
-      for (i = 0; i < line.length(); i++)
-        if (!Character.isWhitespace(line.charAt(i)))
-          break;
-      if (i == line.length())
-        return;
-      throw new HttpException("No colon in header:" + line);
+      Http.LOG.info("Ignoring a header line without a colon: '{}'", line);
+      return;
     }
     String key = line.substring(0, colonIndex);
 
@@ -555,7 +583,7 @@ private void parseHeaders(PushbackInputStream in, 
StringBuffer line,
           (pos = line.indexOf("<HTML")) != -1) || ((pos = 
line.indexOf("<html"))
           != -1)) {
 
-        in.unread(line.substring(pos).getBytes("UTF-8"));
+        in.unread(line.substring(pos).getBytes(StandardCharsets.ISO_8859_1));
         line.setLength(pos);
 
         try {
@@ -578,14 +606,31 @@ private void parseHeaders(PushbackInputStream in, 
StringBuffer line,
 
   private static int readLine(PushbackInputStream in, StringBuffer line,
       boolean allowContinuedLine) throws IOException {
+    return readLine(in, line, allowContinuedLine, Http.BUFFER_SIZE, null);
+  }
+
+  private static int readLine(PushbackInputStream in, StringBuffer line,
+      boolean allowContinuedLine, int maxBytes, StringBuffer lineSeparator) 
throws IOException {
     line.setLength(0);
-    for (int c = in.read(); c != -1; c = in.read()) {
+    int bytesRead = 0;
+    for (int c = in.read(); c != -1
+        && bytesRead < maxBytes; c = in.read(), bytesRead++) {
       switch (c) {
       case '\r':
+        if (lineSeparator != null) {
+          lineSeparator.append((char) c);
+        }
         if (peek(in) == '\n') {
           in.read();
+          if (lineSeparator != null) {
+            lineSeparator.append((char) c);
+          }
         }
+        // fall-through
       case '\n':
+        if (lineSeparator != null) {
+          lineSeparator.append((char) c);
+        }
         if (line.length() > 0) {
           // at EOL -- check for continued line if the current
           // (possibly continued) line wasn't blank
@@ -594,6 +639,9 @@ private static int readLine(PushbackInputStream in, 
StringBuffer line,
             case ' ':
             case '\t': // line is continued
               in.read();
+              if (lineSeparator != null) {
+                lineSeparator.replace(0, lineSeparator.length(), "");
+              }
               continue;
             }
         }
@@ -602,7 +650,11 @@ private static int readLine(PushbackInputStream in, 
StringBuffer line,
         line.append((char) c);
       }
     }
-    throw new EOFException();
+    if (bytesRead >= maxBytes) {
+      throw new IOException("Line exceeds max. buffer size: "
+          + line.substring(0, Math.min(32, line.length())));
+    }
+    return line.length();
   }
 
   private static int peek(PushbackInputStream in) throws IOException {
diff --git a/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml 
b/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml
index a9afd7816..c419e59b2 100644
--- a/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml
+++ b/src/plugin/protocol-http/src/test/conf/nutch-site-test.xml
@@ -49,4 +49,10 @@
   <description></description>
 </property>
 
-</configuration>
\ No newline at end of file
+<property>
+  <name>http.content.limit</name>
+  <value>65536</value>
+  <description></description>
+</property>
+
+</configuration>
diff --git 
a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestBadServerResponses.java
 
b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestBadServerResponses.java
new file mode 100644
index 000000000..51c7930b6
--- /dev/null
+++ 
b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestBadServerResponses.java
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.lang.invoke.MethodHandles;
+import java.net.InetSocketAddress;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.junit.After;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Test cases for protocol-http - robustness regarding bad server responses:
+ * malformed HTTP header lines, etc. See, NUTCH-2549.
+ */
+public class TestBadServerResponses {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  private Http http;
+  private ServerSocket server;
+  private Configuration conf;
+  private int port = 47505;
+
+  private static final String responseHeader = "HTTP/1.1 200 OK\r\n";
+  private static final String simpleContent = "Content-Type: 
text/html\r\n\r\nThis is a text.";
+
+  public void setUp() throws Exception {
+    conf = new Configuration();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("nutch-site-test.xml");
+    conf.setBoolean("store.http.headers", true);
+
+    http = new Http();
+    http.setConf(conf);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    server.close();
+  }
+
+  /**
+   * Starts the test server at a specified port and constant response.
+   * 
+   * @param portno
+   *          Port number.
+   * @param response
+   *          response sent on every request
+   */
+  private void runServer(int port, String response) throws Exception {
+    server = new ServerSocket();
+    server.bind(new InetSocketAddress("127.0.0.1", port));
+    Pattern requestPattern = Pattern.compile("(?i)^GET\\s+(\\S+)");
+    while (true) {
+      LOG.info("Listening on port {}", port);
+      Socket socket = server.accept();
+      LOG.info("Connection received");
+      try (
+          BufferedReader in = new BufferedReader(new InputStreamReader(
+              socket.getInputStream(), StandardCharsets.UTF_8));
+          PrintWriter out = new PrintWriter(new OutputStreamWriter(
+              socket.getOutputStream(), StandardCharsets.UTF_8), true)) {
+
+        String line;
+        while ((line = in.readLine()) != null) {
+          LOG.info("Request: {}", line);
+          if (line.trim().isEmpty()) {
+            break;
+          }
+          Matcher m = requestPattern.matcher(line);
+          if (m.find()) {
+            LOG.info("Requested {}", m.group(1));
+            if (!m.group(1).startsWith("/")) {
+              response = "HTTP/1.1 400 Bad request\r\n\r\n";
+            }
+          }
+        }
+        LOG.info("Response: {}",
+            response.substring(0, Math.min(1024, response.length())));
+        out.print(response);
+      } catch (Exception e) {
+        LOG.warn("Exception in test server:", e);
+      }
+    }
+  }
+
+  private void launchServer(String response) throws InterruptedException {
+    Thread serverThread = new Thread(() -> {
+      try {
+        runServer(port, response);
+      } catch (Exception e) {
+        LOG.warn("Test server died:", e);
+      }
+    });
+    serverThread.start();
+    Thread.sleep(50);
+  }
+
+  /**
+   * Fetches the specified <code>page</code> from the local test server and
+   * checks whether the HTTP response status code matches with the expected
+   * code.
+   * 
+   * @param page
+   *          Page to be fetched.
+   * @param expectedCode
+   *          HTTP response status code expected while fetching the page.
+   */
+  private Response fetchPage(String page, int expectedCode) throws Exception {
+    URL url = new URL("http", "127.0.0.1", port, page);
+    LOG.info("Fetching {}", url);
+    CrawlDatum crawlDatum = new CrawlDatum();
+    Response response = http.getResponse(url, crawlDatum, true);
+    assertEquals("HTTP Status Code for " + url, expectedCode,
+        response.getCode());
+    return response;
+  }
+
+  @Test
+  public void testBadHttpServer() throws Exception {
+    setUp();
+    // test with trivial well-formed content, to make sure the server is
+    // responding
+    launchServer(responseHeader + simpleContent);
+    fetchPage("/", 200);
+  }
+
+  /**
+   * NUTCH-2555 URL normalization problem: path not starting with a '/'
+   */
+  @Test
+  public void testRequestNotStartingWithSlash() throws Exception {
+    setUp();
+    launchServer(responseHeader + simpleContent);
+    fetchPage("?171", 200);
+  }
+
+  /**
+   * NUTCH-2564 protocol-http throws an error when the content-length header is
+   * not a number
+   */
+  @Test
+  public void testContentLengthNotANumber() throws Exception {
+    setUp();
+    launchServer(
+        responseHeader + "Content-Length: thousand\r\n" + simpleContent);
+    fetchPage("/", 200);
+  }
+
+  /**
+   * NUTCH-2559 protocol-http cannot handle colons after the HTTP status code
+   */
+  @Test
+  public void testHeaderWithColon() throws Exception {
+    setUp();
+    launchServer("HTTP/1.1 200: OK\r\n" + simpleContent);
+    fetchPage("/", 200);
+  }
+
+  /**
+   * NUTCH-2563 HTTP header spellchecking issues
+   */
+  @Test
+  public void testHeaderSpellChecking() throws Exception {
+    setUp();
+    launchServer(responseHeader + "Client-Transfer-Encoding: chunked\r\n"
+        + simpleContent);
+    fetchPage("/", 200);
+  }
+
+  /**
+   * NUTCH-2557 protocol-http fails to follow redirections when an HTTP 
response
+   * body is invalid
+   */
+  @Test
+  public void testIgnoreErrorInRedirectPayload() throws Exception {
+    setUp();
+    launchServer("HTTP/1.1 302 Found\r\nLocation: http://example.com/\r\n";
+        + "Transfer-Encoding: chunked\r\n\r\nNot a valid chunk.");
+    Response fetched = fetchPage("/", 302);
+    assertNotNull("No redirect Location.", fetched.getHeader("Location"));
+    assertEquals("Wrong redirect Location.", "http://example.com/";,
+        fetched.getHeader("Location"));
+  }
+
+  /**
+   * NUTCH-2558 protocol-http cannot handle a missing HTTP status line
+   */
+  @Test
+  public void testNoStatusLine() throws Exception {
+    setUp();
+    String text = "This is a text containing non-ASCII characters: 
\u00e4\u00f6\u00fc\u00df";
+    launchServer(text);
+    Response fetched = fetchPage("/", 200);
+    assertEquals("Wrong text returned for response with no status line.", text,
+        new String(fetched.getContent(), StandardCharsets.UTF_8));
+    server.close();
+    text = "<!DOCTYPE html>\n<html>\n<head>\n"
+        + "<title>Testing no HTTP header èéâ</title>\n"
+        + "<meta charset=\"utf-8\">\n"
+        + "</head>\n<body>This is a text containing non-ASCII characters:"
+        + "\u00e4\u00f6\u00fc\u00df</body>\n</html";
+    launchServer(text);
+    fetched = fetchPage("/", 200);
+    assertEquals("Wrong text returned for response with no status line.", text,
+        new String(fetched.getContent(), StandardCharsets.UTF_8));
+  }
+
+  /**
+   * NUTCH-2560 protocol-http throws an error when an http header spans over
+   * multiple lines
+   */
+  @Test
+  public void testMultiLineHeader() throws Exception {
+    setUp();
+    launchServer(responseHeader
+        + "Set-Cookie: UserID=JohnDoe;\r\n  Max-Age=3600;\r\n  Version=1\r\n"
+        + simpleContent);
+    Response fetched = fetchPage("/", 200);
+    LOG.info("Headers: {}", fetched.getHeaders());
+    assertNotNull("Failed to set multi-line \"Set-Cookie\" header.", 
fetched.getHeader("Set-Cookie"));
+    assertTrue("Failed to set multi-line \"Set-Cookie\" header.",
+        fetched.getHeader("Set-Cookie").contains("Version=1"));
+  }
+
+  /**
+   * NUTCH-2561 protocol-http can be made to read arbitrarily large HTTP
+   * responses
+   */
+  @Test(expected = Exception.class)
+  public void testOverlongHeader() throws Exception {
+    setUp();
+    StringBuilder response = new StringBuilder();
+    response.append(responseHeader);
+    for (int i = 0; i < 80; i++) {
+      response.append("X-Custom-Header-");
+      for (int j = 0; j < 10000; j++) {
+        response.append('x');
+      }
+      response.append(": hello\r\n");
+    }
+    response.append("\r\n" + simpleContent);
+    launchServer(response.toString());
+    // should throw exception because of overlong header
+    fetchPage("/", 200);
+  }
+
+  /**
+   * NUTCH-2562 protocol-http fails to read large chunked HTTP responses,
+   * NUTCH-2575 protocol-http does not respect the maximum content-size for
+   * chunked responses
+   */
+  @Test
+  public void testChunkedContent() throws Exception {
+    setUp();
+    StringBuilder response = new StringBuilder();
+    response.append(responseHeader);
+    response.append("Content-Type: text/html\r\n");
+    response.append("Transfer-Encoding: chunked\r\n");
+    // 81920 bytes (80 chunks, 1024 bytes each)
+    // > 65536 (http.content.limit defined in nutch-site-test.xml)
+    for (int i = 0; i < 80; i++) {
+      response.append(String.format("\r\n400\r\n%02x\r\n", i));
+      for (int j = 0; j < 1012; j++) {
+        response.append('x');
+      }
+      response.append(String.format("\r\n%02x\r\n", i));
+      response.append("\r\n");
+    }
+    response.append("\r\n0\r\n\r\n");
+    launchServer(response.toString());
+    Response fetched = fetchPage("/", 200);
+    assertEquals(
+        "Chunked content not truncated according to http.content.limit", 65536,
+        fetched.getContent().length);
+  }
+
+}
diff --git a/src/plugin/protocol-okhttp/build.xml 
b/src/plugin/protocol-okhttp/build.xml
new file mode 100755
index 000000000..644eeb0ea
--- /dev/null
+++ b/src/plugin/protocol-okhttp/build.xml
@@ -0,0 +1,50 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-okhttp" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+    </fileset>
+    <pathelement location="${build.dir}/test/conf"/>
+  </path>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-http"/>
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <copy toDir="${build.test}">
+      <fileset dir="${src.test}" excludes="**/*.java"/>
+    </copy>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+      <fileset dir="jsp"/>
+   </copy>
+
+</project>
diff --git a/src/plugin/protocol-okhttp/ivy.xml 
b/src/plugin/protocol-okhttp/ivy.xml
new file mode 100644
index 000000000..4c9035138
--- /dev/null
+++ b/src/plugin/protocol-okhttp/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="com.squareup.okhttp3" name="okhttp" rev="3.10.0"/>
+  </dependencies>
+  
+</ivy-module>
diff --git a/src/plugin/protocol-okhttp/jsp/basic-http.jsp 
b/src/plugin/protocol-okhttp/jsp/basic-http.jsp
new file mode 100644
index 000000000..bf1f8bd30
--- /dev/null
+++ b/src/plugin/protocol-okhttp/jsp/basic-http.jsp
@@ -0,0 +1,44 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin  
+--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
+String path = request.getContextPath();
+String basePath = 
request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+%>
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>HelloWorld</title>
+    <meta http-equiv="content-type" content="text/html;charset=utf-8" />
+    <meta name="Language" content="en" />
+       <meta http-equiv="pragma" content="no-cache">
+       <meta http-equiv="cache-control" content="no-cache">
+       <meta http-equiv="expires" content="0">    
+       <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+       <meta http-equiv="description" content="This is my page">
+       <!--
+       <link rel="stylesheet" type="text/css" href="styles.css">
+       -->
+  </head>
+  
+  <body>
+    Hello World!!! <br>
+  </body>
+</html>
diff --git a/src/plugin/protocol-okhttp/jsp/brokenpage.jsp 
b/src/plugin/protocol-okhttp/jsp/brokenpage.jsp
new file mode 100644
index 000000000..f3f7c4aba
--- /dev/null
+++ b/src/plugin/protocol-okhttp/jsp/brokenpage.jsp
@@ -0,0 +1,47 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin
+--%>
+
+@ page language="java" import="java.util.*" pageEncoding="UTF-8"
+
+String path = request.getContextPath();
+String basePath = 
request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>HelloWorld</title>
+    <meta http-equiv="content-type" content="text/html;charset=utf-8" />
+    <meta name="Language" content="en" />
+       <meta http-equiv="pragma" content="no-cache">
+       <meta http-equiv="cache-control" content="no-cache">
+       <meta http-equiv="expires" content="0">    
+       <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+       <meta http-equiv="description" content="This is my page">
+       <!--
+       <link rel="stylesheet" type="text/css" href="styles.css">
+       -->
+  </head>
+  
+  <body>
+    Hello World!!! <br>
+  </body>
+</html>
diff --git a/src/plugin/protocol-okhttp/jsp/redirect301.jsp 
b/src/plugin/protocol-okhttp/jsp/redirect301.jsp
new file mode 100644
index 000000000..1100b891e
--- /dev/null
+++ b/src/plugin/protocol-okhttp/jsp/redirect301.jsp
@@ -0,0 +1,49 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin
+--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
+String path = request.getContextPath();
+String basePath = 
request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+%>
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>My JSP page</title>
+    
+       <meta http-equiv="pragma" content="no-cache">
+       <meta http-equiv="cache-control" content="no-cache">
+       <meta http-equiv="expires" content="0">    
+       <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+       <meta http-equiv="description" content="This is my page">
+       <!--
+       <link rel="stylesheet" type="text/css" href="styles.css">
+       -->
+
+  </head>
+  
+  <body>
+       <%
+       response.setStatus(301);
+       response.setHeader( "Location", "http://nutch.apache.org";);
+       response.setHeader( "Connection", "close" );
+               %> 
+    You are redirected by JSP<br>
+  </body>
+</html>
diff --git a/src/plugin/protocol-okhttp/jsp/redirect302.jsp 
b/src/plugin/protocol-okhttp/jsp/redirect302.jsp
new file mode 100644
index 000000000..8a250d9aa
--- /dev/null
+++ b/src/plugin/protocol-okhttp/jsp/redirect302.jsp
@@ -0,0 +1,49 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin 
+--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
+String path = request.getContextPath();
+String basePath = 
request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+%>
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>My JSP page</title>
+    
+       <meta http-equiv="pragma" content="no-cache">
+       <meta http-equiv="cache-control" content="no-cache">
+       <meta http-equiv="expires" content="0">    
+       <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+       <meta http-equiv="description" content="This is my page">
+       <!--
+       <link rel="stylesheet" type="text/css" href="styles.css">
+       -->
+
+  </head>
+  
+  <body>
+       <%
+       response.setStatus(302);
+       response.setHeader( "Location", "http://nutch.apache.org";);
+       response.setHeader( "Connection", "close" );
+               %> 
+    You are sucessfully redirected by JSP<br>
+  </body>
+</html>
diff --git a/src/plugin/protocol-okhttp/plugin.xml 
b/src/plugin/protocol-okhttp/plugin.xml
new file mode 100755
index 000000000..0152fb057
--- /dev/null
+++ b/src/plugin/protocol-okhttp/plugin.xml
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-okhttp"
+   name="OKHttp Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="protocol-okhttp.jar">
+         <export name="*"/>
+      </library>
+      <library name="okhttp-3.10.0.jar"/>
+      <library name="okio-1.14.0.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-http"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.okhttp"
+              name="OkHttpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.okhttp.OkHttp"
+                      class="org.apache.nutch.protocol.okhttp.OkHttp">
+        <parameter name="protocolName" value="http"/>
+      </implementation>
+
+      <implementation id="org.apache.nutch.protocol.okhttp.OkHttp"
+                       class="org.apache.nutch.protocol.okhttp.OkHttp">
+           <parameter name="protocolName" value="https"/>
+      </implementation>
+
+   </extension>
+
+</plugin>
diff --git 
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
new file mode 100755
index 000000000..9206f81fc
--- /dev/null
+++ 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
@@ -0,0 +1,248 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.okhttp;
+
+import java.lang.invoke.MethodHandles;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.InetSocketAddress;
+import java.net.Proxy;
+import java.net.ProxySelector;
+import java.net.SocketAddress;
+import java.net.URI;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Base64;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Locale;
+import java.util.concurrent.TimeUnit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.util.NutchConfiguration;
+
+import okhttp3.Connection;
+import okhttp3.Headers;
+import okhttp3.Interceptor;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+
+public class OkHttp extends HttpBase {
+
+  protected static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  private final List<String[]> customRequestHeaders = new LinkedList<>();
+
+  private OkHttpClient client;
+
+  public OkHttp() {
+    super(LOG);
+  }
+
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+
+    // protocols in order of preference
+    List<okhttp3.Protocol> protocols = new ArrayList<>();
+    if (useHttp2) {
+      protocols.add(okhttp3.Protocol.HTTP_2);
+    }
+    protocols.add(okhttp3.Protocol.HTTP_1_1);
+
+    okhttp3.OkHttpClient.Builder builder = new OkHttpClient.Builder()
+        .protocols(protocols) //
+        .retryOnConnectionFailure(true) //
+        .followRedirects(false) //
+        .connectTimeout(timeout, TimeUnit.MILLISECONDS)
+        .writeTimeout(timeout, TimeUnit.MILLISECONDS)
+        .readTimeout(timeout, TimeUnit.MILLISECONDS);
+
+    if (!accept.isEmpty()) {
+      getCustomRequestHeaders().add(new String[] { "Accept", accept });
+    }
+
+    if (!acceptLanguage.isEmpty()) {
+      getCustomRequestHeaders()
+          .add(new String[] { "Accept-Language", acceptLanguage });
+    }
+
+    if (!acceptCharset.isEmpty()) {
+      getCustomRequestHeaders()
+          .add(new String[] { "Accept-Charset", acceptCharset });
+    }
+
+    if (useProxy) {
+      ProxySelector selector = new ProxySelector() {
+        @SuppressWarnings("serial")
+        private final List<Proxy> noProxy = new ArrayList<Proxy>() {
+          {
+            add(Proxy.NO_PROXY);
+          }
+        };
+        @SuppressWarnings("serial")
+        private final List<Proxy> proxy = new ArrayList<Proxy>() {
+          {
+            add(new Proxy(proxyType,
+                new InetSocketAddress(proxyHost, proxyPort)));
+          }
+        };
+        @Override
+        public List<Proxy> select(URI uri) {
+          if (useProxy(uri)) {
+            return proxy;
+          }
+          return noProxy;
+        }
+        @Override
+        public void connectFailed(URI uri, SocketAddress sa, IOException ioe) {
+          LOG.error("Connection to proxy failed for {}: {}", uri, ioe);
+        }
+      };
+      builder.proxySelector(selector);
+    }
+
+    if (storeIPAddress || storeHttpHeaders || storeHttpRequest) {
+        builder.addNetworkInterceptor(new HTTPHeadersInterceptor());
+    }
+
+    client = builder.build();
+  }
+
+  class HTTPHeadersInterceptor implements Interceptor {
+
+    @Override
+    public okhttp3.Response intercept(Interceptor.Chain chain)
+        throws IOException {
+
+      Connection connection = chain.connection();
+      String ipAddress = null;
+      if (storeIPAddress) {
+        InetAddress address = connection.socket().getInetAddress();
+        ipAddress = address.getHostAddress();
+      }
+
+      Request request = chain.request();
+      okhttp3.Response response = chain.proceed(request);
+      String httpProtocol = response.protocol().toString()
+          .toUpperCase(Locale.ROOT);
+      if (useHttp2 && "H2".equals(httpProtocol)) {
+        // back-warc compatible protocol name
+        httpProtocol = "HTTP/2";
+      }
+
+      StringBuilder resquestverbatim = null;
+      StringBuilder responseverbatim = null;
+
+      if (storeHttpRequest) {
+        resquestverbatim = new StringBuilder();
+
+        resquestverbatim.append(request.method()).append(' ');
+        resquestverbatim.append(request.url().encodedPath());
+        String query = request.url().encodedQuery();
+        if (query != null) {
+          resquestverbatim.append('?').append(query);
+        }
+        resquestverbatim.append(' ').append(httpProtocol).append("\r\n");
+
+        Headers headers = request.headers();
+
+        for (int i = 0, size = headers.size(); i < size; i++) {
+          String key = headers.name(i);
+          String value = headers.value(i);
+          resquestverbatim.append(key).append(": ").append(value)
+              .append("\r\n");
+        }
+
+        resquestverbatim.append("\r\n");
+      }
+
+      if (storeHttpHeaders) {
+        responseverbatim = new StringBuilder();
+
+        responseverbatim.append(httpProtocol).append(' ')
+            .append(response.code());
+        if (!response.message().isEmpty()) {
+          responseverbatim.append(' ').append(response.message());
+        }
+        responseverbatim.append("\r\n");
+
+        Headers headers = response.headers();
+
+        for (int i = 0, size = headers.size(); i < size; i++) {
+          String key = headers.name(i);
+          String value = headers.value(i);
+          responseverbatim.append(key).append(": ").append(value)
+              .append("\r\n");
+        }
+
+        responseverbatim.append("\r\n");
+      }
+
+      okhttp3.Response.Builder builder = response.newBuilder();
+
+      if (ipAddress != null) {
+        builder = builder.header(Response.IP_ADDRESS, ipAddress);
+      }
+
+      if (resquestverbatim != null) {
+        byte[] encodedBytesRequest = Base64.getEncoder()
+            .encode(resquestverbatim.toString().getBytes());
+        builder = builder.header(Response.REQUEST,
+            new String(encodedBytesRequest));
+      }
+
+      if (responseverbatim != null) {
+        byte[] encodedBytesResponse = Base64.getEncoder()
+            .encode(responseverbatim.toString().getBytes());
+        builder = builder.header(Response.RESPONSE_HEADERS,
+            new String(encodedBytesResponse));
+      }
+
+      // returns a modified version of the response
+      return builder.build();
+    }
+  }
+
+  protected List<String[]> getCustomRequestHeaders() {
+    return customRequestHeaders;
+  }
+
+  protected OkHttpClient getClient() {
+    return client;
+  }
+
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new OkHttpResponse(this, url, datum);
+  }
+
+  public static void main(String[] args) throws Exception {
+    OkHttp okhttp = new OkHttp();
+    okhttp.setConf(NutchConfiguration.create());
+    main(okhttp, args);
+  }
+
+}
diff --git 
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
new file mode 100644
index 000000000..2278928b0
--- /dev/null
+++ 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
@@ -0,0 +1,182 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.okhttp;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.net.URL;
+import java.util.Base64;
+
+import org.apache.commons.lang.mutable.MutableBoolean;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import okhttp3.Request;
+import okhttp3.ResponseBody;
+import okio.BufferedSource;
+
+public class OkHttpResponse implements Response {
+
+  protected static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  private URL url;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new Metadata();
+
+  public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum)
+      throws ProtocolException, IOException {
+
+    this.url = url;
+
+    Request.Builder rb = new Request.Builder().url(url);
+
+    rb.header(USER_AGENT, okhttp.getUserAgent());
+    okhttp.getCustomRequestHeaders().forEach((k) -> {
+        rb.header(k[0], k[1]);
+    });
+
+    if (okhttp.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
+      rb.header(IF_MODIFIED_SINCE,
+          HttpDateFormat.toString(datum.getModifiedTime()));
+    }
+
+    Request request = rb.build();
+    okhttp3.Call call = okhttp.getClient().newCall(request);
+
+    try (okhttp3.Response response = call.execute()) {
+
+      Metadata responsemetadata = new Metadata();
+      okhttp3.Headers httpHeaders = response.headers();
+
+      for (int i = 0, size = httpHeaders.size(); i < size; i++) {
+        String key = httpHeaders.name(i);
+        String value = httpHeaders.value(i);
+
+        if (key.equals(REQUEST)
+            || key.equals(RESPONSE_HEADERS)) {
+          value = new String(Base64.getDecoder().decode(value));
+        }
+
+        responsemetadata.add(key, value);
+      }
+      LOG.debug("{} - {} {} {}", url, response.protocol(), response.code(),
+          response.message());
+
+      MutableBoolean trimmed = new MutableBoolean();
+      content = toByteArray(response.body(), trimmed, okhttp.getMaxContent(),
+          okhttp.getTimeout());
+      responsemetadata.add(FETCH_TIME,
+          Long.toString(System.currentTimeMillis()));
+      if (trimmed.booleanValue()) {
+        if (!call.isCanceled()) {
+          call.cancel();
+        }
+        responsemetadata.set(TRIMMED_CONTENT, "true");
+        LOG.debug("HTTP content trimmed to {} bytes", content.length);
+      }
+
+      code = response.code();
+      headers = responsemetadata;
+
+    } catch (IOException e) {
+      LOG.warn("Fetch of URL {} failed: {}", url, e);
+    }
+
+  }
+
+  private final byte[] toByteArray(final ResponseBody responseBody,
+      MutableBoolean trimmed, int maxContent, int timeout) throws IOException {
+
+    if (responseBody == null) {
+      return new byte[] {};
+    }
+
+    long endDueFor = -1;
+    if (timeout != -1) {
+      endDueFor = System.currentTimeMillis() + timeout;
+    }
+
+    int maxContentBytes = Integer.MAX_VALUE;
+    if (maxContent != -1) {
+      maxContentBytes = Math.min(maxContentBytes, maxContent);
+    }
+
+    BufferedSource source = responseBody.source();
+    int contentBytesBuffered = 0;
+    int contentBytesRequested = 0;
+    int bufferGrowStepBytes = 8192;
+    while (contentBytesBuffered < maxContentBytes) {
+      contentBytesRequested += Math.min(bufferGrowStepBytes,
+          (maxContentBytes - contentBytesBuffered));
+      boolean success = source.request(contentBytesRequested);
+      contentBytesBuffered = (int) source.buffer().size();
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("total bytes requested = {}, buffered = {}",
+            contentBytesRequested, contentBytesBuffered);
+      }
+      if (!success) {
+        LOG.debug("source exhausted, no more data to read");
+        break;
+      }
+      if (endDueFor != -1 && endDueFor <= System.currentTimeMillis()) {
+        LOG.debug("timeout reached");
+        trimmed.setValue(true);
+        break;
+      }
+      if (contentBytesBuffered > maxContentBytes) {
+        LOG.debug("content limit reached");
+        trimmed.setValue(true);
+      }
+    }
+    if (maxContent != -1 && contentBytesBuffered > maxContent) {
+      // okhttp's internal buffer is larger than maxContent
+      trimmed.setValue(true);
+      contentBytesBuffered = maxContentBytes;
+    }
+    byte[] arr = new byte[contentBytesBuffered];
+    source.buffer().read(arr);
+    return arr;
+  }
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+}
diff --git 
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/package-info.java
 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/package-info.java
new file mode 100644
index 000000000..7bdf14a75
--- /dev/null
+++ 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Protocol plugin based on <a 
href="https://github.com/square/okhttp";>okhttp</a>, supports http, https, 
http/2.
+ */
+package org.apache.nutch.protocol.okhttp;
diff --git a/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml 
b/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml
new file mode 100644
index 000000000..dd74c3284
--- /dev/null
+++ b/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<configuration>
+
+<property>
+  <name>http.agent.name</name>
+  <value>Nutch-Test</value>
+</property>
+
+<property>
+  <name>http.timeout</name>
+  <value>60000</value>
+</property>
+
+<property>
+  <name>store.http.headers</name>
+  <value>true</value>
+</property>
+
+</configuration>
\ No newline at end of file
diff --git 
a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
 
b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
new file mode 100644
index 000000000..d276f1c12
--- /dev/null
+++ 
b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.okhttp;
+
+import static org.junit.Assert.assertEquals;
+
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.okhttp.OkHttp;
+import org.junit.After;
+import org.junit.Test;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.nio.SelectChannelConnector;
+import org.mortbay.jetty.servlet.Context;
+import org.mortbay.jetty.servlet.ServletHolder;
+
+/**
+ * Test cases for protocol-http
+ */
+public class TestProtocolOkHttp {
+  private static final String RES_DIR = System.getProperty("test.data", ".");
+
+  private OkHttp http;
+  private Server server;
+  private Context root;
+  private Configuration conf;
+  private int port;
+
+  public void setUp(boolean redirection) throws Exception {
+    conf = new Configuration();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("nutch-site-test.xml");
+
+    http = new OkHttp();
+    http.setConf(conf);
+
+    server = new Server();
+
+    if (redirection) {
+      root = new Context(server, "/redirection", Context.SESSIONS);
+      root.setAttribute("newContextURL", "/redirect");
+    } else {
+      root = new Context(server, "/", Context.SESSIONS);
+    }
+
+    ServletHolder sh = new ServletHolder(
+        org.apache.jasper.servlet.JspServlet.class);
+    root.addServlet(sh, "*.jsp");
+    root.setResourceBase(RES_DIR);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    server.stop();
+  }
+
+  @Test
+  public void testStatusCode() throws Exception {
+    startServer(47504, false);
+    fetchPage("/basic-http.jsp", 200);
+    fetchPage("/redirect301.jsp", 301);
+    fetchPage("/redirect302.jsp", 302);
+    fetchPage("/nonexists.html", 404);
+    fetchPage("/brokenpage.jsp", 500);
+  }
+
+  @Test
+  public void testRedirectionJetty() throws Exception {
+    // Redirection via Jetty
+    startServer(47503, true);
+    fetchPage("/redirection", 302);
+  }
+
+  /**
+   * Starts the Jetty server at a specified port and redirection parameter.
+   * 
+   * @param portno
+   *          Port number.
+   * @param redirection
+   *          whether redirection
+   */
+  private void startServer(int portno, boolean redirection) throws Exception {
+    port = portno;
+    setUp(redirection);
+    SelectChannelConnector connector = new SelectChannelConnector();
+    connector.setHost("127.0.0.1");
+    connector.setPort(port);
+
+    server.addConnector(connector);
+    server.start();
+  }
+
+  /**
+   * Fetches the specified <code>page</code> from the local Jetty server and
+   * checks whether the HTTP response status code matches with the expected
+   * code. Also use jsp pages for redirection.
+   * 
+   * @param page
+   *          Page to be fetched.
+   * @param expectedCode
+   *          HTTP response status code expected while fetching the page.
+   */
+  private void fetchPage(String page, int expectedCode) throws Exception {
+    URL url = new URL("http", "127.0.0.1", port, page);
+    CrawlDatum crawlDatum = new CrawlDatum();
+    Response response = http.getResponse(url, crawlDatum, true);
+    ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()),
+        crawlDatum);
+    Content content = out.getContent();
+    assertEquals("HTTP Status Code for " + url, expectedCode,
+        response.getCode());
+
+    if (page.compareTo("/nonexists.html") != 0
+        && page.compareTo("/brokenpage.jsp") != 0
+        && page.compareTo("/redirection") != 0) {
+      assertEquals("ContentType " + url, "text/html",
+          content.getContentType());
+    }
+  }
+}
diff --git 
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index b6033aef4..d2cd0cec6 100644
--- 
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -132,21 +132,25 @@ public String normalize(String urlString, String scope)
         changed = true;
       }
 
-      if (file == null || "".equals(file)) { // add a slash
+      if (file == null || "".equals(file)) {
         file = "/";
         changed = true;
+      } else if (!file.startsWith("/")) {
+        file = "/" + file;
+        changed = true;
+      } else {
+        // check for unnecessary use of "/../", "/./", and "//"
+        String file2 = getFileWithNormalizedPath(url);
+        if (!file.equals(file2)) {
+          changed = true;
+          file = file2;
+        }
       }
 
       if (url.getRef() != null) { // remove the ref
         changed = true;
       }
 
-      // check for unnecessary use of "/../", "/./", and "//"
-      String file2 = getFileWithNormalizedPath(url);
-      if (!file.equals(file2)) {
-        changed = true;
-        file = file2;
-      }
     }
 
     // properly encode characters in path/file using percent-encoding
@@ -191,6 +195,8 @@ private String getFileWithNormalizedPath(URL url)
     // if path is empty return a single slash
     if (file.isEmpty()) {
       file = "/";
+    } else if (!file.startsWith("/")) {
+      file = "/" + file;
     }
 
     return file;
diff --git 
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
 
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 5cefbf384..a656a7aa4 100644
--- 
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -170,6 +170,8 @@ public void testNormalizer() throws Exception {
     normalizeTest("http:", "http:/");
     normalizeTest("http:////";, "http:/");
     normalizeTest("http:///////";, "http:/");
+    // NUTCH-2555 path must start with '/'
+    normalizeTest("http://example.com?a=1";, "http://example.com/?a=1";);
   }
   
   @Test


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> protocol-http does not behave the same as browsers
> --------------------------------------------------
>
>                 Key: NUTCH-2549
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2549
>             Project: Nutch
>          Issue Type: Bug
>    Affects Versions: 1.14
>            Reporter: Gerard Bouchar
>            Assignee: Sebastian Nagel
>            Priority: Major
>             Fix For: 1.15
>
>         Attachments: NUTCH-2549.patch
>
>
> We identified the following issues in protocol-http (a plugin implementing 
> the HTTP protocol):
>  * It fails if an url's path does not start with '/'
>  ** Example: [http://news.fx678.com?171|http://news.fx678.com/?171] (browsers 
> correctly rewrite the url as [http://news.fx678.com/?171], while nutch tries 
> to send an invalid HTTP request starting with *GET ?171 HTTP/1.0*.
>  * It advertises its requests as being HTTP/1.0, but sends an 
> _Accept-Encoding_ request header, that is defined only in HTTP/1.1. This 
> confuses some web servers
>  ** Example: 
> [http://www.hansamanuals.com/main/english/none/theconf___987/manuals/version___82/hwconvindex.htm]
>  * If a server sends a redirection (3XX status code, with a Location header), 
> protocol-http tries to parse the HTTP response body anyway. Thus, if an error 
> occurs while decoding the body, the redirection is not followed and the 
> information is lost. Browsers follow the redirection and close the socket 
> soon as they can.
>  ** Example: [http://www.webarcelona.net/es/blog?page=2]
>  * Some servers invalidly send an HTTP body directly without a status line or 
> headers. Browsers handle that, protocol-http doesn't:
>  ** Example: [https://app.unitymedia.de/]
>  * Some servers invalidly add colons after the HTTP status code in the status 
> line (they can send _HTTP/1.1 404: Not found_ instead of _HTTP/1.1 404 Not 
> found_ for instance). Browsers can handle that.
>  * Some servers invalidly send headers that span over multiple lines. In that 
> case, browsers simply ignore the subsequent lines, but protocol-http throws 
> an error, thus preventing us from fetching the contents of the page.
>  * There is no limit over the size of the HTTP headers it reads. A bogus 
> server could send an infinite stream of different HTTP headers and cause the 
> fetcher to go out of memory, or send the same HTTP header repeatedly and 
> cause the fetcher to timeout.
>  * The same goes for the HTTP status line: no check is made concerning its 
> size.
>  * While reading chunked content, if the content size becomes larger than 
> {color:#9876aa}http{color}.getMaxContent(), instead of just stopping, it 
> tries to read a new chunk before having read the previous one completely, 
> resulting in a '{color:#333333}bad chunk length' error.{color}
> {color:#333333}Additionally (and that concerns protocol-httpclient as well), 
> when reading http headers, for each header, the SpellCheckedMetadata class 
> computes a Levenshtein distance between it and every  known header in the 
> HttpHeaders interface. Not only is that slow, non-standard, and non-conform 
> to browsers' behavior, but it also causes bugs and prevents us from accessing 
> the real headers sent by the HTTP server.{color}
>  * {color:#333333}Example: [http://www.taz.de/!443358/] . The server sends a 
> *Client-Transfer-Encoding: chunked* header, but SpellCheckedMetadata corrects 
> it to *Transfer-Encoding: chunked*. Then, HttpResponse (in protocol-http) 
> tries to read the HTTP body as chunked, whereas it is not.{color}
>  



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to