[ 
https://issues.apache.org/jira/browse/NUTCH-2576?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16509766#comment-16509766
 ] 

ASF GitHub Bot commented on NUTCH-2576:
---------------------------------------

sebastian-nagel closed pull request #328: NUTCH-2576 HTTP protocol 
implementation based on okhttp
URL: https://github.com/apache/nutch/pull/328
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/build.xml b/build.xml
index 1d680d0bd..d4836a4f2 100644
--- a/build.xml
+++ b/build.xml
@@ -215,6 +215,7 @@
       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
       <packageset dir="${plugins.dir}/protocol-interactiveselenium/src/java"/>
+      <packageset dir="${plugins.dir}/protocol-okhttp/src/java"/>
       <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
       <packageset dir="${plugins.dir}/publish-rabbitmq/src/java"/>
       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
@@ -673,6 +674,7 @@
       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
       <packageset dir="${plugins.dir}/protocol-interactiveselenium/src/java"/>
+      <packageset dir="${plugins.dir}/protocol-okhttp/src/java"/>
       <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
       <packageset dir="${plugins.dir}/publish-rabbitmq/src/java"/>
       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
@@ -1107,6 +1109,8 @@
         <source path="${plugins.dir}/protocol-httpclient/src/java/" />
         <source path="${plugins.dir}/protocol-httpclient/src/test/" />
         <source path="${plugins.dir}/protocol-interactiveselenium/src/java/" />
+        <source path="${plugins.dir}/protocol-okhttp/src/java/" />
+        <source path="${plugins.dir}/protocol-okhttp/src/test/" />
         <source path="${plugins.dir}/protocol-selenium/src/java"/>
         <source path="${plugins.dir}/publish-rabbitmq/src/java"/>
         <source path="${plugins.dir}/scoring-depth/src/java/" />
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 37f73b8cd..fcedc6df0 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -277,6 +277,15 @@
   </description>
 </property>
 
+<property>
+  <name>http.proxy.type</name>
+  <value>HTTP</value>
+  <description>
+    Proxy type: HTTP or SOCKS (cf. java.net.Proxy.Type).
+    Note: supported by protocol-okhttp.
+  </description>
+</property>
+
 <property>
   <name>http.proxy.exception.list</name>
   <value></value>
@@ -307,6 +316,19 @@
   </description>
 </property>
 
+<property>
+  <name>http.useHttp2</name>
+  <value>false</value>
+  <description>
+    If true try HTTP/2 and fall-back to HTTP/1.1 if HTTP/2 not
+    supported, if false use always HTTP/1.1.
+
+    NOTE: HTTP/2 is currently only supported by protocol-okhttp and
+    requires at runtime Java 9 or a modified Java 8 with support for
+    ALPN (Application Layer Protocol Negotiation).
+  </description>
+</property>
+
 <property>
   <name>http.accept.language</name>
   <value>en-us,en-gb,en;q=0.7,*;q=0.3</value>
diff --git a/src/java/org/apache/nutch/metadata/HttpHeaders.java 
b/src/java/org/apache/nutch/metadata/HttpHeaders.java
index 71a66f66c..a3aec1dbb 100644
--- a/src/java/org/apache/nutch/metadata/HttpHeaders.java
+++ b/src/java/org/apache/nutch/metadata/HttpHeaders.java
@@ -48,4 +48,8 @@
 
   public static final String LOCATION = "Location";
 
+  public static final String IF_MODIFIED_SINCE = "If-Modified-Since";
+
+  public static final String USER_AGENT = "User-Agent";
+
 }
diff --git a/src/java/org/apache/nutch/net/protocols/Response.java 
b/src/java/org/apache/nutch/net/protocols/Response.java
index c9139bd6c..7096c934d 100644
--- a/src/java/org/apache/nutch/net/protocols/Response.java
+++ b/src/java/org/apache/nutch/net/protocols/Response.java
@@ -26,6 +26,32 @@
  */
 public interface Response extends HttpHeaders {
 
+  /** Key to hold the HTTP request if <code>store.http.request</code> is true 
*/
+  public static final String REQUEST = "_request_";
+
+  /**
+   * Key to hold the HTTP response header if <code>store.http.headers</code> is
+   * true
+   */
+  public static final String RESPONSE_HEADERS = "_response.headers_";
+
+  /**
+   * Key to hold the IP address the request is sent to if
+   * <code>store.ip.address</code> is true
+   */
+  public static final String IP_ADDRESS = "_ip_";
+
+  /**
+   * Key to hold the time when the page has been fetched
+   */
+  public static final String FETCH_TIME = "nutch.fetch.time";
+
+  /**
+   * Key to hold boolean whether content has been trimmed because it exceeds
+   * <code>http.content.limit</code>
+   */
+  public static final String TRIMMED_CONTENT = "http.content.trimmed";
+
   /** Returns the URL used to retrieve this response. */
   public URL getUrl();
 
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 5a3a8c910..a9cb912cc 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -73,6 +73,7 @@
     <ant dir="protocol-http" target="deploy"/>
     <ant dir="protocol-httpclient" target="deploy"/>
     <ant dir="protocol-interactiveselenium" target="deploy" />
+    <ant dir="protocol-okhttp" target="deploy"/>
     <ant dir="protocol-selenium" target="deploy" />
     <ant dir="publish-rabbitmq" target="deploy"/>
     <ant dir="scoring-depth" target="deploy"/>
@@ -132,6 +133,7 @@
      <ant dir="protocol-file" target="test"/>
      <ant dir="protocol-http" target="test"/>
      <ant dir="protocol-httpclient" target="test"/>
+     <ant dir="protocol-okhttp" target="test"/>
      <ant dir="scoring-orphan" target="test"/>
      <ant dir="subcollection" target="test"/>
      <ant dir="urlfilter-automaton" target="test"/>
@@ -209,6 +211,7 @@
     <ant dir="protocol-http" target="clean"/>
     <ant dir="protocol-httpclient" target="clean"/>
     <ant dir="protocol-interactiveselenium" target="clean" />
+    <ant dir="protocol-okhttp" target="clean"/>
     <ant dir="protocol-selenium" target="clean" />
     <ant dir="publish-rabbitmq" target="clean"/>
     <ant dir="scoring-depth" target="clean"/>
diff --git 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index d9284c9aa..1cb2bb151 100644
--- 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -20,6 +20,8 @@
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.Reader;
+import java.net.Proxy;
+import java.net.URI;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -70,8 +72,11 @@
   /** The proxy port. */
   protected int proxyPort = 8080;
   
+  /** The proxy port. */
+  protected Proxy.Type proxyType = Proxy.Type.HTTP;
+
   /** The proxy exception list. */
-  protected HashMap proxyException = new HashMap(); 
+  protected HashMap<String,String> proxyException = new HashMap<>();
 
   /** Indicates if a proxy is used */
   protected boolean useProxy = false;
@@ -89,7 +94,7 @@
   /** The "Accept-Language" request header value. */
   protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
 
-  /** The "Accept-Language" request header value. */
+  /** The "Accept-Charset" request header value. */
   protected String acceptCharset = "utf-8,iso-8859-1;q=0.7,*;q=0.7";
 
   /** The "Accept" request header value. */
@@ -108,12 +113,33 @@
   /** Do we use HTTP/1.1? */
   protected boolean useHttp11 = false;
 
+  /** Whether to use HTTP/2 */
+  protected boolean useHttp2 = false;
+
   /**
    * Record response time in CrawlDatum's meta data, see property
    * http.store.responsetime.
    */
   protected boolean responseTime = true;
 
+  /**
+   * Record the IP address of the responding server, see property
+   * <code>store.ip.address</code>.
+   */
+  protected boolean storeIPAddress = false;
+
+  /**
+   * Record the HTTP request in the metadata, see property
+   * <code>store.http.request</code>.
+   */
+  protected boolean storeHttpRequest = false;
+
+  /**
+   * Record the HTTP response header in the metadata, see property
+   * <code>store.http.headers</code>.
+   */
+  protected boolean storeHttpHeaders = false;
+
   /** Skip page if Crawl-Delay longer than this value. */
   protected long maxCrawlDelay = -1L;
 
@@ -147,6 +173,7 @@ public void setConf(Configuration conf) {
     this.conf = conf;
     this.proxyHost = conf.get("http.proxy.host");
     this.proxyPort = conf.getInt("http.proxy.port", 8080);
+    this.proxyType = Proxy.Type.valueOf(conf.get("http.proxy.type", "HTTP"));
     this.proxyException = 
arrayToMap(conf.getStrings("http.proxy.exception.list"));
     this.useProxy = (proxyHost != null && proxyHost.length() > 0);
     this.timeout = conf.getInt("http.timeout", 10000);
@@ -160,7 +187,11 @@ public void setConf(Configuration conf) {
     this.accept = conf.get("http.accept", accept).trim();
     // backward-compatible default setting
     this.useHttp11 = conf.getBoolean("http.useHttp11", false);
+    this.useHttp2 = conf.getBoolean("http.useHttp2", false);
     this.responseTime = conf.getBoolean("http.store.responsetime", true);
+    this.storeIPAddress = conf.getBoolean("store.ip.address", false);
+    this.storeHttpRequest = conf.getBoolean("store.http.request", false);
+    this.storeHttpHeaders = conf.getBoolean("store.http.headers", false);
     this.enableIfModifiedsinceHeader = 
conf.getBoolean("http.enable.if.modified.since.header", true);
     this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header", 
true);
     this.robots.setConf(conf);
@@ -360,9 +391,15 @@ public int getProxyPort() {
   }
 
   public boolean useProxy(URL url) {
-    if (!useProxy){
-      return false;
-    } else if (proxyException.get(url.getHost())!=null){
+    return useProxy(url.getHost());
+  }
+
+  public boolean useProxy(URI uri) {
+    return useProxy(uri.getHost());
+  }
+
+  public boolean useProxy(String host) {
+    if (useProxy && proxyException.containsKey(host)) {
       return false;
     }
     return useProxy;
@@ -380,13 +417,26 @@ public boolean isCookieEnabled() {
     return enableCookieHeader;
   }
 
+  public boolean isStoreIPAddress() {
+    return storeIPAddress;
+  }
+
+  public boolean isStoreHttpRequest() {
+    return storeHttpRequest;
+  }
+
+  public boolean isStoreHttpHeaders() {
+    return storeHttpHeaders;
+  }
+
   public int getMaxContent() {
     return maxContent;
   }
 
   public String getUserAgent() {
-    if (userAgentNames!=null) {
-      return 
userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()));
+    if (userAgentNames != null) {
+      return userAgentNames
+          .get(ThreadLocalRandom.current().nextInt(userAgentNames.size()));
     }
     return userAgent;
   }
diff --git 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index 8b1a03183..19c00fde5 100644
--- 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -205,7 +205,7 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum 
datum)
 
       // store the request in the metadata?
       if (conf.getBoolean("store.http.request", false) == true) {
-        headers.add("_request_", reqStr.toString());
+        headers.add(Response.REQUEST, reqStr.toString());
       }
 
       byte[] reqBytes = reqStr.toString().getBytes();
@@ -263,7 +263,7 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum 
datum)
             // store the headers verbatim only if the response was not 
compressed
             // as the content length reported with not match otherwise
             if (httpHeaders != null) {
-              headers.add("_response.headers_", httpHeaders.toString());
+              headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
             }
             if (Http.LOG.isTraceEnabled()) {
               Http.LOG.trace("fetched " + content.length + " bytes from " + 
url);
diff --git 
a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 
b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index 56ae789f3..c0a3c01b0 100644
--- 
a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ 
b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -33,9 +33,9 @@
 import javax.net.ssl.SSLSocket;
 import javax.net.ssl.SSLSocketFactory;
 
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.HttpHeaders;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.SpellCheckedMetadata;
 import org.apache.nutch.net.protocols.HttpDateFormat;
@@ -49,7 +49,6 @@
  */
 public class HttpResponse implements Response {
 
-  private Configuration conf;
   private HttpBase http;
   private URL url;
   private byte[] content;
@@ -150,9 +149,7 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum 
datum)
         socket = sslsocket;
       }
 
-      this.conf = http.getConf();
-      if (sockAddr != null
-          && conf.getBoolean("store.ip.address", false) == true) {
+      if (sockAddr != null && http.isStoreIPAddress()) {
         headers.add("_ip_", sockAddr.getAddress().getHostAddress());
       }
 
@@ -217,15 +214,15 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum 
datum)
       }
 
       if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
-        reqStr.append("If-Modified-Since: " + HttpDateFormat
-            .toString(datum.getModifiedTime()));
+        reqStr.append(HttpHeaders.IF_MODIFIED_SINCE + ": "
+            + HttpDateFormat.toString(datum.getModifiedTime()));
         reqStr.append("\r\n");
       }
       reqStr.append("\r\n");
 
       // store the request in the metadata?
-      if (conf.getBoolean("store.http.request", false) == true) {
-        headers.add("_request_", reqStr.toString());
+      if (http.isStoreHttpRequest()) {
+        headers.add(Response.REQUEST, reqStr.toString());
       }
 
       byte[] reqBytes = reqStr.toString().getBytes();
@@ -241,11 +238,11 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum 
datum)
       StringBuffer line = new StringBuffer();
 
       // store the http headers verbatim
-      if (conf.getBoolean("store.http.headers", false) == true) {
+      if (http.isStoreHttpHeaders()) {
         httpHeaders = new StringBuffer();
       }
 
-      headers.add("nutch.fetch.time", 
Long.toString(System.currentTimeMillis()));
+      headers.add(FETCH_TIME, Long.toString(System.currentTimeMillis()));
 
       boolean haveSeenNonContinueStatus = false;
       while (!haveSeenNonContinueStatus) {
@@ -273,9 +270,9 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum 
datum)
         content = http.processDeflateEncoded(content, url);
       } else {
         // store the headers verbatim only if the response was not compressed
-        // as the content length reported with not match otherwise
+        // as the content length reported does not match otherwise
         if (httpHeaders != null) {
-          headers.add("_response.headers_", httpHeaders.toString());
+          headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
         }
         if (Http.LOG.isTraceEnabled()) {
           Http.LOG.trace("fetched " + content.length + " bytes from " + url);
diff --git a/src/plugin/protocol-okhttp/build.xml 
b/src/plugin/protocol-okhttp/build.xml
new file mode 100755
index 000000000..644eeb0ea
--- /dev/null
+++ b/src/plugin/protocol-okhttp/build.xml
@@ -0,0 +1,50 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-okhttp" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+    </fileset>
+    <pathelement location="${build.dir}/test/conf"/>
+  </path>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-http"/>
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <copy toDir="${build.test}">
+      <fileset dir="${src.test}" excludes="**/*.java"/>
+    </copy>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+      <fileset dir="jsp"/>
+   </copy>
+
+</project>
diff --git a/src/plugin/protocol-okhttp/ivy.xml 
b/src/plugin/protocol-okhttp/ivy.xml
new file mode 100644
index 000000000..4c9035138
--- /dev/null
+++ b/src/plugin/protocol-okhttp/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="com.squareup.okhttp3" name="okhttp" rev="3.10.0"/>
+  </dependencies>
+  
+</ivy-module>
diff --git a/src/plugin/protocol-okhttp/jsp/basic-http.jsp 
b/src/plugin/protocol-okhttp/jsp/basic-http.jsp
new file mode 100644
index 000000000..bf1f8bd30
--- /dev/null
+++ b/src/plugin/protocol-okhttp/jsp/basic-http.jsp
@@ -0,0 +1,44 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin  
+--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
+String path = request.getContextPath();
+String basePath = 
request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+%>
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>HelloWorld</title>
+    <meta http-equiv="content-type" content="text/html;charset=utf-8" />
+    <meta name="Language" content="en" />
+       <meta http-equiv="pragma" content="no-cache">
+       <meta http-equiv="cache-control" content="no-cache">
+       <meta http-equiv="expires" content="0">    
+       <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+       <meta http-equiv="description" content="This is my page">
+       <!--
+       <link rel="stylesheet" type="text/css" href="styles.css">
+       -->
+  </head>
+  
+  <body>
+    Hello World!!! <br>
+  </body>
+</html>
diff --git a/src/plugin/protocol-okhttp/jsp/brokenpage.jsp 
b/src/plugin/protocol-okhttp/jsp/brokenpage.jsp
new file mode 100644
index 000000000..f3f7c4aba
--- /dev/null
+++ b/src/plugin/protocol-okhttp/jsp/brokenpage.jsp
@@ -0,0 +1,47 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin
+--%>
+
+@ page language="java" import="java.util.*" pageEncoding="UTF-8"
+
+String path = request.getContextPath();
+String basePath = 
request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>HelloWorld</title>
+    <meta http-equiv="content-type" content="text/html;charset=utf-8" />
+    <meta name="Language" content="en" />
+       <meta http-equiv="pragma" content="no-cache">
+       <meta http-equiv="cache-control" content="no-cache">
+       <meta http-equiv="expires" content="0">    
+       <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+       <meta http-equiv="description" content="This is my page">
+       <!--
+       <link rel="stylesheet" type="text/css" href="styles.css">
+       -->
+  </head>
+  
+  <body>
+    Hello World!!! <br>
+  </body>
+</html>
diff --git a/src/plugin/protocol-okhttp/jsp/redirect301.jsp 
b/src/plugin/protocol-okhttp/jsp/redirect301.jsp
new file mode 100644
index 000000000..1100b891e
--- /dev/null
+++ b/src/plugin/protocol-okhttp/jsp/redirect301.jsp
@@ -0,0 +1,49 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin
+--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
+String path = request.getContextPath();
+String basePath = 
request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+%>
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>My JSP page</title>
+    
+       <meta http-equiv="pragma" content="no-cache">
+       <meta http-equiv="cache-control" content="no-cache">
+       <meta http-equiv="expires" content="0">    
+       <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+       <meta http-equiv="description" content="This is my page">
+       <!--
+       <link rel="stylesheet" type="text/css" href="styles.css">
+       -->
+
+  </head>
+  
+  <body>
+       <%
+       response.setStatus(301);
+       response.setHeader( "Location", "http://nutch.apache.org";);
+       response.setHeader( "Connection", "close" );
+               %> 
+    You are redirected by JSP<br>
+  </body>
+</html>
diff --git a/src/plugin/protocol-okhttp/jsp/redirect302.jsp 
b/src/plugin/protocol-okhttp/jsp/redirect302.jsp
new file mode 100644
index 000000000..8a250d9aa
--- /dev/null
+++ b/src/plugin/protocol-okhttp/jsp/redirect302.jsp
@@ -0,0 +1,49 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin 
+--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
+String path = request.getContextPath();
+String basePath = 
request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+%>
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>My JSP page</title>
+    
+       <meta http-equiv="pragma" content="no-cache">
+       <meta http-equiv="cache-control" content="no-cache">
+       <meta http-equiv="expires" content="0">    
+       <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+       <meta http-equiv="description" content="This is my page">
+       <!--
+       <link rel="stylesheet" type="text/css" href="styles.css">
+       -->
+
+  </head>
+  
+  <body>
+       <%
+       response.setStatus(302);
+       response.setHeader( "Location", "http://nutch.apache.org";);
+       response.setHeader( "Connection", "close" );
+               %> 
+    You are sucessfully redirected by JSP<br>
+  </body>
+</html>
diff --git a/src/plugin/protocol-okhttp/plugin.xml 
b/src/plugin/protocol-okhttp/plugin.xml
new file mode 100755
index 000000000..0152fb057
--- /dev/null
+++ b/src/plugin/protocol-okhttp/plugin.xml
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-okhttp"
+   name="OKHttp Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="protocol-okhttp.jar">
+         <export name="*"/>
+      </library>
+      <library name="okhttp-3.10.0.jar"/>
+      <library name="okio-1.14.0.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-http"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.okhttp"
+              name="OkHttpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.okhttp.OkHttp"
+                      class="org.apache.nutch.protocol.okhttp.OkHttp">
+        <parameter name="protocolName" value="http"/>
+      </implementation>
+
+      <implementation id="org.apache.nutch.protocol.okhttp.OkHttp"
+                       class="org.apache.nutch.protocol.okhttp.OkHttp">
+           <parameter name="protocolName" value="https"/>
+      </implementation>
+
+   </extension>
+
+</plugin>
diff --git 
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
new file mode 100755
index 000000000..9206f81fc
--- /dev/null
+++ 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
@@ -0,0 +1,248 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.okhttp;
+
+import java.lang.invoke.MethodHandles;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.InetSocketAddress;
+import java.net.Proxy;
+import java.net.ProxySelector;
+import java.net.SocketAddress;
+import java.net.URI;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Base64;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Locale;
+import java.util.concurrent.TimeUnit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.util.NutchConfiguration;
+
+import okhttp3.Connection;
+import okhttp3.Headers;
+import okhttp3.Interceptor;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+
+public class OkHttp extends HttpBase {
+
+  protected static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  private final List<String[]> customRequestHeaders = new LinkedList<>();
+
+  private OkHttpClient client;
+
+  public OkHttp() {
+    super(LOG);
+  }
+
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+
+    // protocols in order of preference
+    List<okhttp3.Protocol> protocols = new ArrayList<>();
+    if (useHttp2) {
+      protocols.add(okhttp3.Protocol.HTTP_2);
+    }
+    protocols.add(okhttp3.Protocol.HTTP_1_1);
+
+    okhttp3.OkHttpClient.Builder builder = new OkHttpClient.Builder()
+        .protocols(protocols) //
+        .retryOnConnectionFailure(true) //
+        .followRedirects(false) //
+        .connectTimeout(timeout, TimeUnit.MILLISECONDS)
+        .writeTimeout(timeout, TimeUnit.MILLISECONDS)
+        .readTimeout(timeout, TimeUnit.MILLISECONDS);
+
+    if (!accept.isEmpty()) {
+      getCustomRequestHeaders().add(new String[] { "Accept", accept });
+    }
+
+    if (!acceptLanguage.isEmpty()) {
+      getCustomRequestHeaders()
+          .add(new String[] { "Accept-Language", acceptLanguage });
+    }
+
+    if (!acceptCharset.isEmpty()) {
+      getCustomRequestHeaders()
+          .add(new String[] { "Accept-Charset", acceptCharset });
+    }
+
+    if (useProxy) {
+      ProxySelector selector = new ProxySelector() {
+        @SuppressWarnings("serial")
+        private final List<Proxy> noProxy = new ArrayList<Proxy>() {
+          {
+            add(Proxy.NO_PROXY);
+          }
+        };
+        @SuppressWarnings("serial")
+        private final List<Proxy> proxy = new ArrayList<Proxy>() {
+          {
+            add(new Proxy(proxyType,
+                new InetSocketAddress(proxyHost, proxyPort)));
+          }
+        };
+        @Override
+        public List<Proxy> select(URI uri) {
+          if (useProxy(uri)) {
+            return proxy;
+          }
+          return noProxy;
+        }
+        @Override
+        public void connectFailed(URI uri, SocketAddress sa, IOException ioe) {
+          LOG.error("Connection to proxy failed for {}: {}", uri, ioe);
+        }
+      };
+      builder.proxySelector(selector);
+    }
+
+    if (storeIPAddress || storeHttpHeaders || storeHttpRequest) {
+        builder.addNetworkInterceptor(new HTTPHeadersInterceptor());
+    }
+
+    client = builder.build();
+  }
+
+  class HTTPHeadersInterceptor implements Interceptor {
+
+    @Override
+    public okhttp3.Response intercept(Interceptor.Chain chain)
+        throws IOException {
+
+      Connection connection = chain.connection();
+      String ipAddress = null;
+      if (storeIPAddress) {
+        InetAddress address = connection.socket().getInetAddress();
+        ipAddress = address.getHostAddress();
+      }
+
+      Request request = chain.request();
+      okhttp3.Response response = chain.proceed(request);
+      String httpProtocol = response.protocol().toString()
+          .toUpperCase(Locale.ROOT);
+      if (useHttp2 && "H2".equals(httpProtocol)) {
+        // back-warc compatible protocol name
+        httpProtocol = "HTTP/2";
+      }
+
+      StringBuilder resquestverbatim = null;
+      StringBuilder responseverbatim = null;
+
+      if (storeHttpRequest) {
+        resquestverbatim = new StringBuilder();
+
+        resquestverbatim.append(request.method()).append(' ');
+        resquestverbatim.append(request.url().encodedPath());
+        String query = request.url().encodedQuery();
+        if (query != null) {
+          resquestverbatim.append('?').append(query);
+        }
+        resquestverbatim.append(' ').append(httpProtocol).append("\r\n");
+
+        Headers headers = request.headers();
+
+        for (int i = 0, size = headers.size(); i < size; i++) {
+          String key = headers.name(i);
+          String value = headers.value(i);
+          resquestverbatim.append(key).append(": ").append(value)
+              .append("\r\n");
+        }
+
+        resquestverbatim.append("\r\n");
+      }
+
+      if (storeHttpHeaders) {
+        responseverbatim = new StringBuilder();
+
+        responseverbatim.append(httpProtocol).append(' ')
+            .append(response.code());
+        if (!response.message().isEmpty()) {
+          responseverbatim.append(' ').append(response.message());
+        }
+        responseverbatim.append("\r\n");
+
+        Headers headers = response.headers();
+
+        for (int i = 0, size = headers.size(); i < size; i++) {
+          String key = headers.name(i);
+          String value = headers.value(i);
+          responseverbatim.append(key).append(": ").append(value)
+              .append("\r\n");
+        }
+
+        responseverbatim.append("\r\n");
+      }
+
+      okhttp3.Response.Builder builder = response.newBuilder();
+
+      if (ipAddress != null) {
+        builder = builder.header(Response.IP_ADDRESS, ipAddress);
+      }
+
+      if (resquestverbatim != null) {
+        byte[] encodedBytesRequest = Base64.getEncoder()
+            .encode(resquestverbatim.toString().getBytes());
+        builder = builder.header(Response.REQUEST,
+            new String(encodedBytesRequest));
+      }
+
+      if (responseverbatim != null) {
+        byte[] encodedBytesResponse = Base64.getEncoder()
+            .encode(responseverbatim.toString().getBytes());
+        builder = builder.header(Response.RESPONSE_HEADERS,
+            new String(encodedBytesResponse));
+      }
+
+      // returns a modified version of the response
+      return builder.build();
+    }
+  }
+
+  protected List<String[]> getCustomRequestHeaders() {
+    return customRequestHeaders;
+  }
+
+  protected OkHttpClient getClient() {
+    return client;
+  }
+
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new OkHttpResponse(this, url, datum);
+  }
+
+  public static void main(String[] args) throws Exception {
+    OkHttp okhttp = new OkHttp();
+    okhttp.setConf(NutchConfiguration.create());
+    main(okhttp, args);
+  }
+
+}
diff --git 
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
new file mode 100644
index 000000000..da24d7ca2
--- /dev/null
+++ 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
@@ -0,0 +1,191 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.okhttp;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.net.URL;
+import java.util.Base64;
+
+import org.apache.commons.lang.mutable.MutableBoolean;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import okhttp3.Request;
+import okhttp3.ResponseBody;
+import okio.BufferedSource;
+
+public class OkHttpResponse implements Response {
+
+  protected static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  private URL url;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new Metadata();
+
+  public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum)
+      throws ProtocolException, IOException {
+
+    this.url = url;
+
+    Request.Builder rb = new Request.Builder().url(url);
+
+    rb.header(USER_AGENT, okhttp.getUserAgent());
+    okhttp.getCustomRequestHeaders().forEach((k) -> {
+        rb.header(k[0], k[1]);
+    });
+
+    if (okhttp.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
+      rb.header(IF_MODIFIED_SINCE,
+          HttpDateFormat.toString(datum.getModifiedTime()));
+    }
+
+    if (okhttp.isCookieEnabled()
+        && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
+      String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE))
+          .toString();
+      rb.header("Cookie", cookie);
+    }
+
+    Request request = rb.build();
+    okhttp3.Call call = okhttp.getClient().newCall(request);
+
+    okhttp3.Response response = call.execute();
+
+    Metadata responsemetadata = new Metadata();
+    okhttp3.Headers httpHeaders = response.headers();
+
+    for (int i = 0, size = httpHeaders.size(); i < size; i++) {
+      String key = httpHeaders.name(i);
+      String value = httpHeaders.value(i);
+
+      if (key.equals(REQUEST) || key.equals(RESPONSE_HEADERS)) {
+        value = new String(Base64.getDecoder().decode(value));
+      }
+
+      responsemetadata.add(key, value);
+    }
+    LOG.debug("{} - {} {} {}", url, response.protocol(), response.code(),
+        response.message());
+
+    MutableBoolean trimmed = new MutableBoolean();
+    content = toByteArray(response.body(), trimmed, okhttp.getMaxContent(),
+        okhttp.getTimeout());
+    responsemetadata.add(FETCH_TIME, 
Long.toString(System.currentTimeMillis()));
+    if (trimmed.booleanValue()) {
+      if (!call.isCanceled()) {
+        call.cancel();
+      }
+      responsemetadata.set(TRIMMED_CONTENT, "true");
+      LOG.debug("HTTP content trimmed to {} bytes", content.length);
+    }
+
+    code = response.code();
+    headers = responsemetadata;
+
+  }
+
+  private final byte[] toByteArray(final ResponseBody responseBody,
+      MutableBoolean trimmed, int maxContent, int timeout) throws IOException {
+
+    if (responseBody == null) {
+      return new byte[] {};
+    }
+
+    long endDueFor = -1;
+    if (timeout != -1) {
+      endDueFor = System.currentTimeMillis() + timeout;
+    }
+
+    int maxContentBytes = Integer.MAX_VALUE;
+    if (maxContent != -1) {
+      maxContentBytes = Math.min(maxContentBytes, maxContent);
+    }
+
+    BufferedSource source = responseBody.source();
+    int contentBytesBuffered = 0;
+    int contentBytesRequested = 0;
+    int bufferGrowStepBytes = 8192;
+    while (contentBytesBuffered < maxContentBytes) {
+      contentBytesRequested += Math.min(bufferGrowStepBytes,
+          (maxContentBytes - contentBytesBuffered));
+      boolean success = source.request(contentBytesRequested);
+      contentBytesBuffered = (int) source.buffer().size();
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("total bytes requested = {}, buffered = {}",
+            contentBytesRequested, contentBytesBuffered);
+      }
+      if (!success) {
+        LOG.debug("source exhausted, no more data to read");
+        break;
+      }
+      if (endDueFor != -1 && endDueFor <= System.currentTimeMillis()) {
+        LOG.debug("timeout reached");
+        trimmed.setValue(true);
+        break;
+      }
+      if (contentBytesBuffered > maxContentBytes) {
+        LOG.debug("content limit reached");
+        trimmed.setValue(true);
+      }
+    }
+    int bytesToCopy = contentBytesBuffered;
+    if (maxContent != -1 && contentBytesBuffered > maxContent) {
+      // okhttp's internal buffer is larger than maxContent
+      trimmed.setValue(true);
+      bytesToCopy = maxContentBytes;
+    }
+    byte[] arr = new byte[bytesToCopy];
+    source.buffer().readFully(arr);
+    if (LOG.isDebugEnabled()) {
+      LOG.debug(
+          "copied {} bytes out of {} buffered, remaining buffer contains {} 
bytes",
+          bytesToCopy, contentBytesBuffered, source.buffer().size());
+    }
+    return arr;
+  }
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+}
diff --git 
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/package-info.java
 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/package-info.java
new file mode 100644
index 000000000..7bdf14a75
--- /dev/null
+++ 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Protocol plugin based on <a 
href="https://github.com/square/okhttp";>okhttp</a>, supports http, https, 
http/2.
+ */
+package org.apache.nutch.protocol.okhttp;
diff --git a/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml 
b/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml
new file mode 100644
index 000000000..72776c3cb
--- /dev/null
+++ b/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<configuration>
+
+<property>
+  <name>http.agent.name</name>
+  <value>Nutch-Test</value>
+</property>
+
+<property>
+  <name>http.timeout</name>
+  <value>60000</value>
+</property>
+
+<property>
+  <name>store.http.headers</name>
+  <value>true</value>
+</property>
+
+<property>
+  <name>http.content.limit</name>
+  <value>65536</value>
+  <description></description>
+</property>
+
+</configuration>
diff --git 
a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java
 
b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java
new file mode 100644
index 000000000..d8d2654a1
--- /dev/null
+++ 
b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java
@@ -0,0 +1,318 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.okhttp;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.lang.invoke.MethodHandles;
+import java.net.InetSocketAddress;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.junit.After;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Test cases for protocol-http - robustness regarding bad server responses:
+ * malformed HTTP header lines, etc. See, NUTCH-2549.
+ */
+public class TestBadServerResponses {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  private OkHttp http;
+  private ServerSocket server;
+  private Configuration conf;
+  private int port = 47506;
+
+  private static final String responseHeader = "HTTP/1.1 200 OK\r\n";
+  private static final String simpleContent = "Content-Type: 
text/html\r\n\r\nThis is a text.";
+
+  public void setUp() throws Exception {
+    conf = new Configuration();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("nutch-site-test.xml");
+    conf.setBoolean("store.http.headers", true);
+
+    http = new OkHttp();
+    http.setConf(conf);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    server.close();
+  }
+
+  /**
+   * Starts the test server at a specified port and constant response.
+   * 
+   * @param portno
+   *          Port number.
+   * @param response
+   *          response sent on every request
+   */
+  private void runServer(int port, String response) throws Exception {
+    server = new ServerSocket();
+    server.bind(new InetSocketAddress("127.0.0.1", port));
+    Pattern requestPattern = Pattern.compile("(?i)^GET\\s+(\\S+)");
+    while (true) {
+      LOG.info("Listening on port {}", port);
+      Socket socket = server.accept();
+      LOG.info("Connection received");
+      try (
+          BufferedReader in = new BufferedReader(new InputStreamReader(
+              socket.getInputStream(), StandardCharsets.UTF_8));
+          PrintWriter out = new PrintWriter(new OutputStreamWriter(
+              socket.getOutputStream(), StandardCharsets.UTF_8), true)) {
+
+        String line;
+        while ((line = in.readLine()) != null) {
+          LOG.info("Request: {}", line);
+          if (line.trim().isEmpty()) {
+            break;
+          }
+          Matcher m = requestPattern.matcher(line);
+          if (m.find()) {
+            LOG.info("Requested {}", m.group(1));
+            if (!m.group(1).startsWith("/")) {
+              response = "HTTP/1.1 400 Bad request\r\n\r\n";
+            }
+          }
+        }
+        LOG.info("Response: {}",
+            response.substring(0, Math.min(1024, response.length())));
+        out.print(response);
+      } catch (Exception e) {
+        LOG.warn("Exception in test server:", e);
+      }
+    }
+  }
+
+  private void launchServer(String response) throws InterruptedException {
+    Thread serverThread = new Thread(() -> {
+      try {
+        runServer(port, response);
+      } catch (Exception e) {
+        LOG.warn("Test server died:", e);
+      }
+    });
+    serverThread.start();
+    Thread.sleep(50);
+  }
+
+  /**
+   * Fetches the specified <code>page</code> from the local test server and
+   * checks whether the HTTP response status code matches with the expected
+   * code.
+   * 
+   * @param page
+   *          Page to be fetched.
+   * @param expectedCode
+   *          HTTP response status code expected while fetching the page.
+   */
+  private Response fetchPage(String page, int expectedCode) throws Exception {
+    URL url = new URL("http", "127.0.0.1", port, page);
+    LOG.info("Fetching {}", url);
+    CrawlDatum crawlDatum = new CrawlDatum();
+    Response response = http.getResponse(url, crawlDatum, true);
+    assertEquals("HTTP Status Code for " + url, expectedCode,
+        response.getCode());
+    return response;
+  }
+
+  @Test
+  public void testBadHttpServer() throws Exception {
+    setUp();
+    // test with trivial well-formed content, to make sure the server is
+    // responding
+    launchServer(responseHeader + simpleContent);
+    fetchPage("/", 200);
+  }
+
+  /**
+   * NUTCH-2555 URL normalization problem: path not starting with a '/'
+   */
+  @Test
+  public void testRequestNotStartingWithSlash() throws Exception {
+    setUp();
+    launchServer(responseHeader + simpleContent);
+    fetchPage("?171", 200);
+  }
+
+  /**
+   * NUTCH-2564 protocol-http throws an error when the content-length header is
+   * not a number
+   */
+  @Test
+  public void testContentLengthNotANumber() throws Exception {
+    setUp();
+    launchServer(
+        responseHeader + "Content-Length: thousand\r\n" + simpleContent);
+    fetchPage("/", 200);
+  }
+
+  /**
+   * NUTCH-2559 protocol-http cannot handle colons after the HTTP status code
+   */
+  @Ignore("Fails with okhttp 3.10.0")
+  @Test
+  public void testHeaderWithColon() throws Exception {
+    setUp();
+    launchServer("HTTP/1.1 200: OK\r\n" + simpleContent);
+    fetchPage("/", 200);
+  }
+
+  /**
+   * NUTCH-2563 HTTP header spellchecking issues
+   */
+  @Test
+  public void testHeaderSpellChecking() throws Exception {
+    setUp();
+    launchServer(responseHeader + "Client-Transfer-Encoding: chunked\r\n"
+        + simpleContent);
+    fetchPage("/", 200);
+  }
+
+  /**
+   * NUTCH-2557 protocol-http fails to follow redirections when an HTTP 
response
+   * body is invalid
+   */
+  @Ignore("Fails with okhttp 3.10.0")
+  @Test
+  public void testIgnoreErrorInRedirectPayload() throws Exception {
+    setUp();
+    launchServer("HTTP/1.1 302 Found\r\nLocation: http://example.com/\r\n";
+        + "Transfer-Encoding: chunked\r\n\r\nNot a valid chunk.");
+    Response fetched = fetchPage("/", 302);
+    assertNotNull("No redirect Location.", fetched.getHeader("Location"));
+    assertEquals("Wrong redirect Location.", "http://example.com/";,
+        fetched.getHeader("Location"));
+  }
+
+  /**
+   * NUTCH-2558 protocol-http cannot handle a missing HTTP status line
+   */
+  @Ignore("Fails with okhttp 3.10.0")
+  @Test
+  public void testNoStatusLine() throws Exception {
+    setUp();
+    String text = "This is a text containing non-ASCII characters: 
\u00e4\u00f6\u00fc\u00df";
+    launchServer(text);
+    Response fetched = fetchPage("/", 200);
+    assertEquals("Wrong text returned for response with no status line.", text,
+        new String(fetched.getContent(), StandardCharsets.UTF_8));
+    server.close();
+    text = "<!DOCTYPE html>\n<html>\n<head>\n"
+        + "<title>Testing no HTTP header èéâ</title>\n"
+        + "<meta charset=\"utf-8\">\n"
+        + "</head>\n<body>This is a text containing non-ASCII characters:"
+        + "\u00e4\u00f6\u00fc\u00df</body>\n</html";
+    launchServer(text);
+    fetched = fetchPage("/", 200);
+    assertEquals("Wrong text returned for response with no status line.", text,
+        new String(fetched.getContent(), StandardCharsets.UTF_8));
+  }
+
+  /**
+   * NUTCH-2560 protocol-http throws an error when an http header spans over
+   * multiple lines
+   */
+  @Ignore("Fails with okhttp 3.10.0")
+  @Test
+  public void testMultiLineHeader() throws Exception {
+    setUp();
+    launchServer(responseHeader
+        + "Set-Cookie: UserID=JohnDoe;\r\n  Max-Age=3600;\r\n  Version=1\r\n"
+        + simpleContent);
+    Response fetched = fetchPage("/", 200);
+    LOG.info("Headers: {}", fetched.getHeaders());
+    assertNotNull("Failed to set multi-line \"Set-Cookie\" header.", 
fetched.getHeader("Set-Cookie"));
+    assertTrue("Failed to set multi-line \"Set-Cookie\" header.",
+        fetched.getHeader("Set-Cookie").contains("Version=1"));
+  }
+
+  /**
+   * NUTCH-2561 protocol-http can be made to read arbitrarily large HTTP
+   * responses
+   */
+  @Test(expected = Exception.class)
+  public void testOverlongHeader() throws Exception {
+    setUp();
+    StringBuilder response = new StringBuilder();
+    response.append(responseHeader);
+    for (int i = 0; i < 80; i++) {
+      response.append("X-Custom-Header-");
+      for (int j = 0; j < 10000; j++) {
+        response.append('x');
+      }
+      response.append(": hello\r\n");
+    }
+    response.append("\r\n" + simpleContent);
+    launchServer(response.toString());
+    // should throw exception because of overlong header
+    fetchPage("/", 200);
+  }
+
+  /**
+   * NUTCH-2562 protocol-http fails to read large chunked HTTP responses,
+   * NUTCH-2575 protocol-http does not respect the maximum content-size for
+   * chunked responses
+   */
+  @Test
+  public void testChunkedContent() throws Exception {
+    setUp();
+    StringBuilder response = new StringBuilder();
+    response.append(responseHeader);
+    response.append("Content-Type: text/html\r\n");
+    response.append("Transfer-Encoding: chunked\r\n");
+    // 81920 bytes (80 chunks, 1024 bytes each)
+    // > 65536 (http.content.limit defined in nutch-site-test.xml)
+    for (int i = 0; i < 80; i++) {
+      response.append(String.format("\r\n400\r\n%02x\r\n", i));
+      for (int j = 0; j < 1012; j++) {
+        response.append('x');
+      }
+      response.append(String.format("\r\n%02x\r\n", i));
+      response.append("\r\n");
+    }
+    response.append("\r\n0\r\n\r\n");
+    launchServer(response.toString());
+    Response fetched = fetchPage("/", 200);
+    assertEquals(
+        "Chunked content not truncated according to http.content.limit", 65536,
+        fetched.getContent().length);
+  }
+
+}
diff --git 
a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
 
b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
new file mode 100644
index 000000000..d276f1c12
--- /dev/null
+++ 
b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.okhttp;
+
+import static org.junit.Assert.assertEquals;
+
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.okhttp.OkHttp;
+import org.junit.After;
+import org.junit.Test;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.nio.SelectChannelConnector;
+import org.mortbay.jetty.servlet.Context;
+import org.mortbay.jetty.servlet.ServletHolder;
+
+/**
+ * Test cases for protocol-http
+ */
+public class TestProtocolOkHttp {
+  private static final String RES_DIR = System.getProperty("test.data", ".");
+
+  private OkHttp http;
+  private Server server;
+  private Context root;
+  private Configuration conf;
+  private int port;
+
+  public void setUp(boolean redirection) throws Exception {
+    conf = new Configuration();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("nutch-site-test.xml");
+
+    http = new OkHttp();
+    http.setConf(conf);
+
+    server = new Server();
+
+    if (redirection) {
+      root = new Context(server, "/redirection", Context.SESSIONS);
+      root.setAttribute("newContextURL", "/redirect");
+    } else {
+      root = new Context(server, "/", Context.SESSIONS);
+    }
+
+    ServletHolder sh = new ServletHolder(
+        org.apache.jasper.servlet.JspServlet.class);
+    root.addServlet(sh, "*.jsp");
+    root.setResourceBase(RES_DIR);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    server.stop();
+  }
+
+  @Test
+  public void testStatusCode() throws Exception {
+    startServer(47504, false);
+    fetchPage("/basic-http.jsp", 200);
+    fetchPage("/redirect301.jsp", 301);
+    fetchPage("/redirect302.jsp", 302);
+    fetchPage("/nonexists.html", 404);
+    fetchPage("/brokenpage.jsp", 500);
+  }
+
+  @Test
+  public void testRedirectionJetty() throws Exception {
+    // Redirection via Jetty
+    startServer(47503, true);
+    fetchPage("/redirection", 302);
+  }
+
+  /**
+   * Starts the Jetty server at a specified port and redirection parameter.
+   * 
+   * @param portno
+   *          Port number.
+   * @param redirection
+   *          whether redirection
+   */
+  private void startServer(int portno, boolean redirection) throws Exception {
+    port = portno;
+    setUp(redirection);
+    SelectChannelConnector connector = new SelectChannelConnector();
+    connector.setHost("127.0.0.1");
+    connector.setPort(port);
+
+    server.addConnector(connector);
+    server.start();
+  }
+
+  /**
+   * Fetches the specified <code>page</code> from the local Jetty server and
+   * checks whether the HTTP response status code matches with the expected
+   * code. Also use jsp pages for redirection.
+   * 
+   * @param page
+   *          Page to be fetched.
+   * @param expectedCode
+   *          HTTP response status code expected while fetching the page.
+   */
+  private void fetchPage(String page, int expectedCode) throws Exception {
+    URL url = new URL("http", "127.0.0.1", port, page);
+    CrawlDatum crawlDatum = new CrawlDatum();
+    Response response = http.getResponse(url, crawlDatum, true);
+    ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()),
+        crawlDatum);
+    Content content = out.getContent();
+    assertEquals("HTTP Status Code for " + url, expectedCode,
+        response.getCode());
+
+    if (page.compareTo("/nonexists.html") != 0
+        && page.compareTo("/brokenpage.jsp") != 0
+        && page.compareTo("/redirection") != 0) {
+      assertEquals("ContentType " + url, "text/html",
+          content.getContentType());
+    }
+  }
+}


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> HTTP protocol plugin based on okhttp
> ------------------------------------
>
>                 Key: NUTCH-2576
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2576
>             Project: Nutch
>          Issue Type: Improvement
>          Components: plugin, protocol
>            Reporter: Sebastian Nagel
>            Priority: Major
>             Fix For: 1.15
>
>
> [Okhttp|http://square.github.io/okhttp/] is an Apache2-licensed http library 
> which supports HTTP/2. [~jnioche]'s implementation 
> [storm-crawler#443|https://github.com/DigitalPebble/storm-crawler/issues/443] 
> proves that it should be straightforward to implement a Nutch protocol plugin 
> using okhttp. A recent HTTP protocol implementation should also fix (most of) 
> the issues reported in NUTCH-2549.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to