This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit c433f497168c3a8242930c1dd1609ef3a99770ff
Author: Sebastian Nagel <[email protected]>
AuthorDate: Wed May 23 18:06:53 2018 +0200

    NUTCH-2577 protocol-selenium can't handle https
    - port solution contributed by hussein-alahmad to plugin
      protocol-interactiveselenium to address NUTCH-2273
---
 src/plugin/protocol-interactiveselenium/plugin.xml |   5 +
 .../protocol/interactiveselenium/HttpResponse.java | 117 ++++++++++++++++++---
 2 files changed, 109 insertions(+), 13 deletions(-)

diff --git a/src/plugin/protocol-interactiveselenium/plugin.xml 
b/src/plugin/protocol-interactiveselenium/plugin.xml
index a69a1e5..9f35930 100644
--- a/src/plugin/protocol-interactiveselenium/plugin.xml
+++ b/src/plugin/protocol-interactiveselenium/plugin.xml
@@ -42,6 +42,11 @@
         <parameter name="protocolName" value="http"/>
       </implementation>
 
+      <implementation id="org.apache.nutch.protocol.interactiveselenium.Http"
+                      
class="org.apache.nutch.protocol.interactiveselenium.Http">
+         <parameter name="protocolName" value="https"/>
+      </implementation>
+
    </extension>
 
 </plugin>
diff --git 
a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
 
b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
index 71707de..7f961d9 100644
--- 
a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
+++ 
b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
@@ -25,8 +25,15 @@ import java.io.PushbackInputStream;
 import java.net.InetSocketAddress;
 import java.net.Socket;
 import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.net.ssl.SSLSocket;
+import javax.net.ssl.SSLSocketFactory;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.SpellCheckedMetadata;
@@ -50,8 +57,13 @@ public class HttpResponse implements Response {
   private byte[] content;
   private int code;
   private Metadata headers = new SpellCheckedMetadata();
-  private static InteractiveSeleniumHandler[] handlers;
+  private InteractiveSeleniumHandler[] handlers;
+  // used for storing the http headers verbatim
+  private StringBuffer httpHeaders;
 
+  protected enum Scheme {
+    HTTP, HTTPS,
+  }
   /** The nutch configuration */
   private Configuration conf = null;
 
@@ -62,9 +74,15 @@ public class HttpResponse implements Response {
     this.url = url;
     this.orig = url.toString();
     this.base = url.toString();
+    Scheme scheme = null;
 
-    if (!"http".equals(url.getProtocol()))
-      throw new HttpException("Not an HTTP url:" + url);
+    if ("http".equals(url.getProtocol())) {
+      scheme = Scheme.HTTP;
+    } else if ("https".equals(url.getProtocol())) {
+      scheme = Scheme.HTTPS;
+    } else {
+      throw new HttpException("Unknown scheme (not http/https) for url:" + 
url);
+    }
 
     if (Http.LOG.isTraceEnabled()) {
       Http.LOG.trace("fetching " + url);
@@ -80,7 +98,11 @@ public class HttpResponse implements Response {
     int port;
     String portString;
     if (url.getPort() == -1) {
-      port = 80;
+      if (scheme == Scheme.HTTP) {
+        port = 80;
+      } else {
+        port = 443;
+      }
       portString = "";
     } else {
       port = url.getPort();
@@ -98,6 +120,36 @@ public class HttpResponse implements Response {
       InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
       socket.connect(sockAddr, http.getTimeout());
 
+      if (scheme == Scheme.HTTPS) {
+        SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
+                .getDefault();
+        SSLSocket sslsocket = (SSLSocket) factory
+                .createSocket(socket, sockHost, sockPort, true);
+        sslsocket.setUseClientMode(true);
+
+        // Get the protocols and ciphers supported by this JVM
+        Set<String> protocols = new HashSet<String>(
+                Arrays.asList(sslsocket.getSupportedProtocols()));
+        Set<String> ciphers = new HashSet<String>(
+                Arrays.asList(sslsocket.getSupportedCipherSuites()));
+
+        // Intersect with preferred protocols and ciphers
+        protocols.retainAll(http.getTlsPreferredProtocols());
+        ciphers.retainAll(http.getTlsPreferredCipherSuites());
+
+        sslsocket.setEnabledProtocols(
+                protocols.toArray(new String[protocols.size()]));
+        sslsocket.setEnabledCipherSuites(
+                ciphers.toArray(new String[ciphers.size()]));
+
+        sslsocket.startHandshake();
+        socket = sslsocket;
+      }
+
+      if (sockAddr != null
+              && conf.getBoolean("store.ip.address", false) == true) {
+        headers.add("_ip_", sockAddr.getAddress().getHostAddress());
+      }
       // make request
       OutputStream req = socket.getOutputStream();
 
@@ -128,20 +180,49 @@ public class HttpResponse implements Response {
         reqStr.append("\r\n");
       }
 
-      reqStr.append("Accept-Language: ");
-      reqStr.append(this.http.getAcceptLanguage());
-      reqStr.append("\r\n");
+      String acceptLanguage = http.getAcceptLanguage();
+      if (!acceptLanguage.isEmpty()) {
+        reqStr.append("Accept-Language: ");
+        reqStr.append(acceptLanguage);
+        reqStr.append("\r\n");
+      }
 
-      reqStr.append("Accept: ");
-      reqStr.append(this.http.getAccept());
-      reqStr.append("\r\n");
+      String acceptCharset = http.getAcceptCharset();
+      if (!acceptCharset.isEmpty()) {
+        reqStr.append("Accept-Charset: ");
+        reqStr.append(acceptCharset);
+        reqStr.append("\r\n");
+      }
+
+      String accept = http.getAccept();
+      if (!accept.isEmpty()) {
+        reqStr.append("Accept: ");
+        reqStr.append(accept);
+        reqStr.append("\r\n");
+      }
+
+      if (http.isCookieEnabled()
+              && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
+        String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE))
+                .toString();
+        reqStr.append("Cookie: ");
+        reqStr.append(cookie);
+        reqStr.append("\r\n");
+      }
 
-      if (datum.getModifiedTime() > 0) {
-        reqStr.append("If-Modified-Since: " + 
HttpDateFormat.toString(datum.getModifiedTime()));
+      if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " + HttpDateFormat
+                .toString(datum.getModifiedTime()));
         reqStr.append("\r\n");
       }
       reqStr.append("\r\n");
 
+      // store the request in the metadata?
+      if (conf.getBoolean("store.http.request", false) == true) {
+        headers.add("_request_", reqStr.toString());
+      }
+
+
       byte[] reqBytes = reqStr.toString().getBytes();
 
       req.write(reqBytes);
@@ -153,10 +234,20 @@ public class HttpResponse implements Response {
 
       StringBuffer line = new StringBuffer();
 
+
+      // store the http headers verbatim
+      if (conf.getBoolean("store.http.headers", false) == true) {
+        httpHeaders = new StringBuffer();
+      }
+
+      headers.add("nutch.fetch.time", 
Long.toString(System.currentTimeMillis()));
+
       boolean haveSeenNonContinueStatus = false;
       while (!haveSeenNonContinueStatus) {
         // parse status code line
         this.code = parseStatusLine(in, line);
+        if (httpHeaders != null)
+          httpHeaders.append(line).append("\n");
         // parse headers
         parseHeaders(in, line);
         haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
@@ -250,7 +341,7 @@ public class HttpResponse implements Response {
     handlers = new InteractiveSeleniumHandler[handlerNames.length];
     for (int i = 0; i < handlerNames.length; i++) {
         try {
-            String classToLoad = this.getClass().getPackage().getName() + "." 
+ handlerNames[i];
+            String classToLoad = this.getClass().getPackage().getName() + 
".handlers." + handlerNames[i];
             handlers[i] = 
InteractiveSeleniumHandler.class.cast(Class.forName(classToLoad).newInstance());
             Http.LOG.info("Successfully loaded " + classToLoad);
         } catch (ClassNotFoundException e) {

-- 
To stop receiving notification emails like this one, please contact
[email protected].

Reply via email to