This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit c433f497168c3a8242930c1dd1609ef3a99770ff Author: Sebastian Nagel <[email protected]> AuthorDate: Wed May 23 18:06:53 2018 +0200 NUTCH-2577 protocol-selenium can't handle https - port solution contributed by hussein-alahmad to plugin protocol-interactiveselenium to address NUTCH-2273 --- src/plugin/protocol-interactiveselenium/plugin.xml | 5 + .../protocol/interactiveselenium/HttpResponse.java | 117 ++++++++++++++++++--- 2 files changed, 109 insertions(+), 13 deletions(-) diff --git a/src/plugin/protocol-interactiveselenium/plugin.xml b/src/plugin/protocol-interactiveselenium/plugin.xml index a69a1e5..9f35930 100644 --- a/src/plugin/protocol-interactiveselenium/plugin.xml +++ b/src/plugin/protocol-interactiveselenium/plugin.xml @@ -42,6 +42,11 @@ <parameter name="protocolName" value="http"/> </implementation> + <implementation id="org.apache.nutch.protocol.interactiveselenium.Http" + class="org.apache.nutch.protocol.interactiveselenium.Http"> + <parameter name="protocolName" value="https"/> + </implementation> + </extension> </plugin> diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java index 71707de..7f961d9 100644 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java +++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java @@ -25,8 +25,15 @@ import java.io.PushbackInputStream; import java.net.InetSocketAddress; import java.net.Socket; import java.net.URL; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import javax.net.ssl.SSLSocket; +import javax.net.ssl.SSLSocketFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.SpellCheckedMetadata; @@ -50,8 +57,13 @@ public class HttpResponse implements Response { private byte[] content; private int code; private Metadata headers = new SpellCheckedMetadata(); - private static InteractiveSeleniumHandler[] handlers; + private InteractiveSeleniumHandler[] handlers; + // used for storing the http headers verbatim + private StringBuffer httpHeaders; + protected enum Scheme { + HTTP, HTTPS, + } /** The nutch configuration */ private Configuration conf = null; @@ -62,9 +74,15 @@ public class HttpResponse implements Response { this.url = url; this.orig = url.toString(); this.base = url.toString(); + Scheme scheme = null; - if (!"http".equals(url.getProtocol())) - throw new HttpException("Not an HTTP url:" + url); + if ("http".equals(url.getProtocol())) { + scheme = Scheme.HTTP; + } else if ("https".equals(url.getProtocol())) { + scheme = Scheme.HTTPS; + } else { + throw new HttpException("Unknown scheme (not http/https) for url:" + url); + } if (Http.LOG.isTraceEnabled()) { Http.LOG.trace("fetching " + url); @@ -80,7 +98,11 @@ public class HttpResponse implements Response { int port; String portString; if (url.getPort() == -1) { - port = 80; + if (scheme == Scheme.HTTP) { + port = 80; + } else { + port = 443; + } portString = ""; } else { port = url.getPort(); @@ -98,6 +120,36 @@ public class HttpResponse implements Response { InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort); socket.connect(sockAddr, http.getTimeout()); + if (scheme == Scheme.HTTPS) { + SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory + .getDefault(); + SSLSocket sslsocket = (SSLSocket) factory + .createSocket(socket, sockHost, sockPort, true); + sslsocket.setUseClientMode(true); + + // Get the protocols and ciphers supported by this JVM + Set<String> protocols = new HashSet<String>( + Arrays.asList(sslsocket.getSupportedProtocols())); + Set<String> ciphers = new HashSet<String>( + Arrays.asList(sslsocket.getSupportedCipherSuites())); + + // Intersect with preferred protocols and ciphers + protocols.retainAll(http.getTlsPreferredProtocols()); + ciphers.retainAll(http.getTlsPreferredCipherSuites()); + + sslsocket.setEnabledProtocols( + protocols.toArray(new String[protocols.size()])); + sslsocket.setEnabledCipherSuites( + ciphers.toArray(new String[ciphers.size()])); + + sslsocket.startHandshake(); + socket = sslsocket; + } + + if (sockAddr != null + && conf.getBoolean("store.ip.address", false) == true) { + headers.add("_ip_", sockAddr.getAddress().getHostAddress()); + } // make request OutputStream req = socket.getOutputStream(); @@ -128,20 +180,49 @@ public class HttpResponse implements Response { reqStr.append("\r\n"); } - reqStr.append("Accept-Language: "); - reqStr.append(this.http.getAcceptLanguage()); - reqStr.append("\r\n"); + String acceptLanguage = http.getAcceptLanguage(); + if (!acceptLanguage.isEmpty()) { + reqStr.append("Accept-Language: "); + reqStr.append(acceptLanguage); + reqStr.append("\r\n"); + } - reqStr.append("Accept: "); - reqStr.append(this.http.getAccept()); - reqStr.append("\r\n"); + String acceptCharset = http.getAcceptCharset(); + if (!acceptCharset.isEmpty()) { + reqStr.append("Accept-Charset: "); + reqStr.append(acceptCharset); + reqStr.append("\r\n"); + } + + String accept = http.getAccept(); + if (!accept.isEmpty()) { + reqStr.append("Accept: "); + reqStr.append(accept); + reqStr.append("\r\n"); + } + + if (http.isCookieEnabled() + && datum.getMetaData().containsKey(HttpBase.COOKIE)) { + String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE)) + .toString(); + reqStr.append("Cookie: "); + reqStr.append(cookie); + reqStr.append("\r\n"); + } - if (datum.getModifiedTime() > 0) { - reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime())); + if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) { + reqStr.append("If-Modified-Since: " + HttpDateFormat + .toString(datum.getModifiedTime())); reqStr.append("\r\n"); } reqStr.append("\r\n"); + // store the request in the metadata? + if (conf.getBoolean("store.http.request", false) == true) { + headers.add("_request_", reqStr.toString()); + } + + byte[] reqBytes = reqStr.toString().getBytes(); req.write(reqBytes); @@ -153,10 +234,20 @@ public class HttpResponse implements Response { StringBuffer line = new StringBuffer(); + + // store the http headers verbatim + if (conf.getBoolean("store.http.headers", false) == true) { + httpHeaders = new StringBuffer(); + } + + headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis())); + boolean haveSeenNonContinueStatus = false; while (!haveSeenNonContinueStatus) { // parse status code line this.code = parseStatusLine(in, line); + if (httpHeaders != null) + httpHeaders.append(line).append("\n"); // parse headers parseHeaders(in, line); haveSeenNonContinueStatus = code != 100; // 100 is "Continue" @@ -250,7 +341,7 @@ public class HttpResponse implements Response { handlers = new InteractiveSeleniumHandler[handlerNames.length]; for (int i = 0; i < handlerNames.length; i++) { try { - String classToLoad = this.getClass().getPackage().getName() + "." + handlerNames[i]; + String classToLoad = this.getClass().getPackage().getName() + ".handlers." + handlerNames[i]; handlers[i] = InteractiveSeleniumHandler.class.cast(Class.forName(classToLoad).newInstance()); Http.LOG.info("Successfully loaded " + classToLoad); } catch (ClassNotFoundException e) { -- To stop receiving notification emails like this one, please contact [email protected].
