[
https://issues.apache.org/jira/browse/NUTCH-2577?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16487571#comment-16487571
]
ASF GitHub Bot commented on NUTCH-2577:
---------------------------------------
sebastian-nagel closed pull request #330: fix for NUTCH-2577 contributed by
hussein-alahmad
URL: https://github.com/apache/nutch/pull/330
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/src/plugin/protocol-selenium/plugin.xml
b/src/plugin/protocol-selenium/plugin.xml
index 1454c1bd1..2f50606ae 100644
--- a/src/plugin/protocol-selenium/plugin.xml
+++ b/src/plugin/protocol-selenium/plugin.xml
@@ -42,6 +42,11 @@
<parameter name="protocolName" value="http"/>
</implementation>
+ <implementation id="org.apache.nutch.protocol.http.Http"
+ class="org.apache.nutch.protocol.selenium.Http">
+ <parameter name="protocolName" value="https"/>
+ </implementation>
+
</extension>
</plugin>
diff --git
a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
index 681e838a2..6b39a74b7 100644
---
a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
+++
b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
@@ -26,8 +26,16 @@
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+
+import javax.net.ssl.SSLSocket;
+import javax.net.ssl.SSLSocketFactory;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.SpellCheckedMetadata;
@@ -48,7 +56,12 @@
private byte[] content;
private int code;
private Metadata headers = new SpellCheckedMetadata();
+ // used for storing the http headers verbatim
+ private StringBuffer httpHeaders;
+ protected enum Scheme {
+ HTTP, HTTPS,
+ }
/** The nutch configuration */
private Configuration conf = null;
@@ -59,9 +72,15 @@ public HttpResponse(Http http, URL url, CrawlDatum datum)
throws ProtocolExcepti
this.url = url;
this.orig = url.toString();
this.base = url.toString();
+ Scheme scheme = null;
- if (!"http".equals(url.getProtocol()))
- throw new HttpException("Not an HTTP url:" + url);
+ if ("http".equals(url.getProtocol())) {
+ scheme = Scheme.HTTP;
+ } else if ("https".equals(url.getProtocol())) {
+ scheme = Scheme.HTTPS;
+ } else {
+ throw new HttpException("Unknown scheme (not http/https) for url:" +
url);
+ }
if (Http.LOG.isTraceEnabled()) {
Http.LOG.trace("fetching " + url);
@@ -77,7 +96,11 @@ public HttpResponse(Http http, URL url, CrawlDatum datum)
throws ProtocolExcepti
int port;
String portString;
if (url.getPort() == -1) {
- port = 80;
+ if (scheme == Scheme.HTTP) {
+ port = 80;
+ } else {
+ port = 443;
+ }
portString = "";
} else {
port = url.getPort();
@@ -95,6 +118,36 @@ public HttpResponse(Http http, URL url, CrawlDatum datum)
throws ProtocolExcepti
InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
socket.connect(sockAddr, http.getTimeout());
+ if (scheme == Scheme.HTTPS) {
+ SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
+ .getDefault();
+ SSLSocket sslsocket = (SSLSocket) factory
+ .createSocket(socket, sockHost, sockPort, true);
+ sslsocket.setUseClientMode(true);
+
+ // Get the protocols and ciphers supported by this JVM
+ Set<String> protocols = new HashSet<String>(
+ Arrays.asList(sslsocket.getSupportedProtocols()));
+ Set<String> ciphers = new HashSet<String>(
+ Arrays.asList(sslsocket.getSupportedCipherSuites()));
+
+ // Intersect with preferred protocols and ciphers
+ protocols.retainAll(http.getTlsPreferredProtocols());
+ ciphers.retainAll(http.getTlsPreferredCipherSuites());
+
+ sslsocket.setEnabledProtocols(
+ protocols.toArray(new String[protocols.size()]));
+ sslsocket.setEnabledCipherSuites(
+ ciphers.toArray(new String[ciphers.size()]));
+
+ sslsocket.startHandshake();
+ socket = sslsocket;
+ }
+
+ if (sockAddr != null
+ && conf.getBoolean("store.ip.address", false) == true) {
+ headers.add("_ip_", sockAddr.getAddress().getHostAddress());
+ }
// make request
OutputStream req = socket.getOutputStream();
@@ -125,20 +178,49 @@ public HttpResponse(Http http, URL url, CrawlDatum datum)
throws ProtocolExcepti
reqStr.append("\r\n");
}
- reqStr.append("Accept-Language: ");
- reqStr.append(this.http.getAcceptLanguage());
- reqStr.append("\r\n");
+ String acceptLanguage = http.getAcceptLanguage();
+ if (!acceptLanguage.isEmpty()) {
+ reqStr.append("Accept-Language: ");
+ reqStr.append(acceptLanguage);
+ reqStr.append("\r\n");
+ }
- reqStr.append("Accept: ");
- reqStr.append(this.http.getAccept());
- reqStr.append("\r\n");
+ String acceptCharset = http.getAcceptCharset();
+ if (!acceptCharset.isEmpty()) {
+ reqStr.append("Accept-Charset: ");
+ reqStr.append(acceptCharset);
+ reqStr.append("\r\n");
+ }
+
+ String accept = http.getAccept();
+ if (!accept.isEmpty()) {
+ reqStr.append("Accept: ");
+ reqStr.append(accept);
+ reqStr.append("\r\n");
+ }
+
+ if (http.isCookieEnabled()
+ && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
+ String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE))
+ .toString();
+ reqStr.append("Cookie: ");
+ reqStr.append(cookie);
+ reqStr.append("\r\n");
+ }
- if (datum.getModifiedTime() > 0) {
- reqStr.append("If-Modified-Since: " +
HttpDateFormat.toString(datum.getModifiedTime()));
+ if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
+ reqStr.append("If-Modified-Since: " + HttpDateFormat
+ .toString(datum.getModifiedTime()));
reqStr.append("\r\n");
}
reqStr.append("\r\n");
+ // store the request in the metadata?
+ if (conf.getBoolean("store.http.request", false) == true) {
+ headers.add("_request_", reqStr.toString());
+ }
+
+
byte[] reqBytes = reqStr.toString().getBytes();
req.write(reqBytes);
@@ -150,10 +232,20 @@ public HttpResponse(Http http, URL url, CrawlDatum datum)
throws ProtocolExcepti
StringBuffer line = new StringBuffer();
+
+ // store the http headers verbatim
+ if (conf.getBoolean("store.http.headers", false) == true) {
+ httpHeaders = new StringBuffer();
+ }
+
+ headers.add("nutch.fetch.time",
Long.toString(System.currentTimeMillis()));
+
boolean haveSeenNonContinueStatus = false;
while (!haveSeenNonContinueStatus) {
// parse status code line
this.code = parseStatusLine(in, line);
+ if (httpHeaders != null)
+ httpHeaders.append(line).append("\n");
// parse headers
parseHeaders(in, line);
haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> protocol-selenium can't handle https
> ------------------------------------
>
> Key: NUTCH-2577
> URL: https://issues.apache.org/jira/browse/NUTCH-2577
> Project: Nutch
> Issue Type: Improvement
> Components: protocol
> Affects Versions: 1.14
> Reporter: hussein Al_Ahmad
> Priority: Major
>
> fetch of any https page is failing with:
> org.apache.nutch.protocol.ProtocolNotFound: protocol not found for url=https
> at
> org.apache.nutch.protocol.ProtocolFactory.getProtocol(ProtocolFactory.java:83)
> at org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:687)
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)