This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 42002477b NUTCH-3062 protocol-okhttp: optionally record HTTP and
SSL/TLS versions (#822)
42002477b is described below
commit 42002477b93f9e78be65ba7d2ecd617d7bbf424a
Author: Sebastian Nagel <[email protected]>
AuthorDate: Fri Sep 13 17:23:19 2024 +0200
NUTCH-3062 protocol-okhttp: optionally record HTTP and SSL/TLS versions
(#822)
* NUTCH-3062 protocol-okhttp: optionally record HTTP and SSL/TLS versions
Adds property store.protocol.versions - if true, protocol-okhttp records
protocol versions and related in response metadata.
* NUTCH-3062 protocol-okhttp: optionally record HTTP and SSL/TLS versions
Activate HTTPHeadersInterceptor also if only store.protocol.versions is
true, but no other headers and connection information is intercepted.
---
conf/nutch-default.xml | 10 +++++++
.../org/apache/nutch/net/protocols/Response.java | 12 +++++++++
.../apache/nutch/protocol/http/api/HttpBase.java | 7 +++++
.../org/apache/nutch/protocol/okhttp/OkHttp.java | 31 +++++++++++++++++-----
4 files changed, 53 insertions(+), 7 deletions(-)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index c00d9776b..fe6eeccf7 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2681,6 +2681,16 @@ CAUTION: Set the parser.timeout to -1 or a bigger value
than 30, when using this
</description>
</property>
+<property>
+ <name>store.protocol.versions</name>
+ <value>false</value>
+ <description>
+ Store protocol versions in response metadata: HTTP and SSL/TLS
+ versions, SSL/TTL cipher suites and related information depending
+ on the protocol implementation. Supported by: protocol-okhttp.
+ </description>
+</property>
+
<!-- index-links plugin -->
<property>
diff --git a/src/java/org/apache/nutch/net/protocols/Response.java
b/src/java/org/apache/nutch/net/protocols/Response.java
index 514ce8561..3fbe93266 100644
--- a/src/java/org/apache/nutch/net/protocols/Response.java
+++ b/src/java/org/apache/nutch/net/protocols/Response.java
@@ -41,6 +41,18 @@ public interface Response extends HttpHeaders {
*/
public static final String IP_ADDRESS = "_ip_";
+ /**
+ * Key to hold the HTTP and SSL/TLS protocol versions if
+ * <code>store.protocol.versions</code> is true.
+ */
+ public static final String PROTOCOL_VERSIONS = "_protocol_versions_";
+
+ /**
+ * Key to hold the SSL/TLS cipher suites
+ * <code>store.protocol.versions</code> is true.
+ */
+ public static final String CIPHER_SUITES = "_cipher_suites_";
+
/**
* Key to hold the time when the page has been fetched
*/
diff --git
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 150f1ad82..7e337f844 100644
---
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -167,6 +167,12 @@ public abstract class HttpBase implements Protocol {
*/
protected boolean storeHttpHeaders = false;
+ /**
+ * Record the HTTP and SSL/TLS protocol versions and the SSL/TLS cipher
+ * suites, see property <code>store.protocol.versions</code>.
+ */
+ protected boolean storeProtocolVersions = false;
+
/** Skip page if Crawl-Delay longer than this value. */
protected long maxCrawlDelay = -1L;
@@ -235,6 +241,7 @@ public abstract class HttpBase implements Protocol {
this.storeIPAddress = conf.getBoolean("store.ip.address", false);
this.storeHttpRequest = conf.getBoolean("store.http.request", false);
this.storeHttpHeaders = conf.getBoolean("store.http.headers", false);
+ this.storeProtocolVersions = conf.getBoolean("store.protocol.versions",
false);
this.enableIfModifiedsinceHeader = conf
.getBoolean("http.enable.if.modified.since.header", true);
this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header",
diff --git
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
index 876c4ef24..954c3f6df 100644
---
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
+++
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
@@ -25,6 +25,7 @@ import java.net.ProxySelector;
import java.net.SocketAddress;
import java.net.URI;
import java.net.URL;
+import java.nio.charset.StandardCharsets;
import java.security.cert.CertificateException;
import java.util.ArrayList;
import java.util.Base64;
@@ -53,6 +54,7 @@ import org.slf4j.LoggerFactory;
import okhttp3.Authenticator;
import okhttp3.Connection;
import okhttp3.ConnectionPool;
+import okhttp3.Handshake;
import okhttp3.Headers;
import okhttp3.Interceptor;
import okhttp3.OkHttpClient;
@@ -217,7 +219,8 @@ public class OkHttp extends HttpBase {
builder.addNetworkInterceptor(new
HTTPFilterIPAddressInterceptor(ipFilterRules));
}
- if (this.storeIPAddress || this.storeHttpHeaders || this.storeHttpRequest)
{
+ if (this.storeIPAddress || this.storeHttpHeaders || this.storeHttpRequest
+ || this.storeProtocolVersions) {
builder.addNetworkInterceptor(new HTTPHeadersInterceptor());
}
@@ -373,17 +376,31 @@ public class OkHttp extends HttpBase {
}
if (requestverbatim != null) {
- byte[] encodedBytesRequest = Base64.getEncoder()
- .encode(requestverbatim.toString().getBytes());
+ byte[] encodedBytesRequest = Base64.getEncoder().encode(
+ requestverbatim.toString().getBytes(StandardCharsets.ISO_8859_1));
builder = builder.header(Response.REQUEST,
- new String(encodedBytesRequest));
+ new String(encodedBytesRequest, StandardCharsets.ISO_8859_1));
}
if (responseverbatim != null) {
- byte[] encodedBytesResponse = Base64.getEncoder()
- .encode(responseverbatim.toString().getBytes());
+ byte[] encodedBytesResponse = Base64.getEncoder().encode(
+ responseverbatim.toString().getBytes(StandardCharsets.ISO_8859_1));
builder = builder.header(Response.RESPONSE_HEADERS,
- new String(encodedBytesResponse));
+ new String(encodedBytesResponse, StandardCharsets.ISO_8859_1));
+ }
+
+ // store the HTTP and SSL/TLS protocol versions and SSL/TLS cipher suites
+ if (storeProtocolVersions) {
+ final StringBuilder protocols = new StringBuilder(
+ response.protocol().toString());
+ final Handshake handshake = connection.handshake();
+ if (handshake != null) {
+ protocols.append(',').append(handshake.tlsVersion().javaName());
+ builder = builder.header(Response.CIPHER_SUITES,
+ handshake.cipherSuite().toString());
+ }
+ builder = builder.header(Response.PROTOCOL_VERSIONS,
+ protocols.toString());
}
// returns a modified version of the response