This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 42002477b NUTCH-3062 protocol-okhttp: optionally record HTTP and 
SSL/TLS versions (#822)
42002477b is described below

commit 42002477b93f9e78be65ba7d2ecd617d7bbf424a
Author: Sebastian Nagel <[email protected]>
AuthorDate: Fri Sep 13 17:23:19 2024 +0200

    NUTCH-3062 protocol-okhttp: optionally record HTTP and SSL/TLS versions 
(#822)
    
    * NUTCH-3062 protocol-okhttp: optionally record HTTP and SSL/TLS versions
    
    Adds property store.protocol.versions - if true, protocol-okhttp records
    protocol versions and related in response metadata.
    
    * NUTCH-3062 protocol-okhttp: optionally record HTTP and SSL/TLS versions
    
    Activate HTTPHeadersInterceptor also if only store.protocol.versions is
    true, but no other headers and connection information is intercepted.
---
 conf/nutch-default.xml                             | 10 +++++++
 .../org/apache/nutch/net/protocols/Response.java   | 12 +++++++++
 .../apache/nutch/protocol/http/api/HttpBase.java   |  7 +++++
 .../org/apache/nutch/protocol/okhttp/OkHttp.java   | 31 +++++++++++++++++-----
 4 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index c00d9776b..fe6eeccf7 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2681,6 +2681,16 @@ CAUTION: Set the parser.timeout to -1 or a bigger value 
than 30, when using this
   </description>
 </property>
 
+<property>
+  <name>store.protocol.versions</name>
+  <value>false</value>
+  <description>
+    Store protocol versions in response metadata: HTTP and SSL/TLS
+    versions, SSL/TTL cipher suites and related information depending
+    on the protocol implementation. Supported by: protocol-okhttp.
+  </description>
+</property>
+
 <!-- index-links plugin -->
 
 <property>
diff --git a/src/java/org/apache/nutch/net/protocols/Response.java 
b/src/java/org/apache/nutch/net/protocols/Response.java
index 514ce8561..3fbe93266 100644
--- a/src/java/org/apache/nutch/net/protocols/Response.java
+++ b/src/java/org/apache/nutch/net/protocols/Response.java
@@ -41,6 +41,18 @@ public interface Response extends HttpHeaders {
    */
   public static final String IP_ADDRESS = "_ip_";
 
+  /**
+   * Key to hold the HTTP and SSL/TLS protocol versions if
+   * <code>store.protocol.versions</code> is true.
+   */
+  public static final String PROTOCOL_VERSIONS = "_protocol_versions_";
+
+  /**
+   * Key to hold the SSL/TLS cipher suites
+   * <code>store.protocol.versions</code> is true.
+   */
+  public static final String CIPHER_SUITES = "_cipher_suites_";
+
   /**
    * Key to hold the time when the page has been fetched
    */
diff --git 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 150f1ad82..7e337f844 100644
--- 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -167,6 +167,12 @@ public abstract class HttpBase implements Protocol {
    */
   protected boolean storeHttpHeaders = false;
 
+  /**
+   * Record the HTTP and SSL/TLS protocol versions and the SSL/TLS cipher
+   * suites, see property <code>store.protocol.versions</code>.
+   */
+  protected boolean storeProtocolVersions = false;
+
   /** Skip page if Crawl-Delay longer than this value. */
   protected long maxCrawlDelay = -1L;
 
@@ -235,6 +241,7 @@ public abstract class HttpBase implements Protocol {
     this.storeIPAddress = conf.getBoolean("store.ip.address", false);
     this.storeHttpRequest = conf.getBoolean("store.http.request", false);
     this.storeHttpHeaders = conf.getBoolean("store.http.headers", false);
+    this.storeProtocolVersions = conf.getBoolean("store.protocol.versions", 
false);
     this.enableIfModifiedsinceHeader = conf
         .getBoolean("http.enable.if.modified.since.header", true);
     this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header",
diff --git 
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
index 876c4ef24..954c3f6df 100644
--- 
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
+++ 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
@@ -25,6 +25,7 @@ import java.net.ProxySelector;
 import java.net.SocketAddress;
 import java.net.URI;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.security.cert.CertificateException;
 import java.util.ArrayList;
 import java.util.Base64;
@@ -53,6 +54,7 @@ import org.slf4j.LoggerFactory;
 import okhttp3.Authenticator;
 import okhttp3.Connection;
 import okhttp3.ConnectionPool;
+import okhttp3.Handshake;
 import okhttp3.Headers;
 import okhttp3.Interceptor;
 import okhttp3.OkHttpClient;
@@ -217,7 +219,8 @@ public class OkHttp extends HttpBase {
       builder.addNetworkInterceptor(new 
HTTPFilterIPAddressInterceptor(ipFilterRules));
     }
 
-    if (this.storeIPAddress || this.storeHttpHeaders || this.storeHttpRequest) 
{
+    if (this.storeIPAddress || this.storeHttpHeaders || this.storeHttpRequest
+        || this.storeProtocolVersions) {
       builder.addNetworkInterceptor(new HTTPHeadersInterceptor());
     }
 
@@ -373,17 +376,31 @@ public class OkHttp extends HttpBase {
       }
 
       if (requestverbatim != null) {
-        byte[] encodedBytesRequest = Base64.getEncoder()
-            .encode(requestverbatim.toString().getBytes());
+        byte[] encodedBytesRequest = Base64.getEncoder().encode(
+            requestverbatim.toString().getBytes(StandardCharsets.ISO_8859_1));
         builder = builder.header(Response.REQUEST,
-            new String(encodedBytesRequest));
+            new String(encodedBytesRequest, StandardCharsets.ISO_8859_1));
       }
 
       if (responseverbatim != null) {
-        byte[] encodedBytesResponse = Base64.getEncoder()
-            .encode(responseverbatim.toString().getBytes());
+        byte[] encodedBytesResponse = Base64.getEncoder().encode(
+            responseverbatim.toString().getBytes(StandardCharsets.ISO_8859_1));
         builder = builder.header(Response.RESPONSE_HEADERS,
-            new String(encodedBytesResponse));
+            new String(encodedBytesResponse, StandardCharsets.ISO_8859_1));
+      }
+
+      // store the HTTP and SSL/TLS protocol versions and SSL/TLS cipher suites
+      if (storeProtocolVersions) {
+        final StringBuilder protocols = new StringBuilder(
+            response.protocol().toString());
+        final Handshake handshake = connection.handshake();
+        if (handshake != null) {
+          protocols.append(',').append(handshake.tlsVersion().javaName());
+          builder = builder.header(Response.CIPHER_SUITES,
+              handshake.cipherSuite().toString());
+        }
+        builder = builder.header(Response.PROTOCOL_VERSIONS,
+            protocols.toString());
       }
 
       // returns a modified version of the response

Reply via email to