This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new e96cfc56e NUTCH-3002 Protocol-okhttp HttpResponse: HTTP header 
metadata lookup should be case-insensitive - implement class 
CaseInsensitiveMetadata providing case-insensitive   metadata look-ups (but no 
spell-checking) - use CaseInsensitiveMetadata to hold HTTP header metadata in   
in the class OkHttpResponse of protocol-okhttp - add unit tests to prove the 
fix (and also case-insensitive look-ups   and spell-checking in protocol-http)
e96cfc56e is described below

commit e96cfc56ee04c8e7e07e11d4eef521b4674a9ec6
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Tue Sep 19 08:10:14 2023 +0200

    NUTCH-3002 Protocol-okhttp HttpResponse: HTTP header metadata lookup should 
be case-insensitive
    - implement class CaseInsensitiveMetadata providing case-insensitive
      metadata look-ups (but no spell-checking)
    - use CaseInsensitiveMetadata to hold HTTP header metadata in
      in the class OkHttpResponse of protocol-okhttp
    - add unit tests to prove the fix (and also case-insensitive look-ups
      and spell-checking in protocol-http)
---
 .../nutch/metadata/CaseInsensitiveMetadata.java    |  33 +++++
 src/java/org/apache/nutch/metadata/Metadata.java   |   4 +-
 .../nutch/metadata/SpellCheckedMetadata.java       |   8 +-
 .../org/apache/nutch/net/protocols/Response.java   |   2 +-
 .../apache/nutch/protocol/http/TestResponse.java   | 152 ++++++++++++++++++++
 .../nutch/protocol/okhttp/OkHttpResponse.java      |   3 +-
 .../apache/nutch/protocol/okhttp/TestResponse.java | 154 +++++++++++++++++++++
 7 files changed, 348 insertions(+), 8 deletions(-)

diff --git a/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java 
b/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java
new file mode 100644
index 000000000..92e848ca2
--- /dev/null
+++ b/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.util.TreeMap;
+
+/**
+ * A decorator to Metadata that adds for case-insensitive lookup of keys.
+ */
+public class CaseInsensitiveMetadata extends Metadata {
+
+  /**
+   * Constructs a new, empty metadata.
+   */
+  public CaseInsensitiveMetadata() {
+    metadata = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
+  }
+
+}
diff --git a/src/java/org/apache/nutch/metadata/Metadata.java 
b/src/java/org/apache/nutch/metadata/Metadata.java
index 5c37911fb..7fa0bb12c 100644
--- a/src/java/org/apache/nutch/metadata/Metadata.java
+++ b/src/java/org/apache/nutch/metadata/Metadata.java
@@ -36,7 +36,7 @@ public class Metadata implements Writable, CreativeCommons, 
DublinCore,
   /**
    * A map of all metadata attributes.
    */
-  private Map<String, String[]> metadata = null;
+  protected Map<String, String[]> metadata = null;
 
   /**
    * Constructs a new, empty metadata.
@@ -66,7 +66,7 @@ public class Metadata implements Writable, CreativeCommons, 
DublinCore,
   }
 
   /**
-   * Get the value associated to a metadata name. If many values are 
assiociated
+   * Get the value associated to a metadata name. If many values are associated
    * to the specified name, then the first one is returned.
    * 
    * @param name
diff --git a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java 
b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
index fdbf1b62c..be161440e 100644
--- a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
+++ b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
@@ -25,7 +25,7 @@ import org.apache.commons.lang.StringUtils;
 
 /**
  * A decorator to Metadata that adds spellchecking capabilities to property
- * names. Currently used spelling vocabulary contains just the httpheaders from
+ * names. Currently used spelling vocabulary contains just the HTTP headers 
from
  * {@link HttpHeaders} class.
  * 
  */
@@ -94,7 +94,7 @@ public class SpellCheckedMetadata extends Metadata {
   /**
    * Get the normalized name of metadata attribute name. This method tries to
    * find a well-known metadata name (one of the metadata names defined in this
-   * class) that matches the specified name. The matching is error tolerent. 
For
+   * class) that matches the specified name. The matching is error tolerant. 
For
    * instance,
    * <ul>
    * <li>content-type gives Content-Type</li>
@@ -105,8 +105,8 @@ public class SpellCheckedMetadata extends Metadata {
    * name is returned.
    * 
    * @param name
-   *          Name to normalize
-   * @return normalized name
+   *          HTTP header name to normalize
+   * @return normalized HTTP header name
    */
   public static String getNormalizedName(final String name) {
     String searched = normalize(name);
diff --git a/src/java/org/apache/nutch/net/protocols/Response.java 
b/src/java/org/apache/nutch/net/protocols/Response.java
index 0159358ec..514ce8561 100644
--- a/src/java/org/apache/nutch/net/protocols/Response.java
+++ b/src/java/org/apache/nutch/net/protocols/Response.java
@@ -86,7 +86,7 @@ public interface Response extends HttpHeaders {
 
   /**
    * Get the value of a named header.
-   * @param name key of the header you wish to retreive
+   * @param name key of the header you wish to retrieve
    * @return header value
    */
   public String getHeader(String name);
diff --git 
a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java
 
b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java
new file mode 100644
index 000000000..9d65b6df8
--- /dev/null
+++ 
b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest;
+import org.apache.nutch.protocol.ProtocolException;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+
+public class TestResponse extends AbstractHttpProtocolPluginTest {
+
+  protected static final String redirectHeader = "HTTP/1.1 301 Moved 
Permanently\r\n" //
+      + "Content-Type: text/html; charset=UTF-8\r\n" //
+      + "Content-Length: 0\r\n";
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  @Override
+  protected String getPluginClassName() {
+    return "org.apache.nutch.protocol.okhttp.OkHttp";
+  }
+
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    conf = new Configuration();
+    conf.addResource("nutch-default.xml");
+    /*
+     * plugin tests specific config file - needs to add the tested plugin to
+     * plugin.includes
+     */
+    conf.addResource("nutch-site-test.xml");
+    conf.setBoolean("store.http.headers", true);
+
+    http = new Http();
+    http.setConf(conf);
+  }
+
+  protected HttpResponse getResponse(int statusCode, String headerName) {
+    try {
+      URL url = new URL(protocol, localHost, defaultPort, "/" + headerName);
+      LOG.info("Emulating fetch of {}", url);
+      return new HttpResponse((Http) http, url, new CrawlDatum(statusCode, 
1000));
+    } catch (ProtocolException | IOException e) {
+      return null;
+    }
+  }
+
+  protected void headerTest(int statusCode, String headerName, String value, 
String lookupName) {
+    HttpResponse response = getResponse(statusCode, headerName);
+    LOG.info("Response headers:");
+    LOG.info(response.getHeaders().get(Response.RESPONSE_HEADERS));
+    assertEquals(
+        "No or unexpected value of header \"" + headerName
+            + "\" returned when retrieving header \"" + lookupName + "\"",
+        value, response.getHeader(lookupName));
+  }
+
+  protected Map<String, byte[]> getResponses(String headerValue) {
+    String[] headerNames = { "Location", "location", "LOCATION", "Loction" };
+    Map<String, byte[]> responses = new TreeMap<>();
+    for (String headerName : headerNames) {
+      responses.put("/" + headerName,
+          (redirectHeader + headerName + ": " + headerValue + "\r\n"
+              + "Content-Length: 0\r\n\r\n").getBytes(UTF_8));
+    }
+    responses.put("/MyCustomHeader", (responseHeader + "MyCustomHeader" + ": "
+        + headerValue + "\r\n" + simpleContent).getBytes(UTF_8));
+    return responses;
+  }
+
+  @Test
+  public void testGetHeader() throws Exception {
+    String value = "headervalue";
+    launchServer(getResponses(value));
+
+    LOG.info(
+        "Testing standard HTTP header \"Location\": expected case-insensitive 
and error-tolerant matching");
+    headerTest(301, "Location", value, "Location");
+    headerTest(301, "Location", value, "location");
+    headerTest(301, "location", value, "Location");
+    headerTest(301, "LOCATION", value, "Location");
+    headerTest(301, "Loction", value, "Location");
+
+    LOG.info(
+        "Testing non-standard HTTP header \"MyCustomHeader\": only exact 
matching");
+    headerTest(200, "MyCustomHeader", value, "MyCustomHeader");
+    /*
+     * The following case-insensitive or approximate look-ups are not supported
+     * for non-standard headers by SpellCheckedMetadata:
+     */
+    // testHeader(200, "MyCustomHeader", value, "mycustomheader");
+    // testHeader(200, "mycustomheader", value, "MyCustomHeader");
+    // testHeader(200, "MYCUSTOMHEADER", value, "MyCustomHeader");
+  }
+
+  @Ignore("Only for benchmarking")
+  @Test
+  public void testMetadataBenchmark() throws MalformedURLException, 
ProtocolException,
+      IOException, InterruptedException {
+    String value = "headervalue";
+    launchServer(getResponses(value));
+    Thread.sleep(30000); // time to attach a profiler
+    int iterations = 4000;
+    LOG.info("Starting benchmark with {} iterations ({} calls)", iterations,
+        (iterations * 5));
+    long start = System.currentTimeMillis();
+    for (int i = 0; i < iterations; i++) {
+      headerTest(301, "Location", value, "Location");
+      headerTest(301, "Location", value, "location");
+      headerTest(301, "location", value, "Location");
+      headerTest(301, "LOCATION", value, "Location");
+      headerTest(301, "Loction", value, "Location");
+    }
+    long elapsed = System.currentTimeMillis() - start;
+    LOG.info("Benchmark finished, elapsed: {}, {}ms per call", elapsed,
+        (elapsed / (5.0 * iterations)));
+  }
+
+}
diff --git 
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
index 67bc45b03..605c03390 100644
--- 
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
+++ 
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
@@ -24,6 +24,7 @@ import java.util.Locale;
 
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.CaseInsensitiveMetadata;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
@@ -106,7 +107,7 @@ public class OkHttpResponse implements Response {
     // ensure that Response and underlying ResponseBody are closed
     try (okhttp3.Response response = call.execute()) {
 
-      Metadata responsemetadata = new Metadata();
+      Metadata responsemetadata = new CaseInsensitiveMetadata();
       okhttp3.Headers httpHeaders = response.headers();
 
       for (int i = 0, size = httpHeaders.size(); i < size; i++) {
diff --git 
a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java
 
b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java
new file mode 100644
index 000000000..695a6c539
--- /dev/null
+++ 
b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.okhttp;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest;
+import org.apache.nutch.protocol.ProtocolException;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+
+public class TestResponse extends AbstractHttpProtocolPluginTest {
+
+  protected static final String redirectHeader = "HTTP/1.1 301 Moved 
Permanently\r\n" //
+      + "Content-Type: text/html; charset=UTF-8\r\n" //
+      + "Content-Length: 0\r\n";
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  @Override
+  protected String getPluginClassName() {
+    return "org.apache.nutch.protocol.okhttp.OkHttp";
+  }
+
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    conf = new Configuration();
+    conf.addResource("nutch-default.xml");
+    /*
+     * plugin tests specific config file - needs to add the tested plugin to
+     * plugin.includes
+     */
+    conf.addResource("nutch-site-test.xml");
+    conf.setBoolean("store.http.headers", true);
+
+    http = new OkHttp();
+    http.setConf(conf);
+  }
+
+  protected OkHttpResponse getResponse(int statusCode, String headerName) {
+    try {
+      URL url = new URL(protocol, localHost, defaultPort, "/" + headerName);
+      LOG.info("Emulating fetch of {}", url);
+      return new OkHttpResponse((OkHttp) http, url, new CrawlDatum(statusCode, 
1000));
+    } catch (ProtocolException | IOException e) {
+      return null;
+    }
+  }
+
+  protected void headerTest(int statusCode, String headerName, String value, 
String lookupName) {
+    OkHttpResponse response = getResponse(statusCode, headerName);
+    LOG.info("Response headers:");
+    LOG.info(response.getHeaders().get(Response.RESPONSE_HEADERS));
+    assertEquals(
+        "No or unexpected value of header \"" + headerName
+            + "\" returned when retrieving header \"" + lookupName + "\"",
+        value, response.getHeader(lookupName));
+  }
+
+  protected Map<String, byte[]> getResponses(String headerValue) {
+    String[] headerNames = { "Location", "location", "LOCATION", "Loction" };
+    Map<String, byte[]> responses = new TreeMap<>();
+    for (String headerName : headerNames) {
+      responses.put("/" + headerName,
+          (redirectHeader + headerName + ": " + headerValue + "\r\n"
+              + "Content-Length: 0\r\n\r\n").getBytes(UTF_8));
+    }
+    responses.put("/MyCustomHeader", (responseHeader + "MyCustomHeader" + ": "
+        + headerValue + "\r\n" + simpleContent).getBytes(UTF_8));
+    return responses;
+  }
+
+  @Test
+  public void testGetHeader() throws Exception {
+    String value = "headervalue";
+    launchServer(getResponses(value));
+
+    LOG.info(
+        "Testing standard HTTP header \"Location\": expected case-insensitive 
and error-tolerant matching");
+    headerTest(301, "Location", value, "Location");
+    headerTest(301, "Location", value, "location");
+    headerTest(301, "location", value, "Location");
+    headerTest(301, "LOCATION", value, "Location");
+    // only with SpellCheckedMetadata:
+    // headerTest(301, "Loction", value, "Location");
+
+    LOG.info(
+        "Testing non-standard HTTP header \"MyCustomHeader\": only exact 
matching");
+    headerTest(200, "MyCustomHeader", value, "MyCustomHeader");
+    /*
+     * The following case-insensitive or approximate look-ups are not supported
+     * for non-standard headers by SpellCheckedMetadata:
+     */
+    // testHeader(200, "MyCustomHeader", value, "mycustomheader");
+    // testHeader(200, "mycustomheader", value, "MyCustomHeader");
+    // testHeader(200, "MYCUSTOMHEADER", value, "MyCustomHeader");
+  }
+
+  @Ignore("Only for benchmarking")
+  @Test
+  public void testMetadataBenchmark() throws MalformedURLException, 
ProtocolException,
+      IOException, InterruptedException {
+    String value = "headervalue";
+    launchServer(getResponses(value));
+    Thread.sleep(30000); // time to attach a profiler
+    int iterations = 5000;
+    LOG.info("Starting benchmark with {} iterations ({} calls)", iterations,
+        (iterations * 4));
+    long start = System.currentTimeMillis();
+    for (int i = 0; i < iterations; i++) {
+      headerTest(301, "Location", value, "Location");
+      headerTest(301, "Location", value, "location");
+      headerTest(301, "location", value, "Location");
+      headerTest(301, "LOCATION", value, "Location");
+      // only with SpellCheckedMetadata:
+      // headerTest(301, "Loction", value, "Location");
+    }
+    long elapsed = System.currentTimeMillis() - start;
+    LOG.info("Benchmark finished, elapsed: {}, {}ms per call", elapsed,
+        (elapsed / (4.0 * iterations)));
+  }
+
+}

Reply via email to