This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new e96cfc56e NUTCH-3002 Protocol-okhttp HttpResponse: HTTP header
metadata lookup should be case-insensitive - implement class
CaseInsensitiveMetadata providing case-insensitive metadata look-ups (but no
spell-checking) - use CaseInsensitiveMetadata to hold HTTP header metadata in
in the class OkHttpResponse of protocol-okhttp - add unit tests to prove the
fix (and also case-insensitive look-ups and spell-checking in protocol-http)
e96cfc56e is described below
commit e96cfc56ee04c8e7e07e11d4eef521b4674a9ec6
Author: Sebastian Nagel <[email protected]>
AuthorDate: Tue Sep 19 08:10:14 2023 +0200
NUTCH-3002 Protocol-okhttp HttpResponse: HTTP header metadata lookup should
be case-insensitive
- implement class CaseInsensitiveMetadata providing case-insensitive
metadata look-ups (but no spell-checking)
- use CaseInsensitiveMetadata to hold HTTP header metadata in
in the class OkHttpResponse of protocol-okhttp
- add unit tests to prove the fix (and also case-insensitive look-ups
and spell-checking in protocol-http)
---
.../nutch/metadata/CaseInsensitiveMetadata.java | 33 +++++
src/java/org/apache/nutch/metadata/Metadata.java | 4 +-
.../nutch/metadata/SpellCheckedMetadata.java | 8 +-
.../org/apache/nutch/net/protocols/Response.java | 2 +-
.../apache/nutch/protocol/http/TestResponse.java | 152 ++++++++++++++++++++
.../nutch/protocol/okhttp/OkHttpResponse.java | 3 +-
.../apache/nutch/protocol/okhttp/TestResponse.java | 154 +++++++++++++++++++++
7 files changed, 348 insertions(+), 8 deletions(-)
diff --git a/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java
b/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java
new file mode 100644
index 000000000..92e848ca2
--- /dev/null
+++ b/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.util.TreeMap;
+
+/**
+ * A decorator to Metadata that adds for case-insensitive lookup of keys.
+ */
+public class CaseInsensitiveMetadata extends Metadata {
+
+ /**
+ * Constructs a new, empty metadata.
+ */
+ public CaseInsensitiveMetadata() {
+ metadata = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
+ }
+
+}
diff --git a/src/java/org/apache/nutch/metadata/Metadata.java
b/src/java/org/apache/nutch/metadata/Metadata.java
index 5c37911fb..7fa0bb12c 100644
--- a/src/java/org/apache/nutch/metadata/Metadata.java
+++ b/src/java/org/apache/nutch/metadata/Metadata.java
@@ -36,7 +36,7 @@ public class Metadata implements Writable, CreativeCommons,
DublinCore,
/**
* A map of all metadata attributes.
*/
- private Map<String, String[]> metadata = null;
+ protected Map<String, String[]> metadata = null;
/**
* Constructs a new, empty metadata.
@@ -66,7 +66,7 @@ public class Metadata implements Writable, CreativeCommons,
DublinCore,
}
/**
- * Get the value associated to a metadata name. If many values are
assiociated
+ * Get the value associated to a metadata name. If many values are associated
* to the specified name, then the first one is returned.
*
* @param name
diff --git a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
index fdbf1b62c..be161440e 100644
--- a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
+++ b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
@@ -25,7 +25,7 @@ import org.apache.commons.lang.StringUtils;
/**
* A decorator to Metadata that adds spellchecking capabilities to property
- * names. Currently used spelling vocabulary contains just the httpheaders from
+ * names. Currently used spelling vocabulary contains just the HTTP headers
from
* {@link HttpHeaders} class.
*
*/
@@ -94,7 +94,7 @@ public class SpellCheckedMetadata extends Metadata {
/**
* Get the normalized name of metadata attribute name. This method tries to
* find a well-known metadata name (one of the metadata names defined in this
- * class) that matches the specified name. The matching is error tolerent.
For
+ * class) that matches the specified name. The matching is error tolerant.
For
* instance,
* <ul>
* <li>content-type gives Content-Type</li>
@@ -105,8 +105,8 @@ public class SpellCheckedMetadata extends Metadata {
* name is returned.
*
* @param name
- * Name to normalize
- * @return normalized name
+ * HTTP header name to normalize
+ * @return normalized HTTP header name
*/
public static String getNormalizedName(final String name) {
String searched = normalize(name);
diff --git a/src/java/org/apache/nutch/net/protocols/Response.java
b/src/java/org/apache/nutch/net/protocols/Response.java
index 0159358ec..514ce8561 100644
--- a/src/java/org/apache/nutch/net/protocols/Response.java
+++ b/src/java/org/apache/nutch/net/protocols/Response.java
@@ -86,7 +86,7 @@ public interface Response extends HttpHeaders {
/**
* Get the value of a named header.
- * @param name key of the header you wish to retreive
+ * @param name key of the header you wish to retrieve
* @return header value
*/
public String getHeader(String name);
diff --git
a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java
b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java
new file mode 100644
index 000000000..9d65b6df8
--- /dev/null
+++
b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest;
+import org.apache.nutch.protocol.ProtocolException;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+
+public class TestResponse extends AbstractHttpProtocolPluginTest {
+
+ protected static final String redirectHeader = "HTTP/1.1 301 Moved
Permanently\r\n" //
+ + "Content-Type: text/html; charset=UTF-8\r\n" //
+ + "Content-Length: 0\r\n";
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ @Override
+ protected String getPluginClassName() {
+ return "org.apache.nutch.protocol.okhttp.OkHttp";
+ }
+
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ conf = new Configuration();
+ conf.addResource("nutch-default.xml");
+ /*
+ * plugin tests specific config file - needs to add the tested plugin to
+ * plugin.includes
+ */
+ conf.addResource("nutch-site-test.xml");
+ conf.setBoolean("store.http.headers", true);
+
+ http = new Http();
+ http.setConf(conf);
+ }
+
+ protected HttpResponse getResponse(int statusCode, String headerName) {
+ try {
+ URL url = new URL(protocol, localHost, defaultPort, "/" + headerName);
+ LOG.info("Emulating fetch of {}", url);
+ return new HttpResponse((Http) http, url, new CrawlDatum(statusCode,
1000));
+ } catch (ProtocolException | IOException e) {
+ return null;
+ }
+ }
+
+ protected void headerTest(int statusCode, String headerName, String value,
String lookupName) {
+ HttpResponse response = getResponse(statusCode, headerName);
+ LOG.info("Response headers:");
+ LOG.info(response.getHeaders().get(Response.RESPONSE_HEADERS));
+ assertEquals(
+ "No or unexpected value of header \"" + headerName
+ + "\" returned when retrieving header \"" + lookupName + "\"",
+ value, response.getHeader(lookupName));
+ }
+
+ protected Map<String, byte[]> getResponses(String headerValue) {
+ String[] headerNames = { "Location", "location", "LOCATION", "Loction" };
+ Map<String, byte[]> responses = new TreeMap<>();
+ for (String headerName : headerNames) {
+ responses.put("/" + headerName,
+ (redirectHeader + headerName + ": " + headerValue + "\r\n"
+ + "Content-Length: 0\r\n\r\n").getBytes(UTF_8));
+ }
+ responses.put("/MyCustomHeader", (responseHeader + "MyCustomHeader" + ": "
+ + headerValue + "\r\n" + simpleContent).getBytes(UTF_8));
+ return responses;
+ }
+
+ @Test
+ public void testGetHeader() throws Exception {
+ String value = "headervalue";
+ launchServer(getResponses(value));
+
+ LOG.info(
+ "Testing standard HTTP header \"Location\": expected case-insensitive
and error-tolerant matching");
+ headerTest(301, "Location", value, "Location");
+ headerTest(301, "Location", value, "location");
+ headerTest(301, "location", value, "Location");
+ headerTest(301, "LOCATION", value, "Location");
+ headerTest(301, "Loction", value, "Location");
+
+ LOG.info(
+ "Testing non-standard HTTP header \"MyCustomHeader\": only exact
matching");
+ headerTest(200, "MyCustomHeader", value, "MyCustomHeader");
+ /*
+ * The following case-insensitive or approximate look-ups are not supported
+ * for non-standard headers by SpellCheckedMetadata:
+ */
+ // testHeader(200, "MyCustomHeader", value, "mycustomheader");
+ // testHeader(200, "mycustomheader", value, "MyCustomHeader");
+ // testHeader(200, "MYCUSTOMHEADER", value, "MyCustomHeader");
+ }
+
+ @Ignore("Only for benchmarking")
+ @Test
+ public void testMetadataBenchmark() throws MalformedURLException,
ProtocolException,
+ IOException, InterruptedException {
+ String value = "headervalue";
+ launchServer(getResponses(value));
+ Thread.sleep(30000); // time to attach a profiler
+ int iterations = 4000;
+ LOG.info("Starting benchmark with {} iterations ({} calls)", iterations,
+ (iterations * 5));
+ long start = System.currentTimeMillis();
+ for (int i = 0; i < iterations; i++) {
+ headerTest(301, "Location", value, "Location");
+ headerTest(301, "Location", value, "location");
+ headerTest(301, "location", value, "Location");
+ headerTest(301, "LOCATION", value, "Location");
+ headerTest(301, "Loction", value, "Location");
+ }
+ long elapsed = System.currentTimeMillis() - start;
+ LOG.info("Benchmark finished, elapsed: {}, {}ms per call", elapsed,
+ (elapsed / (5.0 * iterations)));
+ }
+
+}
diff --git
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
index 67bc45b03..605c03390 100644
---
a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
+++
b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
@@ -24,6 +24,7 @@ import java.util.Locale;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.CaseInsensitiveMetadata;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
@@ -106,7 +107,7 @@ public class OkHttpResponse implements Response {
// ensure that Response and underlying ResponseBody are closed
try (okhttp3.Response response = call.execute()) {
- Metadata responsemetadata = new Metadata();
+ Metadata responsemetadata = new CaseInsensitiveMetadata();
okhttp3.Headers httpHeaders = response.headers();
for (int i = 0, size = httpHeaders.size(); i < size; i++) {
diff --git
a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java
b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java
new file mode 100644
index 000000000..695a6c539
--- /dev/null
+++
b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.okhttp;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest;
+import org.apache.nutch.protocol.ProtocolException;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+
+public class TestResponse extends AbstractHttpProtocolPluginTest {
+
+ protected static final String redirectHeader = "HTTP/1.1 301 Moved
Permanently\r\n" //
+ + "Content-Type: text/html; charset=UTF-8\r\n" //
+ + "Content-Length: 0\r\n";
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ @Override
+ protected String getPluginClassName() {
+ return "org.apache.nutch.protocol.okhttp.OkHttp";
+ }
+
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ conf = new Configuration();
+ conf.addResource("nutch-default.xml");
+ /*
+ * plugin tests specific config file - needs to add the tested plugin to
+ * plugin.includes
+ */
+ conf.addResource("nutch-site-test.xml");
+ conf.setBoolean("store.http.headers", true);
+
+ http = new OkHttp();
+ http.setConf(conf);
+ }
+
+ protected OkHttpResponse getResponse(int statusCode, String headerName) {
+ try {
+ URL url = new URL(protocol, localHost, defaultPort, "/" + headerName);
+ LOG.info("Emulating fetch of {}", url);
+ return new OkHttpResponse((OkHttp) http, url, new CrawlDatum(statusCode,
1000));
+ } catch (ProtocolException | IOException e) {
+ return null;
+ }
+ }
+
+ protected void headerTest(int statusCode, String headerName, String value,
String lookupName) {
+ OkHttpResponse response = getResponse(statusCode, headerName);
+ LOG.info("Response headers:");
+ LOG.info(response.getHeaders().get(Response.RESPONSE_HEADERS));
+ assertEquals(
+ "No or unexpected value of header \"" + headerName
+ + "\" returned when retrieving header \"" + lookupName + "\"",
+ value, response.getHeader(lookupName));
+ }
+
+ protected Map<String, byte[]> getResponses(String headerValue) {
+ String[] headerNames = { "Location", "location", "LOCATION", "Loction" };
+ Map<String, byte[]> responses = new TreeMap<>();
+ for (String headerName : headerNames) {
+ responses.put("/" + headerName,
+ (redirectHeader + headerName + ": " + headerValue + "\r\n"
+ + "Content-Length: 0\r\n\r\n").getBytes(UTF_8));
+ }
+ responses.put("/MyCustomHeader", (responseHeader + "MyCustomHeader" + ": "
+ + headerValue + "\r\n" + simpleContent).getBytes(UTF_8));
+ return responses;
+ }
+
+ @Test
+ public void testGetHeader() throws Exception {
+ String value = "headervalue";
+ launchServer(getResponses(value));
+
+ LOG.info(
+ "Testing standard HTTP header \"Location\": expected case-insensitive
and error-tolerant matching");
+ headerTest(301, "Location", value, "Location");
+ headerTest(301, "Location", value, "location");
+ headerTest(301, "location", value, "Location");
+ headerTest(301, "LOCATION", value, "Location");
+ // only with SpellCheckedMetadata:
+ // headerTest(301, "Loction", value, "Location");
+
+ LOG.info(
+ "Testing non-standard HTTP header \"MyCustomHeader\": only exact
matching");
+ headerTest(200, "MyCustomHeader", value, "MyCustomHeader");
+ /*
+ * The following case-insensitive or approximate look-ups are not supported
+ * for non-standard headers by SpellCheckedMetadata:
+ */
+ // testHeader(200, "MyCustomHeader", value, "mycustomheader");
+ // testHeader(200, "mycustomheader", value, "MyCustomHeader");
+ // testHeader(200, "MYCUSTOMHEADER", value, "MyCustomHeader");
+ }
+
+ @Ignore("Only for benchmarking")
+ @Test
+ public void testMetadataBenchmark() throws MalformedURLException,
ProtocolException,
+ IOException, InterruptedException {
+ String value = "headervalue";
+ launchServer(getResponses(value));
+ Thread.sleep(30000); // time to attach a profiler
+ int iterations = 5000;
+ LOG.info("Starting benchmark with {} iterations ({} calls)", iterations,
+ (iterations * 4));
+ long start = System.currentTimeMillis();
+ for (int i = 0; i < iterations; i++) {
+ headerTest(301, "Location", value, "Location");
+ headerTest(301, "Location", value, "location");
+ headerTest(301, "location", value, "Location");
+ headerTest(301, "LOCATION", value, "Location");
+ // only with SpellCheckedMetadata:
+ // headerTest(301, "Loction", value, "Location");
+ }
+ long elapsed = System.currentTimeMillis() - start;
+ LOG.info("Benchmark finished, elapsed: {}, {}ms per call", elapsed,
+ (elapsed / (4.0 * iterations)));
+ }
+
+}