This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new e96cfc56e NUTCH-3002 Protocol-okhttp HttpResponse: HTTP header metadata lookup should be case-insensitive - implement class CaseInsensitiveMetadata providing case-insensitive metadata look-ups (but no spell-checking) - use CaseInsensitiveMetadata to hold HTTP header metadata in in the class OkHttpResponse of protocol-okhttp - add unit tests to prove the fix (and also case-insensitive look-ups and spell-checking in protocol-http) e96cfc56e is described below commit e96cfc56ee04c8e7e07e11d4eef521b4674a9ec6 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Tue Sep 19 08:10:14 2023 +0200 NUTCH-3002 Protocol-okhttp HttpResponse: HTTP header metadata lookup should be case-insensitive - implement class CaseInsensitiveMetadata providing case-insensitive metadata look-ups (but no spell-checking) - use CaseInsensitiveMetadata to hold HTTP header metadata in in the class OkHttpResponse of protocol-okhttp - add unit tests to prove the fix (and also case-insensitive look-ups and spell-checking in protocol-http) --- .../nutch/metadata/CaseInsensitiveMetadata.java | 33 +++++ src/java/org/apache/nutch/metadata/Metadata.java | 4 +- .../nutch/metadata/SpellCheckedMetadata.java | 8 +- .../org/apache/nutch/net/protocols/Response.java | 2 +- .../apache/nutch/protocol/http/TestResponse.java | 152 ++++++++++++++++++++ .../nutch/protocol/okhttp/OkHttpResponse.java | 3 +- .../apache/nutch/protocol/okhttp/TestResponse.java | 154 +++++++++++++++++++++ 7 files changed, 348 insertions(+), 8 deletions(-) diff --git a/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java b/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java new file mode 100644 index 000000000..92e848ca2 --- /dev/null +++ b/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metadata; + +import java.util.TreeMap; + +/** + * A decorator to Metadata that adds for case-insensitive lookup of keys. + */ +public class CaseInsensitiveMetadata extends Metadata { + + /** + * Constructs a new, empty metadata. + */ + public CaseInsensitiveMetadata() { + metadata = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); + } + +} diff --git a/src/java/org/apache/nutch/metadata/Metadata.java b/src/java/org/apache/nutch/metadata/Metadata.java index 5c37911fb..7fa0bb12c 100644 --- a/src/java/org/apache/nutch/metadata/Metadata.java +++ b/src/java/org/apache/nutch/metadata/Metadata.java @@ -36,7 +36,7 @@ public class Metadata implements Writable, CreativeCommons, DublinCore, /** * A map of all metadata attributes. */ - private Map<String, String[]> metadata = null; + protected Map<String, String[]> metadata = null; /** * Constructs a new, empty metadata. @@ -66,7 +66,7 @@ public class Metadata implements Writable, CreativeCommons, DublinCore, } /** - * Get the value associated to a metadata name. If many values are assiociated + * Get the value associated to a metadata name. If many values are associated * to the specified name, then the first one is returned. * * @param name diff --git a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java index fdbf1b62c..be161440e 100644 --- a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java +++ b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java @@ -25,7 +25,7 @@ import org.apache.commons.lang.StringUtils; /** * A decorator to Metadata that adds spellchecking capabilities to property - * names. Currently used spelling vocabulary contains just the httpheaders from + * names. Currently used spelling vocabulary contains just the HTTP headers from * {@link HttpHeaders} class. * */ @@ -94,7 +94,7 @@ public class SpellCheckedMetadata extends Metadata { /** * Get the normalized name of metadata attribute name. This method tries to * find a well-known metadata name (one of the metadata names defined in this - * class) that matches the specified name. The matching is error tolerent. For + * class) that matches the specified name. The matching is error tolerant. For * instance, * <ul> * <li>content-type gives Content-Type</li> @@ -105,8 +105,8 @@ public class SpellCheckedMetadata extends Metadata { * name is returned. * * @param name - * Name to normalize - * @return normalized name + * HTTP header name to normalize + * @return normalized HTTP header name */ public static String getNormalizedName(final String name) { String searched = normalize(name); diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java index 0159358ec..514ce8561 100644 --- a/src/java/org/apache/nutch/net/protocols/Response.java +++ b/src/java/org/apache/nutch/net/protocols/Response.java @@ -86,7 +86,7 @@ public interface Response extends HttpHeaders { /** * Get the value of a named header. - * @param name key of the header you wish to retreive + * @param name key of the header you wish to retrieve * @return header value */ public String getHeader(String name); diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java new file mode 100644 index 000000000..9d65b6df8 --- /dev/null +++ b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestResponse.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.http; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest; +import org.apache.nutch.protocol.ProtocolException; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; + +public class TestResponse extends AbstractHttpProtocolPluginTest { + + protected static final String redirectHeader = "HTTP/1.1 301 Moved Permanently\r\n" // + + "Content-Type: text/html; charset=UTF-8\r\n" // + + "Content-Length: 0\r\n"; + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + @Override + protected String getPluginClassName() { + return "org.apache.nutch.protocol.okhttp.OkHttp"; + } + + @Override + @Before + public void setUp() throws Exception { + conf = new Configuration(); + conf.addResource("nutch-default.xml"); + /* + * plugin tests specific config file - needs to add the tested plugin to + * plugin.includes + */ + conf.addResource("nutch-site-test.xml"); + conf.setBoolean("store.http.headers", true); + + http = new Http(); + http.setConf(conf); + } + + protected HttpResponse getResponse(int statusCode, String headerName) { + try { + URL url = new URL(protocol, localHost, defaultPort, "/" + headerName); + LOG.info("Emulating fetch of {}", url); + return new HttpResponse((Http) http, url, new CrawlDatum(statusCode, 1000)); + } catch (ProtocolException | IOException e) { + return null; + } + } + + protected void headerTest(int statusCode, String headerName, String value, String lookupName) { + HttpResponse response = getResponse(statusCode, headerName); + LOG.info("Response headers:"); + LOG.info(response.getHeaders().get(Response.RESPONSE_HEADERS)); + assertEquals( + "No or unexpected value of header \"" + headerName + + "\" returned when retrieving header \"" + lookupName + "\"", + value, response.getHeader(lookupName)); + } + + protected Map<String, byte[]> getResponses(String headerValue) { + String[] headerNames = { "Location", "location", "LOCATION", "Loction" }; + Map<String, byte[]> responses = new TreeMap<>(); + for (String headerName : headerNames) { + responses.put("/" + headerName, + (redirectHeader + headerName + ": " + headerValue + "\r\n" + + "Content-Length: 0\r\n\r\n").getBytes(UTF_8)); + } + responses.put("/MyCustomHeader", (responseHeader + "MyCustomHeader" + ": " + + headerValue + "\r\n" + simpleContent).getBytes(UTF_8)); + return responses; + } + + @Test + public void testGetHeader() throws Exception { + String value = "headervalue"; + launchServer(getResponses(value)); + + LOG.info( + "Testing standard HTTP header \"Location\": expected case-insensitive and error-tolerant matching"); + headerTest(301, "Location", value, "Location"); + headerTest(301, "Location", value, "location"); + headerTest(301, "location", value, "Location"); + headerTest(301, "LOCATION", value, "Location"); + headerTest(301, "Loction", value, "Location"); + + LOG.info( + "Testing non-standard HTTP header \"MyCustomHeader\": only exact matching"); + headerTest(200, "MyCustomHeader", value, "MyCustomHeader"); + /* + * The following case-insensitive or approximate look-ups are not supported + * for non-standard headers by SpellCheckedMetadata: + */ + // testHeader(200, "MyCustomHeader", value, "mycustomheader"); + // testHeader(200, "mycustomheader", value, "MyCustomHeader"); + // testHeader(200, "MYCUSTOMHEADER", value, "MyCustomHeader"); + } + + @Ignore("Only for benchmarking") + @Test + public void testMetadataBenchmark() throws MalformedURLException, ProtocolException, + IOException, InterruptedException { + String value = "headervalue"; + launchServer(getResponses(value)); + Thread.sleep(30000); // time to attach a profiler + int iterations = 4000; + LOG.info("Starting benchmark with {} iterations ({} calls)", iterations, + (iterations * 5)); + long start = System.currentTimeMillis(); + for (int i = 0; i < iterations; i++) { + headerTest(301, "Location", value, "Location"); + headerTest(301, "Location", value, "location"); + headerTest(301, "location", value, "Location"); + headerTest(301, "LOCATION", value, "Location"); + headerTest(301, "Loction", value, "Location"); + } + long elapsed = System.currentTimeMillis() - start; + LOG.info("Benchmark finished, elapsed: {}, {}ms per call", elapsed, + (elapsed / (5.0 * iterations))); + } + +} diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java index 67bc45b03..605c03390 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java @@ -24,6 +24,7 @@ import java.util.Locale; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.CaseInsensitiveMetadata; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.HttpDateFormat; import org.apache.nutch.net.protocols.Response; @@ -106,7 +107,7 @@ public class OkHttpResponse implements Response { // ensure that Response and underlying ResponseBody are closed try (okhttp3.Response response = call.execute()) { - Metadata responsemetadata = new Metadata(); + Metadata responsemetadata = new CaseInsensitiveMetadata(); okhttp3.Headers httpHeaders = response.headers(); for (int i = 0, size = httpHeaders.size(); i < size; i++) { diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java new file mode 100644 index 000000000..695a6c539 --- /dev/null +++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestResponse.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.okhttp; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest; +import org.apache.nutch.protocol.ProtocolException; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; + +public class TestResponse extends AbstractHttpProtocolPluginTest { + + protected static final String redirectHeader = "HTTP/1.1 301 Moved Permanently\r\n" // + + "Content-Type: text/html; charset=UTF-8\r\n" // + + "Content-Length: 0\r\n"; + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + @Override + protected String getPluginClassName() { + return "org.apache.nutch.protocol.okhttp.OkHttp"; + } + + @Override + @Before + public void setUp() throws Exception { + conf = new Configuration(); + conf.addResource("nutch-default.xml"); + /* + * plugin tests specific config file - needs to add the tested plugin to + * plugin.includes + */ + conf.addResource("nutch-site-test.xml"); + conf.setBoolean("store.http.headers", true); + + http = new OkHttp(); + http.setConf(conf); + } + + protected OkHttpResponse getResponse(int statusCode, String headerName) { + try { + URL url = new URL(protocol, localHost, defaultPort, "/" + headerName); + LOG.info("Emulating fetch of {}", url); + return new OkHttpResponse((OkHttp) http, url, new CrawlDatum(statusCode, 1000)); + } catch (ProtocolException | IOException e) { + return null; + } + } + + protected void headerTest(int statusCode, String headerName, String value, String lookupName) { + OkHttpResponse response = getResponse(statusCode, headerName); + LOG.info("Response headers:"); + LOG.info(response.getHeaders().get(Response.RESPONSE_HEADERS)); + assertEquals( + "No or unexpected value of header \"" + headerName + + "\" returned when retrieving header \"" + lookupName + "\"", + value, response.getHeader(lookupName)); + } + + protected Map<String, byte[]> getResponses(String headerValue) { + String[] headerNames = { "Location", "location", "LOCATION", "Loction" }; + Map<String, byte[]> responses = new TreeMap<>(); + for (String headerName : headerNames) { + responses.put("/" + headerName, + (redirectHeader + headerName + ": " + headerValue + "\r\n" + + "Content-Length: 0\r\n\r\n").getBytes(UTF_8)); + } + responses.put("/MyCustomHeader", (responseHeader + "MyCustomHeader" + ": " + + headerValue + "\r\n" + simpleContent).getBytes(UTF_8)); + return responses; + } + + @Test + public void testGetHeader() throws Exception { + String value = "headervalue"; + launchServer(getResponses(value)); + + LOG.info( + "Testing standard HTTP header \"Location\": expected case-insensitive and error-tolerant matching"); + headerTest(301, "Location", value, "Location"); + headerTest(301, "Location", value, "location"); + headerTest(301, "location", value, "Location"); + headerTest(301, "LOCATION", value, "Location"); + // only with SpellCheckedMetadata: + // headerTest(301, "Loction", value, "Location"); + + LOG.info( + "Testing non-standard HTTP header \"MyCustomHeader\": only exact matching"); + headerTest(200, "MyCustomHeader", value, "MyCustomHeader"); + /* + * The following case-insensitive or approximate look-ups are not supported + * for non-standard headers by SpellCheckedMetadata: + */ + // testHeader(200, "MyCustomHeader", value, "mycustomheader"); + // testHeader(200, "mycustomheader", value, "MyCustomHeader"); + // testHeader(200, "MYCUSTOMHEADER", value, "MyCustomHeader"); + } + + @Ignore("Only for benchmarking") + @Test + public void testMetadataBenchmark() throws MalformedURLException, ProtocolException, + IOException, InterruptedException { + String value = "headervalue"; + launchServer(getResponses(value)); + Thread.sleep(30000); // time to attach a profiler + int iterations = 5000; + LOG.info("Starting benchmark with {} iterations ({} calls)", iterations, + (iterations * 4)); + long start = System.currentTimeMillis(); + for (int i = 0; i < iterations; i++) { + headerTest(301, "Location", value, "Location"); + headerTest(301, "Location", value, "location"); + headerTest(301, "location", value, "Location"); + headerTest(301, "LOCATION", value, "Location"); + // only with SpellCheckedMetadata: + // headerTest(301, "Loction", value, "Location"); + } + long elapsed = System.currentTimeMillis() - start; + LOG.info("Benchmark finished, elapsed: {}, {}ms per call", elapsed, + (elapsed / (4.0 * iterations))); + } + +}