This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 7e969eaec NUTCH-2930 Protocol-okhttp: implement IP filter (#736) 7e969eaec is described below commit 7e969eaec1ab8e9e21667faf6cf1881fb10cfb31 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Fri Aug 19 15:26:07 2022 +0200 NUTCH-2930 Protocol-okhttp: implement IP filter (#736) - add include/exclude rules as list of IP address, CIDR notation or predefined IP ranges (localhost, loopback, sitelocal) --- conf/nutch-default.xml | 25 +++ .../org/apache/nutch/protocol/okhttp/CIDR.java | 79 ++++++++ .../nutch/protocol/okhttp/IPFilterRules.java | 129 +++++++++++++ .../org/apache/nutch/protocol/okhttp/OkHttp.java | 35 ++++ .../protocol/okhttp/TestBadServerResponses.java | 2 +- .../protocol/okhttp/TestIPAddressFiltering.java | 207 +++++++++++++++++++++ .../nutch/protocol/okhttp/TestProtocolOkHttp.java | 2 +- .../protocol/AbstractHttpProtocolPluginTest.java | 22 ++- 8 files changed, 494 insertions(+), 7 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 1ad02a021..2a6325884 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -449,6 +449,31 @@ </description> </property> +<property> + <name>http.filter.ipaddress.include</name> + <value></value> + <description> + If not empty: only fetch content from these IP addresses defined + as a comma-separated list of a single IP address, a CIDR notation, + or one of the following pre-defined IP address types: localhost, + loopback, sitelocal. The property http.filter.ipaddress.exclude + can be used to block subranges in the included list of ranges. + Note: supported only by protocol-okhttp. + </description> +</property> + +<property> + <name>http.filter.ipaddress.exclude</name> + <value></value> + <description> + If not empty: do not fetch content from these IP addresses defined + as a comma-separated list of a single IP address, a CIDR notation, + or one of the following pre-defined IP address types: localhost, + loopback, sitelocal. Note: supported only by protocol-okhttp. + </description> +</property> + + <!-- FTP properties --> <property> diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/CIDR.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/CIDR.java new file mode 100644 index 000000000..3add082a8 --- /dev/null +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/CIDR.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.okhttp; + +import java.net.InetAddress; + +import com.google.common.net.InetAddresses; + +/** + * Parse a <a href= + * "https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing">CIDR</a> block + * notation and test whether an IP address is contained in the subnet range + * defined by the CIDR. + */ +public class CIDR { + InetAddress addr; + int mask; + + public CIDR(InetAddress address, int mask) { + this.addr = address; + this.mask = mask; + } + + public CIDR(String cidr) throws IllegalArgumentException { + String ipStr = cidr; + int sep = cidr.indexOf('/'); + if (sep > -1) { + ipStr = cidr.substring(0, sep); + } + addr = InetAddresses.forString(ipStr); + if (sep > -1) { + mask = Integer.parseInt(cidr.substring(sep + 1)); + } else { + mask = addr.getAddress().length * 8; + } + if (cidr.indexOf(':') > -1 && addr.getAddress().length == 4) { + // IPv4-mapped IPv6 addresses are automatically converted to IPv4, + // need to shift the mask + mask = Math.max(0, mask - 96); + } + } + + public boolean contains(InetAddress address) { + byte[] addr0 = addr.getAddress(); + byte[] addr1 = address.getAddress(); + if (addr0.length != addr1.length) { + // not comparing IPv4 and IPv6 addresses + return false; + } + for (int i = 0; i < addr0.length; i++) { + int remainingMaskBits = mask - (i * 8); + if (remainingMaskBits <= 0) + return true; + int m = ~(0xff >> remainingMaskBits); // mask for byte under cursor + if ((addr0[i] & m) != (addr1[i] & m)) + return false; + } + return true; + } + + @Override + public String toString() { + return addr + "/" + mask; + } +} diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/IPFilterRules.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/IPFilterRules.java new file mode 100644 index 000000000..868732fe5 --- /dev/null +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/IPFilterRules.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.okhttp; + +import java.lang.invoke.MethodHandles; +import java.net.InetAddress; +import java.util.ArrayList; +import java.util.List; +import java.util.function.Predicate; + +import org.apache.hadoop.conf.Configuration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Optionally limit or block connections to IP address ranges + * (localhost/loopback or site-local addresses, subnet ranges given in CIDR + * notation, or single IP addresses). + * + * IP filter rules are built from two Nutch properties: + * <ul> + * <li><code>http.filter.ipaddress.include</code> defines all allowed IP ranges. + * If not defined or empty all IP addresses (and not explicitly excluded) are + * allowed. + * <li><code>http.filter.ipaddress.exclude</code> defines excluded IP address + * ranges. + * </ul> + * + * IP ranges can be defined as + * <ul> + * <li>IP address, e.g. <code>127.0.0.1</code> or <code>::1</code> (IPv6)</li> + * <li>CIDR notation, e.g. <code>192.168.0.0/16</code> or + * <code>fd00::/8</code></li> + * <li><code>localhost</code> or <code>loopback</code> applies to all IP + * addresses for which {@link InetAddress#isLoopbackAddress()} is true</li> + * <li><code>sitelocal</code> applies to all IP + * addresses for which {@link InetAddress#isSiteLocalAddress()} is true</li> + * </ul> + * + * Multiple IP ranges are separated by a comma, e.g. <code>loopback,sitelocal,fd00::/8</code> + * + */ +public class IPFilterRules { + + protected static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + List<Predicate<InetAddress>> includeRules; + List<Predicate<InetAddress>> excludeRules; + + public IPFilterRules(Configuration conf) { + includeRules = parseIPRules(conf, "http.filter.ipaddress.include"); + excludeRules = parseIPRules(conf, "http.filter.ipaddress.exclude"); + } + + public boolean isEmpty() { + return !(includeRules.size() > 0 || excludeRules.size() > 0); + } + + public boolean accept(InetAddress address) { + boolean accept = true; + if (includeRules.size() > 0) { + accept = false; + for (Predicate<InetAddress> rule : includeRules) { + if (rule.test(address)) { + accept = true; + break; + } + } + } + if (accept && excludeRules.size() > 0) { + for (Predicate<InetAddress> rule : excludeRules) { + if (rule.test(address)) { + accept = false; + break; + } + } + } + return accept; + } + + private static List<Predicate<InetAddress>> parseIPRules(Configuration conf, + String ipRuleProperty) { + List<Predicate<InetAddress>> rules = new ArrayList<>(); + String[] ipRules = conf.getStrings(ipRuleProperty); + if (ipRules == null) { + return rules; + } + for (String ipRule : ipRules) { + switch (ipRule.toLowerCase()) { + case "localhost": + case "loopback": + rules.add((InetAddress a) -> a.isLoopbackAddress()); + break; + case "sitelocal": + rules.add((InetAddress a) -> a.isSiteLocalAddress()); + break; + default: + try { + CIDR cidr = new CIDR(ipRule); + rules.add((InetAddress a) -> cidr.contains(a)); + } catch (IllegalArgumentException e) { + LOG.error( + "Failed to parse {} as CIDR, ignoring to configure IP rules ({})", + ipRule, ipRuleProperty); + } + } + } + if (rules.size() > 0) { + LOG.info("Found {} IP filter rules for {}", rules.size(), ipRuleProperty); + } + return rules; + } + +} diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java index 63fa32837..876c4ef24 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java @@ -212,6 +212,11 @@ public class OkHttp extends HttpBase { } } + IPFilterRules ipFilterRules = new IPFilterRules(conf); + if (!ipFilterRules.isEmpty()) { + builder.addNetworkInterceptor(new HTTPFilterIPAddressInterceptor(ipFilterRules)); + } + if (this.storeIPAddress || this.storeHttpHeaders || this.storeHttpRequest) { builder.addNetworkInterceptor(new HTTPHeadersInterceptor()); } @@ -259,6 +264,36 @@ public class OkHttp extends HttpBase { } } + class HTTPFilterIPAddressInterceptor implements Interceptor { + + IPFilterRules rules; + + public HTTPFilterIPAddressInterceptor(IPFilterRules rules) { + this.rules = rules; + } + + @Override + public okhttp3.Response intercept(Interceptor.Chain chain) + throws IOException { + + Connection connection = chain.connection(); + InetAddress address = connection.socket().getInetAddress(); + + boolean accept = rules.accept(address); + + Request request = chain.request(); + + if (accept) { + return chain.proceed(request); + } + + LOG.warn("Blocked connection to IP address {}: {}", + address.getHostAddress(), request.url()); + throw new IOException( + "Forbidden connection to IP address " + address.getHostAddress()); + } + } + class HTTPHeadersInterceptor implements Interceptor { private String getNormalizedProtocolName(Protocol protocol) { diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java index 5a587fea2..7c5d0f15c 100644 --- a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java +++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java @@ -34,7 +34,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * Test cases for protocol-http - robustness regarding bad server responses: + * Test cases for protocol-okhttp - robustness regarding bad server responses: * malformed HTTP header lines, etc. See, NUTCH-2549. */ public class TestBadServerResponses extends AbstractHttpProtocolPluginTest { diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestIPAddressFiltering.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestIPAddressFiltering.java new file mode 100644 index 000000000..dbd1b846d --- /dev/null +++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestIPAddressFiltering.java @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.okhttp; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.net.InetAddress; +import java.util.function.Function; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest; +import org.junit.Test; + +import com.google.common.net.InetAddresses; + +/** + * Test cases for protocol-okhttp IP address filtering + */ +public class TestIPAddressFiltering extends AbstractHttpProtocolPluginTest { + + @Override + protected String getPluginClassName() { + return "org.apache.nutch.protocol.okhttp.OkHttp"; + } + + public InetAddress parseIP(String ip) { + // the Java built-in may perform DNS lookup (and throw UnknownHostException) + // if not a well-formed IP address: + // InetAddress.getByName(ip); + + // use Guava because it does not perform DNS lookups, may throw + // IllegalArgumentException if IP address is not well-formed + return InetAddresses.forString(ip); + } + + public void testCIDRcontains(String cidr, String ip) { + CIDR c = new CIDR(cidr); + InetAddress i = parseIP(ip); + assertTrue(i + " should be in " + c, c.contains(i)); + } + + public void testCIDRnotContains(String cidr, String ip) { + CIDR c = new CIDR(cidr); + InetAddress i = parseIP(ip); + assertFalse(i + " should not be in " + c, c.contains(i)); + } + + /** Tests for {@link CIDR} */ + @Test + public void testCIDRs() { + // private subnets IPv4 + testCIDRcontains("127.0.0.0/8", "127.0.0.1"); + testCIDRcontains("10.0.0.0/8", "10.0.0.13"); + testCIDRcontains("172.16.0.0/12", "172.17.0.0"); + testCIDRcontains("192.168.0.0/16", "192.168.0.1"); + + // private subnets IPv6 + testCIDRcontains("::1/128", "::1"); + testCIDRcontains("127.0.0.0/8", "::ffff:127.0.0.1"); + testCIDRcontains("::ffff:7f00:0/104", "::ffff:127.0.0.1"); + testCIDRcontains("fd00::/8", "fd12:3456:789a:1::1"); + testCIDRcontains("fe80::/10", "fe80::2f29:b6f0:a4c:32ae"); + + // test single IP address (with and without mask) + testCIDRcontains("127.0.0.1", "127.0.0.1"); + testCIDRcontains("127.0.0.1/24", "127.0.0.1"); + + // test off-by-one boundaries + testCIDRnotContains("127.0.0.0/8", "128.0.0.0"); + testCIDRnotContains("10.0.0.0/8", "11.0.0.0"); + testCIDRnotContains("10.0.0.0/8", "9.255.255.255"); + testCIDRnotContains("172.16.0.0/12", "172.32.0.0"); + testCIDRnotContains("172.16.0.0/12", "171.255.255.255"); + } + + public void testFilter(Configuration conf, String[] included, String[] excluded) { + IPFilterRules ipFilterRules = new IPFilterRules(conf); + for (String address : included) { + assertTrue("Address " + address + " should be included", + ipFilterRules.accept(parseIP(address))); + } + for (String address : excluded) { + assertFalse("Address " + address + " should be excluded", + ipFilterRules.accept(parseIP(address))); + } + } + + /** Tests for {@link IPFilterRules} */ + @Test + public void testIPAddressFilterRules() { + String[] publicAddresses = {"93.184.216.34", "93.184.216.43"}; + String[] loopbackAddresses = {"127.0.0.1", "127.0.0.2", "::1"}; + String[] sitelocalAddresses = {"10.0.0.13", "172.17.0.0", "192.168.0.1"}; + + conf.set("http.filter.ipaddress.include", ""); + conf.set("http.filter.ipaddress.exclude", "localhost"); + testFilter(conf, new String[0], loopbackAddresses); + + conf.set("http.filter.ipaddress.exclude", "loopback,sitelocal"); + testFilter(conf, publicAddresses, loopbackAddresses); + testFilter(conf, publicAddresses, sitelocalAddresses); + + conf.set("http.filter.ipaddress.include", "93.184.216.0/8"); + conf.set("http.filter.ipaddress.exclude", ""); + testFilter(conf, publicAddresses, loopbackAddresses); + + conf.set("http.filter.ipaddress.include", "localhost"); + conf.set("http.filter.ipaddress.exclude", ""); + testFilter(conf, loopbackAddresses, publicAddresses); + } + + public void testPredefinedAddressRange(String ipAddress, String type) { + try { + InetAddress addr = InetAddresses.forString(ipAddress); + Function<InetAddress,Boolean> pred = null; + switch (type.toLowerCase()) { + case "localhost": + case "loopback": + pred = InetAddress::isLoopbackAddress; + break; + case "sitelocal": + pred = InetAddress::isSiteLocalAddress; + break; + default: + fail("Unknown IP address type " + type); + } + assertTrue(ipAddress + " is not recognized as " + type + " address", pred.apply(addr)); + } catch (IllegalArgumentException e) { + fail("Not a valid IP address string: " + ipAddress); + } + } + + /** + * Verify that certain IP addresses are matched by predefined IP classes: + * localhost, loopback, sitelocal. This verifies that the predefined classes + * are properly mapped to the underlying predicates of the class + * {@link InetAddress}. + */ + @Test + public void testPredefinedRanges() throws Exception { + testPredefinedAddressRange("127.0.0.1", "localhost"); + testPredefinedAddressRange("127.0.0.1", "loopback"); + testPredefinedAddressRange("10.0.0.13", "sitelocal"); + testPredefinedAddressRange("172.17.0.0", "sitelocal"); + testPredefinedAddressRange("192.168.0.1", "sitelocal"); + + testPredefinedAddressRange("::1", "loopback"); + testPredefinedAddressRange("::ffff:127.0.0.1", "loopback"); + // fec0::/10 - Java follows the "old" standard to define private IPv6 addresses + testPredefinedAddressRange("fec0::", "sitelocal"); + // fd::/8 - not (yet?) recognized as site-local address by InetAddress::isSiteLocalAddress + //testPredefinedAddressRange("fd12:3456:789a:1::1", "sitelocal"); + } + + /** + * Test whether connections are blocked according to the IP filter + * configuration + */ + @Test + public void testConnectionBlocking() throws Exception { + localHost = "127.0.0.1"; + launchServer("/", (responseHeader + simpleContent).getBytes(UTF_8)); + + // without filter configured + conf.set("http.filter.ipaddress.exclude", ""); + http.setConf(conf); + fetchPage("/", 200, "text/html"); + + // filter localhost + conf.set("http.filter.ipaddress.exclude", "localhost"); + http.setConf(conf); + fetchPage("/", -1, "text/html"); + + // filter loopback + conf.set("http.filter.ipaddress.exclude", "localhost"); + http.setConf(conf); + fetchPage("/", -1, "text/html"); + + // filter by IP + conf.set("http.filter.ipaddress.exclude", "127.0.0.1"); + http.setConf(conf); + fetchPage("/", -1, "text/html"); + + // filter by CIDR + conf.set("http.filter.ipaddress.exclude", "127.0.0.0/8"); + http.setConf(conf); + fetchPage("/", -1, "text/html"); + } + +} diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java index 289e75672..e740ed288 100644 --- a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java +++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java @@ -25,7 +25,7 @@ import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest; import org.junit.Test; /** - * Test cases for protocol-http + * Test cases for protocol-okhttp */ public class TestProtocolOkHttp extends AbstractHttpProtocolPluginTest { diff --git a/src/test/org/apache/nutch/protocol/AbstractHttpProtocolPluginTest.java b/src/test/org/apache/nutch/protocol/AbstractHttpProtocolPluginTest.java index 3a90e21a9..322b34e99 100644 --- a/src/test/org/apache/nutch/protocol/AbstractHttpProtocolPluginTest.java +++ b/src/test/org/apache/nutch/protocol/AbstractHttpProtocolPluginTest.java @@ -28,7 +28,6 @@ import java.net.Socket; import java.net.SocketException; import java.net.URL; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.TreeMap; @@ -60,6 +59,17 @@ public abstract class AbstractHttpProtocolPluginTest { protected Protocol http; protected ServerSocket server; protected Configuration conf; + + /** Protocol / URL scheme used to send/receive test requests */ + protected String protocol = "http"; + + /** + * URL host name used to represent localhost when sending/receiving test + * requests + */ + protected String localHost = "127.0.0.1"; + + /** Port used to send/receive test requests */ protected int defaultPort = 47505; protected static final String responseHeader = "HTTP/1.1 200 OK\r\n"; @@ -103,7 +113,9 @@ public abstract class AbstractHttpProtocolPluginTest { @After public void tearDown() throws Exception { - server.close(); + if (server != null) { + server.close(); + } } /** @@ -123,13 +135,13 @@ public abstract class AbstractHttpProtocolPluginTest { BiFunction<String, String[], byte[]> responder, Predicate<List<String>> requestChecker) throws Exception { server = new ServerSocket(); - server.bind(new InetSocketAddress("127.0.0.1", port)); + server.bind(new InetSocketAddress(localHost, port)); Pattern requestPattern = Pattern.compile("(?i)^GET\\s+(\\S+)"); while (true) { LOG.info("Listening on port {}", port); if (server.isClosed()) { server = new ServerSocket(); - server.bind(new InetSocketAddress("127.0.0.1", port)); + server.bind(new InetSocketAddress(localHost, port)); } Socket socket = server.accept(); LOG.info("Connection received"); @@ -259,7 +271,7 @@ public abstract class AbstractHttpProtocolPluginTest { */ protected ProtocolOutput fetchPage(int port, String page, int expectedCode, String expectedContentType) throws Exception { - URL url = new URL("http", "127.0.0.1", port, page); + URL url = new URL(protocol, localHost, port, page); LOG.info("Fetching {}", url); CrawlDatum crawlDatum = new CrawlDatum(); ProtocolOutput protocolOutput = http