This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new cefb48a75 NUTCH-3099 Allow wildcard '*' in http.proxy.exception.list
(via Isabelle Giguere) (#865)
cefb48a75 is described below
commit cefb48a7502d3d57046674bd958fc5a78cc1378f
Author: Lewis John McGibbney <[email protected]>
AuthorDate: Sun Oct 12 15:55:29 2025 -0700
NUTCH-3099 Allow wildcard '*' in http.proxy.exception.list (via Isabelle
Giguere) (#865)
---
.github/workflows/junit-report.yml | 4 +-
conf/nutch-default.xml | 7 ++-
.../apache/nutch/protocol/http/api/HttpBase.java | 8 ++-
.../nutch/protocol/http/api/TestHttpBase.java | 73 ++++++++++++++++++++++
4 files changed, 87 insertions(+), 5 deletions(-)
diff --git a/.github/workflows/junit-report.yml
b/.github/workflows/junit-report.yml
index e93a602a2..46373d399 100644
--- a/.github/workflows/junit-report.yml
+++ b/.github/workflows/junit-report.yml
@@ -17,7 +17,7 @@ name: junit report
on:
workflow_run:
workflows: [master pull request ci]
- types: [success]
+ types: [completed]
permissions:
checks: write
pull-requests: write
@@ -49,4 +49,4 @@ jobs:
include_time_in_summary: true
include_passed: true
job_name: tests
- check_name: 'JUnit Test Report Core, JUnit Test Report Plugins'
\ No newline at end of file
+ check_name: 'JUnit Test Report Core, JUnit Test Report Plugins'
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 5a1e7bfce..b41f7ecda 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -365,8 +365,11 @@
<property>
<name>http.proxy.exception.list</name>
<value></value>
- <description>A comma separated list of hosts that don't use the proxy
- (e.g. intranets). Example: www.apache.org</description>
+ <description>Either i) a comma separated list of hosts e.g.,
+ domain1.org,www.domain2.com, or ii) a wildcard '*' in either prefix
+ e.g. "*.domain.com", or suffix e.g. "some.domain.*", that don't
+ use the proxy (e.g. intranets)
+ </description>
</property>
<property>
diff --git
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
old mode 100644
new mode 100755
index e24488234..79b45882e
---
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -455,12 +455,18 @@ public abstract class HttpBase implements Protocol {
}
public boolean useProxy(String host) {
- if (this.useProxy && this.proxyException.containsKey(host)) {
+ if (this.useProxy && isProxyException(host)) {
return false;
}
return this.useProxy;
}
+ protected boolean isProxyException(String host) {
+ return this.proxyException.keySet().stream().anyMatch(h -> h.equals(host)
+ || (h.endsWith("*") && host.startsWith(h.substring(0, h.length() - 1)))
+ || (h.startsWith("*") && host.endsWith(h.substring(1))));
+ }
+
public int getTimeout() {
return this.timeout;
}
diff --git
a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestHttpBase.java
b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestHttpBase.java
new file mode 100755
index 000000000..e56a843f1
--- /dev/null
+++
b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestHttpBase.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http.api;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Test {@link HttpBase}
+ */
+public class TestHttpBase {
+
+ /**
+ * Test non-proxy hosts
+ */
+ @Test
+ public void testIsProxyException() {
+ final HttpBase base = new HttpBase() {
+
+ @Override
+ protected Response getResponse(URL url, CrawlDatum datum,
+ boolean followRedirects) throws ProtocolException, IOException {
+ return null;
+ }
+
+ };
+ base.proxyException = new HashMap<>();
+
+ // test exact match
+ base.proxyException.put("some.domain.com", "some.domain.com");
+ assertFalse(base.isProxyException("other.domain.com"));
+ assertTrue(base.isProxyException("some.domain.com"));
+
+ // test '*' prefix
+ base.proxyException.clear();
+ base.proxyException.put("*.domain.com", "*.domain.com");
+ assertTrue(base.isProxyException("some.domain.com"));
+ assertFalse(base.isProxyException("somedomain.com"));
+
+ // test '*' suffix
+ base.proxyException.clear();
+ base.proxyException.put("some.domain.*", "some.domain.*");
+ assertTrue(base.isProxyException("some.domain.with.long.name.com"));
+ assertFalse(base.isProxyException("my.domain.com"));
+
+ }
+
+}