This is an automated email from the ASF dual-hosted git repository.

lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new cefb48a75 NUTCH-3099 Allow wildcard '*' in http.proxy.exception.list 
(via Isabelle Giguere) (#865)
cefb48a75 is described below

commit cefb48a7502d3d57046674bd958fc5a78cc1378f
Author: Lewis John McGibbney <[email protected]>
AuthorDate: Sun Oct 12 15:55:29 2025 -0700

    NUTCH-3099 Allow wildcard '*' in http.proxy.exception.list (via Isabelle 
Giguere) (#865)
---
 .github/workflows/junit-report.yml                 |  4 +-
 conf/nutch-default.xml                             |  7 ++-
 .../apache/nutch/protocol/http/api/HttpBase.java   |  8 ++-
 .../nutch/protocol/http/api/TestHttpBase.java      | 73 ++++++++++++++++++++++
 4 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/junit-report.yml 
b/.github/workflows/junit-report.yml
index e93a602a2..46373d399 100644
--- a/.github/workflows/junit-report.yml
+++ b/.github/workflows/junit-report.yml
@@ -17,7 +17,7 @@ name: junit report
 on:
   workflow_run:
     workflows: [master pull request ci]
-    types: [success]
+    types: [completed]
 permissions:
   checks: write
   pull-requests: write
@@ -49,4 +49,4 @@ jobs:
           include_time_in_summary: true
           include_passed: true
           job_name: tests
-          check_name: 'JUnit Test Report Core, JUnit Test Report Plugins'
\ No newline at end of file
+          check_name: 'JUnit Test Report Core, JUnit Test Report Plugins'
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 5a1e7bfce..b41f7ecda 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -365,8 +365,11 @@
 <property>
   <name>http.proxy.exception.list</name>
   <value></value>
-  <description>A comma separated list of hosts that don't use the proxy
-  (e.g. intranets). Example: www.apache.org</description>
+  <description>Either i) a comma separated list of hosts e.g.,
+    domain1.org,www.domain2.com, or ii) a wildcard '*' in either prefix
+    e.g. "*.domain.com", or suffix e.g. "some.domain.*", that don't
+    use the proxy (e.g. intranets)
+  </description>
 </property>
 
 <property>
diff --git 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
old mode 100644
new mode 100755
index e24488234..79b45882e
--- 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -455,12 +455,18 @@ public abstract class HttpBase implements Protocol {
   }
 
   public boolean useProxy(String host) {
-    if (this.useProxy && this.proxyException.containsKey(host)) {
+    if (this.useProxy && isProxyException(host)) {
       return false;
     }
     return this.useProxy;
   }
 
+  protected boolean isProxyException(String host) {
+    return this.proxyException.keySet().stream().anyMatch(h -> h.equals(host)
+        || (h.endsWith("*") && host.startsWith(h.substring(0, h.length() - 1)))
+        || (h.startsWith("*") && host.endsWith(h.substring(1))));
+  }
+
   public int getTimeout() {
     return this.timeout;
   }
diff --git 
a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestHttpBase.java
 
b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestHttpBase.java
new file mode 100755
index 000000000..e56a843f1
--- /dev/null
+++ 
b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestHttpBase.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http.api;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Test {@link HttpBase}
+ */
+public class TestHttpBase {
+
+  /**
+   * Test non-proxy hosts
+   */
+  @Test
+  public void testIsProxyException() {
+    final HttpBase base = new HttpBase() {
+
+      @Override
+      protected Response getResponse(URL url, CrawlDatum datum,
+          boolean followRedirects) throws ProtocolException, IOException {
+        return null;
+      }
+
+    };
+    base.proxyException = new HashMap<>();
+
+    // test exact match
+    base.proxyException.put("some.domain.com", "some.domain.com");
+    assertFalse(base.isProxyException("other.domain.com"));
+    assertTrue(base.isProxyException("some.domain.com"));
+
+    // test '*' prefix
+    base.proxyException.clear();
+    base.proxyException.put("*.domain.com", "*.domain.com");
+    assertTrue(base.isProxyException("some.domain.com"));
+    assertFalse(base.isProxyException("somedomain.com"));
+
+    // test '*' suffix
+    base.proxyException.clear();
+    base.proxyException.put("some.domain.*", "some.domain.*");
+    assertTrue(base.isProxyException("some.domain.with.long.name.com"));
+    assertFalse(base.isProxyException("my.domain.com"));
+
+  }
+
+}

Reply via email to