This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/stormcrawler.git


The following commit(s) were added to refs/heads/main by this push:
     new 179ab420 #1597 replace deprecated use of URL constructor using 
URI#toURL using URLUtil class
179ab420 is described below

commit 179ab420de5b687f582a4733488bc5d9d1a50632
Author: TamimEhsan <54908501+tamimeh...@users.noreply.github.com>
AuthorDate: Thu Aug 14 19:54:51 2025 +0800

    #1597 replace deprecated use of URL constructor using URI#toURL using 
URLUtil class
---
 .../protocol/HttpRobotRulesParser.java             |  5 +++--
 .../java/org/apache/stormcrawler/util/URLUtil.java | 24 ++++++++++++++++++----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git 
a/core/src/main/java/org/apache/stormcrawler/protocol/HttpRobotRulesParser.java 
b/core/src/main/java/org/apache/stormcrawler/protocol/HttpRobotRulesParser.java
index 0c15030a..ff62eea8 100644
--- 
a/core/src/main/java/org/apache/stormcrawler/protocol/HttpRobotRulesParser.java
+++ 
b/core/src/main/java/org/apache/stormcrawler/protocol/HttpRobotRulesParser.java
@@ -30,6 +30,7 @@ import org.apache.http.HttpHeaders;
 import org.apache.storm.Config;
 import org.apache.stormcrawler.Metadata;
 import org.apache.stormcrawler.util.ConfUtils;
+import org.apache.stormcrawler.util.URLUtil;
 
 /**
  * This class is used for parsing robots for urls belonging to HTTP protocol. 
It extends the generic
@@ -128,7 +129,7 @@ public class HttpRobotRulesParser extends RobotRulesParser {
         LOG.debug("Cache miss {} for {}", cacheKey, url);
         List<Integer> bytesFetched = new LinkedList<>();
         try {
-            robotsUrl = new URL(url, "/robots.txt");
+            robotsUrl = URLUtil.resolveURL(url, "/robots.txt");
             ProtocolResponse response = 
http.getProtocolOutput(robotsUrl.toString(), fetchRobotsMd);
             int code = response.getStatusCode();
             bytesFetched.add(response.getContent() != null ? 
response.getContent().length : 0);
@@ -146,7 +147,7 @@ public class HttpRobotRulesParser extends RobotRulesParser {
                 String redirection = 
response.getMetadata().getFirstValue(HttpHeaders.LOCATION);
                 LOG.debug("Redirected from {} to {}", redir, redirection);
                 if (StringUtils.isNotBlank(redirection)) {
-                    redir = new URL(redir, redirection);
+                    redir = URLUtil.resolveURL(redir, redirection);
                     if (redir.getPath().equals("/robots.txt") && 
redir.getQuery() == null) {
                         // only if the path (including the query part) of the 
redirect target is
                         // `/robots.txt` we can get/put the rules from/to the 
cache under the host
diff --git a/core/src/main/java/org/apache/stormcrawler/util/URLUtil.java 
b/core/src/main/java/org/apache/stormcrawler/util/URLUtil.java
index e30a1c9e..7066c637 100644
--- a/core/src/main/java/org/apache/stormcrawler/util/URLUtil.java
+++ b/core/src/main/java/org/apache/stormcrawler/util/URLUtil.java
@@ -44,7 +44,23 @@ public class URLUtil {
             return fixPureQueryTargets(base, target);
         }
 
-        return new URL(base, target);
+        return resolveURLInternal(base, target);
+    }
+
+    /**
+     * Refactor deprecated URL constructor to use the URI class for resolving 
relative URLs
+     *
+     * @param base the base URL
+     * @param target the target URL (may be relative)
+     * @return resolved absolute URL.
+     * @throws MalformedURLException if the URL is not well formed
+     */
+    private static URL resolveURLInternal(URL base, String target) throws 
MalformedURLException {
+        try {
+            return base.toURI().resolve(target).toURL();
+        } catch (Exception e) {
+            throw (MalformedURLException) new 
MalformedURLException(e.getMessage()).initCause(e);
+        }
     }
 
     /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */
@@ -55,7 +71,7 @@ public class URLUtil {
             final String baseRightMost = basePath.substring(baseRightMostIdx + 
1);
             target = baseRightMost + target;
         }
-        return new URL(base, target);
+        return resolveURLInternal(base, target);
     }
 
     /**
@@ -77,7 +93,7 @@ public class URLUtil {
         // the target contains params information or the base doesn't then no
         // conversion necessary, return regular URL
         if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
-            return new URL(base, target);
+            return resolveURLInternal(base, target);
         }
 
         // get the base url and it params information
@@ -96,7 +112,7 @@ public class URLUtil {
             target += params;
         }
 
-        return new URL(base, target);
+        return resolveURLInternal(base, target);
     }
 
     private static Pattern IP_PATTERN = 
Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})");

Reply via email to