This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/stormcrawler.git
The following commit(s) were added to refs/heads/main by this push: new 179ab420 #1597 replace deprecated use of URL constructor using URI#toURL using URLUtil class 179ab420 is described below commit 179ab420de5b687f582a4733488bc5d9d1a50632 Author: TamimEhsan <54908501+tamimeh...@users.noreply.github.com> AuthorDate: Thu Aug 14 19:54:51 2025 +0800 #1597 replace deprecated use of URL constructor using URI#toURL using URLUtil class --- .../protocol/HttpRobotRulesParser.java | 5 +++-- .../java/org/apache/stormcrawler/util/URLUtil.java | 24 ++++++++++++++++++---- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/HttpRobotRulesParser.java b/core/src/main/java/org/apache/stormcrawler/protocol/HttpRobotRulesParser.java index 0c15030a..ff62eea8 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/HttpRobotRulesParser.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/HttpRobotRulesParser.java @@ -30,6 +30,7 @@ import org.apache.http.HttpHeaders; import org.apache.storm.Config; import org.apache.stormcrawler.Metadata; import org.apache.stormcrawler.util.ConfUtils; +import org.apache.stormcrawler.util.URLUtil; /** * This class is used for parsing robots for urls belonging to HTTP protocol. It extends the generic @@ -128,7 +129,7 @@ public class HttpRobotRulesParser extends RobotRulesParser { LOG.debug("Cache miss {} for {}", cacheKey, url); List<Integer> bytesFetched = new LinkedList<>(); try { - robotsUrl = new URL(url, "/robots.txt"); + robotsUrl = URLUtil.resolveURL(url, "/robots.txt"); ProtocolResponse response = http.getProtocolOutput(robotsUrl.toString(), fetchRobotsMd); int code = response.getStatusCode(); bytesFetched.add(response.getContent() != null ? response.getContent().length : 0); @@ -146,7 +147,7 @@ public class HttpRobotRulesParser extends RobotRulesParser { String redirection = response.getMetadata().getFirstValue(HttpHeaders.LOCATION); LOG.debug("Redirected from {} to {}", redir, redirection); if (StringUtils.isNotBlank(redirection)) { - redir = new URL(redir, redirection); + redir = URLUtil.resolveURL(redir, redirection); if (redir.getPath().equals("/robots.txt") && redir.getQuery() == null) { // only if the path (including the query part) of the redirect target is // `/robots.txt` we can get/put the rules from/to the cache under the host diff --git a/core/src/main/java/org/apache/stormcrawler/util/URLUtil.java b/core/src/main/java/org/apache/stormcrawler/util/URLUtil.java index e30a1c9e..7066c637 100644 --- a/core/src/main/java/org/apache/stormcrawler/util/URLUtil.java +++ b/core/src/main/java/org/apache/stormcrawler/util/URLUtil.java @@ -44,7 +44,23 @@ public class URLUtil { return fixPureQueryTargets(base, target); } - return new URL(base, target); + return resolveURLInternal(base, target); + } + + /** + * Refactor deprecated URL constructor to use the URI class for resolving relative URLs + * + * @param base the base URL + * @param target the target URL (may be relative) + * @return resolved absolute URL. + * @throws MalformedURLException if the URL is not well formed + */ + private static URL resolveURLInternal(URL base, String target) throws MalformedURLException { + try { + return base.toURI().resolve(target).toURL(); + } catch (Exception e) { + throw (MalformedURLException) new MalformedURLException(e.getMessage()).initCause(e); + } } /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */ @@ -55,7 +71,7 @@ public class URLUtil { final String baseRightMost = basePath.substring(baseRightMostIdx + 1); target = baseRightMost + target; } - return new URL(base, target); + return resolveURLInternal(base, target); } /** @@ -77,7 +93,7 @@ public class URLUtil { // the target contains params information or the base doesn't then no // conversion necessary, return regular URL if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) { - return new URL(base, target); + return resolveURLInternal(base, target); } // get the base url and it params information @@ -96,7 +112,7 @@ public class URLUtil { target += params; } - return new URL(base, target); + return resolveURLInternal(base, target); } private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})");