This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch 1597 in repository https://gitbox.apache.org/repos/asf/stormcrawler.git
commit e9d0e404a5a646634af4a49b42e6684a50973782 Author: Richard Zowalla <[email protected]> AuthorDate: Tue Dec 23 10:45:23 2025 +0100 First steps in migration from URL to URI. Leads to some corner cases to discuss in normalizer / architecture of normalization. --- .../apache/stormcrawler/bolt/FeedParserBolt.java | 3 +- .../org/apache/stormcrawler/bolt/FetcherBolt.java | 7 +- .../apache/stormcrawler/bolt/JSoupParserBolt.java | 16 +++-- .../stormcrawler/bolt/SimpleFetcherBolt.java | 5 +- .../stormcrawler/bolt/SiteMapParserBolt.java | 6 +- .../stormcrawler/bolt/URLPartitionerBolt.java | 5 +- .../apache/stormcrawler/filtering/URLFilters.java | 3 +- .../filtering/basic/BasicURLNormalizer.java | 22 ++++-- .../stormcrawler/filtering/host/HostURLFilter.java | 6 +- .../filtering/regex/FastURLFilter.java | 8 ++- .../filtering/robots/RobotsFilter.java | 6 +- .../stormcrawler/indexing/AbstractIndexerBolt.java | 6 +- .../apache/stormcrawler/jsoup/LinkParseFilter.java | 8 ++- .../apache/stormcrawler/parse/JSoupFilters.java | 6 +- .../apache/stormcrawler/parse/ParseFilters.java | 6 +- .../stormcrawler/parse/filter/LinkParseFilter.java | 8 ++- .../stormcrawler/protocol/RobotRulesParser.java | 4 +- .../stormcrawler/protocol/file/FileResponse.java | 6 +- .../stormcrawler/protocol/okhttp/HttpProtocol.java | 6 +- .../apache/stormcrawler/util/URLPartitioner.java | 6 +- .../java/org/apache/stormcrawler/util/URLUtil.java | 17 ++--- .../stormcrawler/filtering/BasicURLFilterTest.java | 14 ++-- .../filtering/BasicURLNormalizerTest.java | 84 +++++++++++----------- .../stormcrawler/filtering/FastURLFilterTest.java | 16 +++-- .../stormcrawler/filtering/HostURLFilterTest.java | 18 ++--- .../stormcrawler/filtering/MaxDepthFilterTest.java | 18 ++--- .../filtering/MetadataFilterFromJsonTest.java | 38 +++++----- .../stormcrawler/filtering/MetadataFilterTest.java | 58 +++++++-------- .../stormcrawler/filtering/RegexFilterTest.java | 20 +++--- .../stormcrawler/util/CookieConverterTest.java | 6 +- .../protocol/selenium/RemoteDriverProtocol.java | 3 +- .../org/apache/stormcrawler/tika/ParserBolt.java | 6 +- .../org/apache/stormcrawler/warc/WARCSpout.java | 6 +- 33 files changed, 257 insertions(+), 190 deletions(-) diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java index 3abe00ff..7c451167 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java @@ -25,6 +25,7 @@ import com.rometools.rome.feed.synd.SyndEntry; import com.rometools.rome.feed.synd.SyndFeed; import com.rometools.rome.io.SyndFeedInput; import java.io.ByteArrayInputStream; +import java.net.URI; import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.ArrayList; @@ -175,7 +176,7 @@ public class FeedParserBolt extends StatusEmitterBolt { feed = input.build(new InputSource(is)); } - URL url1 = new URL(url); + URL url1 = new URI(url).toURL(); List<SyndEntry> entries = feed.getEntries(); for (SyndEntry entry : entries) { diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java index 295631b1..c07474a2 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java @@ -22,6 +22,7 @@ import crawlercommons.robots.BaseRobotRules; import java.io.File; import java.net.InetAddress; import java.net.MalformedURLException; +import java.net.URI; import java.net.URL; import java.net.UnknownHostException; import java.time.Instant; @@ -529,7 +530,7 @@ public class FetcherBolt extends StatusEmitterBolt { boolean asap = false; try { - URL url = new URL(fit.url); + URL url = new URI(fit.url).toURL(); Protocol protocol = protocolFactory.getProtocol(url); if (protocol == null) { @@ -982,8 +983,8 @@ public class FetcherBolt extends StatusEmitterBolt { URL url; try { - url = new URL(urlString); - } catch (MalformedURLException e) { + url = new URI(urlString).toURL(); + } catch (Exception e) { LOG.error("{} is a malformed URL", urlString); Metadata metadata = (Metadata) input.getValueByField("metadata"); diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java index 933f41bd..9970bb8f 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java @@ -24,6 +24,8 @@ import java.io.IOException; import java.io.InputStream; import java.lang.reflect.InvocationTargetException; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.nio.ByteBuffer; import java.nio.charset.Charset; @@ -293,7 +295,7 @@ public class JSoupParserBolt extends StatusEmitterBolt { } else { final Elements links = jsoupDoc.select("a[href]"); slinks = new HashMap<>(links.size()); - final URL baseUrl = new URL(url); + final URL baseUrl = new URI(url).toURL(); for (Element link : links) { // nofollow String[] relkeywords = link.attr("rel").split(" "); @@ -375,7 +377,7 @@ public class JSoupParserBolt extends StatusEmitterBolt { // https://github.com/apache/stormcrawler/issues/954 if (allowRedirs() && StringUtils.isNotBlank(redirection)) { - emitOutlink(tuple, new URL(url), redirection, metadata); + emitOutlink(tuple, new URI(url).toURL(), redirection, metadata); } // Mark URL as redirected @@ -387,8 +389,8 @@ public class JSoupParserBolt extends StatusEmitterBolt { eventCounter.scope("tuple_success").incr(); return; } - } catch (MalformedURLException e) { - LOG.error("MalformedURLException on {}", url); + } catch (MalformedURLException | URISyntaxException e) { + LOG.error("Exception on {}", url, e); } } @@ -516,11 +518,11 @@ public class JSoupParserBolt extends StatusEmitterBolt { URL sourceUrl; try { - sourceUrl = new URL(url); - } catch (MalformedURLException e) { + sourceUrl = new URI(url).toURL(); + } catch (Exception e) { // we would have known by now as previous components check whether // the URL is valid - LOG.error("MalformedURLException on {}", url); + LOG.error("Exception on {}", url, e); eventCounter.scope("error_invalid_source_url").incrBy(1); return new LinkedList<>(); } diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java index e5eb16f5..c79d7449 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java @@ -23,6 +23,7 @@ import crawlercommons.domains.PaidLevelDomain; import crawlercommons.robots.BaseRobotRules; import java.net.InetAddress; import java.net.MalformedURLException; +import java.net.URI; import java.net.URL; import java.net.UnknownHostException; import java.text.SimpleDateFormat; @@ -265,8 +266,8 @@ public class SimpleFetcherBolt extends StatusEmitterBolt { URL url; try { - url = new URL(urlString); - } catch (MalformedURLException e) { + url = new URI(urlString).toURL(); + } catch (Exception e) { LOG.error("{} is a malformed URL", urlString); // Report to status stream and ack metadata.setValue(Constants.STATUS_ERROR_CAUSE, "malformed URL"); diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java index 6736b41f..acbebf41 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java @@ -31,6 +31,8 @@ import crawlercommons.sitemaps.UnknownFormatException; import crawlercommons.sitemaps.extension.Extension; import crawlercommons.sitemaps.extension.ExtensionMetadata; import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.ArrayList; @@ -181,9 +183,9 @@ public class SiteMapParserBolt extends StatusEmitterBolt { private List<Outlink> parseSiteMap( String url, byte[] content, String contentType, Metadata parentMetadata) - throws UnknownFormatException, IOException { + throws UnknownFormatException, IOException, URISyntaxException { - URL url1 = new URL(url); + URL url1 = new URI(url).toURL(); long start = System.currentTimeMillis(); AbstractSiteMap siteMap; // let the parser guess what the mimetype is diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/URLPartitionerBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/URLPartitionerBolt.java index 8f44779f..f11f2a8e 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/URLPartitionerBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/URLPartitionerBolt.java @@ -20,6 +20,7 @@ package org.apache.stormcrawler.bolt; import crawlercommons.domains.PaidLevelDomain; import java.net.InetAddress; import java.net.MalformedURLException; +import java.net.URI; import java.net.URL; import java.util.Collections; import java.util.LinkedHashMap; @@ -82,9 +83,9 @@ public class URLPartitionerBolt extends BaseRichBolt { if (partitionKey == null) { URL u; try { - u = new URL(url); + u = new URI(url).toURL(); host = u.getHost(); - } catch (MalformedURLException e1) { + } catch (Exception e1) { eventCounter.scope("Invalid URL").incrBy(1); LOG.warn("Invalid URL: {}", url); // ack it so that it doesn't get replayed diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/URLFilters.java b/core/src/main/java/org/apache/stormcrawler/filtering/URLFilters.java index 59f36b6f..ce85ec3e 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/URLFilters.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/URLFilters.java @@ -23,6 +23,7 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.io.InputStream; +import java.net.URI; import java.net.URL; import java.util.List; import java.util.Map; @@ -185,7 +186,7 @@ public class URLFilters extends URLFilter implements JSONResource { for (URLFilter filter : filters.filters) { long start = System.currentTimeMillis(); normalizedUrl = - filter.filter(new URL(sourceUrl), new Metadata(), normalizedUrl); + filter.filter(new URI(sourceUrl).toURL(), new Metadata(), normalizedUrl); long end = System.currentTimeMillis(); System.out.println( "\t[" diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java b/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java index 9a5692ef..ca029223 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java @@ -22,6 +22,7 @@ import com.fasterxml.jackson.databind.node.ArrayNode; import java.net.IDN; import java.net.MalformedURLException; import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; @@ -120,7 +121,7 @@ public class BasicURLNormalizer extends URLFilter { } try { - URL theUrl = new URL(urlToFilter); + URL theUrl = new URI(urlToFilter).toURL(); String file = theUrl.getFile(); String protocol = theUrl.getProtocol(); String host = theUrl.getHost(); @@ -152,9 +153,18 @@ public class BasicURLNormalizer extends URLFilter { hasChanged = true; } if (hasChanged) { - urlToFilter = new URL(protocol, host, port, file2).toString(); + URI uri = new URI( + protocol, + null, // userInfo + host, + port, + file2, // path + null, // query + null // fragment + ); + urlToFilter = uri.toString(); } - } catch (MalformedURLException e) { + } catch (MalformedURLException | URISyntaxException e) { return null; } @@ -223,7 +233,7 @@ public class BasicURLNormalizer extends URLFilter { try { // Handle illegal characters by making a url first // this will clean illegal characters like | - final URL url = new URL(urlToFilter); + final URL url = new URI(urlToFilter).toURL(); String query = url.getQuery(); String path = url.getPath(); @@ -287,8 +297,8 @@ public class BasicURLNormalizer extends URLFilter { + ((s = newQueryString) != null ? '?' + s : "") + ((s = url.getRef()) != null ? '#' + s : ""); - } catch (MalformedURLException e) { - LOG.warn("Invalid urlToFilter {}. {}", urlToFilter, e); + } catch (MalformedURLException | URISyntaxException e) { + LOG.warn("Invalid urlToFilter {}.", urlToFilter, e); return null; } } diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/host/HostURLFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/host/HostURLFilter.java index f7e5055b..ab104a66 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/host/HostURLFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/host/HostURLFilter.java @@ -20,6 +20,8 @@ package org.apache.stormcrawler.filtering.host; import com.fasterxml.jackson.databind.JsonNode; import crawlercommons.domains.PaidLevelDomain; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.Map; import org.apache.stormcrawler.Metadata; @@ -82,8 +84,8 @@ public class HostURLFilter extends URLFilter { URL url; try { - url = new URL(urlToFilter); - } catch (MalformedURLException e1) { + url = new URI(urlToFilter).toURL(); + } catch (MalformedURLException | URISyntaxException e1) { return null; } diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java index 854b464a..9f2c9b3a 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java @@ -24,6 +24,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; @@ -177,7 +179,7 @@ public class FastURLFilter extends URLFilter implements JSONResource { if (rules.filter(urlToFilter, sourceMetadata)) { return null; } - } catch (MalformedURLException e) { + } catch (MalformedURLException | URISyntaxException e) { return null; } return urlToFilter; @@ -210,8 +212,8 @@ class Rules { * * @throws MalformedURLException */ - public boolean filter(String url, Metadata metadata) throws MalformedURLException { - URL u = new URL(url); + public boolean filter(String url, Metadata metadata) throws MalformedURLException, URISyntaxException { + URL u = new URI(url).toURL(); // first try the full hostname String hostname = u.getHost(); diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/robots/RobotsFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/robots/RobotsFilter.java index 07140c2f..42e775db 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/robots/RobotsFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/robots/RobotsFilter.java @@ -20,6 +20,8 @@ package org.apache.stormcrawler.filtering.robots; import com.fasterxml.jackson.databind.JsonNode; import crawlercommons.robots.BaseRobotRules; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.Map; import org.apache.storm.Config; @@ -61,8 +63,8 @@ public class RobotsFilter extends URLFilter { @NotNull String urlToFilter) { URL target; try { - target = new URL(urlToFilter); - } catch (MalformedURLException e) { + target = new URI(urlToFilter).toURL(); + } catch (MalformedURLException | URISyntaxException e) { return null; } diff --git a/core/src/main/java/org/apache/stormcrawler/indexing/AbstractIndexerBolt.java b/core/src/main/java/org/apache/stormcrawler/indexing/AbstractIndexerBolt.java index 91175846..4ece0b75 100644 --- a/core/src/main/java/org/apache/stormcrawler/indexing/AbstractIndexerBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/indexing/AbstractIndexerBolt.java @@ -19,6 +19,8 @@ package org.apache.stormcrawler.indexing; import crawlercommons.domains.PaidLevelDomain; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; @@ -286,7 +288,7 @@ public abstract class AbstractIndexerBolt extends BaseRichBolt { } try { - URL url1 = new URL(url); + URL url1 = new URI(url).toURL(); URL canonical = URLUtil.resolveUrl(url1, canonicalValue); String domain = PaidLevelDomain.getPLD(url1.getHost()); @@ -298,7 +300,7 @@ public abstract class AbstractIndexerBolt extends BaseRichBolt { } else { log.info("Canonical URL references a different domain, ignoring in {} ", url); } - } catch (MalformedURLException e) { + } catch (MalformedURLException | URISyntaxException e) { log.error("Malformed canonical URL {} was found in {} ", canonicalValue, url); } diff --git a/core/src/main/java/org/apache/stormcrawler/jsoup/LinkParseFilter.java b/core/src/main/java/org/apache/stormcrawler/jsoup/LinkParseFilter.java index 1125c374..87d66751 100644 --- a/core/src/main/java/org/apache/stormcrawler/jsoup/LinkParseFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/jsoup/LinkParseFilter.java @@ -19,6 +19,8 @@ package org.apache.stormcrawler.jsoup; import com.fasterxml.jackson.databind.JsonNode; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; @@ -73,11 +75,11 @@ public class LinkParseFilter extends XPathFilter { java.net.URL sourceUrl; try { - sourceUrl = new URL(url); - } catch (MalformedURLException e1) { + sourceUrl = new URI(url).toURL(); + } catch (MalformedURLException | URISyntaxException e1) { // we would have known by now as previous components check whether // the URL is valid - LOG.error("MalformedURLException on {}", url); + LOG.error("Malformed URL on {}", url); return; } diff --git a/core/src/main/java/org/apache/stormcrawler/parse/JSoupFilters.java b/core/src/main/java/org/apache/stormcrawler/parse/JSoupFilters.java index 8fd8adc3..9a9ccb36 100644 --- a/core/src/main/java/org/apache/stormcrawler/parse/JSoupFilters.java +++ b/core/src/main/java/org/apache/stormcrawler/parse/JSoupFilters.java @@ -21,6 +21,8 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.io.InputStream; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.List; @@ -126,7 +128,7 @@ public class JSoupFilters extends AbstractConfigurable implements JSoupFilter, J } /** * Used for quick testing + debugging */ - public static void main(String[] args) throws IOException, ParseException { + public static void main(String[] args) throws IOException, ParseException, URISyntaxException { Config conf = new Config(); @@ -154,7 +156,7 @@ public class JSoupFilters extends AbstractConfigurable implements JSoupFilter, J String url = cmd.getArgs()[0]; - byte[] content = IOUtils.toByteArray((new URL(url)).openStream()); + byte[] content = IOUtils.toByteArray((new URI(url).toURL()).openStream()); Document doc = Jsoup.parse(new String(content, StandardCharsets.UTF_8), url); diff --git a/core/src/main/java/org/apache/stormcrawler/parse/ParseFilters.java b/core/src/main/java/org/apache/stormcrawler/parse/ParseFilters.java index 6852d590..9f2ae827 100644 --- a/core/src/main/java/org/apache/stormcrawler/parse/ParseFilters.java +++ b/core/src/main/java/org/apache/stormcrawler/parse/ParseFilters.java @@ -23,6 +23,8 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.io.InputStream; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.List; @@ -159,7 +161,7 @@ public class ParseFilters extends ParseFilter implements JSONResource { * * @since 1.17 */ - public static void main(String[] args) throws IOException, ParseException { + public static void main(String[] args) throws IOException, ParseException, URISyntaxException { Config conf = new Config(); @@ -187,7 +189,7 @@ public class ParseFilters extends ParseFilter implements JSONResource { String url = cmd.getArgs()[0]; - byte[] content = IOUtils.toByteArray((new URL(url)).openStream()); + byte[] content = IOUtils.toByteArray((new URI(url).toURL()).openStream()); Document doc = Jsoup.parse(new String(content, StandardCharsets.UTF_8), url); diff --git a/core/src/main/java/org/apache/stormcrawler/parse/filter/LinkParseFilter.java b/core/src/main/java/org/apache/stormcrawler/parse/filter/LinkParseFilter.java index d379e014..eb23c795 100644 --- a/core/src/main/java/org/apache/stormcrawler/parse/filter/LinkParseFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/parse/filter/LinkParseFilter.java @@ -19,6 +19,8 @@ package org.apache.stormcrawler.parse.filter; import com.fasterxml.jackson.databind.JsonNode; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; @@ -73,11 +75,11 @@ public class LinkParseFilter extends XPathFilter { java.net.URL sourceUrl; try { - sourceUrl = new URL(url); - } catch (MalformedURLException e1) { + sourceUrl = new URI(url).toURL(); + } catch (MalformedURLException | URISyntaxException e1) { // we would have known by now as previous components check whether // the URL is valid - LOG.error("MalformedURLException on {}", url); + LOG.error("Malformed URL on {}", url); return; } diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/RobotRulesParser.java b/core/src/main/java/org/apache/stormcrawler/protocol/RobotRulesParser.java index 63849eec..00198119 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/RobotRulesParser.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/RobotRulesParser.java @@ -23,6 +23,8 @@ import crawlercommons.robots.BaseRobotRules; import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRules.RobotRulesMode; import crawlercommons.robots.SimpleRobotRulesParser; + +import java.net.URI; import java.net.URL; import java.util.ArrayList; import java.util.Collection; @@ -200,7 +202,7 @@ public abstract class RobotRulesParser { public BaseRobotRules getRobotRulesSet(Protocol protocol, String url) { URL u; try { - u = new URL(url); + u = new URI(url).toURL(); } catch (Exception e) { return EMPTY_RULES; } diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java b/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java index e5858f35..88d1de16 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java @@ -21,6 +21,8 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.lang.invoke.MethodHandles; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.net.URLDecoder; import java.nio.charset.StandardCharsets; @@ -45,13 +47,13 @@ public class FileResponse { private int statusCode; private final Metadata metadata; - public FileResponse(String u, Metadata md, FileProtocol fileProtocol) throws IOException { + public FileResponse(String u, Metadata md, FileProtocol fileProtocol) throws IOException, URISyntaxException { metadata = new Metadata(); content = new byte[0]; statusCode = HttpStatus.SC_INTERNAL_SERVER_ERROR; - URL url = new URL(u); + URL url = new URI(u).toURL(); if (!url.getPath().equals(url.getFile())) { LOG.warn("url.getPath() != url.getFile(): {}.", url); diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java index e8a14eb9..89d99934 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java @@ -21,6 +21,8 @@ import java.io.IOException; import java.net.InetSocketAddress; import java.net.MalformedURLException; import java.net.Proxy; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.nio.charset.StandardCharsets; import java.security.cert.CertificateException; @@ -271,11 +273,11 @@ public class HttpProtocol extends AbstractHttpProtocol { return; } try { - final List<Cookie> cookies = CookieConverter.getCookies(cookieStrings, new URL(url)); + final List<Cookie> cookies = CookieConverter.getCookies(cookieStrings, new URI(url).toURL()); for (Cookie c : cookies) { rb.addHeader("Cookie", c.getName() + "=" + c.getValue()); } - } catch (MalformedURLException e) { // Bad url , nothing to do + } catch (MalformedURLException | URISyntaxException e) { // Bad url , nothing to do } } diff --git a/core/src/main/java/org/apache/stormcrawler/util/URLPartitioner.java b/core/src/main/java/org/apache/stormcrawler/util/URLPartitioner.java index 384c3eeb..b8f11ec0 100644 --- a/core/src/main/java/org/apache/stormcrawler/util/URLPartitioner.java +++ b/core/src/main/java/org/apache/stormcrawler/util/URLPartitioner.java @@ -20,6 +20,8 @@ package org.apache.stormcrawler.util; import crawlercommons.domains.PaidLevelDomain; import java.net.InetAddress; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.Map; import org.apache.commons.lang3.StringUtils; @@ -58,9 +60,9 @@ public class URLPartitioner { if (partitionKey == null) { URL u; try { - u = new URL(url); + u = new URI(url).toURL(); host = u.getHost(); - } catch (MalformedURLException e1) { + } catch (MalformedURLException | URISyntaxException e) { LOG.warn("Invalid URL: {}", url); return null; } diff --git a/core/src/main/java/org/apache/stormcrawler/util/URLUtil.java b/core/src/main/java/org/apache/stormcrawler/util/URLUtil.java index fe3c72f2..1350ece0 100644 --- a/core/src/main/java/org/apache/stormcrawler/util/URLUtil.java +++ b/core/src/main/java/org/apache/stormcrawler/util/URLUtil.java @@ -20,6 +20,7 @@ package org.apache.stormcrawler.util; import java.net.IDN; import java.net.MalformedURLException; import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.Locale; import java.util.regex.Pattern; @@ -137,8 +138,8 @@ public class URLUtil { * * @throws MalformedURLException */ - public static String[] getHostSegments(String url) throws MalformedURLException { - return getHostSegments(new URL(url)); + public static String[] getHostSegments(String url) throws MalformedURLException, URISyntaxException { + return getHostSegments(new URI(url).toURL()); } /** @@ -149,8 +150,8 @@ public class URLUtil { */ public static String getHost(String url) { try { - return new URL(url).getHost().toLowerCase(Locale.ROOT); - } catch (MalformedURLException e) { + return new URI(url).toURL().getHost().toLowerCase(Locale.ROOT); + } catch (MalformedURLException | URISyntaxException e) { return null; } } @@ -167,16 +168,16 @@ public class URLUtil { // get the full url, and replace the query string with and empty // string url = url.toLowerCase(Locale.ROOT); - String queryStr = new URL(url).getQuery(); + String queryStr = new URI(url).toURL().getQuery(); return (queryStr != null) ? url.replace("?" + queryStr, "") : url; - } catch (MalformedURLException e) { + } catch (MalformedURLException | URISyntaxException e) { return null; } } public static String toASCII(String url) { try { - URL u = new URL(url); + URL u = new URI(url).toURL(); URI p = new URI( u.getProtocol(), @@ -195,7 +196,7 @@ public class URLUtil { public static String toUNICODE(String url) { try { - URL u = new URL(url); + URL u = new URI(url).toURL(); URI p = new URI( u.getProtocol(), diff --git a/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLFilterTest.java b/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLFilterTest.java index 017b60b8..e59f8cc4 100644 --- a/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLFilterTest.java @@ -19,6 +19,8 @@ package org.apache.stormcrawler.filtering; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.HashMap; import java.util.Map; @@ -40,25 +42,25 @@ class BasicURLFilterTest { } @Test - void testRepetition() throws MalformedURLException { + void testRepetition() throws MalformedURLException, URISyntaxException { URLFilter filter = createFilter(-1, 3); Metadata metadata = new Metadata(); - URL targetURL = new URL("http://www.sourcedomain.com/a/a/a/index.html"); + URL targetURL = new URI("http://www.sourcedomain.com/a/a/a/index.html").toURL(); String filterResult = filter.filter(targetURL, metadata, targetURL.toExternalForm()); Assertions.assertNull(filterResult); - targetURL = new URL("http://www.sourcedomain.com/a/b/a/index.html"); + targetURL = new URI("http://www.sourcedomain.com/a/b/a/index.html").toURL(); filterResult = filter.filter(targetURL, metadata, targetURL.toExternalForm()); Assertions.assertEquals(targetURL.toExternalForm(), filterResult); } @Test - void testLength() throws MalformedURLException { + void testLength() throws MalformedURLException, URISyntaxException { URLFilter filter = createFilter(32, -1); Metadata metadata = new Metadata(); - URL targetURL = new URL("http://www.sourcedomain.com/a/a/a/index.html"); + URL targetURL = new URI("http://www.sourcedomain.com/a/a/a/index.html").toURL(); String filterResult = filter.filter(targetURL, metadata, targetURL.toExternalForm()); Assertions.assertNull(filterResult); - targetURL = new URL("http://www.sourcedomain.com/"); + targetURL = new URI("http://www.sourcedomain.com/").toURL(); filterResult = filter.filter(targetURL, metadata, targetURL.toExternalForm()); Assertions.assertEquals(targetURL.toExternalForm(), filterResult); } diff --git a/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java b/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java index 65da7630..65676040 100644 --- a/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java +++ b/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java @@ -23,6 +23,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.Arrays; import java.util.HashMap; @@ -79,9 +81,9 @@ class BasicURLNormalizerTest { } @Test - void testAnchorFilter() throws MalformedURLException { + void testAnchorFilter() throws MalformedURLException, URISyntaxException { URLFilter allAllowed = createFilter(true, false); - URL url = new URL("http://www.sourcedomain.com/#0"); + URL url = new URI("http://www.sourcedomain.com/#0").toURL(); Metadata metadata = new Metadata(); String filterResult = allAllowed.filter(url, metadata, url.toExternalForm()); String expected = "http://www.sourcedomain.com/"; @@ -89,18 +91,18 @@ class BasicURLNormalizerTest { } @Test - void testAnchorFilterFalse() throws MalformedURLException { + void testAnchorFilterFalse() throws MalformedURLException, URISyntaxException { URLFilter allAllowed = createFilter(false, false); - URL url = new URL("http://www.sourcedomain.com/#0"); + URL url = new URI("http://www.sourcedomain.com/#0").toURL(); Metadata metadata = new Metadata(); String filterResult = allAllowed.filter(url, metadata, url.toExternalForm()); Assertions.assertEquals(url.toExternalForm(), filterResult); } @Test - void testRemoveSomeOfManyQueryParams() throws MalformedURLException { + void testRemoveSomeOfManyQueryParams() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(queryParamsToFilter); - URL testSourceUrl = new URL("http://google.com"); + URL testSourceUrl = new URI("http://google.com").toURL(); String testUrl = "http://google.com?keep1=true&a=c&foo=baz&keep2=true"; String expectedResult = "http://google.com?keep1=true&keep2=true"; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), testUrl); @@ -108,9 +110,9 @@ class BasicURLNormalizerTest { } @Test - void testRemoveAllQueryParams() throws MalformedURLException { + void testRemoveAllQueryParams() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(queryParamsToFilter); - URL testSourceUrl = new URL("http://google.com"); + URL testSourceUrl = new URI("http://google.com").toURL(); String testUrl = "http://google.com?a=c&foo=baz"; String expectedResult = "http://google.com"; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), testUrl); @@ -118,9 +120,9 @@ class BasicURLNormalizerTest { } @Test - void testRemoveDupeQueryParams() throws MalformedURLException { + void testRemoveDupeQueryParams() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(queryParamsToFilter); - URL testSourceUrl = new URL("http://google.com"); + URL testSourceUrl = new URI("http://google.com").toURL(); String testUrl = "http://google.com?a=c&foo=baz&foo=bar&test=true"; String expectedResult = "http://google.com?test=true"; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), testUrl); @@ -128,9 +130,9 @@ class BasicURLNormalizerTest { } @Test - void testPipeInUrlAndFilterStillWorks() throws MalformedURLException { + void testPipeInUrlAndFilterStillWorks() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(queryParamsToFilter); - URL testSourceUrl = new URL("http://google.com"); + URL testSourceUrl = new URI("http://google.com").toURL(); String testUrl = "http://google.com?a=c|d&foo=baz&foo=bar&test=true"; String expectedResult = "http://google.com?test=true"; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), testUrl); @@ -138,9 +140,9 @@ class BasicURLNormalizerTest { } @Test - void testBothAnchorAndQueryFilter() throws MalformedURLException { + void testBothAnchorAndQueryFilter() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(true, queryParamsToFilter); - URL testSourceUrl = new URL("http://google.com"); + URL testSourceUrl = new URI("http://google.com").toURL(); String testUrl = "http://google.com?a=c|d&foo=baz&foo=bar&test=true#fragment=ohYeah"; String expectedResult = "http://google.com?test=true"; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), testUrl); @@ -148,9 +150,9 @@ class BasicURLNormalizerTest { } @Test - void testQuerySort() throws MalformedURLException { + void testQuerySort() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(queryParamsToFilter); - URL testSourceUrl = new URL("http://google.com"); + URL testSourceUrl = new URI("http://google.com").toURL(); String testUrl = "http://google.com?a=c|d&foo=baz&foo=bar&test=true&z=2&d=4"; String expectedResult = "http://google.com?d=4&test=true&z=2"; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), testUrl); @@ -158,9 +160,9 @@ class BasicURLNormalizerTest { } @Test - void testMangledQueryString() throws MalformedURLException { + void testMangledQueryString() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(queryParamsToFilter); - URL testSourceUrl = new URL("http://google.com"); + URL testSourceUrl = new URI("http://google.com").toURL(); String testUrl = "http://google.com&d=4&good=true"; String expectedResult = "http://google.com?d=4&good=true"; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), testUrl); @@ -168,11 +170,11 @@ class BasicURLNormalizerTest { } @Test - void testHashes() throws MalformedURLException { + void testHashes() throws MalformedURLException, URISyntaxException { ObjectNode filterParams = new ObjectNode(JsonNodeFactory.instance); filterParams.put("removeHashes", true); URLFilter urlFilter = createFilter(filterParams); - URL testSourceUrl = new URL("http://florida-chemical.com"); + URL testSourceUrl = new URI("http://florida-chemical.com").toURL(); String in = "http://www.florida-chemical.com/Diacetone-Alcohol-DAA-99.html?xid_0b629=12854b827878df26423d933a5baf86d5"; String out = "http://www.florida-chemical.com/Diacetone-Alcohol-DAA-99.html"; @@ -186,9 +188,9 @@ class BasicURLNormalizerTest { } @Test - void testDontFixMangledQueryString() throws MalformedURLException { + void testDontFixMangledQueryString() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(true, false, queryParamsToFilter); - URL testSourceUrl = new URL("http://google.com"); + URL testSourceUrl = new URI("http://google.com").toURL(); String testUrl = "http://google.com&d=4&good=true"; String expectedResult = "http://google.com&d=4&good=true"; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), testUrl); @@ -196,14 +198,14 @@ class BasicURLNormalizerTest { } @Test - void testFixMangledQueryString() throws MalformedURLException { + void testFixMangledQueryString() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(false, true, queryParamsToFilter); - URL testSourceUrl = new URL("http://google.com"); + URL testSourceUrl = new URI("http://google.com").toURL(); String testUrl = "http://google.com&d=4&good=true"; String expectedResult = "http://google.com?d=4&good=true"; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), testUrl); assertEquals(expectedResult, normalizedUrl, "Failed to filter query string"); - testSourceUrl = new URL("http://dev.com"); + testSourceUrl = new URI("http://dev.com").toURL(); testUrl = "http://dev.com/s&utax/NEWSRLSEfy18.pdf"; normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), testUrl); expectedResult = "http://dev.com/s&utax/NEWSRLSEfy18.pdf"; @@ -211,11 +213,11 @@ class BasicURLNormalizerTest { } @Test - void testProperURLEncodingWithoutQueryParameter() throws MalformedURLException { + void testProperURLEncodingWithoutQueryParameter() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(queryParamsToFilter); String urlWithEscapedCharacters = "http://www.dillards.com/product/ASICS-Womens-GT2000-3-LiteShow%E2%84%A2-Running-Shoes_301_-1_301_504736989"; - URL testSourceUrl = new URL(urlWithEscapedCharacters); + URL testSourceUrl = new URI(urlWithEscapedCharacters).toURL(); String testUrl = urlWithEscapedCharacters; String expectedResult = urlWithEscapedCharacters; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), testUrl); @@ -223,11 +225,11 @@ class BasicURLNormalizerTest { } @Test - void testProperURLEncodingWithQueryParameters() throws MalformedURLException { + void testProperURLEncodingWithQueryParameters() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(queryParamsToFilter); String urlWithEscapedCharacters = "http://www.dillards.com/product/ASICS-Womens-GT2000-3-LiteShow%E2%84%A2-Running-Shoes_301_-1_301_504736989?how=are&you=doing"; - URL testSourceUrl = new URL(urlWithEscapedCharacters); + URL testSourceUrl = new URI(urlWithEscapedCharacters).toURL(); String testUrl = urlWithEscapedCharacters; String expectedResult = urlWithEscapedCharacters; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), testUrl); @@ -235,24 +237,24 @@ class BasicURLNormalizerTest { } @Test - void testProperURLEncodingWithBackSlash() throws MalformedURLException { + void testProperURLEncodingWithBackSlash() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(queryParamsToFilter); String urlWithEscapedCharacters = "http://www.voltaix.com/\\SDS\\Silicon\\Trisilane\\Trisilane_SI050_USENG.pdf"; String expectedResult = "http://www.voltaix.com/%5CSDS%5CSilicon%5CTrisilane%5CTrisilane_SI050_USENG.pdf"; - URL testSourceUrl = new URL(urlWithEscapedCharacters); + URL testSourceUrl = new URI(urlWithEscapedCharacters).toURL(); String testUrl = urlWithEscapedCharacters; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), testUrl); assertEquals(expectedResult, normalizedUrl, "Failed to filter query string"); } @Test - void testInvalidURI() throws MalformedURLException { + void testInvalidURI() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(true, true); // this one is now handled by the normaliser String nonURI = "http://www.quanjing.com/search.aspx?q=top-651451||1|60|1|2||||&Fr=4"; - URL testSourceUrl = new URL(nonURI); + URL testSourceUrl = new URI(nonURI).toURL(); String expectedResult = "http://www.quanjing.com/search.aspx?q=top-651451%7C%7C1%7C60%7C1%7C2%7C%7C%7C%7C&Fr=4"; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), nonURI); @@ -260,7 +262,7 @@ class BasicURLNormalizerTest { // this one is nonURI = "http://vins.lemonde.fr?utm_source=LeMonde_partenaire_hp&utm_medium=EMPLACEMENT PARTENAIRE&utm_term=&utm_content=&utm_campaign=LeMonde_partenaire_hp"; - testSourceUrl = new URL(nonURI); + testSourceUrl = new URI(nonURI).toURL(); expectedResult = "http://vins.lemonde.fr?utm_source=LeMonde_partenaire_hp&utm_medium=EMPLACEMENT%20PARTENAIRE&utm_term=&utm_content=&utm_campaign=LeMonde_partenaire_hp"; normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), nonURI); @@ -269,16 +271,16 @@ class BasicURLNormalizerTest { // http://docs.oracle.com/javase/7/docs/api/java/net/URI.html#normalize() String nonNormURL = "http://docs.oracle.com/javase/7/docs/api/java/net/../net/./URI.html#normalize()"; - testSourceUrl = new URL(nonNormURL); + testSourceUrl = new URI(nonNormURL).toURL(); expectedResult = "http://docs.oracle.com/javase/7/docs/api/java/net/URI.html"; normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), nonNormURL); assertEquals(expectedResult, normalizedUrl, "Failed to filter query string"); } @Test - void testLowerCasing() throws MalformedURLException { + void testLowerCasing() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(false, false); - URL testSourceUrl = new URL("http://blablabla.org/"); + URL testSourceUrl = new URI("http://blablabla.org/").toURL(); String inputURL = "HTTP://www.quanjing.com/"; String expectedResult = inputURL.toLowerCase(Locale.ROOT); String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), inputURL); @@ -291,9 +293,9 @@ class BasicURLNormalizerTest { // https://github.com/apache/stormcrawler/issues/401 @Test - void testNonStandardPercentEncoding() throws MalformedURLException { + void testNonStandardPercentEncoding() throws MalformedURLException, URISyntaxException { URLFilter urlFilter = createFilter(false, false); - URL testSourceUrl = new URL("http://www.hurriyet.com.tr/index/?d=20160328&p=13"); + URL testSourceUrl = new URI("http://www.hurriyet.com.tr/index/?d=20160328&p=13").toURL(); String inputURL = "http://www.hurriyet.com.tr/index/?d=20160328&p=13&s=ni%u011fde"; String expectedURL = "http://www.hurriyet.com.tr/index/?d=20160328&p=13&s=ni%C4%9Fde"; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), inputURL); @@ -301,11 +303,11 @@ class BasicURLNormalizerTest { } @Test - void testHostIDNtoASCII() throws MalformedURLException { + void testHostIDNtoASCII() throws MalformedURLException, URISyntaxException { ObjectNode filterParams = new ObjectNode(JsonNodeFactory.instance); filterParams.put("hostIDNtoASCII", true); URLFilter urlFilter = createFilter(filterParams); - URL testSourceUrl = new URL("http://www.example.com/"); + URL testSourceUrl = new URI("http://www.example.com/").toURL(); String inputURL = "http://señal6.com.ar/"; String expectedURL = "http://xn--seal6-pta.com.ar/"; String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), inputURL); diff --git a/core/src/test/java/org/apache/stormcrawler/filtering/FastURLFilterTest.java b/core/src/test/java/org/apache/stormcrawler/filtering/FastURLFilterTest.java index 6ae14691..09bd8c31 100644 --- a/core/src/test/java/org/apache/stormcrawler/filtering/FastURLFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/filtering/FastURLFilterTest.java @@ -19,6 +19,8 @@ package org.apache.stormcrawler.filtering; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.HashMap; import java.util.Map; @@ -39,28 +41,28 @@ class FastURLFilterTest { } @Test - void testImagesFilter() throws MalformedURLException { - URL url = new URL("http://www.somedomain.com/image.jpg"); + void testImagesFilter() throws MalformedURLException, URISyntaxException { + URL url = new URI("http://www.somedomain.com/image.jpg").toURL(); Metadata metadata = new Metadata(); String filterResult = createFilter().filter(url, metadata, url.toExternalForm()); Assertions.assertNull(filterResult); } @Test - void testDomainNotAllowed() throws MalformedURLException { - URL url = new URL("http://stormcrawler.net/"); + void testDomainNotAllowed() throws MalformedURLException, URISyntaxException { + URL url = new URI("http://stormcrawler.net/").toURL(); Metadata metadata = new Metadata(); String filterResult = createFilter().filter(url, metadata, url.toExternalForm()); Assertions.assertNull(filterResult); // allowed - url = new URL("http://stormcrawler.net/bla/"); + url = new URI("http://stormcrawler.net/bla/").toURL(); filterResult = createFilter().filter(url, metadata, url.toExternalForm()); Assertions.assertEquals(url.toString(), filterResult); } @Test - void testMD() throws MalformedURLException { - URL url = new URL("http://somedomain.net/"); + void testMD() throws MalformedURLException, URISyntaxException { + URL url = new URI("http://somedomain.net/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "value"); String filterResult = createFilter().filter(url, metadata, url.toExternalForm()); diff --git a/core/src/test/java/org/apache/stormcrawler/filtering/HostURLFilterTest.java b/core/src/test/java/org/apache/stormcrawler/filtering/HostURLFilterTest.java index 5b183919..59b81c64 100644 --- a/core/src/test/java/org/apache/stormcrawler/filtering/HostURLFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/filtering/HostURLFilterTest.java @@ -19,6 +19,8 @@ package org.apache.stormcrawler.filtering; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.HashMap; import java.util.Map; @@ -44,9 +46,9 @@ class HostURLFilterTest { } @Test - void testAllAllowed() throws MalformedURLException { + void testAllAllowed() throws MalformedURLException, URISyntaxException { HostURLFilter allAllowed = createFilter(false, false); - URL sourceURL = new URL("http://www.sourcedomain.com/index.html"); + URL sourceURL = new URI("http://www.sourcedomain.com/index.html").toURL(); Metadata metadata = new Metadata(); String filterResult = allAllowed.filter(sourceURL, metadata, "http://www.sourcedomain.com/index.html"); @@ -60,9 +62,9 @@ class HostURLFilterTest { } @Test - void testAllForbidden() throws MalformedURLException { + void testAllForbidden() throws MalformedURLException, URISyntaxException { HostURLFilter allAllowed = createFilter(true, true); - URL sourceURL = new URL("http://www.sourcedomain.com/index.html"); + URL sourceURL = new URI("http://www.sourcedomain.com/index.html").toURL(); Metadata metadata = new Metadata(); String filterResult = allAllowed.filter(sourceURL, metadata, "http://www.sourcedomain.com/index.html"); @@ -76,9 +78,9 @@ class HostURLFilterTest { } @Test - void testWithinHostOnly() throws MalformedURLException { + void testWithinHostOnly() throws MalformedURLException, URISyntaxException { HostURLFilter allAllowed = createFilter(true, false); - URL sourceURL = new URL("http://www.sourcedomain.com/index.html"); + URL sourceURL = new URI("http://www.sourcedomain.com/index.html").toURL(); Metadata metadata = new Metadata(); String filterResult = allAllowed.filter(sourceURL, metadata, "http://www.sourcedomain.com/index.html"); @@ -92,9 +94,9 @@ class HostURLFilterTest { } @Test - void testWithinDomain() throws MalformedURLException { + void testWithinDomain() throws MalformedURLException, URISyntaxException { HostURLFilter allAllowed = createFilter(false, true); - URL sourceURL = new URL("http://www.sourcedomain.com/index.html"); + URL sourceURL = new URI("http://www.sourcedomain.com/index.html").toURL(); Metadata metadata = new Metadata(); String filterResult = allAllowed.filter(sourceURL, metadata, "http://www.sourcedomain.com/index.html"); diff --git a/core/src/test/java/org/apache/stormcrawler/filtering/MaxDepthFilterTest.java b/core/src/test/java/org/apache/stormcrawler/filtering/MaxDepthFilterTest.java index a76a5c74..24fa6563 100644 --- a/core/src/test/java/org/apache/stormcrawler/filtering/MaxDepthFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/filtering/MaxDepthFilterTest.java @@ -19,6 +19,8 @@ package org.apache.stormcrawler.filtering; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.HashMap; import java.util.Map; @@ -40,18 +42,18 @@ class MaxDepthFilterTest { } @Test - void testDepthZero() throws MalformedURLException { + void testDepthZero() throws MalformedURLException, URISyntaxException { URLFilter filter = createFilter("maxDepth", 0); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); String filterResult = filter.filter(url, metadata, url.toExternalForm()); Assertions.assertNull(filterResult); } @Test - void testDepth() throws MalformedURLException { + void testDepth() throws MalformedURLException, URISyntaxException { URLFilter filter = createFilter("maxDepth", 2); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.setValue(MetadataTransfer.depthKeyName, "2"); String filterResult = filter.filter(url, metadata, url.toExternalForm()); @@ -59,9 +61,9 @@ class MaxDepthFilterTest { } @Test - void testCustomDepthZero() throws MalformedURLException { + void testCustomDepthZero() throws MalformedURLException, URISyntaxException { URLFilter filter = createFilter("maxDepth", 3); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.setValue(MetadataTransfer.maxDepthKeyName, "0"); String filterResult = filter.filter(url, metadata, url.toExternalForm()); @@ -69,9 +71,9 @@ class MaxDepthFilterTest { } @Test - void testCustomDepth() throws MalformedURLException { + void testCustomDepth() throws MalformedURLException, URISyntaxException { URLFilter filter = createFilter("maxDepth", 1); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.setValue(MetadataTransfer.maxDepthKeyName, "2"); metadata.setValue(MetadataTransfer.depthKeyName, "1"); diff --git a/core/src/test/java/org/apache/stormcrawler/filtering/MetadataFilterFromJsonTest.java b/core/src/test/java/org/apache/stormcrawler/filtering/MetadataFilterFromJsonTest.java index 85a624b9..14ae326b 100644 --- a/core/src/test/java/org/apache/stormcrawler/filtering/MetadataFilterFromJsonTest.java +++ b/core/src/test/java/org/apache/stormcrawler/filtering/MetadataFilterFromJsonTest.java @@ -17,6 +17,8 @@ package org.apache.stormcrawler.filtering; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.Map; import org.apache.stormcrawler.Metadata; @@ -31,18 +33,18 @@ class MetadataFilterFromJsonTest { // old filter mechanism (backward compatible) @Test - void testFilterNoMD() throws MalformedURLException { + void testFilterNoMD() throws MalformedURLException, URISyntaxException { URLFilters filter = createURLFilters("test.metadata.1.urlfilters.json"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); String filterResult = filter.filter(url, metadata, url.toExternalForm()); Assertions.assertEquals(url.toExternalForm(), filterResult); } @Test - void testFilterHit() throws MalformedURLException { + void testFilterHit() throws MalformedURLException, URISyntaxException { URLFilters filter = createURLFilters("test.metadata.1.urlfilters.json"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val"); String filterResult = filter.filter(url, metadata, url.toExternalForm()); @@ -50,9 +52,9 @@ class MetadataFilterFromJsonTest { } @Test - void testFilterNoHit() throws MalformedURLException { + void testFilterNoHit() throws MalformedURLException, URISyntaxException { URLFilters filter = createURLFilters("test.metadata.1.urlfilters.json"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val2"); metadata.addValue("key", "val3"); @@ -62,36 +64,36 @@ class MetadataFilterFromJsonTest { // new filter mechanism @Test - void testNewFilterWithEmptyFilterAndNullMetadata() throws MalformedURLException { + void testNewFilterWithEmptyFilterAndNullMetadata() throws MalformedURLException, URISyntaxException { URLFilters filter = createURLFilters("test.metadata.2.urlfilters.json"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); String filterResult = filter.filter(url, null, url.toExternalForm()); Assertions.assertEquals(url.toExternalForm(), filterResult); } @Test - void testNewFilterWithEmptyFilterAndEmptyMetadata() throws MalformedURLException { + void testNewFilterWithEmptyFilterAndEmptyMetadata() throws MalformedURLException, URISyntaxException { URLFilters filter = createURLFilters("test.metadata.2.urlfilters.json"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); String filterResult = filter.filter(url, metadata, url.toExternalForm()); Assertions.assertEquals(url.toExternalForm(), filterResult); } @Test - void testNewFilterWithEmptyMetadata() throws MalformedURLException { + void testNewFilterWithEmptyMetadata() throws MalformedURLException, URISyntaxException { URLFilters filter = createURLFilters("test.metadata.2.urlfilters.json"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); String filterResult = filter.filter(url, metadata, url.toExternalForm()); Assertions.assertEquals(url.toExternalForm(), filterResult); } @Test - void testNewFilterWithOnlyOneMatchingANDFilter() throws MalformedURLException { + void testNewFilterWithOnlyOneMatchingANDFilter() throws MalformedURLException, URISyntaxException { // Filter if key=>val AND key2=>val2 match URLFilters filter = createURLFilters("test.metadata.2.urlfilters.json"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val"); String filterResult = filter.filter(url, metadata, url.toExternalForm()); @@ -99,10 +101,10 @@ class MetadataFilterFromJsonTest { } @Test - void testNewFilterWithAllMatchingANDFilter() throws MalformedURLException { + void testNewFilterWithAllMatchingANDFilter() throws MalformedURLException, URISyntaxException { // Filter if key=>val AND key2=>val2 match URLFilters filter = createURLFilters("test.metadata.2.urlfilters.json"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val"); metadata.addValue("key2", "val2"); @@ -111,10 +113,10 @@ class MetadataFilterFromJsonTest { } @Test - void testNewFilterWithComplexFilter() throws MalformedURLException { + void testNewFilterWithComplexFilter() throws MalformedURLException, URISyntaxException { // Filter if key=>val AND (key2=>val2 OR key3=>val3) match URLFilters filter = createURLFilters("test.metadata.3.urlfilters.json"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val"); diff --git a/core/src/test/java/org/apache/stormcrawler/filtering/MetadataFilterTest.java b/core/src/test/java/org/apache/stormcrawler/filtering/MetadataFilterTest.java index 6a39568b..1a84e9ef 100644 --- a/core/src/test/java/org/apache/stormcrawler/filtering/MetadataFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/filtering/MetadataFilterTest.java @@ -19,6 +19,8 @@ package org.apache.stormcrawler.filtering; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.HashMap; import java.util.Map; @@ -40,18 +42,18 @@ class MetadataFilterTest { // old filter mechanism (backward compatible) @Test - void testFilterNoMD() throws MalformedURLException { + void testFilterNoMD() throws MalformedURLException, URISyntaxException { URLFilter filter = createFilter("key", "val"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); String filterResult = filter.filter(url, metadata, url.toExternalForm()); Assertions.assertEquals(url.toExternalForm(), filterResult); } @Test - void testFilterHit() throws MalformedURLException { + void testFilterHit() throws MalformedURLException, URISyntaxException { URLFilter filter = createFilter("key", "val"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val"); String filterResult = filter.filter(url, metadata, url.toExternalForm()); @@ -59,9 +61,9 @@ class MetadataFilterTest { } @Test - void testFilterNoHit() throws MalformedURLException { + void testFilterNoHit() throws MalformedURLException, URISyntaxException { URLFilter filter = createFilter("key", "val"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val2"); metadata.addValue("key", "val3"); @@ -71,26 +73,26 @@ class MetadataFilterTest { // new filter mechanism @Test - void testNewFilterWithEmptyFilterAndNullMetadata() throws MalformedURLException { + void testNewFilterWithEmptyFilterAndNullMetadata() throws MalformedURLException, URISyntaxException { MetadataFilter filter = new MetadataFilter(); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); String filterResult = filter.filter(url, null, url.toExternalForm()); Assertions.assertEquals(url.toExternalForm(), filterResult); } @Test - void testNewFilterWithEmptyFilterAndEmptyMetadata() throws MalformedURLException { + void testNewFilterWithEmptyFilterAndEmptyMetadata() throws MalformedURLException, URISyntaxException { MetadataFilter filter = new MetadataFilter(); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); String filterResult = filter.filter(url, metadata, url.toExternalForm()); Assertions.assertEquals(url.toExternalForm(), filterResult); } @Test - void testNewFilterWithEmptyFilter() throws MalformedURLException { + void testNewFilterWithEmptyFilter() throws MalformedURLException, URISyntaxException { MetadataFilter filter = new MetadataFilter(); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val"); String filterResult = filter.filter(url, metadata, url.toExternalForm()); @@ -98,21 +100,21 @@ class MetadataFilterTest { } @Test - void testNewFilterWithEmptyMetadata() throws MalformedURLException { + void testNewFilterWithEmptyMetadata() throws MalformedURLException, URISyntaxException { MetadataFilter filter = new MetadataFilter(); filter.addFilter("key", "val"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); String filterResult = filter.filter(url, metadata, url.toExternalForm()); Assertions.assertEquals(url.toExternalForm(), filterResult); } @Test - void testNewFilterWithSingleMatchingORFilter() throws MalformedURLException { + void testNewFilterWithSingleMatchingORFilter() throws MalformedURLException, URISyntaxException { // Filter if key=>val match (OR operation) MetadataFilter filter = new MetadataFilter(); filter.addFilter("key", "val"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val"); String filterResult = filter.filter(url, metadata, url.toExternalForm()); @@ -120,12 +122,12 @@ class MetadataFilterTest { } @Test - void testNewFilterWithSingleMatchingANDFilter() throws MalformedURLException { + void testNewFilterWithSingleMatchingANDFilter() throws MalformedURLException, URISyntaxException { // Filter if key=>val match (AND operation) MetadataFilter filter = new MetadataFilter(); filter.addFilter("key", "val"); filter.setOperation(MetadataFilter.FilterOperation.AND); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val"); String filterResult = filter.filter(url, metadata, url.toExternalForm()); @@ -133,12 +135,12 @@ class MetadataFilterTest { } @Test - void testNewFilterWithOnlyOneMatchingORFilter() throws MalformedURLException { + void testNewFilterWithOnlyOneMatchingORFilter() throws MalformedURLException, URISyntaxException { // Filter if key=>val OR key2=>val2 match MetadataFilter filter = new MetadataFilter(); filter.addFilter("key", "val"); filter.addFilter("key2", "val2"); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val"); String filterResult = filter.filter(url, metadata, url.toExternalForm()); @@ -146,13 +148,13 @@ class MetadataFilterTest { } @Test - void testNewFilterWithOnlyOneMatchingANDFilter() throws MalformedURLException { + void testNewFilterWithOnlyOneMatchingANDFilter() throws MalformedURLException, URISyntaxException { // Filter if key=>val AND key2=>val2 match MetadataFilter filter = new MetadataFilter(); filter.addFilter("key", "val"); filter.addFilter("key2", "val2"); filter.setOperation(MetadataFilter.FilterOperation.AND); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val"); String filterResult = filter.filter(url, metadata, url.toExternalForm()); @@ -160,13 +162,13 @@ class MetadataFilterTest { } @Test - void testNewFilterWithAllMatchingANDFilter() throws MalformedURLException { + void testNewFilterWithAllMatchingANDFilter() throws MalformedURLException, URISyntaxException { // Filter if key=>val AND key2=>val2 match MetadataFilter filter = new MetadataFilter(); filter.addFilter("key", "val"); filter.addFilter("key2", "val2"); filter.setOperation(MetadataFilter.FilterOperation.AND); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val"); metadata.addValue("key2", "val2"); @@ -175,7 +177,7 @@ class MetadataFilterTest { } @Test - void testNewFilterWithComplexFilter() throws MalformedURLException { + void testNewFilterWithComplexFilter() throws MalformedURLException, URISyntaxException { // Filter if key=>val AND (key2=>val2 OR key3=>val3) match MetadataFilter filter = new MetadataFilter(); filter.addFilter("key", "val"); @@ -184,7 +186,7 @@ class MetadataFilterTest { filter2.addFilter("key2", "val2"); filter2.addFilter("key3", "val3"); filter.addFilter(filter2); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val"); @@ -221,7 +223,7 @@ class MetadataFilterTest { } @Test - void testNewFilterWithOtherComplexFilter() throws MalformedURLException { + void testNewFilterWithOtherComplexFilter() throws MalformedURLException, URISyntaxException { // Filter if key=>val OR (key2=>val2 AND key3=>val3) match MetadataFilter filter = new MetadataFilter(); filter.addFilter("key", "val"); @@ -230,7 +232,7 @@ class MetadataFilterTest { filter2.addFilter("key3", "val3"); filter2.setOperation(MetadataFilter.FilterOperation.AND); filter.addFilter(filter2); - URL url = new URL("http://www.sourcedomain.com/"); + URL url = new URI("http://www.sourcedomain.com/").toURL(); Metadata metadata = new Metadata(); metadata.addValue("key", "val"); diff --git a/core/src/test/java/org/apache/stormcrawler/filtering/RegexFilterTest.java b/core/src/test/java/org/apache/stormcrawler/filtering/RegexFilterTest.java index 60482433..183495ae 100644 --- a/core/src/test/java/org/apache/stormcrawler/filtering/RegexFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/filtering/RegexFilterTest.java @@ -19,6 +19,8 @@ package org.apache.stormcrawler.filtering; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.HashMap; import java.util.Map; @@ -43,35 +45,35 @@ class RegexFilterTest { } @Test - void testProtocolFilter() throws MalformedURLException { + void testProtocolFilter() throws MalformedURLException, URISyntaxException { URLFilter allAllowed = createFilter(); - URL url = new URL("ftp://www.someFTP.com/#0"); + URL url = new URI("ftp://www.someFTP.com/#0").toURL(); Metadata metadata = new Metadata(); String filterResult = allAllowed.filter(url, metadata, url.toExternalForm()); Assertions.assertNull(filterResult); } @Test - void testImagesFilter() throws MalformedURLException { + void testImagesFilter() throws MalformedURLException, URISyntaxException { URLFilter allAllowed = createFilter(); - URL url = new URL("http://www.someFTP.com/bla.gif"); + URL url = new URI("http://www.someFTP.com/bla.gif").toURL(); Metadata metadata = new Metadata(); String filterResult = allAllowed.filter(url, metadata, url.toExternalForm()); Assertions.assertNull(filterResult); - url = new URL("http://www.someFTP.com/bla.GIF"); + url = new URI("http://www.someFTP.com/bla.GIF").toURL(); filterResult = allAllowed.filter(url, metadata, url.toExternalForm()); Assertions.assertNull(filterResult); - url = new URL("http://www.someFTP.com/bla.GIF&somearg=0"); + url = new URI("http://www.someFTP.com/bla.GIF&somearg=0").toURL(); filterResult = allAllowed.filter(url, metadata, url.toExternalForm()); Assertions.assertNull(filterResult); - url = new URL("http://www.someFTP.com/bla.GIF?somearg=0"); + url = new URI("http://www.someFTP.com/bla.GIF?somearg=0").toURL(); filterResult = allAllowed.filter(url, metadata, url.toExternalForm()); Assertions.assertNull(filterResult); // not this one : the gif is within the path - url = new URL("http://www.someFTP.com/bla.GIF.orNot"); + url = new URI("http://www.someFTP.com/bla.GIF.orNot").toURL(); filterResult = allAllowed.filter(url, metadata, url.toExternalForm()); Assertions.assertEquals(url.toExternalForm(), filterResult); - url = new URL("http://www.someFTP.com/bla.mp4"); + url = new URI("http://www.someFTP.com/bla.mp4").toURL(); filterResult = allAllowed.filter(url, metadata, url.toExternalForm()); Assertions.assertNull(filterResult); } diff --git a/core/src/test/java/org/apache/stormcrawler/util/CookieConverterTest.java b/core/src/test/java/org/apache/stormcrawler/util/CookieConverterTest.java index 1b9cca8d..a434a825 100644 --- a/core/src/test/java/org/apache/stormcrawler/util/CookieConverterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/util/CookieConverterTest.java @@ -17,6 +17,8 @@ package org.apache.stormcrawler.util; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.List; import org.apache.http.cookie.Cookie; @@ -307,8 +309,8 @@ class CookieConverterTest { private URL getUrl(String urlString) { try { - return new URL(urlString); - } catch (MalformedURLException e) { + return new URI(urlString).toURL(); + } catch (MalformedURLException | URISyntaxException e) { return null; } } diff --git a/external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java b/external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java index 219f4b0a..35ef66b1 100644 --- a/external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java +++ b/external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java @@ -16,6 +16,7 @@ */ package org.apache.stormcrawler.protocol.selenium; +import java.net.URI; import java.net.URL; import java.time.Duration; import java.util.ArrayList; @@ -98,7 +99,7 @@ public class RemoteDriverProtocol extends SeleniumProtocol { for (String cdaddress : addresses) { try { RemoteWebDriver driver = - new RemoteWebDriver(new URL(cdaddress), capabilities, tracing); + new RemoteWebDriver(new URI(cdaddress).toURL(), capabilities, tracing); // setting timouts // see https://www.browserstack.com/guide/understanding-selenium-timeouts Timeouts touts = driver.manage().timeouts(); diff --git a/external/tika/src/main/java/org/apache/stormcrawler/tika/ParserBolt.java b/external/tika/src/main/java/org/apache/stormcrawler/tika/ParserBolt.java index 09dea259..7e229848 100644 --- a/external/tika/src/main/java/org/apache/stormcrawler/tika/ParserBolt.java +++ b/external/tika/src/main/java/org/apache/stormcrawler/tika/ParserBolt.java @@ -21,6 +21,8 @@ import static org.apache.stormcrawler.Constants.StatusStreamName; import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; @@ -204,9 +206,9 @@ public class ParserBolt extends BaseRichBolt { // as well as the filename try { - URL _url = new URL(url); + URL _url = new URI(url).toURL(); md.set(TikaCoreProperties.RESOURCE_NAME_KEY, _url.getFile()); - } catch (MalformedURLException e1) { + } catch (MalformedURLException | URISyntaxException e1) { throw new IllegalStateException("Malformed URL", e1); } diff --git a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCSpout.java b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCSpout.java index fb2e5a1c..df127f58 100644 --- a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCSpout.java +++ b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCSpout.java @@ -17,6 +17,8 @@ package org.apache.stormcrawler.warc; import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.nio.ByteBuffer; import java.nio.channels.Channels; @@ -145,9 +147,9 @@ public class WARCSpout extends FileSpout { } } - private ReadableByteChannel openChannel(String path) throws IOException { + private ReadableByteChannel openChannel(String path) throws IOException, URISyntaxException { if (path.matches("^https?://.*")) { - URL warcUrl = new URL(path); + URL warcUrl = new URI(path).toURL(); return Channels.newChannel(warcUrl.openStream()); } org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(path);
