ANY23-356 Updated xercesImpl, crawler4j
Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/a5c3f1c4 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/a5c3f1c4 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/a5c3f1c4 Branch: refs/heads/master Commit: a5c3f1c4a225e8165e97d35cf9eb6502087526e7 Parents: 6b76f34 Author: Hans <[email protected]> Authored: Mon Jul 2 09:52:09 2018 -0500 Committer: Hans <[email protected]> Committed: Mon Jul 2 18:24:38 2018 -0500 ---------------------------------------------------------------------- core/src/main/java/org/apache/any23/rdf/RDFUtils.java | 4 ++-- plugins/basic-crawler/pom.xml | 2 +- .../apache/any23/plugin/crawler/DefaultWebCrawler.java | 13 +++++++++---- plugins/html-scraper/pom.xml | 2 +- 4 files changed, 13 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/a5c3f1c4/core/src/main/java/org/apache/any23/rdf/RDFUtils.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java index d323fb3..242984b 100644 --- a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java +++ b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java @@ -528,10 +528,10 @@ public class RDFUtils { new java.net.URI(href.trim()); return true; } catch (IllegalArgumentException e) { - LOG.debug("Error processing href: {}", href, e); + LOG.trace("Error processing href: {}", href, e); return false; } catch (URISyntaxException e) { - LOG.debug("Error interpreting href: {} as URI.", href, e); + LOG.trace("Error interpreting href: {} as URI.", href, e); return false; } } http://git-wip-us.apache.org/repos/asf/any23/blob/a5c3f1c4/plugins/basic-crawler/pom.xml ---------------------------------------------------------------------- diff --git a/plugins/basic-crawler/pom.xml b/plugins/basic-crawler/pom.xml index 58063ee..b864b92 100644 --- a/plugins/basic-crawler/pom.xml +++ b/plugins/basic-crawler/pom.xml @@ -74,7 +74,7 @@ <dependency> <groupId>edu.uci.ics</groupId> <artifactId>crawler4j</artifactId> - <version>3.4</version> + <version>4.4.0</version> <type>jar</type> <scope>compile</scope> </dependency> http://git-wip-us.apache.org/repos/asf/any23/blob/a5c3f1c4/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java ---------------------------------------------------------------------- diff --git a/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java b/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java index 2451286..2e43445 100644 --- a/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java +++ b/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/DefaultWebCrawler.java @@ -47,12 +47,17 @@ public class DefaultWebCrawler extends WebCrawler { /** * Override this method to specify whether the given URL should be visited or not. */ + @Override - public boolean shouldVisit(WebURL url) { - if (url.getURL() == null) return false; + public boolean shouldVisit(Page referringPage, WebURL url) { + if (!super.shouldVisit(referringPage, url)) + return false; + if (url.getURL() == null) + return false; final String href = url.getURL().toLowerCase(); - if( ! href.startsWith( sharedData.getSeed() ) ) return false; - return pattern == null || ! pattern.matcher(href).matches(); + if (!href.startsWith(sharedData.getSeed())) + return false; + return pattern == null || !pattern.matcher(href).matches(); } /** http://git-wip-us.apache.org/repos/asf/any23/blob/a5c3f1c4/plugins/html-scraper/pom.xml ---------------------------------------------------------------------- diff --git a/plugins/html-scraper/pom.xml b/plugins/html-scraper/pom.xml index 5f47adb..e24f6b6 100644 --- a/plugins/html-scraper/pom.xml +++ b/plugins/html-scraper/pom.xml @@ -56,7 +56,7 @@ <dependency> <groupId>xerces</groupId> <artifactId>xercesImpl</artifactId> - <version>2.9.1</version> + <version>2.12.0</version> <scope>provided</scope> <exclusions> <exclusion>
