This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 7b163542e4d319f95a7d4d06db77d910250bceb0 Author: Sebastian Nagel <[email protected]> AuthorDate: Mon Jul 6 14:03:33 2020 +0200 [NUTCH-2730] SitemapProcessor to treat sitemap URLs as Set instead of List - sitemap links from robots.txt are treated as set by crawler-commons (since crawler-commons 1.1) - sitemaps referenced in sitemap index are deduplicated --- src/java/org/apache/nutch/util/SitemapProcessor.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java index da5c7e7..76dcef9 100644 --- a/src/java/org/apache/nutch/util/SitemapProcessor.java +++ b/src/java/org/apache/nutch/util/SitemapProcessor.java @@ -51,7 +51,6 @@ import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolFactory; import org.apache.nutch.protocol.ProtocolOutput; import org.apache.nutch.protocol.ProtocolStatus; -import org.apache.nutch.util.NutchJob; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -287,7 +286,7 @@ public class SitemapProcessor extends Configured implements Tool { } else if (asm instanceof SiteMapIndex) { SiteMapIndex index = (SiteMapIndex) asm; - Collection<AbstractSiteMap> sitemapUrls = index.getSitemaps(); + Collection<AbstractSiteMap> sitemapUrls = index.getSitemaps(true); if (sitemapUrls.isEmpty()) { return;
