This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 7b163542e4d319f95a7d4d06db77d910250bceb0
Author: Sebastian Nagel <[email protected]>
AuthorDate: Mon Jul 6 14:03:33 2020 +0200

    [NUTCH-2730] SitemapProcessor to treat sitemap URLs as Set instead of List
    - sitemap links from robots.txt are treated as set by crawler-commons
      (since crawler-commons 1.1)
    - sitemaps referenced in sitemap index are deduplicated
---
 src/java/org/apache/nutch/util/SitemapProcessor.java | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java 
b/src/java/org/apache/nutch/util/SitemapProcessor.java
index da5c7e7..76dcef9 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -51,7 +51,6 @@ import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.util.NutchJob;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -287,7 +286,7 @@ public class SitemapProcessor extends Configured implements 
Tool {
       }
       else if (asm instanceof SiteMapIndex) {
         SiteMapIndex index = (SiteMapIndex) asm;
-        Collection<AbstractSiteMap> sitemapUrls = index.getSitemaps();
+        Collection<AbstractSiteMap> sitemapUrls = index.getSitemaps(true);
 
         if (sitemapUrls.isEmpty()) {
           return;

Reply via email to