This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 3fe291b NUTCH-2807 SitemapProcessor to warn that ignoring robots.txt
affects detection of sitemaps (#710)
3fe291b is described below
commit 3fe291bd00fe4152c6e496cbac151c0f094856c8
Author: Sebastian Nagel <[email protected]>
AuthorDate: Fri Dec 17 09:59:19 2021 +0100
NUTCH-2807 SitemapProcessor to warn that ignoring robots.txt affects
detection of sitemaps (#710)
---
.../org/apache/nutch/util/SitemapProcessor.java | 39 ++++++++++++++--------
1 file changed, 26 insertions(+), 13 deletions(-)
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java
b/src/java/org/apache/nutch/util/SitemapProcessor.java
index bcbacdd..b191f23 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -63,21 +63,29 @@ import crawlercommons.sitemaps.SiteMapParser;
import crawlercommons.sitemaps.SiteMapURL;
/**
- * <p>Performs Sitemap processing by fetching sitemap links, parsing the
content and merging
- * the urls from Sitemap (with the metadata) with the existing crawldb.</p>
+ * <p>
+ * Performs <a href="https://sitemaps.org/">sitemap</a> processing by fetching
+ * sitemap links, parsing the content and merging the URLs from sitemaps (with
+ * the metadata) into the CrawlDb.
+ * </p>
*
- * <p>There are two use cases supported in Nutch's Sitemap processing:</p>
+ * <p>
+ * There are two use cases supported in Nutch's sitemap processing:
+ * </p>
* <ol>
- * <li>Sitemaps are considered as "remote seed lists". Crawl administrators
can prepare a
- * list of sitemap links and get only those sitemap pages. This suits well
for targeted
- * crawl of specific hosts.</li>
- * <li>For open web crawl, it is not possible to track each host and get the
sitemap links
- * manually. Nutch would automatically get the sitemaps for all the hosts
seen in the
- * crawls and inject the urls from sitemap to the crawldb.</li>
+ * <li>Sitemaps are considered as "remote seed lists". Crawl administrators can
+ * prepare a list of sitemap links and inject and fetch only the pages listed
in
+ * the sitemaps. This suits well for targeted crawl of specific hosts.</li>
+ * <li>For an open web crawl, it is not possible to track each host and get the
+ * sitemap links manually. Nutch automatically detects the sitemaps for all
+ * hosts seen in the crawls and present in the HostDb and injects the URLs from
+ * the sitemaps into the CrawlDb.</li>
* </ol>
*
- * <p>For more details see:
- * https://cwiki.apache.org/confluence/display/NUTCH/SitemapFeature </p>
+ * <p>
+ * For more details see:
+ * https://cwiki.apache.org/confluence/display/NUTCH/SitemapFeature
+ * </p>
*/
public class SitemapProcessor extends Configured implements Tool {
public static final Logger LOG =
LoggerFactory.getLogger(SitemapProcessor.class);
@@ -181,7 +189,7 @@ public class SitemapProcessor extends Configured implements
Tool {
private void generateSitemapsFromHostname(String host, Context context) {
try {
// For entry from hostdb, get sitemap url(s) from robots.txt, fetch
the sitemap,
- // extract urls and emit those
+ // extract URLs and emit those
// try different combinations of schemes one by one till we get
rejection in all cases
String url;
@@ -385,8 +393,13 @@ public class SitemapProcessor extends Configured
implements Tool {
if (sitemapUrlDir != null)
MultipleInputs.addInputPath(job, sitemapUrlDir,
KeyValueTextInputFormat.class);
- if (hostdb != null)
+ if (hostdb != null) {
MultipleInputs.addInputPath(job, new Path(hostdb, CURRENT_NAME),
SequenceFileInputFormat.class);
+ if (conf.getStrings("http.robot.rules.allowlist") != null) {
+ LOG.warn("Non-empty property \"http.robot.rules.allowlist\":"
+ + " sitemap discovery via robots.txt is not possible for the
listed hosts!");
+ }
+ }
FileOutputFormat.setOutputPath(job, tempCrawlDb);