This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 312828602 NUTCH-2976 SitemapProcessor: verify sitemap values
added from sitemap to CrawlDB (priority, modification time and
change frequency) - use default priority if priority <= 0.0 (a CrawlDatum
with score 0.0 is not eligible for fetch) - ensure that the fetch interval
(from change frequency) is within db.fetch.schedule.adaptive.min_interval
and db.fetch.schedule.adaptive.max_interval - ignore last-modified times in the
future
312828602 is described below
commit 312828602e09de2b59103f880b492fb994226108
Author: Sebastian Nagel <[email protected]>
AuthorDate: Fri Jul 11 16:11:32 2025 +0200
NUTCH-2976 SitemapProcessor: verify sitemap values
added from sitemap to CrawlDB (priority,
modification time and change frequency)
- use default priority if priority <= 0.0
(a CrawlDatum with score 0.0 is not eligible for fetch)
- ensure that the fetch interval (from change frequency)
is within db.fetch.schedule.adaptive.min_interval
and db.fetch.schedule.adaptive.max_interval
- ignore last-modified times in the future
---
.../org/apache/nutch/util/SitemapProcessor.java | 31 +++++++++++++++++++---
1 file changed, 28 insertions(+), 3 deletions(-)
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java
b/src/java/org/apache/nutch/util/SitemapProcessor.java
index 043e77f69..f20a3a956 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -106,6 +106,8 @@ public class SitemapProcessor extends Configured implements
Tool {
private boolean normalize = true;
private boolean tryDefaultSitemapXml = true;
private int maxRedir = 3;
+ private float minFetchInterval = 60f;
+ private float maxFetchInterval = 31536000f; // one year
private URLFilters filters = null;
private URLNormalizers normalizers = null;
private CrawlDatum datum = new CrawlDatum();
@@ -124,6 +126,11 @@ public class SitemapProcessor extends Configured
implements Tool {
this.tryDefaultSitemapXml =
conf.getBoolean(SITEMAP_ALWAYS_TRY_SITEMAPXML_ON_ROOT, true);
this.maxRedir = conf.getInt(SITEMAP_REDIR_MAX, 3);
this.parser = new SiteMapParser(strict);
+ this.minFetchInterval = conf
+ .getFloat("db.fetch.schedule.adaptive.min_interval", (float) 60.0);
+ this.maxFetchInterval = conf.getFloat(
+ "db.fetch.schedule.adaptive.max_interval",
+ (float) 31536000.0 /* one year */);
if (filter) {
filters = new URLFilters(conf);
@@ -251,7 +258,8 @@ public class SitemapProcessor extends Configured implements
Tool {
return;
}
- AbstractSiteMap asm = parser.parseSiteMap(content.getContentType(),
content.getContent(), new URL(url));
+ AbstractSiteMap asm = parser.parseSiteMap(content.getContentType(),
+ content.getContent(), new URL(url));
if(asm instanceof SiteMap) {
LOG.info("Parsing sitemap file: {}", asm.getUrl().toString());
@@ -265,7 +273,13 @@ public class SitemapProcessor extends Configured
implements Tool {
if (key != null) {
CrawlDatum sitemapUrlDatum = new CrawlDatum();
sitemapUrlDatum.setStatus(CrawlDatum.STATUS_INJECTED);
- sitemapUrlDatum.setScore((float) sitemapUrl.getPriority());
+ float priority = (float) sitemapUrl.getPriority();
+ if (priority > .0f) {
+ sitemapUrlDatum.setScore(priority);
+ } else {
+ // score == 0 would mean not fetch, use default priority (0.5)
instead
+ sitemapUrlDatum.setScore((float) SiteMapURL.DEFAULT_PRIORITY);
+ }
if(sitemapUrl.getChangeFrequency() != null) {
int fetchInterval = -1;
@@ -278,10 +292,21 @@ public class SitemapProcessor extends Configured
implements Tool {
case YEARLY: fetchInterval = 31536000; break; //
60*60*24*365
case NEVER: fetchInterval = Integer.MAX_VALUE; break; //
Loose "NEVER" contract
}
+ /*
+ * ensure that the fetch interval is within the min and max
+ * interval
+ */
+ if (fetchInterval > maxFetchInterval) {
+ fetchInterval = (int) maxFetchInterval;
+ } else if (fetchInterval < minFetchInterval) {
+ fetchInterval = (int) minFetchInterval;
+ }
sitemapUrlDatum.setFetchInterval(fetchInterval);
}
- if(sitemapUrl.getLastModified() != null) {
+ if (sitemapUrl.getLastModified() != null
+ && sitemapUrl.getLastModified().getTime() <=
System.currentTimeMillis()) {
+ // set modified time if not in the future
sitemapUrlDatum.setModifiedTime(sitemapUrl.getLastModified().getTime());
}