This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 312828602 NUTCH-2976 SitemapProcessor: verify sitemap values           
 added from sitemap to CrawlDB (priority,            modification time and 
change frequency) - use default priority if priority <= 0.0   (a CrawlDatum 
with score 0.0 is not eligible for fetch) - ensure that the fetch interval 
(from change frequency)   is within db.fetch.schedule.adaptive.min_interval   
and db.fetch.schedule.adaptive.max_interval - ignore last-modified times in the 
future
312828602 is described below

commit 312828602e09de2b59103f880b492fb994226108
Author: Sebastian Nagel <[email protected]>
AuthorDate: Fri Jul 11 16:11:32 2025 +0200

    NUTCH-2976 SitemapProcessor: verify sitemap values
               added from sitemap to CrawlDB (priority,
               modification time and change frequency)
    - use default priority if priority <= 0.0
      (a CrawlDatum with score 0.0 is not eligible for fetch)
    - ensure that the fetch interval (from change frequency)
      is within db.fetch.schedule.adaptive.min_interval
      and db.fetch.schedule.adaptive.max_interval
    - ignore last-modified times in the future
---
 .../org/apache/nutch/util/SitemapProcessor.java    | 31 +++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java 
b/src/java/org/apache/nutch/util/SitemapProcessor.java
index 043e77f69..f20a3a956 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -106,6 +106,8 @@ public class SitemapProcessor extends Configured implements 
Tool {
     private boolean normalize = true;
     private boolean tryDefaultSitemapXml = true;
     private int maxRedir = 3;
+    private float minFetchInterval = 60f;
+    private float maxFetchInterval = 31536000f; // one year
     private URLFilters filters = null;
     private URLNormalizers normalizers = null;
     private CrawlDatum datum = new CrawlDatum();
@@ -124,6 +126,11 @@ public class SitemapProcessor extends Configured 
implements Tool {
       this.tryDefaultSitemapXml = 
conf.getBoolean(SITEMAP_ALWAYS_TRY_SITEMAPXML_ON_ROOT, true);
       this.maxRedir = conf.getInt(SITEMAP_REDIR_MAX, 3);
       this.parser = new SiteMapParser(strict);
+      this.minFetchInterval = conf
+          .getFloat("db.fetch.schedule.adaptive.min_interval", (float) 60.0);
+      this.maxFetchInterval = conf.getFloat(
+          "db.fetch.schedule.adaptive.max_interval",
+          (float) 31536000.0 /* one year */);
 
       if (filter) {
         filters = new URLFilters(conf);
@@ -251,7 +258,8 @@ public class SitemapProcessor extends Configured implements 
Tool {
         return;
       }
 
-      AbstractSiteMap asm = parser.parseSiteMap(content.getContentType(), 
content.getContent(), new URL(url));
+      AbstractSiteMap asm = parser.parseSiteMap(content.getContentType(),
+          content.getContent(), new URL(url));
 
       if(asm instanceof SiteMap) {
         LOG.info("Parsing sitemap file: {}", asm.getUrl().toString());
@@ -265,7 +273,13 @@ public class SitemapProcessor extends Configured 
implements Tool {
             if (key != null) {
               CrawlDatum sitemapUrlDatum = new CrawlDatum();
               sitemapUrlDatum.setStatus(CrawlDatum.STATUS_INJECTED);
-              sitemapUrlDatum.setScore((float) sitemapUrl.getPriority());
+              float priority = (float) sitemapUrl.getPriority();
+              if (priority > .0f) {
+                sitemapUrlDatum.setScore(priority);
+              } else {
+                // score == 0 would mean not fetch, use default priority (0.5) 
instead
+                sitemapUrlDatum.setScore((float) SiteMapURL.DEFAULT_PRIORITY);
+              }
 
               if(sitemapUrl.getChangeFrequency() != null) {
                 int fetchInterval = -1;
@@ -278,10 +292,21 @@ public class SitemapProcessor extends Configured 
implements Tool {
                   case YEARLY:  fetchInterval = 31536000; break; // 
60*60*24*365
                   case NEVER:   fetchInterval = Integer.MAX_VALUE; break; // 
Loose "NEVER" contract
                 }
+                /*
+                 * ensure that the fetch interval is within the min and max
+                 * interval
+                 */
+                if (fetchInterval > maxFetchInterval) {
+                  fetchInterval = (int) maxFetchInterval;
+                } else if (fetchInterval < minFetchInterval) {
+                  fetchInterval = (int) minFetchInterval;
+                }
                 sitemapUrlDatum.setFetchInterval(fetchInterval);
               }
 
-              if(sitemapUrl.getLastModified() != null) {
+              if (sitemapUrl.getLastModified() != null
+                  && sitemapUrl.getLastModified().getTime() <= 
System.currentTimeMillis()) {
+                // set modified time if not in the future
                 
sitemapUrlDatum.setModifiedTime(sitemapUrl.getLastModified().getTime());
               }
 

Reply via email to