CrawlDbFilter.java

markus Fri, 09 Sep 2011 04:14:53 -0700

Author: markus
Date: Fri Sep  9 11:13:54 2011
New Revision: 1167096

URL: http://svn.apache.org/viewvc?rev=1167096&view=rev
Log:
NUTCH-1101 Option to purge db_gone records from CrawlDB


Modified:
    nutch/branches/branch-1.4/CHANGES.txt
    nutch/branches/branch-1.4/conf/nutch-default.xml
    nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java
    nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java

Modified: nutch/branches/branch-1.4/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1167096&r1=1167095&r2=1167096&view=diff
==============================================================================
--- nutch/branches/branch-1.4/CHANGES.txt (original)
+++ nutch/branches/branch-1.4/CHANGES.txt Fri Sep  9 11:13:54 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.4 - Current development
 
+* NUTCH-1101 Option to purge db_gone records with updatedb (markus)
+
 * NUTCH-1096 Empty (not null) ContentLength results in failure of fetch (Ferdy 
Galema via jnioche)
 
 * NUTCH-1073 Rename parameters 'fetcher.threads.per.host.by.ip' and 
'fetcher.threads.per.host' (jnioche)

Modified: nutch/branches/branch-1.4/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/nutch-default.xml?rev=1167096&r1=1167095&r2=1167096&view=diff
==============================================================================
--- nutch/branches/branch-1.4/conf/nutch-default.xml (original)
+++ nutch/branches/branch-1.4/conf/nutch-default.xml Fri Sep  9 11:13:54 2011
@@ -404,6 +404,14 @@
 </property>
 
 <property>
+  <name>db.update.purge.404</name>
+  <value>false</value>
+  <description>If true, updatedb will add purge records with status DB_GONE
+  from the CrawlDB.
+  </description>
+</property>
+
+<property>
   <name>db.update.max.inlinks</name>
   <value>10000</value>
   <description>Maximum number of inlinks to take into account when updating 

Modified: nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=1167096&r1=1167095&r2=1167096&view=diff
==============================================================================
--- nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java 
(original)
+++ nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java Fri 
Sep  9 11:13:54 2011
@@ -46,6 +46,8 @@ public class CrawlDb extends Configured 
 
   public static final String CRAWLDB_ADDITIONS_ALLOWED = 
"db.update.additions.allowed";
 
+  public static final String CRAWLDB_PURGE_404 = "db.update.purge.404";
+
   public static final String CURRENT_NAME = "current";
   
   public static final String LOCK_NAME = ".locked";
@@ -57,7 +59,7 @@ public class CrawlDb extends Configured 
   }
 
   public void update(Path crawlDb, Path[] segments, boolean normalize, boolean 
filter) throws IOException {
-    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, 
true);    
+    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, 
true);
     update(crawlDb, segments, normalize, filter, additionsAllowed, false);
   }
   
@@ -67,6 +69,14 @@ public class CrawlDb extends Configured 
     LockUtil.createLockFile(fs, lock, force);
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
+
+    JobConf job = CrawlDb.createJob(getConf(), crawlDb);
+    job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
+    job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
+    job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
+
+    boolean url404Purging = job.getBoolean(CRAWLDB_PURGE_404, false);
+
     if (LOG.isInfoEnabled()) {
       LOG.info("CrawlDb update: starting at " + sdf.format(start));
       LOG.info("CrawlDb update: db: " + crawlDb);
@@ -74,12 +84,9 @@ public class CrawlDb extends Configured 
       LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
       LOG.info("CrawlDb update: URL normalizing: " + normalize);
       LOG.info("CrawlDb update: URL filtering: " + filter);
+      LOG.info("CrawlDb update: 404 purging: " + url404Purging);
     }
 
-    JobConf job = CrawlDb.createJob(getConf(), crawlDb);
-    job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
-    job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
-    job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
     for (int i = 0; i < segments.length; i++) {
       Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
       Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);
@@ -166,11 +173,13 @@ public class CrawlDb extends Configured 
       System.err.println("\t-normalize\tuse URLNormalizer on urls in CrawlDb 
and segment (usually not needed)");
       System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and 
segment");
       System.err.println("\t-noAdditions\tonly update already existing URLs, 
don't add any newly discovered URLs");
+
       return -1;
     }
     boolean normalize = false;
     boolean filter = false;
     boolean force = false;
+    boolean url404Purging = false;
     final FileSystem fs = FileSystem.get(getConf());
     boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, 
true);
     HashSet<Path> dirs = new HashSet<Path>();

Modified: 
nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java?rev=1167096&r1=1167095&r2=1167096&view=diff
==============================================================================
--- 
nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java 
(original)
+++ 
nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java 
Fri Sep  9 11:13:54 2011
@@ -46,6 +46,8 @@ public class CrawlDbFilter implements Ma
 
   private boolean urlNormalizers;
 
+  private boolean url404Purging;
+
   private URLFilters filters;
 
   private URLNormalizers normalizers;
@@ -57,6 +59,8 @@ public class CrawlDbFilter implements Ma
   public void configure(JobConf job) {
     urlFiltering = job.getBoolean(URL_FILTERING, false);
     urlNormalizers = job.getBoolean(URL_NORMALIZING, false);
+    url404Purging = job.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false);
+
     if (urlFiltering) {
       filters = new URLFilters(job);
     }
@@ -75,6 +79,11 @@ public class CrawlDbFilter implements Ma
       Reporter reporter) throws IOException {
 
     String url = key.toString();
+
+    // https://issues.apache.org/jira/browse/NUTCH-1101 check status first, 
cheaper than normalizing or filtering
+    if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) {
+      url = null;
+    }
     if (urlNormalizers) {
       try {
         url = normalizers.normalize(url, scope); // normalize the url

svn commit: r1167096 - in /nutch/branches/branch-1.4: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/crawl/CrawlDb.java src/java/org/apache/nutch/crawl/CrawlDbFilter.java

Reply via email to