Author: markus Date: Fri Sep 9 11:13:54 2011 New Revision: 1167096 URL: http://svn.apache.org/viewvc?rev=1167096&view=rev Log: NUTCH-1101 Option to purge db_gone records from CrawlDB
Modified: nutch/branches/branch-1.4/CHANGES.txt nutch/branches/branch-1.4/conf/nutch-default.xml nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java Modified: nutch/branches/branch-1.4/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1167096&r1=1167095&r2=1167096&view=diff ============================================================================== --- nutch/branches/branch-1.4/CHANGES.txt (original) +++ nutch/branches/branch-1.4/CHANGES.txt Fri Sep 9 11:13:54 2011 @@ -2,6 +2,8 @@ Nutch Change Log Release 1.4 - Current development +* NUTCH-1101 Option to purge db_gone records with updatedb (markus) + * NUTCH-1096 Empty (not null) ContentLength results in failure of fetch (Ferdy Galema via jnioche) * NUTCH-1073 Rename parameters 'fetcher.threads.per.host.by.ip' and 'fetcher.threads.per.host' (jnioche) Modified: nutch/branches/branch-1.4/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/nutch-default.xml?rev=1167096&r1=1167095&r2=1167096&view=diff ============================================================================== --- nutch/branches/branch-1.4/conf/nutch-default.xml (original) +++ nutch/branches/branch-1.4/conf/nutch-default.xml Fri Sep 9 11:13:54 2011 @@ -404,6 +404,14 @@ </property> <property> + <name>db.update.purge.404</name> + <value>false</value> + <description>If true, updatedb will add purge records with status DB_GONE + from the CrawlDB. + </description> +</property> + +<property> <name>db.update.max.inlinks</name> <value>10000</value> <description>Maximum number of inlinks to take into account when updating Modified: nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=1167096&r1=1167095&r2=1167096&view=diff ============================================================================== --- nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDb.java Fri Sep 9 11:13:54 2011 @@ -46,6 +46,8 @@ public class CrawlDb extends Configured public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed"; + public static final String CRAWLDB_PURGE_404 = "db.update.purge.404"; + public static final String CURRENT_NAME = "current"; public static final String LOCK_NAME = ".locked"; @@ -57,7 +59,7 @@ public class CrawlDb extends Configured } public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter) throws IOException { - boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true); + boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true); update(crawlDb, segments, normalize, filter, additionsAllowed, false); } @@ -67,6 +69,14 @@ public class CrawlDb extends Configured LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); + + JobConf job = CrawlDb.createJob(getConf(), crawlDb); + job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed); + job.setBoolean(CrawlDbFilter.URL_FILTERING, filter); + job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize); + + boolean url404Purging = job.getBoolean(CRAWLDB_PURGE_404, false); + if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: starting at " + sdf.format(start)); LOG.info("CrawlDb update: db: " + crawlDb); @@ -74,12 +84,9 @@ public class CrawlDb extends Configured LOG.info("CrawlDb update: additions allowed: " + additionsAllowed); LOG.info("CrawlDb update: URL normalizing: " + normalize); LOG.info("CrawlDb update: URL filtering: " + filter); + LOG.info("CrawlDb update: 404 purging: " + url404Purging); } - JobConf job = CrawlDb.createJob(getConf(), crawlDb); - job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed); - job.setBoolean(CrawlDbFilter.URL_FILTERING, filter); - job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize); for (int i = 0; i < segments.length; i++) { Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME); Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME); @@ -166,11 +173,13 @@ public class CrawlDb extends Configured System.err.println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)"); System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and segment"); System.err.println("\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs"); + return -1; } boolean normalize = false; boolean filter = false; boolean force = false; + boolean url404Purging = false; final FileSystem fs = FileSystem.get(getConf()); boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true); HashSet<Path> dirs = new HashSet<Path>(); Modified: nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java?rev=1167096&r1=1167095&r2=1167096&view=diff ============================================================================== --- nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java (original) +++ nutch/branches/branch-1.4/src/java/org/apache/nutch/crawl/CrawlDbFilter.java Fri Sep 9 11:13:54 2011 @@ -46,6 +46,8 @@ public class CrawlDbFilter implements Ma private boolean urlNormalizers; + private boolean url404Purging; + private URLFilters filters; private URLNormalizers normalizers; @@ -57,6 +59,8 @@ public class CrawlDbFilter implements Ma public void configure(JobConf job) { urlFiltering = job.getBoolean(URL_FILTERING, false); urlNormalizers = job.getBoolean(URL_NORMALIZING, false); + url404Purging = job.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false); + if (urlFiltering) { filters = new URLFilters(job); } @@ -75,6 +79,11 @@ public class CrawlDbFilter implements Ma Reporter reporter) throws IOException { String url = key.toString(); + + // https://issues.apache.org/jira/browse/NUTCH-1101 check status first, cheaper than normalizing or filtering + if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) { + url = null; + } if (urlNormalizers) { try { url = normalizers.normalize(url, scope); // normalize the url