Author: jnioche Date: Tue Mar 30 08:30:28 2010 New Revision: 929038 URL: http://svn.apache.org/viewvc?rev=929038&view=rev Log: NUTCH-779 Mechanism for passing metadata from parse to crawldb
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=929038&r1=929037&r2=929038&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Mar 30 08:30:28 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche) + * NUTCH-784 CrawlDBScanner (jnioche) * NUTCH-762 Generator can generate several segments in one parse of the crawlDB (jnioche) Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=929038&r1=929037&r2=929038&view=diff ============================================================================== --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Tue Mar 30 08:30:28 2010 @@ -479,6 +479,15 @@ </description> </property> + <property> + <name>db.parsemeta.to.crawldb</name> + <value></value> + <description>Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779). + Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang' + will copy both the key 'lang' and its value to the corresponding entry in the crawldb. + </description> +</property> + <property> <name>db.fetch.retry.max</name> <value>3</value> Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=929038&r1=929037&r2=929038&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Tue Mar 30 08:30:28 2010 @@ -82,6 +82,8 @@ public class CrawlDatum implements Writa public static final byte STATUS_INJECTED = 0x42; /** Page discovered through a link. */ public static final byte STATUS_LINKED = 0x43; + /** Page got metadata from a parser */ + public static final byte STATUS_PARSE_META = 0x44; public static final HashMap<Byte, String> statNames = new HashMap<Byte, String>(); @@ -101,6 +103,7 @@ public class CrawlDatum implements Writa statNames.put(STATUS_FETCH_REDIR_PERM, "fetch_redir_perm"); statNames.put(STATUS_FETCH_GONE, "fetch_gone"); statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified"); + statNames.put(STATUS_PARSE_META, "parse_metadata"); oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED); oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=929038&r1=929037&r2=929038&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Tue Mar 30 08:30:28 2010 @@ -20,6 +20,7 @@ package org.apache.nutch.crawl; import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Map.Entry; import java.io.IOException; // Commons Logging imports @@ -71,7 +72,8 @@ public class CrawlDbReducer implements R byte[] signature = null; boolean multiple = false; // avoid deep copy when only single value exists linked.clear(); - + org.apache.hadoop.io.MapWritable metaFromParse = null; + while (values.hasNext()) { CrawlDatum datum = (CrawlDatum)values.next(); if (!multiple && values.hasNext()) multiple = true; @@ -120,6 +122,9 @@ public class CrawlDbReducer implements R case CrawlDatum.STATUS_SIGNATURE: signature = datum.getSignature(); break; + case CrawlDatum.STATUS_PARSE_META: + metaFromParse = datum.getMetaData(); + break; default: LOG.warn("Unknown status, key: " + key + ", datum: " + datum); } @@ -233,6 +238,11 @@ public class CrawlDbReducer implements R else result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); } result.setSignature(signature); + if (metaFromParse != null) { + for (Entry<Writable, Writable> e : metaFromParse.entrySet()) { + result.getMetaData().put(e.getKey(), e.getValue()); + } + } } // if fetchInterval is larger than the system-wide maximum, trigger // an unconditional recrawl. This prevents the page to be stuck at Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=929038&r1=929037&r2=929038&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Tue Mar 30 08:30:28 2010 @@ -88,7 +88,8 @@ implements Mapper<Text, Writable, Text, if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) fetchDatum = datum; } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() || - CrawlDatum.STATUS_SIGNATURE == datum.getStatus()) { + CrawlDatum.STATUS_SIGNATURE == datum.getStatus() || + CrawlDatum.STATUS_PARSE_META == datum.getStatus()) { continue; } else { throw new RuntimeException("Unexpected status: "+datum.getStatus()); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=929038&r1=929037&r2=929038&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Tue Mar 30 08:30:28 2010 @@ -98,6 +98,8 @@ public class ParseOutputFormat implement Path data = new Path(new Path(out, ParseData.DIR_NAME), name); Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name); + final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb","").split(" *, *"); + final MapFile.Writer textOut = new MapFile.Writer(job, fs, text.toString(), Text.class, ParseText.class, CompressionType.RECORD, progress); @@ -133,6 +135,20 @@ public class ParseOutputFormat implement crawlOut.append(key, d); } } + + // see if the parse metadata contain things that we'd like + // to pass to the metadata of the crawlDB entry + CrawlDatum parseMDCrawlDatum = null; + for (String mdname : parseMDtoCrawlDB) { + String mdvalue = parse.getData().getParseMeta().get(mdname); + if (mdvalue != null) { + if (parseMDCrawlDatum == null) parseMDCrawlDatum = new CrawlDatum( + CrawlDatum.STATUS_PARSE_META, 0); + parseMDCrawlDatum.getMetaData().put(new Text(mdname), + new Text(mdvalue)); + } + } + if (parseMDCrawlDatum != null) crawlOut.append(key, parseMDCrawlDatum); try { ParseStatus pstatus = parseData.getStatus();