Author: fenglu
Date: Thu Sep 5 14:40:25 2013
New Revision: 1520332
URL: http://svn.apache.org/r1520332
Log:
NUTCH-1556 enabling updatedb to accept batchId
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/bin/crawl
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1520332&r1=1520331&r2=1520332&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Sep 5 14:40:25 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1556 enabling updatedb to accept batchId (kaveh minooie,Feng)
+
* NUTCH-1619 Writes Dmoz Description and Title information to db with snippet
argument ( Yasin Kılınç via feng)
* NUTCH-1631 Display Document Count Added To Solr Server (Furkan KAMACI via
lewismc)
Modified: nutch/branches/2.x/src/bin/crawl
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1520332&r1=1520331&r2=1520332&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Thu Sep 5 14:40:25 2013
@@ -141,7 +141,7 @@ do
# updatedb with this batch
echo "CrawlDB update for $CRAWL_ID"
- $bin/nutch updatedb $commonOptions -crawlId $CRAWL_ID
+ $bin/nutch updatedb $commonOptions -batchId $batchId -crawlId $CRAWL_ID
if [ $? -ne 0 ]
then exit $?
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java?rev=1520332&r1=1520331&r2=1520332&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java Thu
Sep 5 14:40:25 2013
@@ -23,6 +23,9 @@ import java.util.Map;
import java.util.Map.Entry;
import org.apache.avro.util.Utf8;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.storage.Mark;
+import org.apache.nutch.util.NutchJob;
import org.slf4j.Logger;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.util.StringUtils;
@@ -41,6 +44,8 @@ extends GoraMapper<String, WebPage, UrlW
private ScoringFilters scoringFilters;
private final List<ScoreDatum> scoreData = new ArrayList<ScoreDatum>();
+
+ private Utf8 batchId;
//reuse writables
private UrlWithScore urlWithScore = new UrlWithScore();
@@ -51,6 +56,14 @@ extends GoraMapper<String, WebPage, UrlW
public void map(String key, WebPage page, Context context)
throws IOException, InterruptedException {
+ Utf8 mark = Mark.GENERATE_MARK.checkMark(page);
+ if(!NutchJob.shouldProcess(mark,batchId)) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; different
batch id (" + mark + ")");
+ }
+ return;
+ }
+
String url = TableUtil.unreverseUrl(key);
scoreData.clear();
@@ -93,6 +106,7 @@ extends GoraMapper<String, WebPage, UrlW
public void setup(Context context) {
scoringFilters = new ScoringFilters(context.getConfiguration());
pageWritable = new WebPageWritable(context.getConfiguration(), null);
+ batchId = new
Utf8(context.getConfiguration().get(Nutch.BATCH_NAME_KEY,Nutch.ALL_BATCH_ID_STR));
}
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java?rev=1520332&r1=1520331&r2=1520332&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java Thu
Sep 5 14:40:25 2013
@@ -73,12 +73,17 @@ public class DbUpdaterJob extends NutchT
public Map<String,Object> run(Map<String,Object> args) throws Exception {
String crawlId = (String)args.get(Nutch.ARG_CRAWL);
+ String batchId = (String)args.get(Nutch.ARG_BATCH);
numJobs = 1;
currentJobNum = 0;
currentJob = new NutchJob(getConf(), "update-table");
if (crawlId != null) {
currentJob.getConfiguration().set(Nutch.CRAWL_ID_KEY, crawlId);
}
+ if (batchId == null) {
+ batchId = Nutch.ALL_BATCH_ID_STR;
+ }
+ getConf().set(Nutch.BATCH_NAME_KEY, batchId);
//job.setBoolean(ALL, updateAll);
ScoringFilters scoringFilters = new ScoringFilters(getConf());
HashSet<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
@@ -100,23 +105,46 @@ public class DbUpdaterJob extends NutchT
return results;
}
- private int updateTable(String crawlId) throws Exception {
+ private int updateTable(String crawlId,String batchId) throws Exception {
LOG.info("DbUpdaterJob: starting");
- run(ToolUtil.toArgMap(Nutch.ARG_CRAWL, crawlId));
+ if (batchId.equals(Nutch.ALL_BATCH_ID_STR)) {
+ LOG.info("DbUpdaterJob: updatinging all");
+ } else {
+ LOG.info("DbUpdaterJob: batchId: " + batchId);
+ }
+ run(ToolUtil.toArgMap(Nutch.ARG_CRAWL, crawlId,
+ Nutch.ARG_BATCH, batchId));
LOG.info("DbUpdaterJob: done");
return 0;
}
public int run(String[] args) throws Exception {
String crawlId = null;
+ String batchId;
+
+ String usage = "Usage: DbUpdaterJob (<batchId> | -all) [-crawlId <id>] " +
+ " <batchId> - crawl identifier returned by Generator, or
-all for all \n \t \t generated batchId-s\n" +
+ " -crawlId <id> - the id to prefix the schemas to operate on,
\n \t \t (default: storage.crawl.id)\n";
+
if (args.length == 0) {
- //
- } else if (args.length == 2 && "-crawlId".equals(args[0])) {
- crawlId = args[1];
- } else {
- throw new IllegalArgumentException("usage: " + "(-crawlId <id>)");
+ System.err.println(usage);
+ return -1;
+ }
+
+ batchId = args[0];
+ if (!batchId.equals("-all") && batchId.startsWith("-")) {
+ System.err.println(usage);
+ return -1;
+ }
+
+ for (int i = 1; i < args.length; i++) {
+ if ("-crawlId".equals(args[i])) {
+ getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
+ } else {
+ throw new IllegalArgumentException("arg " +args[i]+ " not recognized");
+ }
}
- return updateTable(crawlId);
+ return updateTable(crawlId,batchId);
}
public static void main(String[] args) throws Exception {