Author: jnioche
Date: Mon Jul 18 09:20:02 2011
New Revision: 1147794
URL: http://svn.apache.org/viewvc?rev=1147794&view=rev
Log:
NUTCH-1054 : linkDB optional during indexing
Modified:
nutch/branches/branch-1.4/CHANGES.txt
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
Modified: nutch/branches/branch-1.4/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1147794&r1=1147793&r2=1147794&view=diff
==============================================================================
--- nutch/branches/branch-1.4/CHANGES.txt (original)
+++ nutch/branches/branch-1.4/CHANGES.txt Mon Jul 18 09:20:02 2011
@@ -2,6 +2,8 @@ Nutch Change Log
Release 1.4 - Current development
+* NUTCH-1054 LinkDB optional during indexing (jnioche)
+
* NUTCH-1029 Readdb throws EOFException (markus)
* NUTCH-1036 Solr jobs should increment counters in Reporter (markus)
Modified:
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1147794&r1=1147793&r2=1147794&view=diff
==============================================================================
---
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
(original)
+++
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Mon Jul 18 09:20:02 2011
@@ -23,6 +23,7 @@ import java.util.Iterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
@@ -172,7 +173,9 @@ implements Mapper<Text, Writable, Text,
JobConf job) {
LOG.info("IndexerMapReduce: crawldb: " + crawlDb);
- LOG.info("IndexerMapReduce: linkdb: " + linkDb);
+
+ if (linkDb!=null)
+ LOG.info("IndexerMapReduce: linkdb: " + linkDb);
for (final Path segment : segments) {
LOG.info("IndexerMapReduces: adding segment: " + segment);
@@ -183,7 +186,10 @@ implements Mapper<Text, Writable, Text,
}
FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
- FileInputFormat.addInputPath(job, new Path(linkDb, LinkDb.CURRENT_NAME));
+
+ if (linkDb!=null)
+ FileInputFormat.addInputPath(job, new Path(linkDb,
LinkDb.CURRENT_NAME));
+
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(IndexerMapReduce.class);
Modified:
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java?rev=1147794&r1=1147793&r2=1147794&view=diff
==============================================================================
---
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
(original)
+++
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
Mon Jul 18 09:20:02 2011
@@ -99,20 +99,23 @@ public class SolrIndexer extends Configu
}
public int run(String[] args) throws Exception {
- if (args.length < 4) {
- System.err.println("Usage: SolrIndexer <solr url> <crawldb> <linkdb>
(<segment> ... | -dir <segments>) [-noCommit]");
+ if (args.length < 3) {
+ System.err.println("Usage: SolrIndexer <solr url> <crawldb> [-linkdb
<linkdb>] (<segment> ... | -dir <segments>) [-noCommit]");
return -1;
}
final Path crawlDb = new Path(args[1]);
- final Path linkDb = new Path(args[2]);
+ Path linkDb = null;
final List<Path> segments = new ArrayList<Path>();
boolean noCommit = false;
- for (int i = 3; i < args.length; i++) {
- if (args[i].equals("-dir")) {
+ for (int i = 2; i < args.length; i++) {
+ if (args[i].equals("-linkdb")) {
+ linkDb = new Path(args[++i]);
+ }
+ else if (args[i].equals("-dir")) {
Path dir = new Path(args[++i]);
FileSystem fs = dir.getFileSystem(getConf());
FileStatus[] fstats = fs.listStatus(dir,