Author: markus
Date: Tue Jul 19 12:49:58 2011
New Revision: 1148301

URL: http://svn.apache.org/viewvc?rev=1148301&view=rev
Log:
NUTCH-1050 Add segmentDir to WebGraph

Modified:
    nutch/branches/branch-1.4/conf/log4j.properties
    
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
    
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java

Modified: nutch/branches/branch-1.4/conf/log4j.properties
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/log4j.properties?rev=1148301&r1=1148300&r2=1148301&view=diff
==============================================================================
--- nutch/branches/branch-1.4/conf/log4j.properties (original)
+++ nutch/branches/branch-1.4/conf/log4j.properties Tue Jul 19 12:49:58 2011
@@ -27,6 +27,11 @@ log4j.logger.org.apache.nutch.indexer.so
 log4j.logger.org.apache.nutch.indexer.solr.SolrDeleteDuplicates=INFO,cmdstdout
 log4j.logger.org.apache.nutch.indexer.solr.SolrClean=INFO,cmdstdout
 log4j.logger.org.apache.nutch.scoring.webgraph.WebGraph=INFO,cmdstdout
+log4j.logger.org.apache.nutch.scoring.webgraph.LinkRank=INFO,cmdstdout
+log4j.logger.org.apache.nutch.scoring.webgraph.Loops=INFO,cmdstdout
+log4j.logger.org.apache.nutch.scoring.webgraph.ScoreUpdater=INFO,cmdstdout
+log4j.logger.org.apache.nutch.parse.ParserChecker=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexer.IndexingFiltersChecker=INFO,cmdstdout
 
 log4j.logger.org.apache.nutch=INFO
 log4j.logger.org.apache.hadoop=WARN

Modified: 
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java?rev=1148301&r1=1148300&r2=1148301&view=diff
==============================================================================
--- 
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
 (original)
+++ 
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
 Tue Jul 19 12:49:58 2011
@@ -538,7 +538,7 @@ public class LinkRank
         + (this.dampingFactor * totalInlinkScore);
 
       LOG.info(url + ": score: " + linkRankScore + " num inlinks: "
-        + numInlinks + " iteration: " + itNum + "\n");
+        + numInlinks + " iteration: " + itNum);
 
       // store the score in a temporary NodeDb
       Node outNode = (Node)WritableUtils.clone(node, conf);

Modified: 
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java?rev=1148301&r1=1148300&r2=1148301&view=diff
==============================================================================
--- 
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
 (original)
+++ 
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
 Tue Jul 19 12:49:58 2011
@@ -38,6 +38,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
@@ -61,6 +62,7 @@ import org.apache.nutch.net.URLNormalize
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.util.FSUtils;
+import org.apache.nutch.util.HadoopFSUtil;
 import org.apache.nutch.util.LockUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
@@ -617,26 +619,44 @@ public class WebGraph
       "the web graph database to use").create("webgraphdb");
     Option segOpts = 
OptionBuilder.withArgName("segment").hasArgs().withDescription(
       "the segment(s) to use").create("segment");
+    Option segDirOpts = 
OptionBuilder.withArgName("segmentDir").hasArgs().withDescription(
+      "the segment directory to use").create("segmentDir");
     options.addOption(helpOpts);
     options.addOption(webGraphDbOpts);
     options.addOption(segOpts);
+    options.addOption(segDirOpts);
 
     CommandLineParser parser = new GnuParser();
     try {
 
       CommandLine line = parser.parse(options, args);
       if (line.hasOption("help") || !line.hasOption("webgraphdb")
-        || !line.hasOption("segment")) {
+        || (!line.hasOption("segment") && !line.hasOption("segmentDir"))
+) {
         HelpFormatter formatter = new HelpFormatter();
         formatter.printHelp("WebGraph", options);
         return -1;
       }
 
       String webGraphDb = line.getOptionValue("webgraphdb");
-      String[] segments = line.getOptionValues("segment");
-      Path[] segPaths = new Path[segments.length];
-      for (int i = 0; i < segments.length; i++) {
-        segPaths[i] = new Path(segments[i]);
+
+      Path[] segPaths = null;
+
+      // Handle segment option
+      if (line.hasOption("segment")) {
+        String[] segments = line.getOptionValues("segment");
+        segPaths = new Path[segments.length];
+        for (int i = 0; i < segments.length; i++) {
+          segPaths[i] = new Path(segments[i]);
+        }
+      }
+
+      // Handle segmentDir option
+      if (line.hasOption("segmentDir")) {
+        Path dir = new Path(line.getOptionValue("segmentDir"));
+        FileSystem fs = dir.getFileSystem(getConf());
+        FileStatus[] fstats = fs.listStatus(dir, 
HadoopFSUtil.getPassDirectoriesFilter(fs));
+        segPaths = HadoopFSUtil.getPaths(fstats);
       }
 
       createWebGraph(new Path(webGraphDb), segPaths);


Reply via email to