Author: markus Date: Tue Jul 19 12:49:58 2011 New Revision: 1148301 URL: http://svn.apache.org/viewvc?rev=1148301&view=rev Log: NUTCH-1050 Add segmentDir to WebGraph
Modified: nutch/branches/branch-1.4/conf/log4j.properties nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java Modified: nutch/branches/branch-1.4/conf/log4j.properties URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/log4j.properties?rev=1148301&r1=1148300&r2=1148301&view=diff ============================================================================== --- nutch/branches/branch-1.4/conf/log4j.properties (original) +++ nutch/branches/branch-1.4/conf/log4j.properties Tue Jul 19 12:49:58 2011 @@ -27,6 +27,11 @@ log4j.logger.org.apache.nutch.indexer.so log4j.logger.org.apache.nutch.indexer.solr.SolrDeleteDuplicates=INFO,cmdstdout log4j.logger.org.apache.nutch.indexer.solr.SolrClean=INFO,cmdstdout log4j.logger.org.apache.nutch.scoring.webgraph.WebGraph=INFO,cmdstdout +log4j.logger.org.apache.nutch.scoring.webgraph.LinkRank=INFO,cmdstdout +log4j.logger.org.apache.nutch.scoring.webgraph.Loops=INFO,cmdstdout +log4j.logger.org.apache.nutch.scoring.webgraph.ScoreUpdater=INFO,cmdstdout +log4j.logger.org.apache.nutch.parse.ParserChecker=INFO,cmdstdout +log4j.logger.org.apache.nutch.indexer.IndexingFiltersChecker=INFO,cmdstdout log4j.logger.org.apache.nutch=INFO log4j.logger.org.apache.hadoop=WARN Modified: nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java?rev=1148301&r1=1148300&r2=1148301&view=diff ============================================================================== --- nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java (original) +++ nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java Tue Jul 19 12:49:58 2011 @@ -538,7 +538,7 @@ public class LinkRank + (this.dampingFactor * totalInlinkScore); LOG.info(url + ": score: " + linkRankScore + " num inlinks: " - + numInlinks + " iteration: " + itNum + "\n"); + + numInlinks + " iteration: " + itNum); // store the score in a temporary NodeDb Node outNode = (Node)WritableUtils.clone(node, conf); Modified: nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java?rev=1148301&r1=1148300&r2=1148301&view=diff ============================================================================== --- nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java (original) +++ nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java Tue Jul 19 12:49:58 2011 @@ -38,6 +38,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; @@ -61,6 +62,7 @@ import org.apache.nutch.net.URLNormalize import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.ParseData; import org.apache.nutch.util.FSUtils; +import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -617,26 +619,44 @@ public class WebGraph "the web graph database to use").create("webgraphdb"); Option segOpts = OptionBuilder.withArgName("segment").hasArgs().withDescription( "the segment(s) to use").create("segment"); + Option segDirOpts = OptionBuilder.withArgName("segmentDir").hasArgs().withDescription( + "the segment directory to use").create("segmentDir"); options.addOption(helpOpts); options.addOption(webGraphDbOpts); options.addOption(segOpts); + options.addOption(segDirOpts); CommandLineParser parser = new GnuParser(); try { CommandLine line = parser.parse(options, args); if (line.hasOption("help") || !line.hasOption("webgraphdb") - || !line.hasOption("segment")) { + || (!line.hasOption("segment") && !line.hasOption("segmentDir")) +) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("WebGraph", options); return -1; } String webGraphDb = line.getOptionValue("webgraphdb"); - String[] segments = line.getOptionValues("segment"); - Path[] segPaths = new Path[segments.length]; - for (int i = 0; i < segments.length; i++) { - segPaths[i] = new Path(segments[i]); + + Path[] segPaths = null; + + // Handle segment option + if (line.hasOption("segment")) { + String[] segments = line.getOptionValues("segment"); + segPaths = new Path[segments.length]; + for (int i = 0; i < segments.length; i++) { + segPaths[i] = new Path(segments[i]); + } + } + + // Handle segmentDir option + if (line.hasOption("segmentDir")) { + Path dir = new Path(line.getOptionValue("segmentDir")); + FileSystem fs = dir.getFileSystem(getConf()); + FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs)); + segPaths = HadoopFSUtil.getPaths(fstats); } createWebGraph(new Path(webGraphDb), segPaths);