Author: markus
Date: Tue Jul 19 12:49:58 2011
New Revision: 1148301
URL: http://svn.apache.org/viewvc?rev=1148301&view=rev
Log:
NUTCH-1050 Add segmentDir to WebGraph
Modified:
nutch/branches/branch-1.4/conf/log4j.properties
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
Modified: nutch/branches/branch-1.4/conf/log4j.properties
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/log4j.properties?rev=1148301&r1=1148300&r2=1148301&view=diff
==============================================================================
--- nutch/branches/branch-1.4/conf/log4j.properties (original)
+++ nutch/branches/branch-1.4/conf/log4j.properties Tue Jul 19 12:49:58 2011
@@ -27,6 +27,11 @@ log4j.logger.org.apache.nutch.indexer.so
log4j.logger.org.apache.nutch.indexer.solr.SolrDeleteDuplicates=INFO,cmdstdout
log4j.logger.org.apache.nutch.indexer.solr.SolrClean=INFO,cmdstdout
log4j.logger.org.apache.nutch.scoring.webgraph.WebGraph=INFO,cmdstdout
+log4j.logger.org.apache.nutch.scoring.webgraph.LinkRank=INFO,cmdstdout
+log4j.logger.org.apache.nutch.scoring.webgraph.Loops=INFO,cmdstdout
+log4j.logger.org.apache.nutch.scoring.webgraph.ScoreUpdater=INFO,cmdstdout
+log4j.logger.org.apache.nutch.parse.ParserChecker=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexer.IndexingFiltersChecker=INFO,cmdstdout
log4j.logger.org.apache.nutch=INFO
log4j.logger.org.apache.hadoop=WARN
Modified:
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java?rev=1148301&r1=1148300&r2=1148301&view=diff
==============================================================================
---
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
(original)
+++
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
Tue Jul 19 12:49:58 2011
@@ -538,7 +538,7 @@ public class LinkRank
+ (this.dampingFactor * totalInlinkScore);
LOG.info(url + ": score: " + linkRankScore + " num inlinks: "
- + numInlinks + " iteration: " + itNum + "\n");
+ + numInlinks + " iteration: " + itNum);
// store the score in a temporary NodeDb
Node outNode = (Node)WritableUtils.clone(node, conf);
Modified:
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java?rev=1148301&r1=1148300&r2=1148301&view=diff
==============================================================================
---
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
(original)
+++
nutch/branches/branch-1.4/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
Tue Jul 19 12:49:58 2011
@@ -38,6 +38,7 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
@@ -61,6 +62,7 @@ import org.apache.nutch.net.URLNormalize
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.util.FSUtils;
+import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -617,26 +619,44 @@ public class WebGraph
"the web graph database to use").create("webgraphdb");
Option segOpts =
OptionBuilder.withArgName("segment").hasArgs().withDescription(
"the segment(s) to use").create("segment");
+ Option segDirOpts =
OptionBuilder.withArgName("segmentDir").hasArgs().withDescription(
+ "the segment directory to use").create("segmentDir");
options.addOption(helpOpts);
options.addOption(webGraphDbOpts);
options.addOption(segOpts);
+ options.addOption(segDirOpts);
CommandLineParser parser = new GnuParser();
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("webgraphdb")
- || !line.hasOption("segment")) {
+ || (!line.hasOption("segment") && !line.hasOption("segmentDir"))
+) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("WebGraph", options);
return -1;
}
String webGraphDb = line.getOptionValue("webgraphdb");
- String[] segments = line.getOptionValues("segment");
- Path[] segPaths = new Path[segments.length];
- for (int i = 0; i < segments.length; i++) {
- segPaths[i] = new Path(segments[i]);
+
+ Path[] segPaths = null;
+
+ // Handle segment option
+ if (line.hasOption("segment")) {
+ String[] segments = line.getOptionValues("segment");
+ segPaths = new Path[segments.length];
+ for (int i = 0; i < segments.length; i++) {
+ segPaths[i] = new Path(segments[i]);
+ }
+ }
+
+ // Handle segmentDir option
+ if (line.hasOption("segmentDir")) {
+ Path dir = new Path(line.getOptionValue("segmentDir"));
+ FileSystem fs = dir.getFileSystem(getConf());
+ FileStatus[] fstats = fs.listStatus(dir,
HadoopFSUtil.getPassDirectoriesFilter(fs));
+ segPaths = HadoopFSUtil.getPaths(fstats);
}
createWebGraph(new Path(webGraphDb), segPaths);