Author: markus
Date: Thu Jan 21 15:18:07 2016
New Revision: 1725981
URL: http://svn.apache.org/viewvc?rev=1725981&view=rev
Log:
NUTCH-2201 Remove loops program from webgraph package
Removed:
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1725981&r1=1725980&r2=1725981&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jan 21 15:18:07 2016
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-2201 Remove loops program from webgraph package (markus)
+
* NUTCH-1325 HostDB for Nutch (Gui Forget, markus, tejasp)
* NUTCH-2203 Suffix URL filter can't handle trailing/leading whitespaces
(Jurian Broertjes via markus)
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java?rev=1725981&r1=1725980&r2=1725981&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java Thu
Jan 21 15:18:07 2016
@@ -59,7 +59,6 @@ import org.apache.hadoop.mapred.lib.Hash
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.scoring.webgraph.Loops.LoopSet;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -246,9 +245,8 @@ public class LinkDumper extends Configur
String fromUrl = key.toString();
List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
Node node = null;
- LoopSet loops = null;
-
- // loop through all values aggregating outlinks, saving node and loopset
+
+ // loop through all values aggregating outlinks, saving node
while (values.hasNext()) {
ObjectWritable write = values.next();
Object obj = write.get();
@@ -256,25 +254,16 @@ public class LinkDumper extends Configur
node = (Node) obj;
} else if (obj instanceof LinkDatum) {
outlinks.add(WritableUtils.clone((LinkDatum) obj, conf));
- } else if (obj instanceof LoopSet) {
- loops = (LoopSet) obj;
}
}
// only collect if there are outlinks
int numOutlinks = node.getNumOutlinks();
if (numOutlinks > 0) {
-
- Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null;
for (int i = 0; i < outlinks.size(); i++) {
LinkDatum outlink = outlinks.get(i);
String toUrl = outlink.getUrl();
- // remove any url that is in the loopset, same as LinkRank
- if (loopSet != null && loopSet.contains(toUrl)) {
- continue;
- }
-
// collect the outlink as an inlink with the node
output.collect(new Text(toUrl), new LinkNode(fromUrl, node));
}
@@ -343,8 +332,6 @@ public class LinkDumper extends Configur
Path linkdump = new Path(webGraphDb, DUMP_DIR);
Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
- Path loopSetDb = new Path(webGraphDb, Loops.LOOPS_DIR);
- boolean loopsExists = fs.exists(loopSetDb);
Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
// run the inverter job
@@ -353,9 +340,6 @@ public class LinkDumper extends Configur
JobConf inverter = new NutchJob(conf);
inverter.setJobName("LinkDumper: inverter");
FileInputFormat.addInputPath(inverter, nodeDb);
- if (loopsExists) {
- FileInputFormat.addInputPath(inverter, loopSetDb);
- }
FileInputFormat.addInputPath(inverter, outlinkDb);
inverter.setInputFormat(SequenceFileInputFormat.class);
inverter.setMapperClass(Inverter.class);
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java?rev=1725981&r1=1725980&r2=1725981&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java Thu
Jan 21 15:18:07 2016
@@ -61,7 +61,6 @@ import org.apache.hadoop.mapred.TextOutp
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.scoring.webgraph.Loops.LoopSet;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -183,24 +182,17 @@ public class LinkRank extends Configured
* Runs the inverter job. The inverter job flips outlinks to inlinks to be
* passed into the analysis job.
*
- * The inverter job takes a link loops database if it exists. It is an
- * optional componenet of link analysis due to its extreme computational and
- * space requirements but it can be very useful is weeding out and
eliminating
- * link farms and other spam pages.
- *
* @param nodeDb
* The node database to use.
* @param outlinkDb
* The outlink database to use.
- * @param loopDb
- * The loop database to use if it exists.
* @param output
* The output directory.
*
* @throws IOException
* If an error occurs while running the inverter job.
*/
- private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path
output)
+ private void runInverter(Path nodeDb, Path outlinkDb, Path output)
throws IOException {
// configure the inverter
@@ -208,11 +200,6 @@ public class LinkRank extends Configured
inverter.setJobName("LinkAnalysis Inverter");
FileInputFormat.addInputPath(inverter, nodeDb);
FileInputFormat.addInputPath(inverter, outlinkDb);
-
- // add the loop database if it exists, isn't null
- if (loopDb != null) {
- FileInputFormat.addInputPath(inverter, loopDb);
- }
FileOutputFormat.setOutputPath(inverter, output);
inverter.setInputFormat(SequenceFileInputFormat.class);
inverter.setMapperClass(Inverter.class);
@@ -385,8 +372,7 @@ public class LinkRank extends Configured
/**
* Inverts outlinks to inlinks, attaches current score for the outlink from
- * the NodeDb of the WebGraph and removes any outlink that is contained
- * within the loopset.
+ * the NodeDb of the WebGraph.
*/
public void reduce(Text key, Iterator<ObjectWritable> values,
OutputCollector<Text, LinkDatum> output, Reporter reporter)
@@ -395,7 +381,6 @@ public class LinkRank extends Configured
String fromUrl = key.toString();
List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
Node node = null;
- LoopSet loops = null;
// aggregate outlinks, assign other values
while (values.hasNext()) {
@@ -405,25 +390,9 @@ public class LinkRank extends Configured
node = (Node) obj;
} else if (obj instanceof LinkDatum) {
outlinks.add(WritableUtils.clone((LinkDatum) obj, conf));
- } else if (obj instanceof LoopSet) {
- loops = (LoopSet) obj;
}
}
- // Check for the possibility of a LoopSet object without Node and
- // LinkDatum objects. This can happen
- // with webgraphs that receive deletes (e.g. link.delete.gone and/or URL
- // filters or normalizers) but
- // without an updated Loops database.
- // See: https://issues.apache.org/jira/browse/NUTCH-1299
- if (node == null && loops != null) {
- // Nothing to do
- LOG.warn("LoopSet without Node object received for "
- + key.toString()
- + " . You should either not use Loops as input of the LinkRank
program or rerun the Loops program over the WebGraph.");
- return;
- }
-
// get the number of outlinks and the current inlink and outlink scores
// from the node of the url
int numOutlinks = node.getNumOutlinks();
@@ -433,18 +402,10 @@ public class LinkRank extends Configured
// can't invert if no outlinks
if (numOutlinks > 0) {
-
- Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null;
for (int i = 0; i < outlinks.size(); i++) {
LinkDatum outlink = outlinks.get(i);
String toUrl = outlink.getUrl();
- // remove any url that is contained in the loopset
- if (loopSet != null && loopSet.contains(toUrl)) {
- LOG.debug(fromUrl + ": Skipping inverting inlink from loop "
- + toUrl);
- continue;
- }
outlink.setUrl(fromUrl);
outlink.setScore(outlinkScore);
@@ -623,10 +584,6 @@ public class LinkRank extends Configured
Path wgOutlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
Path wgNodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
Path nodeDb = new Path(linkRank, WebGraph.NODE_DIR);
- Path loopDb = new Path(webGraphDb, Loops.LOOPS_DIR);
- if (!fs.exists(loopDb)) {
- loopDb = null;
- }
// get the number of total nodes in the webgraph, used for rank one, then
// initialze all urls with a default score
@@ -654,7 +611,7 @@ public class LinkRank extends Configured
Path tempNodeDb = new Path(tempRank, WebGraph.NODE_DIR);
// run invert and analysis
- runInverter(nodeDb, wgOutlinkDb, loopDb, tempInverted);
+ runInverter(nodeDb, wgOutlinkDb, tempInverted);
runAnalysis(nodeDb, tempInverted, tempNodeDb, i, numIterations,
rankOneScore);