Author: markus Date: Thu Jan 21 15:18:07 2016 New Revision: 1725981 URL: http://svn.apache.org/viewvc?rev=1725981&view=rev Log: NUTCH-2201 Remove loops program from webgraph package
Removed: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1725981&r1=1725980&r2=1725981&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Jan 21 15:18:07 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2201 Remove loops program from webgraph package (markus) + * NUTCH-1325 HostDB for Nutch (Gui Forget, markus, tejasp) * NUTCH-2203 Suffix URL filter can't handle trailing/leading whitespaces (Jurian Broertjes via markus) Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java?rev=1725981&r1=1725980&r2=1725981&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java Thu Jan 21 15:18:07 2016 @@ -59,7 +59,6 @@ import org.apache.hadoop.mapred.lib.Hash import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; -import org.apache.nutch.scoring.webgraph.Loops.LoopSet; import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -246,9 +245,8 @@ public class LinkDumper extends Configur String fromUrl = key.toString(); List<LinkDatum> outlinks = new ArrayList<LinkDatum>(); Node node = null; - LoopSet loops = null; - - // loop through all values aggregating outlinks, saving node and loopset + + // loop through all values aggregating outlinks, saving node while (values.hasNext()) { ObjectWritable write = values.next(); Object obj = write.get(); @@ -256,25 +254,16 @@ public class LinkDumper extends Configur node = (Node) obj; } else if (obj instanceof LinkDatum) { outlinks.add(WritableUtils.clone((LinkDatum) obj, conf)); - } else if (obj instanceof LoopSet) { - loops = (LoopSet) obj; } } // only collect if there are outlinks int numOutlinks = node.getNumOutlinks(); if (numOutlinks > 0) { - - Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null; for (int i = 0; i < outlinks.size(); i++) { LinkDatum outlink = outlinks.get(i); String toUrl = outlink.getUrl(); - // remove any url that is in the loopset, same as LinkRank - if (loopSet != null && loopSet.contains(toUrl)) { - continue; - } - // collect the outlink as an inlink with the node output.collect(new Text(toUrl), new LinkNode(fromUrl, node)); } @@ -343,8 +332,6 @@ public class LinkDumper extends Configur Path linkdump = new Path(webGraphDb, DUMP_DIR); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); - Path loopSetDb = new Path(webGraphDb, Loops.LOOPS_DIR); - boolean loopsExists = fs.exists(loopSetDb); Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR); // run the inverter job @@ -353,9 +340,6 @@ public class LinkDumper extends Configur JobConf inverter = new NutchJob(conf); inverter.setJobName("LinkDumper: inverter"); FileInputFormat.addInputPath(inverter, nodeDb); - if (loopsExists) { - FileInputFormat.addInputPath(inverter, loopSetDb); - } FileInputFormat.addInputPath(inverter, outlinkDb); inverter.setInputFormat(SequenceFileInputFormat.class); inverter.setMapperClass(Inverter.class); Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java?rev=1725981&r1=1725980&r2=1725981&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java Thu Jan 21 15:18:07 2016 @@ -61,7 +61,6 @@ import org.apache.hadoop.mapred.TextOutp import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; -import org.apache.nutch.scoring.webgraph.Loops.LoopSet; import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -183,24 +182,17 @@ public class LinkRank extends Configured * Runs the inverter job. The inverter job flips outlinks to inlinks to be * passed into the analysis job. * - * The inverter job takes a link loops database if it exists. It is an - * optional componenet of link analysis due to its extreme computational and - * space requirements but it can be very useful is weeding out and eliminating - * link farms and other spam pages. - * * @param nodeDb * The node database to use. * @param outlinkDb * The outlink database to use. - * @param loopDb - * The loop database to use if it exists. * @param output * The output directory. * * @throws IOException * If an error occurs while running the inverter job. */ - private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output) + private void runInverter(Path nodeDb, Path outlinkDb, Path output) throws IOException { // configure the inverter @@ -208,11 +200,6 @@ public class LinkRank extends Configured inverter.setJobName("LinkAnalysis Inverter"); FileInputFormat.addInputPath(inverter, nodeDb); FileInputFormat.addInputPath(inverter, outlinkDb); - - // add the loop database if it exists, isn't null - if (loopDb != null) { - FileInputFormat.addInputPath(inverter, loopDb); - } FileOutputFormat.setOutputPath(inverter, output); inverter.setInputFormat(SequenceFileInputFormat.class); inverter.setMapperClass(Inverter.class); @@ -385,8 +372,7 @@ public class LinkRank extends Configured /** * Inverts outlinks to inlinks, attaches current score for the outlink from - * the NodeDb of the WebGraph and removes any outlink that is contained - * within the loopset. + * the NodeDb of the WebGraph. */ public void reduce(Text key, Iterator<ObjectWritable> values, OutputCollector<Text, LinkDatum> output, Reporter reporter) @@ -395,7 +381,6 @@ public class LinkRank extends Configured String fromUrl = key.toString(); List<LinkDatum> outlinks = new ArrayList<LinkDatum>(); Node node = null; - LoopSet loops = null; // aggregate outlinks, assign other values while (values.hasNext()) { @@ -405,25 +390,9 @@ public class LinkRank extends Configured node = (Node) obj; } else if (obj instanceof LinkDatum) { outlinks.add(WritableUtils.clone((LinkDatum) obj, conf)); - } else if (obj instanceof LoopSet) { - loops = (LoopSet) obj; } } - // Check for the possibility of a LoopSet object without Node and - // LinkDatum objects. This can happen - // with webgraphs that receive deletes (e.g. link.delete.gone and/or URL - // filters or normalizers) but - // without an updated Loops database. - // See: https://issues.apache.org/jira/browse/NUTCH-1299 - if (node == null && loops != null) { - // Nothing to do - LOG.warn("LoopSet without Node object received for " - + key.toString() - + " . You should either not use Loops as input of the LinkRank program or rerun the Loops program over the WebGraph."); - return; - } - // get the number of outlinks and the current inlink and outlink scores // from the node of the url int numOutlinks = node.getNumOutlinks(); @@ -433,18 +402,10 @@ public class LinkRank extends Configured // can't invert if no outlinks if (numOutlinks > 0) { - - Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null; for (int i = 0; i < outlinks.size(); i++) { LinkDatum outlink = outlinks.get(i); String toUrl = outlink.getUrl(); - // remove any url that is contained in the loopset - if (loopSet != null && loopSet.contains(toUrl)) { - LOG.debug(fromUrl + ": Skipping inverting inlink from loop " - + toUrl); - continue; - } outlink.setUrl(fromUrl); outlink.setScore(outlinkScore); @@ -623,10 +584,6 @@ public class LinkRank extends Configured Path wgOutlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR); Path wgNodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Path nodeDb = new Path(linkRank, WebGraph.NODE_DIR); - Path loopDb = new Path(webGraphDb, Loops.LOOPS_DIR); - if (!fs.exists(loopDb)) { - loopDb = null; - } // get the number of total nodes in the webgraph, used for rank one, then // initialze all urls with a default score @@ -654,7 +611,7 @@ public class LinkRank extends Configured Path tempNodeDb = new Path(tempRank, WebGraph.NODE_DIR); // run invert and analysis - runInverter(nodeDb, wgOutlinkDb, loopDb, tempInverted); + runInverter(nodeDb, wgOutlinkDb, tempInverted); runAnalysis(nodeDb, tempInverted, tempNodeDb, i, numIterations, rankOneScore);