Author: markus
Date: Thu Jan 21 15:18:07 2016
New Revision: 1725981

URL: http://svn.apache.org/viewvc?rev=1725981&view=rev
Log:
NUTCH-2201 Remove loops program from webgraph package

Removed:
    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java
    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1725981&r1=1725980&r2=1725981&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jan 21 15:18:07 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2201 Remove loops program from webgraph package (markus)
+
 * NUTCH-1325 HostDB for Nutch (Gui Forget, markus, tejasp)
 
 * NUTCH-2203 Suffix URL filter can't handle trailing/leading whitespaces 
(Jurian Broertjes via markus)

Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java?rev=1725981&r1=1725980&r2=1725981&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java Thu 
Jan 21 15:18:07 2016
@@ -59,7 +59,6 @@ import org.apache.hadoop.mapred.lib.Hash
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.scoring.webgraph.Loops.LoopSet;
 import org.apache.nutch.util.FSUtils;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
@@ -246,9 +245,8 @@ public class LinkDumper extends Configur
       String fromUrl = key.toString();
       List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
       Node node = null;
-      LoopSet loops = null;
-
-      // loop through all values aggregating outlinks, saving node and loopset
+      
+      // loop through all values aggregating outlinks, saving node
       while (values.hasNext()) {
         ObjectWritable write = values.next();
         Object obj = write.get();
@@ -256,25 +254,16 @@ public class LinkDumper extends Configur
           node = (Node) obj;
         } else if (obj instanceof LinkDatum) {
           outlinks.add(WritableUtils.clone((LinkDatum) obj, conf));
-        } else if (obj instanceof LoopSet) {
-          loops = (LoopSet) obj;
         }
       }
 
       // only collect if there are outlinks
       int numOutlinks = node.getNumOutlinks();
       if (numOutlinks > 0) {
-
-        Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null;
         for (int i = 0; i < outlinks.size(); i++) {
           LinkDatum outlink = outlinks.get(i);
           String toUrl = outlink.getUrl();
 
-          // remove any url that is in the loopset, same as LinkRank
-          if (loopSet != null && loopSet.contains(toUrl)) {
-            continue;
-          }
-
           // collect the outlink as an inlink with the node
           output.collect(new Text(toUrl), new LinkNode(fromUrl, node));
         }
@@ -343,8 +332,6 @@ public class LinkDumper extends Configur
 
     Path linkdump = new Path(webGraphDb, DUMP_DIR);
     Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
-    Path loopSetDb = new Path(webGraphDb, Loops.LOOPS_DIR);
-    boolean loopsExists = fs.exists(loopSetDb);
     Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
 
     // run the inverter job
@@ -353,9 +340,6 @@ public class LinkDumper extends Configur
     JobConf inverter = new NutchJob(conf);
     inverter.setJobName("LinkDumper: inverter");
     FileInputFormat.addInputPath(inverter, nodeDb);
-    if (loopsExists) {
-      FileInputFormat.addInputPath(inverter, loopSetDb);
-    }
     FileInputFormat.addInputPath(inverter, outlinkDb);
     inverter.setInputFormat(SequenceFileInputFormat.class);
     inverter.setMapperClass(Inverter.class);

Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java?rev=1725981&r1=1725980&r2=1725981&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java Thu 
Jan 21 15:18:07 2016
@@ -61,7 +61,6 @@ import org.apache.hadoop.mapred.TextOutp
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.scoring.webgraph.Loops.LoopSet;
 import org.apache.nutch.util.FSUtils;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
@@ -183,24 +182,17 @@ public class LinkRank extends Configured
    * Runs the inverter job. The inverter job flips outlinks to inlinks to be
    * passed into the analysis job.
    * 
-   * The inverter job takes a link loops database if it exists. It is an
-   * optional componenet of link analysis due to its extreme computational and
-   * space requirements but it can be very useful is weeding out and 
eliminating
-   * link farms and other spam pages.
-   * 
    * @param nodeDb
    *          The node database to use.
    * @param outlinkDb
    *          The outlink database to use.
-   * @param loopDb
-   *          The loop database to use if it exists.
    * @param output
    *          The output directory.
    * 
    * @throws IOException
    *           If an error occurs while running the inverter job.
    */
-  private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path 
output)
+  private void runInverter(Path nodeDb, Path outlinkDb, Path output)
       throws IOException {
 
     // configure the inverter
@@ -208,11 +200,6 @@ public class LinkRank extends Configured
     inverter.setJobName("LinkAnalysis Inverter");
     FileInputFormat.addInputPath(inverter, nodeDb);
     FileInputFormat.addInputPath(inverter, outlinkDb);
-
-    // add the loop database if it exists, isn't null
-    if (loopDb != null) {
-      FileInputFormat.addInputPath(inverter, loopDb);
-    }
     FileOutputFormat.setOutputPath(inverter, output);
     inverter.setInputFormat(SequenceFileInputFormat.class);
     inverter.setMapperClass(Inverter.class);
@@ -385,8 +372,7 @@ public class LinkRank extends Configured
 
     /**
      * Inverts outlinks to inlinks, attaches current score for the outlink from
-     * the NodeDb of the WebGraph and removes any outlink that is contained
-     * within the loopset.
+     * the NodeDb of the WebGraph.
      */
     public void reduce(Text key, Iterator<ObjectWritable> values,
         OutputCollector<Text, LinkDatum> output, Reporter reporter)
@@ -395,7 +381,6 @@ public class LinkRank extends Configured
       String fromUrl = key.toString();
       List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
       Node node = null;
-      LoopSet loops = null;
 
       // aggregate outlinks, assign other values
       while (values.hasNext()) {
@@ -405,25 +390,9 @@ public class LinkRank extends Configured
           node = (Node) obj;
         } else if (obj instanceof LinkDatum) {
           outlinks.add(WritableUtils.clone((LinkDatum) obj, conf));
-        } else if (obj instanceof LoopSet) {
-          loops = (LoopSet) obj;
         }
       }
 
-      // Check for the possibility of a LoopSet object without Node and
-      // LinkDatum objects. This can happen
-      // with webgraphs that receive deletes (e.g. link.delete.gone and/or URL
-      // filters or normalizers) but
-      // without an updated Loops database.
-      // See: https://issues.apache.org/jira/browse/NUTCH-1299
-      if (node == null && loops != null) {
-        // Nothing to do
-        LOG.warn("LoopSet without Node object received for "
-            + key.toString()
-            + " . You should either not use Loops as input of the LinkRank 
program or rerun the Loops program over the WebGraph.");
-        return;
-      }
-
       // get the number of outlinks and the current inlink and outlink scores
       // from the node of the url
       int numOutlinks = node.getNumOutlinks();
@@ -433,18 +402,10 @@ public class LinkRank extends Configured
 
       // can't invert if no outlinks
       if (numOutlinks > 0) {
-
-        Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null;
         for (int i = 0; i < outlinks.size(); i++) {
           LinkDatum outlink = outlinks.get(i);
           String toUrl = outlink.getUrl();
 
-          // remove any url that is contained in the loopset
-          if (loopSet != null && loopSet.contains(toUrl)) {
-            LOG.debug(fromUrl + ": Skipping inverting inlink from loop "
-                + toUrl);
-            continue;
-          }
           outlink.setUrl(fromUrl);
           outlink.setScore(outlinkScore);
 
@@ -623,10 +584,6 @@ public class LinkRank extends Configured
     Path wgOutlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
     Path wgNodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
     Path nodeDb = new Path(linkRank, WebGraph.NODE_DIR);
-    Path loopDb = new Path(webGraphDb, Loops.LOOPS_DIR);
-    if (!fs.exists(loopDb)) {
-      loopDb = null;
-    }
 
     // get the number of total nodes in the webgraph, used for rank one, then
     // initialze all urls with a default score
@@ -654,7 +611,7 @@ public class LinkRank extends Configured
       Path tempNodeDb = new Path(tempRank, WebGraph.NODE_DIR);
 
       // run invert and analysis
-      runInverter(nodeDb, wgOutlinkDb, loopDb, tempInverted);
+      runInverter(nodeDb, wgOutlinkDb, tempInverted);
       runAnalysis(nodeDb, tempInverted, tempNodeDb, i, numIterations,
           rankOneScore);
 


Reply via email to