Author: ab
Date: Sat Mar 18 11:21:11 2006
New Revision: 386875

URL: http://svn.apache.org/viewcvs?rev=386875&view=rev
Log:
Apply patch in NUTCH-230, which provides additional control over which
outlinks are considered for OPIC "cash" value distribution.

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=386875&r1=386874&r2=386875&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
Sat Mar 18 11:21:11 2006
@@ -25,6 +25,7 @@
 import org.apache.nutch.net.*;
 
 import java.io.*;
+import java.util.ArrayList;
 
 /* Parse content in a segment. */
 public class ParseOutputFormat implements OutputFormat {
@@ -42,6 +43,7 @@
     this.filters = new URLFilters(job);
     final float interval = job.getFloat("db.default.fetch.interval", 30f);
     final float extscore = job.getFloat("db.score.link.external", 1.0f);
+    final boolean countFiltered = job.getBoolean("db.score.count.filtered", 
false);
     
     File text =
       new File(new File(job.getOutputDir(), ParseText.DIR_NAME), name);
@@ -92,9 +94,9 @@
                                     .getContentMeta().get(Fetcher.SCORE_KEY);
           float score = extscore;
           // this may happen if there was a fetch error.
-         if (scoreString != null) score = Float.parseFloat(scoreString);
-          score /= links.length;
-                          
+          if (scoreString != null) score = Float.parseFloat(scoreString);
+          String[] toUrls = new String[links.length];
+          int validCount = 0;
           for (int i = 0; i < links.length; i++) {
             String toUrl = links[i].getToUrl();
             try {
@@ -103,10 +105,18 @@
             } catch (Exception e) {
               toUrl = null;
             }
-            if (toUrl != null)
-              crawlOut.append(new UTF8(toUrl),
-                              new CrawlDatum(CrawlDatum.STATUS_LINKED,
-                                             interval, score));
+            if (toUrl != null) validCount++;
+            toUrls[i] = toUrl;
+          }
+          if (countFiltered) {
+            score = score / links.length;
+          } else {
+            score = score / validCount;
+          }
+          for (int i = 0; i < toUrls.length; i++) {
+            if (toUrls[i] == null) continue;
+            crawlOut.append(new UTF8(toUrls[i]),
+                    new CrawlDatum(CrawlDatum.STATUS_LINKED, interval, score));
           }
         }
         


Reply via email to