Author: ab Date: Sat Mar 18 11:21:11 2006 New Revision: 386875 URL: http://svn.apache.org/viewcvs?rev=386875&view=rev Log: Apply patch in NUTCH-230, which provides additional control over which outlinks are considered for OPIC "cash" value distribution.
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=386875&r1=386874&r2=386875&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Sat Mar 18 11:21:11 2006 @@ -25,6 +25,7 @@ import org.apache.nutch.net.*; import java.io.*; +import java.util.ArrayList; /* Parse content in a segment. */ public class ParseOutputFormat implements OutputFormat { @@ -42,6 +43,7 @@ this.filters = new URLFilters(job); final float interval = job.getFloat("db.default.fetch.interval", 30f); final float extscore = job.getFloat("db.score.link.external", 1.0f); + final boolean countFiltered = job.getBoolean("db.score.count.filtered", false); File text = new File(new File(job.getOutputDir(), ParseText.DIR_NAME), name); @@ -92,9 +94,9 @@ .getContentMeta().get(Fetcher.SCORE_KEY); float score = extscore; // this may happen if there was a fetch error. - if (scoreString != null) score = Float.parseFloat(scoreString); - score /= links.length; - + if (scoreString != null) score = Float.parseFloat(scoreString); + String[] toUrls = new String[links.length]; + int validCount = 0; for (int i = 0; i < links.length; i++) { String toUrl = links[i].getToUrl(); try { @@ -103,10 +105,18 @@ } catch (Exception e) { toUrl = null; } - if (toUrl != null) - crawlOut.append(new UTF8(toUrl), - new CrawlDatum(CrawlDatum.STATUS_LINKED, - interval, score)); + if (toUrl != null) validCount++; + toUrls[i] = toUrl; + } + if (countFiltered) { + score = score / links.length; + } else { + score = score / validCount; + } + for (int i = 0; i < toUrls.length; i++) { + if (toUrls[i] == null) continue; + crawlOut.append(new UTF8(toUrls[i]), + new CrawlDatum(CrawlDatum.STATUS_LINKED, interval, score)); } }