Author: pkosiorowski
Date: Mon Aug 8 12:59:56 2005
New Revision: 230870
URL: http://svn.apache.org/viewcvs?rev=230870&view=rev
Log:
NUTCH-7. Relative links from identical(MD5) pages were treated incorrectly.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java?rev=230870&r1=230869&r2=230870&view=diff
==============================================================================
---
lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java
(original)
+++
lucene/nutch/trunk/src/java/org/apache/nutch/tools/DistributedAnalysisTool.java
Mon Aug 8 12:59:56 2005
@@ -69,6 +69,9 @@
final private static float DECAY_VALUE = 0.85f;
public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.tools.DistributedAnalysisTool");
+
+ public final static long OUTLINK_LIMIT = 10000;
+
/**
* The EditSet inner class represents all of the sorted edits
@@ -343,8 +346,10 @@
try {
// Iterate through all items in the webdb, sorted by URL
long curIndex = 0;
+ long linkCount = 0;
ScoreValue score = new ScoreValue();
IWebDBReader reader = new WebDBReader(nfs, dbDir);
+ MD5Hash lastHash = null;
try {
for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements();
curIndex++) {
//
@@ -366,7 +371,25 @@
// OK, do some analysis!
//
Page curPage = (Page) e.nextElement();
+
+ // Process only one page from set of pages having the same
+ // MD5. Otherwise all links from these pages would be
processed
+ // multiple times.
+ MD5Hash newHash = curPage.getMD5();
+ if (newHash.equals(lastHash)) {
+ continue;
+ }
+ lastHash = newHash;
+
Link outLinks[] = reader.getLinks(curPage.getMD5());
+ linkCount += outLinks.length;
+
+ if (outLinks.length > OUTLINK_LIMIT) {
+ LOG.info("Suspicious outlink count = "
+ + outLinks.length + " for ["
+ + curPage.getURL().toString() + "].");
+ }
+
int targetOutlinkers = 0;
for (int i = 0; i < outLinks.length; i++) {
if (outLinks[i].targetHasOutlink()) {
@@ -402,7 +425,9 @@
}
if (((curIndex - startIndex) % 5000) == 0) {
- LOG.info("Pages consumed: " + (curIndex - startIndex)
+ " (at index " + curIndex + ")");
+ LOG.info("Pages consumed: " + (curIndex - startIndex)
+ + " (at index " + curIndex
+ + "). Links fetched: " + linkCount + ".");
}
}
} finally {