Author: ab Date: Mon Mar 20 15:20:56 2006 New Revision: 387341 URL: http://svn.apache.org/viewcvs?rev=387341&view=rev Log: Don't allow Inlink duplicates (NUTCH-235).
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java?rev=387341&r1=387340&r2=387341&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java Mon Mar 20 15:20:56 2006 @@ -24,22 +24,23 @@ /** A list of [EMAIL PROTECTED] Inlink}s. */ public class Inlinks implements Writable { - private ArrayList inlinks = new ArrayList(1); + private HashSet inlinks = new HashSet(1); public void add(Inlink inlink) { inlinks.add(inlink); } public void add(Inlinks inlinks) { this.inlinks.addAll(inlinks.inlinks); } + public Iterator iterator() { + return this.inlinks.iterator(); + } + public int size() { return inlinks.size(); } - public Inlink get(int i) { return (Inlink)inlinks.get(i); } - public void clear() { inlinks.clear(); } public void readFields(DataInput in) throws IOException { int length = in.readInt(); inlinks.clear(); - inlinks.ensureCapacity(length); for (int i = 0; i < length; i++) { add(Inlink.read(in)); } @@ -47,17 +48,19 @@ public void write(DataOutput out) throws IOException { out.writeInt(inlinks.size()); - for (int i = 0; i < inlinks.size(); i++) { - ((Writable)inlinks.get(i)).write(out); + Iterator it = inlinks.iterator(); + while (it.hasNext()) { + ((Writable)it.next()).write(out); } } public String toString() { StringBuffer buffer = new StringBuffer(); buffer.append("Inlinks:\n"); - for (int i = 0; i < inlinks.size(); i++) { + Iterator it = inlinks.iterator(); + while (it.hasNext()) { buffer.append(" "); - buffer.append(inlinks.get(i)); + buffer.append(it.next()); buffer.append("\n"); } return buffer.toString(); @@ -68,8 +71,9 @@ public String[] getAnchors() throws IOException { HashMap domainToAnchors = new HashMap(); ArrayList results = new ArrayList(); - for (int i = 0; i < inlinks.size(); i++) { - Inlink inlink = (Inlink)inlinks.get(i); + Iterator it = inlinks.iterator(); + while (it.hasNext()) { + Inlink inlink = (Inlink)it.next(); String anchor = inlink.getAnchor(); if (anchor.length() == 0) // skip empty anchors Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=387341&r1=387340&r2=387341&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Mon Mar 20 15:20:56 2006 @@ -117,8 +117,10 @@ } int end = Math.min(maxInlinks - result.size(), inlinks.size()); - for (int i = 0; i < end; i++) { - result.add(inlinks.get(i)); + Iterator it = inlinks.iterator(); + int i = 0; + while(it.hasNext() && i++ < end) { + result.add((Inlink)it.next()); } } output.collect(key, result); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=387341&r1=387340&r2=387341&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Mon Mar 20 15:20:56 2006 @@ -29,6 +29,7 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; +import java.util.Iterator; import java.util.logging.Logger; /** . */ @@ -112,8 +113,9 @@ if (links == null) { System.out.println(" - no link information."); } else { - for (int i = 0; i < links.size(); i++) { - System.out.println(links.get(i).toString()); + Iterator it = links.iterator(); + while (it.hasNext()) { + System.out.println(it.next().toString()); } } } else {