Author: ab
Date: Mon Mar 20 15:20:56 2006
New Revision: 387341

URL: http://svn.apache.org/viewcvs?rev=387341&view=rev
Log:
Don't allow Inlink duplicates (NUTCH-235).

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java?rev=387341&r1=387340&r2=387341&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java Mon Mar 20 
15:20:56 2006
@@ -24,22 +24,23 @@
 
 /** A list of [EMAIL PROTECTED] Inlink}s. */
 public class Inlinks implements Writable {
-  private ArrayList inlinks = new ArrayList(1);
+  private HashSet inlinks = new HashSet(1);
 
   public void add(Inlink inlink) { inlinks.add(inlink); }
 
   public void add(Inlinks inlinks) { this.inlinks.addAll(inlinks.inlinks); }
 
+  public Iterator iterator() {
+    return this.inlinks.iterator();
+  }
+  
   public int size() { return inlinks.size(); }
 
-  public Inlink get(int i) { return (Inlink)inlinks.get(i); }
-
   public void clear() { inlinks.clear(); }
 
   public void readFields(DataInput in) throws IOException {
     int length = in.readInt();
     inlinks.clear();
-    inlinks.ensureCapacity(length);
     for (int i = 0; i < length; i++) {
       add(Inlink.read(in));
     }
@@ -47,17 +48,19 @@
 
   public void write(DataOutput out) throws IOException {
     out.writeInt(inlinks.size());
-    for (int i = 0; i < inlinks.size(); i++) {
-      ((Writable)inlinks.get(i)).write(out);
+    Iterator it = inlinks.iterator();
+    while (it.hasNext()) {
+      ((Writable)it.next()).write(out);
     }
   }
 
   public String toString() {
     StringBuffer buffer = new StringBuffer();
     buffer.append("Inlinks:\n");
-    for (int i = 0; i < inlinks.size(); i++) {
+    Iterator it = inlinks.iterator();
+    while (it.hasNext()) {
       buffer.append(" ");
-      buffer.append(inlinks.get(i));
+      buffer.append(it.next());
       buffer.append("\n");
     }
     return buffer.toString();
@@ -68,8 +71,9 @@
   public String[] getAnchors() throws IOException {
     HashMap domainToAnchors = new HashMap();
     ArrayList results = new ArrayList();
-    for (int i = 0; i < inlinks.size(); i++) {
-      Inlink inlink = (Inlink)inlinks.get(i);
+    Iterator it = inlinks.iterator();
+    while (it.hasNext()) {
+      Inlink inlink = (Inlink)it.next();
       String anchor = inlink.getAnchor();
 
       if (anchor.length() == 0)                   // skip empty anchors

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=387341&r1=387340&r2=387341&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Mon Mar 20 
15:20:56 2006
@@ -117,8 +117,10 @@
       }
 
       int end = Math.min(maxInlinks - result.size(), inlinks.size());
-      for (int i = 0; i < end; i++) {
-        result.add(inlinks.get(i));
+      Iterator it = inlinks.iterator();
+      int i = 0;
+      while(it.hasNext() && i++ < end) {
+        result.add((Inlink)it.next());
       }
     }
     output.collect(key, result);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=387341&r1=387340&r2=387341&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Mon 
Mar 20 15:20:56 2006
@@ -29,6 +29,7 @@
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 
+import java.util.Iterator;
 import java.util.logging.Logger;
 
 /** . */
@@ -112,8 +113,9 @@
       if (links == null) {
         System.out.println(" - no link information.");
       } else {
-        for (int i = 0; i < links.size(); i++) {
-          System.out.println(links.get(i).toString());
+        Iterator it = links.iterator();
+        while (it.hasNext()) {
+          System.out.println(it.next().toString());
         }
       }
     } else {


Reply via email to