Author: cutting
Date: Fri Oct 21 14:04:54 2005
New Revision: 327581

URL: http://svn.apache.org/viewcvs?rev=327581&view=rev
Log:
Ignore rel=nofollow links.

Modified:
    
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
    
lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java

Modified: 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=327581&r1=327580&r2=327581&view=diff
==============================================================================
--- 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 Fri Oct 21 14:04:54 2005
@@ -306,13 +306,21 @@
 
           NamedNodeMap attrs = node.getAttributes();
           String target = null;
+          boolean noFollow = false;
           for (int i= 0; i < attrs.getLength(); i++ ) {
-            if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) 
{
-              target = attrs.item(i).getNodeValue();
-              break;
+            Node attr = attrs.item(i);
+            String attrName = attr.getNodeName();
+
+            if ("rel".equalsIgnoreCase(attrName) &&
+                "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+              noFollow = true;
+            }
+
+            if (params.attrName.equalsIgnoreCase(attrName)) {
+              target = attr.getNodeValue();
             }
           }
-          if (target != null)
+          if (target != null && !noFollow)
             try {
               URL url = new URL(base, target);
               outlinks.add(new Outlink(url.toString(),

Modified: 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=327581&r1=327580&r2=327581&view=diff
==============================================================================
--- 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
 Fri Oct 21 14:04:54 2005
@@ -113,6 +113,12 @@
                + "<h2>End\tthis\rmadness\n!</h2>\r\n"
                + "         .        .        .         ."
                + "</body>  </html>"),
+
+    // test that <a rel=nofollow> links are not returned
+    new String("<html><head></head><body>"
+               + "<a href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore 
</a>"
+               + "<a rel=\"nofollow\" href=\"http://www.nutch.org\";> ignore 
</a>"
+               + "</body></html>"),
   };
 
   private static String[] testBaseHrefs= {
@@ -123,6 +129,7 @@
     "http://www.nutch.org/frames/";,     
     "http://www.nutch.org/maps/";,
     "http://www.nutch.org/whitespace/";,
+    "http://www.nutch.org//";,
   };
   
   private static final DocumentFragment testDOMs[]=
@@ -145,6 +152,7 @@
         + "one two three space here space there no space "
         + "one two two three three four put some text here and there. "
         + "End this madness ! . . . .",
+    "ignore ignore",
   };
 
   private static final String[] answerTitle= {
@@ -155,6 +163,7 @@
     "my title",
     "my title",
     "my title",
+    "",
   };
 
   // note: should be in page-order
@@ -214,6 +223,8 @@
          {
              new Outlink("http://www.nutch.org/index.html";, "whitespace test"),
          },
+         {
+         }
       };
    
     } catch (MalformedURLException e) {


Reply via email to