The attached patch adds support for rel=nofollow. Links which specify this are ignored. Any objections to committing this?

http://googleblog.blogspot.com/2005/01/preventing-comment-spam.html

Doug
Index: src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
===================================================================
--- src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java	(revision 326948)
+++ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java	(working copy)
@@ -113,6 +113,12 @@
                + "<h2>End\tthis\rmadness\n!</h2>\r\n"
                + "         .        .        .         ."
                + "</body>  </html>"),
+
+    // test that <a rel=nofollow> links are not returned
+    new String("<html><head></head><body>"
+               + "<a href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore </a>"
+               + "<a rel=\"nofollow\" href=\"http://www.nutch.org\";> ignore </a>"
+               + "</body></html>"),
   };
 
   private static String[] testBaseHrefs= {
@@ -123,6 +129,7 @@
     "http://www.nutch.org/frames/";,     
     "http://www.nutch.org/maps/";,
     "http://www.nutch.org/whitespace/";,
+    "http://www.nutch.org//";,
   };
   
   private static final DocumentFragment testDOMs[]=
@@ -145,6 +152,7 @@
         + "one two three space here space there no space "
         + "one two two three three four put some text here and there. "
         + "End this madness ! . . . .",
+    "ignore ignore",
   };
 
   private static final String[] answerTitle= {
@@ -155,6 +163,7 @@
     "my title",
     "my title",
     "my title",
+    "",
   };
 
   // note: should be in page-order
@@ -214,6 +223,8 @@
          {
              new Outlink("http://www.nutch.org/index.html";, "whitespace test"),
          },
+         {
+         }
       };
    
     } catch (MalformedURLException e) {
Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
===================================================================
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java	(revision 326948)
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java	(working copy)
@@ -306,13 +306,21 @@
 
           NamedNodeMap attrs = node.getAttributes();
           String target = null;
+          boolean noFollow = false;
           for (int i= 0; i < attrs.getLength(); i++ ) {
-            if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) {
-              target = attrs.item(i).getNodeValue();
-              break;
+            Node attr = attrs.item(i);
+            String attrName = attr.getNodeName();
+
+            if ("rel".equalsIgnoreCase(attrName) &&
+                "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+              noFollow = true;
             }
+
+            if (params.attrName.equalsIgnoreCase(attrName)) {
+              target = attr.getNodeValue();
+            }
           }
-          if (target != null)
+          if (target != null && !noFollow)
             try {
               URL url = new URL(base, target);
               outlinks.add(new Outlink(url.toString(),

Reply via email to