Author: cutting Date: Fri Oct 21 14:04:54 2005 New Revision: 327581 URL: http://svn.apache.org/viewcvs?rev=327581&view=rev Log: Ignore rel=nofollow links.
Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=327581&r1=327580&r2=327581&view=diff ============================================================================== --- lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original) +++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Fri Oct 21 14:04:54 2005 @@ -306,13 +306,21 @@ NamedNodeMap attrs = node.getAttributes(); String target = null; + boolean noFollow = false; for (int i= 0; i < attrs.getLength(); i++ ) { - if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) { - target = attrs.item(i).getNodeValue(); - break; + Node attr = attrs.item(i); + String attrName = attr.getNodeName(); + + if ("rel".equalsIgnoreCase(attrName) && + "nofollow".equalsIgnoreCase(attr.getNodeValue())) { + noFollow = true; + } + + if (params.attrName.equalsIgnoreCase(attrName)) { + target = attr.getNodeValue(); } } - if (target != null) + if (target != null && !noFollow) try { URL url = new URL(base, target); outlinks.add(new Outlink(url.toString(), Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=327581&r1=327580&r2=327581&view=diff ============================================================================== --- lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original) +++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Fri Oct 21 14:04:54 2005 @@ -113,6 +113,12 @@ + "<h2>End\tthis\rmadness\n!</h2>\r\n" + " . . . ." + "</body> </html>"), + + // test that <a rel=nofollow> links are not returned + new String("<html><head></head><body>" + + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" + + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" + + "</body></html>"), }; private static String[] testBaseHrefs= { @@ -123,6 +129,7 @@ "http://www.nutch.org/frames/", "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/", + "http://www.nutch.org//", }; private static final DocumentFragment testDOMs[]= @@ -145,6 +152,7 @@ + "one two three space here space there no space " + "one two two three three four put some text here and there. " + "End this madness ! . . . .", + "ignore ignore", }; private static final String[] answerTitle= { @@ -155,6 +163,7 @@ "my title", "my title", "my title", + "", }; // note: should be in page-order @@ -214,6 +223,8 @@ { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, + { + } }; } catch (MalformedURLException e) {