Author: pkosiorowski
Date: Sat Mar 25 03:19:31 2006
New Revision: 388742
URL: http://svn.apache.org/viewcvs?rev=388742&view=rev
Log:
Skipping post and nofollow outlinks
Modified:
lucene/nutch/branches/branch-0.7/CHANGES.txt
lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
Modified: lucene/nutch/branches/branch-0.7/CHANGES.txt
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=388742&r1=388741&r2=388742&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.7/CHANGES.txt (original)
+++ lucene/nutch/branches/branch-0.7/CHANGES.txt Sat Mar 25 03:19:31 2006
@@ -22,6 +22,8 @@
9. Commons HTTPClient upgraded to version 3.0.
+10. Skipping "post" and "nofollow" outlinks.
+
Release 0.7.1 - 2005-10-01
Modified:
lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=388742&r1=388741&r2=388742&view=diff
==============================================================================
---
lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
(original)
+++
lucene/nutch/branches/branch-0.7/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
Sat Mar 25 03:19:31 2006
@@ -296,23 +296,29 @@
if (node.getNodeType() == Node.ELEMENT_NODE) {
LinkParams params =
(LinkParams)linkParams.get(node.getNodeName().toLowerCase());
if (params != null) {
- if (shouldThrowAwayLink(node, children, childLen, params)) {
- // this has no inner structure or just a single nested
- // anchor-- toss it!
- } else {
+ if (!shouldThrowAwayLink(node, children, childLen, params)) {
StringBuffer linkText = new StringBuffer();
getText(linkText, node, true);
NamedNodeMap attrs = node.getAttributes();
String target = null;
+ boolean noFollow = false;
+ boolean post = false;
for (int i= 0; i < attrs.getLength(); i++ ) {
- if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName()))
{
- target = attrs.item(i).getNodeValue();
- break;
+ Node attr = attrs.item(i);
+ String attrName = attr.getNodeName();
+ if (params.attrName.equalsIgnoreCase(attrName)) {
+ target = attr.getNodeValue();
+ } else if ("rel".equalsIgnoreCase(attrName) &&
+ "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+ noFollow = true;
+ } else if ("method".equalsIgnoreCase(attrName) &&
+ "post".equalsIgnoreCase(attr.getNodeValue())) {
+ post = true;
}
}
- if (target != null)
+ if (target != null && !noFollow && !post)
try {
URL url = new URL(base, target);
outlinks.add(new Outlink(url.toString(),