This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new dfdd00f31 NUTCH-2634 Some links marked as "nofollow" are followed
anyway - fix detection of nofollow in multi-valued rel attributes
new 9a1ed4015 Merge pull request #751 from sebastian-nagel/NUTCH-2634
dfdd00f31 is described below
commit dfdd00f3189839b6ed7d60651e5daa33f0038265
Author: Sebastian Nagel <[email protected]>
AuthorDate: Thu Jan 5 22:53:00 2023 +0100
NUTCH-2634 Some links marked as "nofollow" are followed anyway
- fix detection of nofollow in multi-valued rel attributes
---
.../org/apache/nutch/parse/html/DOMContentUtils.java | 9 +++++++--
.../apache/nutch/parse/html/TestDOMContentUtils.java | 17 ++++++++++++-----
.../org/apache/nutch/parse/tika/DOMContentUtils.java | 6 +++++-
.../apache/nutch/parse/tika/TestDOMContentUtils.java | 18 ++++++++++++------
4 files changed, 36 insertions(+), 14 deletions(-)
diff --git
a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
index 2415e8568..76685675b 100644
---
a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++
b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -23,6 +23,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
+import java.util.regex.Pattern;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.util.NodeWalker;
@@ -30,6 +31,7 @@ import org.apache.nutch.util.URLUtil;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
@@ -42,7 +44,10 @@ import org.apache.hadoop.io.Text;
*
*/
public class DOMContentUtils {
-
+
+ private static Pattern NOFOLLOW_PATTERN = Pattern.compile("\\bnofollow\\b",
+ Pattern.CASE_INSENSITIVE);
+
private String srcTagMetaName;
private boolean keepNodenames;
private Set<String> blockNodes;
@@ -451,7 +456,7 @@ public class DOMContentUtils {
if (params.attrName.equalsIgnoreCase(attrName)) {
target = attr.getNodeValue();
} else if ("rel".equalsIgnoreCase(attrName)
- && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+ && NOFOLLOW_PATTERN.matcher(attr.getNodeValue()).find()) {
noFollow = true;
} else if ("method".equalsIgnoreCase(attrName)
&& "post".equalsIgnoreCase(attr.getNodeValue())) {
diff --git
a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
index 0c1212a50..d50e9052d 100644
---
a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
+++
b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
@@ -103,6 +103,11 @@ public class TestDOMContentUtils {
+ "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+ "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+ "</body></html>"),
+ // multiple space-separated rel values (NUTCH-2634)
+ new String("<html><head></head><body>"
+ + "<a href=\"http://www.nutch.org\" rel=\"noreferrer nofollow\">
ignore </a>"
+ + "<a rel=\"nofollow noreferrer\" href=\"http://www.nutch.org\">
ignore </a>"
+ + "</body></html>"),
// test that POST form actions are skipped
new String("<html><head></head><body>"
+ "<form method='POST' action='/search.jsp'><input type=text>"
@@ -132,13 +137,13 @@ public class TestDOMContentUtils {
+ "<source src=\"movie.mp4\" type=\"video/mp4\">"
+ "</video>" + "</body></html>"), };
- private static int SKIP = 9;
+ private static int SKIP = 10;
private static String[] testBaseHrefs = { "http://www.nutch.org",
"http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
"http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
"http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
- "http://www.nutch.org//", "http://www.nutch.org/",
+ "http://www.nutch.org//", "http://www.nutch.org//",
"http://www.nutch.org/",
"http://www.nutch.org/", "http://www.nutch.org/",
"http://www.nutch.org/;something", "http://www.nutch.org/",
"http://www.nutch.org/" };
@@ -159,12 +164,13 @@ public class TestDOMContentUtils {
+ "Tabs are spaces too. This is a break -> and the line after break
. "
+ "one two three space here space there no space "
+ "one two two three three four put some text here and there. "
- + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+ + "End this madness ! . . . .",
+ "ignore ignore", "ignore ignore", "test1 test2",
"test1 test2", "title anchor1 anchor2 anchor3",
"title anchor1 anchor2 anchor3 anchor4 anchor5", "title", "" };
private static final String[] answerTitle = { "title", "title", "",
- "my title", "my title", "my title", "my title", "", "", "", "title",
+ "my title", "my title", "my title", "my title", "", "", "", "", "title",
"title", "title", "" };
// note: should be in page-order
@@ -218,7 +224,8 @@ public class TestDOMContentUtils {
new Outlink("http://www.nutch.org/bot.html", ""),
new Outlink("http://www.nutch.org/docs/index.html", ""), },
{ new Outlink("http://www.nutch.org/index.html", "whitespace test"),
},
- {},
+ {}, // nofollow
+ {}, // nofollow, multiple rel attributes
{ new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
{},
{ new Outlink("http://www.nutch.org/;x", "anchor1"),
diff --git
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
index a9890825d..ebe1919fa 100644
---
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
+++
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -24,6 +24,7 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.MapWritable;
@@ -45,6 +46,9 @@ import org.w3c.dom.NodeList;
*/
public class DOMContentUtils {
+ private static Pattern NOFOLLOW_PATTERN = Pattern.compile("\\bnofollow\\b",
+ Pattern.CASE_INSENSITIVE);
+
private String srcTagMetaName;
private boolean keepNodenames;
private Set<String> blockNodes;
@@ -419,7 +423,7 @@ public class DOMContentUtils {
if (params.attrName.equalsIgnoreCase(attrName)) {
target = attr.getNodeValue();
} else if ("rel".equalsIgnoreCase(attrName)
- && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+ && NOFOLLOW_PATTERN.matcher(attr.getNodeValue()).find()) {
noFollow = true;
} else if ("method".equalsIgnoreCase(attrName)
&& "post".equalsIgnoreCase(attr.getNodeValue())) {
diff --git
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
index 2f04d7ff1..b449e1e4d 100644
---
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
+++
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
@@ -109,7 +109,12 @@ public class TestDOMContentUtils {
+ "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+ "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+ "</body></html>"),
- // test that all form actions are skipped
+ // multiple space-separated rel values (NUTCH-2634)
+ new String("<html><head></head><body>"
+ + "<a href=\"http://www.nutch.org\" rel=\"noreferrer nofollow\">
ignore </a>"
+ + "<a rel=\"nofollow noreferrer\" href=\"http://www.nutch.org\">
ignore </a>"
+ + "</body></html>"),
+ // test that all form actions are skipped
new String("<html><head></head><body>"
+ "<form method='POST' action='/search.jsp'><input type=text>"
+ "<input type=submit><p>test1</p></form>"
@@ -133,13 +138,13 @@ public class TestDOMContentUtils {
+ "<source src=\"movie.mp4\" type=\"video/mp4\">"
+ "</video>" + "</body></html>"), };
- private static int SKIP = 9;
+ private static int SKIP = 10;
private static String[] testBaseHrefs = { "http://www.nutch.org",
"http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
"http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
"http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
- "http://www.nutch.org//", "http://www.nutch.org/",
+ "http://www.nutch.org//", "http://www.nutch.org//",
"http://www.nutch.org/",
"http://www.nutch.org/", "http://www.nutch.org/",
"http://www.nutch.org/;something", "http://www.nutch.org/" };
@@ -156,12 +161,12 @@ public class TestDOMContentUtils {
+ "one\n" + "two\n" + "three\n" + "space here\n" + "space there\n"
+ "no space\n" + "one two\n" + "two three\n" + "three four\n"
+ "put some text here and there. End this madness ! . . . .",
- "ignore ignore", "test1 test2", "test1 test2",
+ "ignore ignore", "ignore ignore", "test1 test2", "test1 test2",
"title anchor1 anchor2 anchor3",
"title anchor1 anchor2 anchor3 anchor4 anchor5", "" };
private static final String[] answerTitle = { "title", "title", "",
- "my title", "my title", "my title", "my title", "", "", "", "title",
+ "my title", "my title", "my title", "my title", "", "", "", "", "title",
"title", "" };
// note: should be in page-order
@@ -213,7 +218,8 @@ public class TestDOMContentUtils {
new Outlink("http://www.nutch.org/bot.html", ""),
new Outlink("http://www.nutch.org/docs/index.html", ""), },
{ new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
- {},
+ {}, // nofollow
+ {}, // nofollow, multiple rel attributes
{},
{},
{ new Outlink("http://www.nutch.org/;x", "anchor1"),