This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new dfdd00f31 NUTCH-2634 Some links marked as "nofollow" are followed 
anyway - fix detection of nofollow in multi-valued rel attributes
     new 9a1ed4015 Merge pull request #751 from sebastian-nagel/NUTCH-2634
dfdd00f31 is described below

commit dfdd00f3189839b6ed7d60651e5daa33f0038265
Author: Sebastian Nagel <[email protected]>
AuthorDate: Thu Jan 5 22:53:00 2023 +0100

    NUTCH-2634 Some links marked as "nofollow" are followed anyway
    - fix detection of nofollow in multi-valued rel attributes
---
 .../org/apache/nutch/parse/html/DOMContentUtils.java   |  9 +++++++--
 .../apache/nutch/parse/html/TestDOMContentUtils.java   | 17 ++++++++++++-----
 .../org/apache/nutch/parse/tika/DOMContentUtils.java   |  6 +++++-
 .../apache/nutch/parse/tika/TestDOMContentUtils.java   | 18 ++++++++++++------
 4 files changed, 36 insertions(+), 14 deletions(-)

diff --git 
a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 
b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
index 2415e8568..76685675b 100644
--- 
a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++ 
b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -23,6 +23,7 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.regex.Pattern;
 
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.util.NodeWalker;
@@ -30,6 +31,7 @@ import org.apache.nutch.util.URLUtil;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.MapWritable;
 import org.apache.hadoop.io.Text;
@@ -42,7 +44,10 @@ import org.apache.hadoop.io.Text;
  * 
  */
 public class DOMContentUtils {
-  
+
+  private static Pattern NOFOLLOW_PATTERN = Pattern.compile("\\bnofollow\\b",
+      Pattern.CASE_INSENSITIVE);
+
   private String srcTagMetaName;
   private boolean keepNodenames;
   private Set<String> blockNodes;
@@ -451,7 +456,7 @@ public class DOMContentUtils {
               if (params.attrName.equalsIgnoreCase(attrName)) {
                 target = attr.getNodeValue();
               } else if ("rel".equalsIgnoreCase(attrName)
-                  && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+                  && NOFOLLOW_PATTERN.matcher(attr.getNodeValue()).find()) {
                 noFollow = true;
               } else if ("method".equalsIgnoreCase(attrName)
                   && "post".equalsIgnoreCase(attr.getNodeValue())) {
diff --git 
a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
 
b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
index 0c1212a50..d50e9052d 100644
--- 
a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
+++ 
b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
@@ -103,6 +103,11 @@ public class TestDOMContentUtils {
           + "<a href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore </a>"
           + "<a rel=\"nofollow\" href=\"http://www.nutch.org\";> ignore </a>"
           + "</body></html>"),
+      // multiple space-separated rel values (NUTCH-2634)
+      new String("<html><head></head><body>"
+          + "<a href=\"http://www.nutch.org\"; rel=\"noreferrer nofollow\"> 
ignore </a>"
+          + "<a rel=\"nofollow noreferrer\" href=\"http://www.nutch.org\";> 
ignore </a>"
+          + "</body></html>"),
       // test that POST form actions are skipped
       new String("<html><head></head><body>"
           + "<form method='POST' action='/search.jsp'><input type=text>"
@@ -132,13 +137,13 @@ public class TestDOMContentUtils {
           + "<source src=\"movie.mp4\" type=\"video/mp4\">"
           + "</video>" + "</body></html>"), };
 
-  private static int SKIP = 9;
+  private static int SKIP = 10;
 
   private static String[] testBaseHrefs = { "http://www.nutch.org";,
       "http://www.nutch.org/docs/foo.html";, "http://www.nutch.org/docs/";,
       "http://www.nutch.org/docs/";, "http://www.nutch.org/frames/";,
       "http://www.nutch.org/maps/";, "http://www.nutch.org/whitespace/";,
-      "http://www.nutch.org//";, "http://www.nutch.org/";,
+      "http://www.nutch.org//";, "http://www.nutch.org//";, 
"http://www.nutch.org/";,
       "http://www.nutch.org/";, "http://www.nutch.org/";,
       "http://www.nutch.org/;something";, "http://www.nutch.org/";,
       "http://www.nutch.org/"; };
@@ -159,12 +164,13 @@ public class TestDOMContentUtils {
           + "Tabs are spaces too. This is a break -> and the line after break 
. "
           + "one two three space here space there no space "
           + "one two two three three four put some text here and there. "
-          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+          + "End this madness ! . . . .",
+      "ignore ignore", "ignore ignore", "test1 test2",
       "test1 test2", "title anchor1 anchor2 anchor3",
       "title anchor1 anchor2 anchor3 anchor4 anchor5", "title", "" };
 
   private static final String[] answerTitle = { "title", "title", "",
-      "my title", "my title", "my title", "my title", "", "", "", "title",
+      "my title", "my title", "my title", "my title", "", "", "", "", "title",
       "title", "title", "" };
 
   // note: should be in page-order
@@ -218,7 +224,8 @@ public class TestDOMContentUtils {
               new Outlink("http://www.nutch.org/bot.html";, ""),
               new Outlink("http://www.nutch.org/docs/index.html";, ""), },
           { new Outlink("http://www.nutch.org/index.html";, "whitespace test"), 
},
-          {},
+          {}, // nofollow
+          {}, // nofollow, multiple rel attributes
           { new Outlink("http://www.nutch.org/dummy.jsp";, "test2"), },
           {},
           { new Outlink("http://www.nutch.org/;x";, "anchor1"),
diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
index a9890825d..ebe1919fa 100644
--- 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
+++ 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -24,6 +24,7 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
+import java.util.regex.Pattern;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.MapWritable;
@@ -45,6 +46,9 @@ import org.w3c.dom.NodeList;
  */
 public class DOMContentUtils {
 
+  private static Pattern NOFOLLOW_PATTERN = Pattern.compile("\\bnofollow\\b",
+      Pattern.CASE_INSENSITIVE);
+
   private String srcTagMetaName;
   private boolean keepNodenames;
   private Set<String> blockNodes;
@@ -419,7 +423,7 @@ public class DOMContentUtils {
               if (params.attrName.equalsIgnoreCase(attrName)) {
                 target = attr.getNodeValue();
               } else if ("rel".equalsIgnoreCase(attrName)
-                  && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+                  && NOFOLLOW_PATTERN.matcher(attr.getNodeValue()).find()) {
                 noFollow = true;
               } else if ("method".equalsIgnoreCase(attrName)
                   && "post".equalsIgnoreCase(attr.getNodeValue())) {
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
index 2f04d7ff1..b449e1e4d 100644
--- 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
+++ 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
@@ -109,7 +109,12 @@ public class TestDOMContentUtils {
           + "<a href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore </a>"
           + "<a rel=\"nofollow\" href=\"http://www.nutch.org\";> ignore </a>"
           + "</body></html>"),
-      // test that all form actions are skipped
+      // multiple space-separated rel values (NUTCH-2634)
+      new String("<html><head></head><body>"
+          + "<a href=\"http://www.nutch.org\"; rel=\"noreferrer nofollow\"> 
ignore </a>"
+          + "<a rel=\"nofollow noreferrer\" href=\"http://www.nutch.org\";> 
ignore </a>"
+          + "</body></html>"),
+       // test that all form actions are skipped
       new String("<html><head></head><body>"
           + "<form method='POST' action='/search.jsp'><input type=text>"
           + "<input type=submit><p>test1</p></form>"
@@ -133,13 +138,13 @@ public class TestDOMContentUtils {
           + "<source src=\"movie.mp4\" type=\"video/mp4\">"
           + "</video>" + "</body></html>"), };
 
-  private static int SKIP = 9;
+  private static int SKIP = 10;
 
   private static String[] testBaseHrefs = { "http://www.nutch.org";,
       "http://www.nutch.org/docs/foo.html";, "http://www.nutch.org/docs/";,
       "http://www.nutch.org/docs/";, "http://www.nutch.org/frames/";,
       "http://www.nutch.org/maps/";, "http://www.nutch.org/whitespace/";,
-      "http://www.nutch.org//";, "http://www.nutch.org/";,
+      "http://www.nutch.org//";, "http://www.nutch.org//";, 
"http://www.nutch.org/";,
       "http://www.nutch.org/";, "http://www.nutch.org/";,
       "http://www.nutch.org/;something";, "http://www.nutch.org/"; };
 
@@ -156,12 +161,12 @@ public class TestDOMContentUtils {
           + "one\n" + "two\n" + "three\n" + "space here\n" + "space there\n"
           + "no space\n" + "one two\n" + "two three\n" + "three four\n"
           + "put some text here and there. End this madness ! . . . .",
-      "ignore ignore", "test1 test2", "test1 test2",
+      "ignore ignore", "ignore ignore", "test1 test2", "test1 test2",
       "title anchor1 anchor2 anchor3",
       "title anchor1 anchor2 anchor3 anchor4 anchor5", "" };
 
   private static final String[] answerTitle = { "title", "title", "",
-      "my title", "my title", "my title", "my title", "", "", "", "title",
+      "my title", "my title", "my title", "my title", "", "", "", "", "title",
       "title", "" };
 
   // note: should be in page-order
@@ -213,7 +218,8 @@ public class TestDOMContentUtils {
             new Outlink("http://www.nutch.org/bot.html";, ""),
             new Outlink("http://www.nutch.org/docs/index.html";, ""), },
         { new Outlink("http://www.nutch.org/index.html";, "whitespace test"), },
-        {},
+        {}, // nofollow
+        {}, // nofollow, multiple rel attributes
         {},
         {},
         { new Outlink("http://www.nutch.org/;x";, "anchor1"),

Reply via email to