This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit fa319a60f30dbb0efcd67e306c611d66b7b379f1 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Sun May 17 14:37:47 2020 +0200 NUTCH-2720 ROBOTS metatag ignored when capitalized - move string "robots" to constant in metadata.Nutch - make string lowercase not depend on system locale --- .../org/apache/nutch/indexer/IndexerMapReduce.java | 8 +++++--- src/java/org/apache/nutch/metadata/Nutch.java | 6 ++++++ .../apache/nutch/parse/html/HTMLMetaProcessor.java | 3 ++- .../apache/nutch/parse/tika/HTMLMetaProcessor.java | 23 +++++++++++++--------- .../org/apache/nutch/parse/tika/TikaParser.java | 6 +++--- 5 files changed, 30 insertions(+), 16 deletions(-) diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java index 3e9bc15..42093b7 100644 --- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java +++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java @@ -19,6 +19,8 @@ package org.apache.nutch.indexer; import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.Collection; +import java.util.Locale; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.commons.codec.binary.Base64; @@ -274,11 +276,11 @@ public class IndexerMapReduce extends Configured { // Handle robots meta? https://issues.apache.org/jira/browse/NUTCH-1434 if (deleteRobotsNoIndex) { // Get the robots meta data - String robotsMeta = parseData.getMeta("robots"); + String robotsMeta = parseData.getMeta(Nutch.ROBOTS_METATAG); // Has it a noindex for this url? - if (robotsMeta != null - && robotsMeta.toLowerCase().indexOf("noindex") != -1) { + if (robotsMeta != null && robotsMeta.toLowerCase(Locale.ROOT) + .indexOf("noindex") != -1) { // Delete it! context.write(key, DELETE_ACTION); context.getCounter("IndexerStatus", "deleted (robots=noindex)").increment(1); diff --git a/src/java/org/apache/nutch/metadata/Nutch.java b/src/java/org/apache/nutch/metadata/Nutch.java index d28808d..0cfb263 100644 --- a/src/java/org/apache/nutch/metadata/Nutch.java +++ b/src/java/org/apache/nutch/metadata/Nutch.java @@ -52,6 +52,12 @@ public interface Nutch { public static final String FETCH_STATUS_KEY = "_fst_"; + /** + * Name to store the <a href="https://www.robotstxt.org/meta.html">robots + * metatag</a> in {@link org.apache.nutch.parse.ParseData}'s metadata. + */ + public static final String ROBOTS_METATAG = "robots"; + /** * Sites may request that search engines don't provide access to cached * documents. diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java index 4e7ef14..d655a96 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java @@ -18,6 +18,7 @@ package org.apache.nutch.parse.html; import java.net.URL; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.parse.HTMLMetaTags; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; @@ -79,7 +80,7 @@ public class HTMLMetaProcessor { if (contentNode != null) { String name = nameNode.getNodeValue().toLowerCase(); metaTags.getGeneralTags().add(name, contentNode.getNodeValue()); - if ("robots".equals(name)) { + if (Nutch.ROBOTS_METATAG.equals(name)) { String directives = contentNode.getNodeValue().toLowerCase(); int index = directives.indexOf("none"); diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java index 58f93ac..8584df7 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java @@ -18,7 +18,9 @@ package org.apache.nutch.parse.tika; import java.net.MalformedURLException; import java.net.URL; +import java.util.Locale; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.parse.HTMLMetaTags; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; @@ -66,7 +68,7 @@ public class HTMLMetaProcessor { // Retrieves name, http-equiv and content attribues for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); - String attrName = attr.getNodeName().toLowerCase(); + String attrName = attr.getNodeName().toLowerCase(Locale.ROOT); if (attrName.equals("name")) { nameNode = attr; } else if (attrName.equals("http-equiv")) { @@ -78,10 +80,11 @@ public class HTMLMetaProcessor { if (nameNode != null) { if (contentNode != null) { - String name = nameNode.getNodeValue().toLowerCase(); + String name = nameNode.getNodeValue().toLowerCase(Locale.ROOT); metaTags.getGeneralTags().add(name, contentNode.getNodeValue()); - if ("robots".equals(name)) { - String directives = contentNode.getNodeValue().toLowerCase(); + if (Nutch.ROBOTS_METATAG.equals(name)) { + String directives = contentNode.getNodeValue() + .toLowerCase(Locale.ROOT); int index = directives.indexOf("none"); if (index >= 0) { @@ -112,12 +115,14 @@ public class HTMLMetaProcessor { } // end if (name == robots) // meta names added/transformed by Tika else if (name.equals("pragma")) { - String content = contentNode.getNodeValue().toLowerCase(); + String content = contentNode.getNodeValue() + .toLowerCase(Locale.ROOT); if (content.contains("no-cache")) { metaTags.setNoCache(); } } else if (name.equals("refresh")) { - String content = contentNode.getNodeValue().toLowerCase(); + String content = contentNode.getNodeValue() + .toLowerCase(Locale.ROOT); setRefresh(metaTags, content, currURL); } else if (name.equals("content-location")) { String urlString = contentNode.getNodeValue(); @@ -138,11 +143,11 @@ public class HTMLMetaProcessor { if (equivNode != null) { if (contentNode != null) { - String name = equivNode.getNodeValue().toLowerCase(); + String name = equivNode.getNodeValue().toLowerCase(Locale.ROOT); String content = contentNode.getNodeValue(); metaTags.getHttpEquivTags().setProperty(name, content); if ("pragma".equals(name)) { - content = content.toLowerCase(); + content = content.toLowerCase(Locale.ROOT); int index = content.indexOf("no-cache"); if (index >= 0) metaTags.setNoCache(); @@ -203,7 +208,7 @@ public class HTMLMetaProcessor { } URL refreshUrl = null; if (metaTags.getRefresh() && idx != -1) { // set the URL - idx = content.toLowerCase().indexOf("url="); + idx = content.toLowerCase(Locale.ROOT).indexOf("url="); if (idx == -1) { // assume a mis-formatted entry with just the // url idx = content.indexOf(';') + 1; diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java index 4d9495c..d97e8b4 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java @@ -220,10 +220,10 @@ public class TikaParser implements org.apache.nutch.parse.Parser { String[] values = tikamd.getValues(tikaMDName); for (String v : values) { nutchMetadata.add(tikaMDName, v); - if (tikaMDName.equalsIgnoreCase("robots") - && nutchMetadata.get("robots") == null) { + if (tikaMDName.equalsIgnoreCase(Nutch.ROBOTS_METATAG) + && nutchMetadata.get(Nutch.ROBOTS_METATAG) == null) { // NUTCH-2720 force lowercase robots directive - nutchMetadata.add("robots", v); + nutchMetadata.add(Nutch.ROBOTS_METATAG, v); } } }