This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 508715175ad3a5cb7454f4734bb6dc870d80e7d1
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Fri May 15 23:11:08 2020 +0200

    NUTCH-2720 ROBOTS metatag ignored when capitalized
    
    - parse-tika: add lowercase "robots" to metadata
---
 .../src/java/org/apache/nutch/parse/tika/TikaParser.java          | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index f2461fe..4d9495c 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -218,8 +218,14 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
       if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
         continue;
       String[] values = tikamd.getValues(tikaMDName);
-      for (String v : values)
+      for (String v : values) {
         nutchMetadata.add(tikaMDName, v);
+        if (tikaMDName.equalsIgnoreCase("robots")
+            && nutchMetadata.get("robots") == null) {
+          // NUTCH-2720 force lowercase robots directive
+          nutchMetadata.add("robots", v);
+        }
+      }
     }
 
     // no outlinks? try OutlinkExtractor e.g works for mime types where no

Reply via email to