This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 508715175ad3a5cb7454f4734bb6dc870d80e7d1 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Fri May 15 23:11:08 2020 +0200 NUTCH-2720 ROBOTS metatag ignored when capitalized - parse-tika: add lowercase "robots" to metadata --- .../src/java/org/apache/nutch/parse/tika/TikaParser.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java index f2461fe..4d9495c 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java @@ -218,8 +218,14 @@ public class TikaParser implements org.apache.nutch.parse.Parser { if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) continue; String[] values = tikamd.getValues(tikaMDName); - for (String v : values) + for (String v : values) { nutchMetadata.add(tikaMDName, v); + if (tikaMDName.equalsIgnoreCase("robots") + && nutchMetadata.get("robots") == null) { + // NUTCH-2720 force lowercase robots directive + nutchMetadata.add("robots", v); + } + } } // no outlinks? try OutlinkExtractor e.g works for mime types where no