Repository: nutch Updated Branches: refs/heads/master d96c936b6 -> cb6fbae51
NUTCH-1553 Property 'indexer.delete.robots.noindex' not working when using parser-html - add general metadata to parse metadata where it can be checked by the indexer Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/cb6fbae5 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/cb6fbae5 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/cb6fbae5 Branch: refs/heads/master Commit: cb6fbae51a56587c30d15b8f170ebbf470851168 Parents: d96c936 Author: Sebastian Nagel <[email protected]> Authored: Thu Jun 30 08:12:02 2016 +0200 Committer: Sebastian Nagel <[email protected]> Committed: Thu Jun 30 08:40:33 2016 +0200 ---------------------------------------------------------------------- .../src/java/org/apache/nutch/parse/html/HtmlParser.java | 6 ++++++ 1 file changed, 6 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/cb6fbae5/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java index ecf2f12..b6666aa 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java @@ -181,6 +181,12 @@ public class HtmlParser implements Parser { // get meta directives HTMLMetaProcessor.getMetaTags(metaTags, root, base); + + // populate Nutch metadata with HTML meta directives + for (String name : metaTags.getGeneralTags().names()) { + metadata.add(name, metaTags.getGeneralTags().get(name)); + } + if (LOG.isTraceEnabled()) { LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); }
