NUTCH-1553 Property 'indexer.delete.robots.noindex' not working when using parser-html - fix broken unit test (fix HTML markup, make test for meta data extraction obligatory) - add all values of general metadata to parse metadata
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/34050ada Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/34050ada Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/34050ada Branch: refs/heads/master Commit: 34050adae0896a6d7ddb254a1622a03af6e07175 Parents: c18e19b Author: Sebastian Nagel <[email protected]> Authored: Fri Jul 1 15:07:52 2016 +0200 Committer: Sebastian Nagel <[email protected]> Committed: Fri Jul 1 15:10:49 2016 +0200 ---------------------------------------------------------------------- .../org/apache/nutch/metadata/Metadata.java | 25 ++++++++++++++++++++ .../org/apache/nutch/parse/html/HtmlParser.java | 4 +--- .../apache/nutch/parse/html/TestHtmlParser.java | 11 ++++----- 3 files changed, 31 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/java/org/apache/nutch/metadata/Metadata.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/metadata/Metadata.java b/src/java/org/apache/nutch/metadata/Metadata.java index f0bfcd3..8a57ee3 100644 --- a/src/java/org/apache/nutch/metadata/Metadata.java +++ b/src/java/org/apache/nutch/metadata/Metadata.java @@ -123,6 +123,31 @@ public class Metadata implements Writable, CreativeCommons, DublinCore, } /** + * Add all name/value mappings (merge two metadata mappings). If a name + * already exists in current metadata the values are added to existing values. + * + * @param metadata + * other Metadata to be merged + */ + public void addAll(Metadata metadata) { + for (String name : metadata.names()) { + String[] addValues = metadata.getValues(name); + if (addValues == null) + continue; + String[] oldValues = this.metadata.get(name); + if (oldValues == null) { + this.metadata.put(name, addValues); + } else { + String[] newValues = new String[oldValues.length + addValues.length]; + System.arraycopy(oldValues, 0, newValues, 0, oldValues.length); + System.arraycopy(addValues, 0, newValues, oldValues.length, + addValues.length); + this.metadata.put(name, newValues); + } + } + } + + /** * Copy All key-value pairs from properties. * * @param properties http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java index b6666aa..4d043ba 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java @@ -183,9 +183,7 @@ public class HtmlParser implements Parser { HTMLMetaProcessor.getMetaTags(metaTags, root, base); // populate Nutch metadata with HTML meta directives - for (String name : metaTags.getGeneralTags().names()) { - metadata.add(name, metaTags.getGeneralTags().get(name)); - } + metadata.addAll(metaTags.getGeneralTags()); if (LOG.isTraceEnabled()) { LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java index bcfe9e4..7099f50 100644 --- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java +++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java @@ -40,8 +40,8 @@ public class TestHtmlParser { private static final String encodingTestBody = "<ul>\n <li>français\n <li>español\n <li>ÑÑÑÑкий ÑзÑк\n <li>ÄeÅ¡tina\n <li>ελληνικά\n</ul>"; private static final String encodingTestContent = "<title>" + encodingTestKeywords + "</title>\n" - + "<meta name=\"keywords\" content=\"" + encodingTestKeywords - + "</meta>\n" + "</head>\n<body>" + encodingTestBody + "</body>\n</html>"; + + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n" + + "</head>\n<body>" + encodingTestBody + "</body>\n</html>"; private static String[][] encodingTestPages = { { @@ -113,10 +113,9 @@ public class TestHtmlParser { Assert.assertTrue(keyword + " not found in text (" + name + ")", text.contains(keyword)); } - if (keywords != null) { - Assert.assertEquals("Keywords not extracted properly (" + name + ")", - encodingTestKeywords, keywords); - } + Assert.assertNotNull("No keywords extracted", keywords); + Assert.assertEquals("Keywords not extracted properly (" + name + ")", + encodingTestKeywords, keywords); } }
