NUTCH-1553 Property 'indexer.delete.robots.noindex' not working when using
parser-html
- fix broken unit test (fix HTML markup, make test for meta data extraction
obligatory)
- add all values of general metadata to parse metadata
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/34050ada
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/34050ada
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/34050ada
Branch: refs/heads/master
Commit: 34050adae0896a6d7ddb254a1622a03af6e07175
Parents: c18e19b
Author: Sebastian Nagel
Authored: Fri Jul 1 15:07:52 2016 +0200
Committer: Sebastian Nagel
Committed: Fri Jul 1 15:10:49 2016 +0200
--
.../org/apache/nutch/metadata/Metadata.java | 25
.../org/apache/nutch/parse/html/HtmlParser.java | 4 +---
.../apache/nutch/parse/html/TestHtmlParser.java | 11 -
3 files changed, 31 insertions(+), 9 deletions(-)
--
http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/java/org/apache/nutch/metadata/Metadata.java
--
diff --git a/src/java/org/apache/nutch/metadata/Metadata.java
b/src/java/org/apache/nutch/metadata/Metadata.java
index f0bfcd3..8a57ee3 100644
--- a/src/java/org/apache/nutch/metadata/Metadata.java
+++ b/src/java/org/apache/nutch/metadata/Metadata.java
@@ -123,6 +123,31 @@ public class Metadata implements Writable,
CreativeCommons, DublinCore,
}
/**
+ * Add all name/value mappings (merge two metadata mappings). If a name
+ * already exists in current metadata the values are added to existing
values.
+ *
+ * @param metadata
+ * other Metadata to be merged
+ */
+ public void addAll(Metadata metadata) {
+for (String name : metadata.names()) {
+ String[] addValues = metadata.getValues(name);
+ if (addValues == null)
+continue;
+ String[] oldValues = this.metadata.get(name);
+ if (oldValues == null) {
+this.metadata.put(name, addValues);
+ } else {
+String[] newValues = new String[oldValues.length + addValues.length];
+System.arraycopy(oldValues, 0, newValues, 0, oldValues.length);
+System.arraycopy(addValues, 0, newValues, oldValues.length,
+addValues.length);
+this.metadata.put(name, newValues);
+ }
+}
+ }
+
+ /**
* Copy All key-value pairs from properties.
*
* @param properties
http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
--
diff --git
a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index baa..4d043ba 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -183,9 +183,7 @@ public class HtmlParser implements Parser {
HTMLMetaProcessor.getMetaTags(metaTags, root, base);
// populate Nutch metadata with HTML meta directives
-for (String name : metaTags.getGeneralTags().names()) {
- metadata.add(name, metaTags.getGeneralTags().get(name));
-}
+metadata.addAll(metaTags.getGeneralTags());
if (LOG.isTraceEnabled()) {
LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
--
diff --git
a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
index bcfe9e4..7099f50 100644
---
a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++
b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -40,8 +40,8 @@ public class TestHtmlParser {
private static final String encodingTestBody = "\n français\n
español\n ÑÑÑÑкий ÑзÑк\n ÄeÅ¡tina\n
ελληνικά\n";
private static final String encodingTestContent = ""
+ encodingTestKeywords + "\n"
- + "\n" + "\n" + encodingTestBody +
"\n";
+ + "\n"
+ + "\n" + encodingTestBody + "\n";
private static String[][] encodingTestPages = {
{
@@ -113,10 +113,9 @@ public class TestHtmlParser {
Assert.assertTrue(keyword + " not found in text (" + name + ")",
text.contains(keyword));
}
- if (keywords != null) {
-Assert.assertEquals("Keywords not