NUTCH-1553 Property 'indexer.delete.robots.noindex' not working when using 
parser-html
- fix broken unit test (fix HTML markup, make test for meta data extraction 
obligatory)
- add all values of general metadata to parse metadata


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/34050ada
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/34050ada
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/34050ada

Branch: refs/heads/master
Commit: 34050adae0896a6d7ddb254a1622a03af6e07175
Parents: c18e19b
Author: Sebastian Nagel <[email protected]>
Authored: Fri Jul 1 15:07:52 2016 +0200
Committer: Sebastian Nagel <[email protected]>
Committed: Fri Jul 1 15:10:49 2016 +0200

----------------------------------------------------------------------
 .../org/apache/nutch/metadata/Metadata.java     | 25 ++++++++++++++++++++
 .../org/apache/nutch/parse/html/HtmlParser.java |  4 +---
 .../apache/nutch/parse/html/TestHtmlParser.java | 11 ++++-----
 3 files changed, 31 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/java/org/apache/nutch/metadata/Metadata.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/metadata/Metadata.java 
b/src/java/org/apache/nutch/metadata/Metadata.java
index f0bfcd3..8a57ee3 100644
--- a/src/java/org/apache/nutch/metadata/Metadata.java
+++ b/src/java/org/apache/nutch/metadata/Metadata.java
@@ -123,6 +123,31 @@ public class Metadata implements Writable, 
CreativeCommons, DublinCore,
   }
 
   /**
+   * Add all name/value mappings (merge two metadata mappings). If a name
+   * already exists in current metadata the values are added to existing 
values.
+   *
+   * @param metadata
+   *          other Metadata to be merged
+   */
+  public void addAll(Metadata metadata) {
+    for (String name : metadata.names()) {
+      String[] addValues = metadata.getValues(name);
+      if (addValues == null)
+        continue;
+      String[] oldValues = this.metadata.get(name);
+      if (oldValues == null) {
+        this.metadata.put(name, addValues);
+      } else {
+        String[] newValues = new String[oldValues.length + addValues.length];
+        System.arraycopy(oldValues, 0, newValues, 0, oldValues.length);
+        System.arraycopy(addValues, 0, newValues, oldValues.length,
+            addValues.length);
+        this.metadata.put(name, newValues);
+      }
+    }
+  }
+
+  /**
    * Copy All key-value pairs from properties.
    * 
    * @param properties

http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java 
b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index b6666aa..4d043ba 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -183,9 +183,7 @@ public class HtmlParser implements Parser {
     HTMLMetaProcessor.getMetaTags(metaTags, root, base);
 
     // populate Nutch metadata with HTML meta directives
-    for (String name : metaTags.getGeneralTags().names()) {
-      metadata.add(name, metaTags.getGeneralTags().get(name));
-    }
+    metadata.addAll(metaTags.getGeneralTags());
 
     if (LOG.isTraceEnabled()) {
       LOG.trace("Meta tags for " + base + ": " + metaTags.toString());

http://git-wip-us.apache.org/repos/asf/nutch/blob/34050ada/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
 
b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
index bcfe9e4..7099f50 100644
--- 
a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++ 
b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -40,8 +40,8 @@ public class TestHtmlParser {
   private static final String encodingTestBody = "<ul>\n  <li>français\n  
<li>español\n  <li>русский язык\n  <li>čeština\n  
<li>ελληνικά\n</ul>";
   private static final String encodingTestContent = "<title>"
       + encodingTestKeywords + "</title>\n"
-      + "<meta name=\"keywords\" content=\"" + encodingTestKeywords
-      + "</meta>\n" + "</head>\n<body>" + encodingTestBody + 
"</body>\n</html>";
+      + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n"
+      + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
 
   private static String[][] encodingTestPages = {
       {
@@ -113,10 +113,9 @@ public class TestHtmlParser {
         Assert.assertTrue(keyword + " not found in text (" + name + ")",
             text.contains(keyword));
       }
-      if (keywords != null) {
-        Assert.assertEquals("Keywords not extracted properly (" + name + ")",
-            encodingTestKeywords, keywords);
-      }
+      Assert.assertNotNull("No keywords extracted", keywords);
+      Assert.assertEquals("Keywords not extracted properly (" + name + ")",
+          encodingTestKeywords, keywords);
     }
   }
 

Reply via email to