Author: nick
Date: Mon Jan 23 16:08:10 2012
New Revision: 1234873

URL: http://svn.apache.org/viewvc?rev=1234873&view=rev
Log:
TIKA-845 Correct the conversion of XML tags to multi-valued metadata values, 
and avoid duplicating existing values

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java?rev=1234873&r1=1234872&r2=1234873&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
 Mon Jan 23 16:08:10 2012
@@ -16,6 +16,9 @@
  */
 package org.apache.tika.parser.xml;
 
+import java.util.Arrays;
+import java.util.List;
+
 import org.apache.tika.metadata.Metadata;
 import org.xml.sax.helpers.DefaultHandler;
 
@@ -44,11 +47,23 @@ class AbstractMetadataHandler extends De
      */
     protected void addMetadata(String value) {
         if (value != null && value.length() > 0) {
-            String previous = metadata.get(name);
-            if (previous != null && previous.length() > 0) {
-                value = previous + ", " + value;
+            if (metadata.isMultiValued(name)) {
+                // Add the value, assuming it's not already there
+                List<String> previous = 
Arrays.asList(metadata.getValues(name));
+                if (!previous.contains(value)) {
+                    metadata.add(name, value);
+                }
+            } else {
+                // Set the value, assuming it's not already there
+                String previous = metadata.get(name);
+                if (previous != null && previous.length() > 0) {
+                    if (!previous.equals(value)) {
+                        metadata.add(name, value);
+                    }
+                } else {
+                    metadata.set(name, value);
+                }
             }
-            metadata.set(name, value);
         }
     }
 

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=1234873&r1=1234872&r2=1234873&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
 Mon Jan 23 16:08:10 2012
@@ -40,9 +40,17 @@ public class DcXMLParserTest extends Tes
                     metadata.get(Metadata.CONTENT_TYPE));
             assertEquals("Tika test document", metadata.get(Metadata.TITLE));
             assertEquals("Rida Benjelloun", metadata.get(Metadata.CREATOR));
-            assertEquals(
-                    "Java, XML, XSLT, JDOM, Indexation",
-                    metadata.get(Metadata.SUBJECT));
+            
+            // The file contains 5 dc:subject tags, which come through as
+            //  a multi-valued Tika Metadata entry in file order
+            assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT));
+            assertEquals(5,      metadata.getValues(Metadata.SUBJECT).length);
+            assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]);
+            assertEquals("XML",  metadata.getValues(Metadata.SUBJECT)[1]);
+            assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]);
+            assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]);
+            assertEquals("Indexation", 
metadata.getValues(Metadata.SUBJECT)[4]);
+
             assertEquals(
                     "Framework d\'indexation des documents XML, HTML, PDF 
etc..",
                     metadata.get(Metadata.DESCRIPTION));


Reply via email to