Author: nick Date: Mon Jan 23 16:08:10 2012 New Revision: 1234873 URL: http://svn.apache.org/viewvc?rev=1234873&view=rev Log: TIKA-845 Correct the conversion of XML tags to multi-valued metadata values, and avoid duplicating existing values
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java?rev=1234873&r1=1234872&r2=1234873&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java Mon Jan 23 16:08:10 2012 @@ -16,6 +16,9 @@ */ package org.apache.tika.parser.xml; +import java.util.Arrays; +import java.util.List; + import org.apache.tika.metadata.Metadata; import org.xml.sax.helpers.DefaultHandler; @@ -44,11 +47,23 @@ class AbstractMetadataHandler extends De */ protected void addMetadata(String value) { if (value != null && value.length() > 0) { - String previous = metadata.get(name); - if (previous != null && previous.length() > 0) { - value = previous + ", " + value; + if (metadata.isMultiValued(name)) { + // Add the value, assuming it's not already there + List<String> previous = Arrays.asList(metadata.getValues(name)); + if (!previous.contains(value)) { + metadata.add(name, value); + } + } else { + // Set the value, assuming it's not already there + String previous = metadata.get(name); + if (previous != null && previous.length() > 0) { + if (!previous.equals(value)) { + metadata.add(name, value); + } + } else { + metadata.set(name, value); + } } - metadata.set(name, value); } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=1234873&r1=1234872&r2=1234873&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Mon Jan 23 16:08:10 2012 @@ -40,9 +40,17 @@ public class DcXMLParserTest extends Tes metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Tika test document", metadata.get(Metadata.TITLE)); assertEquals("Rida Benjelloun", metadata.get(Metadata.CREATOR)); - assertEquals( - "Java, XML, XSLT, JDOM, Indexation", - metadata.get(Metadata.SUBJECT)); + + // The file contains 5 dc:subject tags, which come through as + // a multi-valued Tika Metadata entry in file order + assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT)); + assertEquals(5, metadata.getValues(Metadata.SUBJECT).length); + assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]); + assertEquals("XML", metadata.getValues(Metadata.SUBJECT)[1]); + assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]); + assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]); + assertEquals("Indexation", metadata.getValues(Metadata.SUBJECT)[4]); + assertEquals( "Framework d\'indexation des documents XML, HTML, PDF etc..", metadata.get(Metadata.DESCRIPTION));