Author: tallison
Date: Thu Oct  8 01:31:12 2015
New Revision: 1707427

URL: http://svn.apache.org/viewvc?rev=1707427&view=rev
Log:
TIKA-1765

Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.doc
   (with props)
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.docx
   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
    
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Oct  8 01:31:12 2015
@@ -1,4 +1,7 @@
 Release 1.11 - Current Development
+
+  * Parse multiple authors from MSOffice's semi-colon delimited
+    author field (TIKA-1765).
   
   * Include CTAKESConfig.properties within tika-parsers resources 
     by default (TIKA-1741)

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java 
Thu Oct  8 01:31:12 2015
@@ -334,7 +334,8 @@ public class Metadata implements Creativ
              if (property.isMultiValuePermitted()) {
                  set(property, appendedValues(values, value));
              } else {
-                 throw new PropertyTypeException(property.getPropertyType());
+                 throw new PropertyTypeException(property.getName() +
+                         " : " + property.getPropertyType());
              }
         }
     }

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
 (original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
 Thu Oct  8 01:31:12 2015
@@ -38,11 +38,10 @@ public interface OfficeOpenXMLExtended
     Property TEMPLATE = Property.externalText(
                PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "Template");
     
-    Property MANAGER = Property.externalText(
+    Property MANAGER = Property.externalTextBag(
                PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "Manager");
     
-    Property COMPANY = Property.externalText(
-               PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "Company");
+    Property COMPANY = Property.externalText(                  PREFIX + 
Metadata.NAMESPACE_PREFIX_DELIMITER + "Company");
     
     Property PRESENTATION_FORMAT = Property.externalText(
                PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + 
"PresentationFormat");

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
 Thu Oct  8 01:31:12 2015
@@ -18,6 +18,8 @@
 package org.apache.tika.parser.microsoft;
 
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.math.BigDecimal;
@@ -50,8 +52,6 @@ import org.apache.tika.sax.BodyContentHa
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-
 /**
  * Internal class.  Needs to be instantiated for each parse because of
  * the lack of thread safety with the dateTimeFormatter
@@ -109,8 +109,9 @@ class JackcessExtractor extends Abstract
                 found.add(title.getName());
             }
             PropertyMap.Property author = 
summaryProperties.get(AUTHOR_PROP_KEY);
-            if (author != null) {
-                metadata.set(TikaCoreProperties.CREATOR, 
toString(author.getValue(), author.getType()));
+            if (author != null && author.getValue() != null) {
+                String authorString = toString(author.getValue(), 
author.getType());
+                SummaryExtractor.addMulti(metadata, 
TikaCoreProperties.CREATOR, authorString);
                 found.add(author.getName());
             }
             PropertyMap.Property company = 
summaryProperties.get(COMPANY_PROP_KEY);

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
 Thu Oct  8 01:31:12 2015
@@ -19,6 +19,8 @@ package org.apache.tika.parser.microsoft
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.Date;
+import java.util.HashSet;
+import java.util.Set;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -101,7 +103,7 @@ public class SummaryExtractor {
 
     private void parse(SummaryInformation summary) {
         set(TikaCoreProperties.TITLE, summary.getTitle());
-        set(TikaCoreProperties.CREATOR, summary.getAuthor());
+        addMulti(metadata, TikaCoreProperties.CREATOR, summary.getAuthor());
         set(TikaCoreProperties.KEYWORDS, summary.getKeywords());
         // TODO Move to OO subject in Tika 2.0
         set(TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, 
summary.getSubject());
@@ -137,7 +139,7 @@ public class SummaryExtractor {
 
     private void parse(DocumentSummaryInformation summary) {
         set(OfficeOpenXMLExtended.COMPANY, summary.getCompany());
-        set(OfficeOpenXMLExtended.MANAGER, summary.getManager());
+        addMulti(metadata, OfficeOpenXMLExtended.MANAGER, 
summary.getManager());
         set(TikaCoreProperties.LANGUAGE, getLanguage(summary));
         set(OfficeOpenXMLCore.CATEGORY, summary.getCategory());
 
@@ -231,4 +233,28 @@ public class SummaryExtractor {
             metadata.set(name, Long.toString(value));
         }
     }
+
+    //MS stores values that should be multiple values (e.g. dc:creator)
+    //as a semicolon-delimited list.  We need to split
+    //on semicolon to add each value.
+    public static void addMulti(Metadata metadata, Property property, String 
string) {
+        if (string == null) {
+            return;
+        }
+        String[] parts = string.split(";");
+        String[] current = metadata.getValues(property);
+        Set<String> seen = new HashSet<>();
+        if (current != null) {
+            for (String val : current) {
+                seen.add(val);
+            }
+        }
+        for (String part : parts) {
+            if (! seen.contains(part)) {
+                metadata.add(property, part);
+                seen.add(part);
+            }
+        }
+    }
+
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
 Thu Oct  8 01:31:12 2015
@@ -35,6 +35,7 @@ import org.apache.tika.metadata.OfficeOp
 import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.microsoft.SummaryExtractor;
 import 
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
 import 
org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
 
@@ -72,7 +73,7 @@ public class MetadataExtractor {
                 .getContentStatusProperty());
         addProperty(metadata, TikaCoreProperties.CREATED, propsHolder
                 .getCreatedProperty());
-        addProperty(metadata, TikaCoreProperties.CREATOR, propsHolder
+        addMultiProperty(metadata, TikaCoreProperties.CREATOR, propsHolder
                 .getCreatorProperty());
         addProperty(metadata, TikaCoreProperties.DESCRIPTION, propsHolder
                 .getDescriptionProperty());
@@ -116,7 +117,7 @@ public class MetadataExtractor {
         addProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, 
propsHolder.getAppVersion());
         addProperty(metadata, TikaCoreProperties.PUBLISHER, 
propsHolder.getCompany());
         addProperty(metadata, OfficeOpenXMLExtended.COMPANY, 
propsHolder.getCompany());
-        addProperty(metadata, OfficeOpenXMLExtended.MANAGER, 
propsHolder.getManager());
+        SummaryExtractor.addMulti(metadata, OfficeOpenXMLExtended.MANAGER, 
propsHolder.getManager());
         addProperty(metadata, OfficeOpenXMLExtended.NOTES, 
propsHolder.getNotes());
         addProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, 
propsHolder.getPresentationFormat());
         addProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, 
propsHolder.getTemplate());
@@ -283,4 +284,12 @@ public class MetadataExtractor {
             metadata.set(name, Integer.toString(value));
         }
     }
+
+    private void addMultiProperty(Metadata metadata, Property property, 
Nullable<String> value) {
+        if (value == null) {
+            return;
+        }
+        SummaryExtractor.addMulti(metadata, property, value.getValue());
+    }
+
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 Thu Oct  8 01:31:12 2015
@@ -479,5 +479,18 @@ public class WordParserTest extends Tika
         assertContains(">01..1 01..1", xml);
         assertContains(">02 02", xml);
     }
+
+    @Test
+    public void testMultiAuthorsManagers() throws Exception {
+        XMLResult r = getXML("testWORD_multi_authors.doc");
+        String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR);
+        assertEquals(3, authors.length);
+        assertEquals("author2", authors[1]);
+
+        String[] managers = 
r.metadata.getValues(OfficeOpenXMLExtended.MANAGER);
+        assertEquals(2, managers.length);
+        assertEquals("manager1", managers[0]);
+        assertEquals("manager2", managers[1]);
+    }
 }
 

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 Thu Oct  8 01:31:12 2015
@@ -1192,6 +1192,19 @@ public class OOXMLParserTest extends Tik
         assertContains("Footer - For Internal Use Only", content);
         assertContains("Footer - Author: John Smith", content);
     }
+
+    @Test
+    public void testMultiAuthorsManagers() throws Exception {
+        XMLResult r = getXML("testWORD_multi_authors.docx");
+        String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR);
+        assertEquals(3, authors.length);
+        assertEquals("author2", authors[1]);
+
+        String[] managers = 
r.metadata.getValues(OfficeOpenXMLExtended.MANAGER);
+        assertEquals(2, managers.length);
+        assertEquals("manager1", managers[0]);
+        assertEquals("manager2", managers[1]);
+    }
 }
 
 

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.doc
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.doc?rev=1707427&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.doc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.docx
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.docx?rev=1707427&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.docx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to