Author: tallison
Date: Thu Oct 8 01:31:12 2015
New Revision: 1707427
URL: http://svn.apache.org/viewvc?rev=1707427&view=rev
Log:
TIKA-1765
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.doc
(with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.docx
(with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Oct 8 01:31:12 2015
@@ -1,4 +1,7 @@
Release 1.11 - Current Development
+
+ * Parse multiple authors from MSOffice's semi-colon delimited
+ author field (TIKA-1765).
* Include CTAKESConfig.properties within tika-parsers resources
by default (TIKA-1741)
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
Thu Oct 8 01:31:12 2015
@@ -334,7 +334,8 @@ public class Metadata implements Creativ
if (property.isMultiValuePermitted()) {
set(property, appendedValues(values, value));
} else {
- throw new PropertyTypeException(property.getPropertyType());
+ throw new PropertyTypeException(property.getName() +
+ " : " + property.getPropertyType());
}
}
}
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
Thu Oct 8 01:31:12 2015
@@ -38,11 +38,10 @@ public interface OfficeOpenXMLExtended
Property TEMPLATE = Property.externalText(
PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "Template");
- Property MANAGER = Property.externalText(
+ Property MANAGER = Property.externalTextBag(
PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "Manager");
- Property COMPANY = Property.externalText(
- PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "Company");
+ Property COMPANY = Property.externalText( PREFIX +
Metadata.NAMESPACE_PREFIX_DELIMITER + "Company");
Property PRESENTATION_FORMAT = Property.externalText(
PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER +
"PresentationFormat");
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
Thu Oct 8 01:31:12 2015
@@ -18,6 +18,8 @@
package org.apache.tika.parser.microsoft;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.math.BigDecimal;
@@ -50,8 +52,6 @@ import org.apache.tika.sax.BodyContentHa
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
/**
* Internal class. Needs to be instantiated for each parse because of
* the lack of thread safety with the dateTimeFormatter
@@ -109,8 +109,9 @@ class JackcessExtractor extends Abstract
found.add(title.getName());
}
PropertyMap.Property author =
summaryProperties.get(AUTHOR_PROP_KEY);
- if (author != null) {
- metadata.set(TikaCoreProperties.CREATOR,
toString(author.getValue(), author.getType()));
+ if (author != null && author.getValue() != null) {
+ String authorString = toString(author.getValue(),
author.getType());
+ SummaryExtractor.addMulti(metadata,
TikaCoreProperties.CREATOR, authorString);
found.add(author.getName());
}
PropertyMap.Property company =
summaryProperties.get(COMPANY_PROP_KEY);
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
Thu Oct 8 01:31:12 2015
@@ -19,6 +19,8 @@ package org.apache.tika.parser.microsoft
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Date;
+import java.util.HashSet;
+import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -101,7 +103,7 @@ public class SummaryExtractor {
private void parse(SummaryInformation summary) {
set(TikaCoreProperties.TITLE, summary.getTitle());
- set(TikaCoreProperties.CREATOR, summary.getAuthor());
+ addMulti(metadata, TikaCoreProperties.CREATOR, summary.getAuthor());
set(TikaCoreProperties.KEYWORDS, summary.getKeywords());
// TODO Move to OO subject in Tika 2.0
set(TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT,
summary.getSubject());
@@ -137,7 +139,7 @@ public class SummaryExtractor {
private void parse(DocumentSummaryInformation summary) {
set(OfficeOpenXMLExtended.COMPANY, summary.getCompany());
- set(OfficeOpenXMLExtended.MANAGER, summary.getManager());
+ addMulti(metadata, OfficeOpenXMLExtended.MANAGER,
summary.getManager());
set(TikaCoreProperties.LANGUAGE, getLanguage(summary));
set(OfficeOpenXMLCore.CATEGORY, summary.getCategory());
@@ -231,4 +233,28 @@ public class SummaryExtractor {
metadata.set(name, Long.toString(value));
}
}
+
+ //MS stores values that should be multiple values (e.g. dc:creator)
+ //as a semicolon-delimited list. We need to split
+ //on semicolon to add each value.
+ public static void addMulti(Metadata metadata, Property property, String
string) {
+ if (string == null) {
+ return;
+ }
+ String[] parts = string.split(";");
+ String[] current = metadata.getValues(property);
+ Set<String> seen = new HashSet<>();
+ if (current != null) {
+ for (String val : current) {
+ seen.add(val);
+ }
+ }
+ for (String part : parts) {
+ if (! seen.contains(part)) {
+ metadata.add(property, part);
+ seen.add(part);
+ }
+ }
+ }
+
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
Thu Oct 8 01:31:12 2015
@@ -35,6 +35,7 @@ import org.apache.tika.metadata.OfficeOp
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.microsoft.SummaryExtractor;
import
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
import
org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
@@ -72,7 +73,7 @@ public class MetadataExtractor {
.getContentStatusProperty());
addProperty(metadata, TikaCoreProperties.CREATED, propsHolder
.getCreatedProperty());
- addProperty(metadata, TikaCoreProperties.CREATOR, propsHolder
+ addMultiProperty(metadata, TikaCoreProperties.CREATOR, propsHolder
.getCreatorProperty());
addProperty(metadata, TikaCoreProperties.DESCRIPTION, propsHolder
.getDescriptionProperty());
@@ -116,7 +117,7 @@ public class MetadataExtractor {
addProperty(metadata, OfficeOpenXMLExtended.APP_VERSION,
propsHolder.getAppVersion());
addProperty(metadata, TikaCoreProperties.PUBLISHER,
propsHolder.getCompany());
addProperty(metadata, OfficeOpenXMLExtended.COMPANY,
propsHolder.getCompany());
- addProperty(metadata, OfficeOpenXMLExtended.MANAGER,
propsHolder.getManager());
+ SummaryExtractor.addMulti(metadata, OfficeOpenXMLExtended.MANAGER,
propsHolder.getManager());
addProperty(metadata, OfficeOpenXMLExtended.NOTES,
propsHolder.getNotes());
addProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT,
propsHolder.getPresentationFormat());
addProperty(metadata, OfficeOpenXMLExtended.TEMPLATE,
propsHolder.getTemplate());
@@ -283,4 +284,12 @@ public class MetadataExtractor {
metadata.set(name, Integer.toString(value));
}
}
+
+ private void addMultiProperty(Metadata metadata, Property property,
Nullable<String> value) {
+ if (value == null) {
+ return;
+ }
+ SummaryExtractor.addMulti(metadata, property, value.getValue());
+ }
+
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Thu Oct 8 01:31:12 2015
@@ -479,5 +479,18 @@ public class WordParserTest extends Tika
assertContains(">01..1 01..1", xml);
assertContains(">02 02", xml);
}
+
+ @Test
+ public void testMultiAuthorsManagers() throws Exception {
+ XMLResult r = getXML("testWORD_multi_authors.doc");
+ String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR);
+ assertEquals(3, authors.length);
+ assertEquals("author2", authors[1]);
+
+ String[] managers =
r.metadata.getValues(OfficeOpenXMLExtended.MANAGER);
+ assertEquals(2, managers.length);
+ assertEquals("manager1", managers[0]);
+ assertEquals("manager2", managers[1]);
+ }
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1707427&r1=1707426&r2=1707427&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Thu Oct 8 01:31:12 2015
@@ -1192,6 +1192,19 @@ public class OOXMLParserTest extends Tik
assertContains("Footer - For Internal Use Only", content);
assertContains("Footer - Author: John Smith", content);
}
+
+ @Test
+ public void testMultiAuthorsManagers() throws Exception {
+ XMLResult r = getXML("testWORD_multi_authors.docx");
+ String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR);
+ assertEquals(3, authors.length);
+ assertEquals("author2", authors[1]);
+
+ String[] managers =
r.metadata.getValues(OfficeOpenXMLExtended.MANAGER);
+ assertEquals(2, managers.length);
+ assertEquals("manager1", managers[0]);
+ assertEquals("manager2", managers[1]);
+ }
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.doc
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.doc?rev=1707427&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.doc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.docx
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.docx?rev=1707427&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_multi_authors.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream