Author: nick
Date: Thu Jul 15 10:37:20 2010
New Revision: 964373
URL: http://svn.apache.org/viewvc?rev=964373&view=rev
Log:
Update parsers to fix problems with new style Date properties, for TIKA-451
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java?rev=964373&r1=964372&r2=964373&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
Thu Jul 15 10:37:20 2010
@@ -45,6 +45,7 @@ public interface DublinCore {
/**
* Date on which the resource was changed.
+ * TODO Make me a Date Property
*/
String MODIFIED = "modified";
@@ -81,6 +82,7 @@ public interface DublinCore {
* the resource. Recommended best practice for encoding the date value is
* defined in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD
* format.
+ * TODO Make me a Date Property
*/
String DATE = "date";
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java?rev=964373&r1=964372&r2=964373&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java
Thu Jul 15 10:37:20 2010
@@ -73,9 +73,10 @@ public interface MSOffice {
String SECURITY = "Security";
- Property EDIT_TIME =
- Property.internalDate("Edit-Time");
+ /** How long has been spent editing the document? */
+ String EDIT_TIME = "Edit-Time";
+ /** When was the document created? */
Property CREATION_DATE =
Property.internalDate("Creation-Date");
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=964373&r1=964372&r2=964373&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
Thu Jul 15 10:37:20 2010
@@ -16,6 +16,8 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import java.util.Date;
+
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.POIXMLProperties.CoreProperties;
import org.apache.poi.POIXMLProperties.ExtendedProperties;
@@ -23,6 +25,7 @@ import org.apache.poi.openxml4j.opc.inte
import org.apache.poi.openxml4j.util.Nullable;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
import
org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
/**
@@ -58,6 +61,8 @@ public class MetadataExtractor {
.getContentStatusProperty());
addProperty(metadata, Metadata.DATE, propsHolder
.getCreatedPropertyString());
+ addProperty(metadata, Metadata.CREATION_DATE, propsHolder
+ .getCreatedProperty());
addProperty(metadata, Metadata.CREATOR, propsHolder
.getCreatorProperty());
addProperty(metadata, Metadata.AUTHOR, propsHolder
@@ -75,7 +80,7 @@ public class MetadataExtractor {
addProperty(metadata, Metadata.LAST_PRINTED, propsHolder
.getLastPrintedPropertyString());
addProperty(metadata, Metadata.LAST_MODIFIED, propsHolder
- .getModifiedPropertyString());
+ .getModifiedProperty());
addProperty(metadata, Metadata.REVISION_NUMBER, propsHolder
.getRevisionProperty());
addProperty(metadata, Metadata.SUBJECT, propsHolder
@@ -110,6 +115,12 @@ public class MetadataExtractor {
addProperty(metadata, Metadata.WORD_COUNT, propsHolder.getWords());
}
+ private void addProperty(Metadata metadata, Property property,
Nullable<Date> value) {
+ if (value.getValue() != null) {
+ metadata.set(property, value.getValue());
+ }
+ }
+
private void addProperty(Metadata metadata, String name, Nullable<?>
value) {
if (value.getValue() != null) {
addProperty(metadata, name, value.getValue().toString());
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java?rev=964373&r1=964372&r2=964373&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
Thu Jul 15 10:37:20 2010
@@ -71,7 +71,7 @@ public class OpenDocumentMetaParser exte
// Process the Dublin Core Attributes
ch = super.getContentHandler(ch, md);
// Process the OO Meta Attributes
- ch = getMeta(ch, md, Metadata.CREATION_DATE, "creation-date");
+ ch = getMeta(ch, md, Metadata.CREATION_DATE.getName(),
"creation-date");
ch = getMeta(ch, md, Metadata.KEYWORDS, "keyword");
ch = getMeta(ch, md, Metadata.EDIT_TIME, "editing-duration");
ch = getMeta(ch, md, "editing-cycles", "editing-cycles");
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=964373&r1=964372&r2=964373&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Thu Jul 15 10:37:20 2010
@@ -32,6 +32,7 @@ import org.apache.pdfbox.pdmodel.PDDocum
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -146,6 +147,12 @@ public class PDFParser implements Parser
}
}
+ private void addMetadata(Metadata metadata, Property property, Calendar
value) {
+ if (value != null) {
+ metadata.set(property, value.getTime());
+ }
+ }
+
/**
* Used when processing custom metadata entries, as PDFBox won't do
* the conversion for us in the way it does for the standard ones