Author: tallison
Date: Thu Jul 16 00:55:40 2015
New Revision: 1691297
URL: http://svn.apache.org/r1691297
Log:
TIKA-1684
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java?rev=1691297&r1=1691296&r2=1691297&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
Thu Jul 16 00:55:40 2015
@@ -24,9 +24,11 @@ import java.math.BigDecimal;
import java.text.DateFormat;
import java.text.NumberFormat;
import java.util.Date;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
+import java.util.Set;
import com.healthmarketscience.jackcess.Column;
import com.healthmarketscience.jackcess.DataType;
@@ -41,6 +43,8 @@ import org.apache.tika.exception.TikaExc
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.BodyContentHandler;
@@ -53,6 +57,10 @@ import org.xml.sax.SAXException;
*/
class JackcessExtractor extends AbstractPOIFSExtractor {
+ final static String TITLE_PROP_KEY = "Title";
+ final static String AUTHOR_PROP_KEY = "Author";
+ final static String COMPANY_PROP_KEY = "Company";
+
final static String TEXT_FORMAT_KEY = "TextFormat";
final static String CURRENCY_FORMAT_KEY = "Format";
final static byte TEXT_FORMAT = 0;
@@ -85,15 +93,38 @@ class JackcessExtractor extends Abstract
}
PropertyMap up = db.getUserDefinedProperties();
- for (PropertyMap.Property p : up
- ) {
+ for (PropertyMap.Property p : up) {
metadata.add(JackcessParser.USER_DEFINED_PROPERTY_PREFIX+
p.getName(),
toString(p.getValue(), p.getType()));
}
- for (PropertyMap.Property p : db.getSummaryProperties()) {
- metadata.add(JackcessParser.SUMMARY_PROPERTY_PREFIX+ p.getName(),
- toString(p.getValue(), p.getType()));
+ Set<String> found = new HashSet<>();
+ PropertyMap summaryProperties = db.getSummaryProperties();
+ if (summaryProperties != null) {
+ //try to get core properties
+ PropertyMap.Property title = summaryProperties.get(TITLE_PROP_KEY);
+ if (title != null) {
+ metadata.set(TikaCoreProperties.TITLE,
toString(title.getValue(), title.getType()));
+ found.add(title.getName());
+ }
+ PropertyMap.Property author =
summaryProperties.get(AUTHOR_PROP_KEY);
+ if (author != null) {
+ metadata.set(TikaCoreProperties.CREATOR,
toString(author.getValue(), author.getType()));
+ found.add(author.getName());
+ }
+ PropertyMap.Property company =
summaryProperties.get(COMPANY_PROP_KEY);
+ if (company != null) {
+ metadata.set(OfficeOpenXMLExtended.COMPANY,
toString(company.getValue(), company.getType()));
+ found.add(company.getName());
+ }
+
+ for (PropertyMap.Property p : db.getSummaryProperties()) {
+ if (! found.contains(p.getName())) {
+ metadata.add(JackcessParser.SUMMARY_PROPERTY_PREFIX +
p.getName(),
+ toString(p.getValue(), p.getType()));
+ }
+ }
+
}
Iterator<Table> it = db.newIterable().
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java?rev=1691297&r1=1691296&r2=1691297&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
Thu Jul 16 00:55:40 2015
@@ -47,7 +47,7 @@ import org.xml.sax.SAXException;
*/
public class JackcessParser extends AbstractParser {
- public static final String SUMMARY_PROPERTY_PREFIX = "MDB_SUMMARY_PROP";
+ public static final String SUMMARY_PROPERTY_PREFIX = "MDB_SUMMARY_PROP" +
Metadata.NAMESPACE_PREFIX_DELIMITER;
public static String MDB_PROPERTY_PREFIX = "MDB_PROP" +
Metadata.NAMESPACE_PREFIX_DELIMITER;
public static String USER_DEFINED_PROPERTY_PREFIX = "MDB_USER_PROP" +
Metadata.NAMESPACE_PREFIX_DELIMITER;
public static Property MDB_PW = Property.externalText("Password");
@@ -102,4 +102,4 @@ public class JackcessParser extends Abst
throw new AssertionError("DO NOT ALLOW RESOLVING OF LINKS!!!");
}
}
-}
+}
\ No newline at end of file
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java?rev=1691297&r1=1691296&r2=1691297&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
Thu Jul 16 00:55:40 2015
@@ -25,6 +25,8 @@ import java.util.List;
import org.apache.tika.TikaTest;
import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -88,4 +90,13 @@ public class JackcessParserTest extends
XMLResult r = getXML("testAccess_V1997.mdb");
assertContains("hijklmnop", r.xml);
}
+
+ @Test
+ public void testMetadata() throws Exception {
+ //basic tests for normalized metadata
+ XMLResult r = getXML("testAccess_V1997.mdb");
+ assertEquals("tmccune", r.metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Health Market Science",
r.metadata.get(OfficeOpenXMLExtended.COMPANY));
+ assertEquals("test", r.metadata.get(TikaCoreProperties.TITLE));
+ }
}