Author: tallison
Date: Tue Jun 30 00:48:03 2015
New Revision: 1688337
URL: http://svn.apache.org/r1688337
Log:
TIKA-1601: integrate Jackcess to parse MSAccess files
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
(with props)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
(with props)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
(with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testAccess2.accdb
(with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testAccess2_2000.mdb
(with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testAccess2_2002-2003.mdb
(with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-bundle/pom.xml
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
tika/trunk/tika-server/pom.xml
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1688337&r1=1688336&r2=1688337&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Jun 30 00:48:03 2015
@@ -1,5 +1,9 @@
Release 1.10 - Current Development
+ * Added parser for MS Access files via Jackcess. Many thanks
+ to Health Market Science, Brian O'Neill and James Ahlborn
+ for relicensing Jackcess to Apache v2! (TIKA-1601)
+
* GDALParser now correctly sets "nitf" as a supported
MediaType (TIKA-1664).
Modified: tika/trunk/tika-bundle/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-bundle/pom.xml?rev=1688337&r1=1688336&r2=1688337&view=diff
==============================================================================
--- tika/trunk/tika-bundle/pom.xml (original)
+++ tika/trunk/tika-bundle/pom.xml Tue Jun 30 00:48:03 2015
@@ -130,6 +130,8 @@
pdfbox,fontbox,jempbox,bcmail-jdk15on,bcprov-jdk15on,bcpkix-jdk15on,
poi,poi-scratchpad,poi-ooxml,poi-ooxml-schemas,
xmlbeans,
+ jackcess,
+ commons-lang,
tagsoup,
asm-debug-all,
juniversalchardet,
Modified: tika/trunk/tika-parsers/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1688337&r1=1688336&r2=1688337&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Tue Jun 30 00:48:03 2015
@@ -77,6 +77,11 @@
<artifactId>vorbis-java-tika</artifactId>
<version>${vorbis.version}</version>
</dependency>
+<dependency>
+ <groupId>com.healthmarketscience.jackcess</groupId>
+ <artifactId>jackcess</artifactId>
+ <version>2.1.2</version>
+ </dependency>
<!-- Optional OSGi dependencies, used only when running within OSGi -->
<dependency>
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java?rev=1688337&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
Tue Jun 30 00:48:03 2015
@@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft;
+
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.text.DateFormat;
+import java.text.NumberFormat;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+
+import com.healthmarketscience.jackcess.Column;
+import com.healthmarketscience.jackcess.DataType;
+import com.healthmarketscience.jackcess.Database;
+import com.healthmarketscience.jackcess.PropertyMap;
+import com.healthmarketscience.jackcess.Row;
+import com.healthmarketscience.jackcess.Table;
+import com.healthmarketscience.jackcess.query.Query;
+import com.healthmarketscience.jackcess.util.OleBlob;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Internal class. Needs to be instantiated for each parse because of
+ * the lack of thread safety with the dateTimeFormatter
+ */
+class JackcessExtractor extends AbstractPOIFSExtractor {
+
+ final static String TEXT_FORMAT_KEY = "TextFormat";
+ final static String CURRENCY_FORMAT_KEY = "Format";
+ final static byte TEXT_FORMAT = 0;
+ final static byte RICH_TEXT_FORMAT = 1;
+ final static ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
+
+ final NumberFormat currencyFormatter;
+ final DateFormat shortDateTimeFormatter;
+
+ final HtmlParser htmlParser = new HtmlParser();
+
+ protected JackcessExtractor(ParseContext context, Locale locale) {
+ super(context);
+ currencyFormatter = NumberFormat.getCurrencyInstance(locale);
+ shortDateTimeFormatter = DateFormat.getDateInstance(DateFormat.SHORT,
locale);
+ }
+
+ public void parse(Database db, XHTMLContentHandler xhtml, Metadata
metadata) throws IOException, SAXException, TikaException {
+
+
+ String pw = db.getDatabasePassword();
+ if (pw != null) {
+ metadata.set(JackcessParser.MDB_PW, pw);
+ }
+
+ PropertyMap dbp = db.getDatabaseProperties();
+ for (PropertyMap.Property p : dbp) {
+ metadata.add(JackcessParser.MDB_PROPERTY_PREFIX + p.getName(),
+ toString(p.getValue(), p.getType()));
+ }
+
+ PropertyMap up = db.getUserDefinedProperties();
+ for (PropertyMap.Property p : up
+ ) {
+ metadata.add(JackcessParser.USER_DEFINED_PROPERTY_PREFIX+
p.getName(),
+ toString(p.getValue(), p.getType()));
+ }
+
+ for (PropertyMap.Property p : db.getSummaryProperties()) {
+ metadata.add(JackcessParser.SUMMARY_PROPERTY_PREFIX+ p.getName(),
+ toString(p.getValue(), p.getType()));
+ }
+
+ Iterator<Table> it = db.newIterable().
+ setIncludeLinkedTables(false).
+ setIncludeSystemTables(false).iterator();
+
+ while (it.hasNext()) {
+ Table table = it.next();
+ String tableName = table.getName();
+ List<? extends Column> columns = table.getColumns();
+ xhtml.startElement("table", "name", tableName);
+ addHeaders(columns, xhtml);
+ xhtml.startElement("tbody");
+
+ Row r = table.getNextRow();
+
+ while (r != null) {
+ xhtml.startElement("tr");
+ for (Column c : columns) {
+ handleCell(r, c, xhtml);
+ }
+ xhtml.endElement("tr");
+ r = table.getNextRow();
+ }
+ xhtml.endElement("tbody");
+ xhtml.endElement("table");
+ }
+
+ for (Query q : db.getQueries()) {
+ xhtml.startElement("div", "type", "sqlQuery");
+ xhtml.characters(q.toSQLString());
+ xhtml.endElement("div");
+ }
+ }
+
+ private void addHeaders(List<? extends Column> columns,
XHTMLContentHandler xhtml) throws SAXException {
+ xhtml.startElement("thead");
+ xhtml.startElement("tr");
+ for (Column c : columns) {
+ xhtml.startElement("th");
+ xhtml.characters(c.getName());
+ xhtml.endElement("th");
+ }
+ xhtml.endElement("tr");
+ xhtml.endElement("thead");
+
+ }
+
+ private void handleCell(Row r, Column c, XHTMLContentHandler handler)
+ throws SAXException, IOException, TikaException {
+
+ handler.startElement("td");
+ if (c.getType().equals(DataType.OLE)) {
+ handleOLE(r, c.getName(), handler);
+ } else if (c.getType().equals(DataType.BINARY)) {
+ Object obj = r.get(c.getName());
+ if (obj != null) {
+ byte[] bytes = (byte[])obj;
+ handleEmbeddedResource(
+ TikaInputStream.get(bytes),
+ null,//filename
+ null,//relationshipId
+ null,//mediatype
+ handler, false);
+ }
+ } else {
+ Object obj = r.get(c.getName());
+ String v = toString(obj, c.getType());
+ if (isRichText(c)) {
+ BodyContentHandler h = new BodyContentHandler();
+ Metadata m = new Metadata();
+ m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+ try {
+ htmlParser.parse(new
ByteArrayInputStream(v.getBytes(IOUtils.UTF_8)),
+ h,
+ m, EMPTY_PARSE_CONTEXT);
+ handler.characters(h.toString());
+ } catch (SAXException e) {
+ //if something went wrong in htmlparser, just append the
characters
+ handler.characters(v);
+ }
+ } else {
+ handler.characters(v);
+ }
+ }
+ handler.endElement("td");
+ }
+
+ private boolean isRichText(Column c) throws IOException {
+
+ if (c == null) {
+ return false;
+ }
+
+ PropertyMap m = c.getProperties();
+ if (m == null) {
+ return false;
+ }
+ if (c.getType() == null || ! c.getType().equals(DataType.MEMO)) {
+ return false;
+ }
+ Object b = m.getValue(TEXT_FORMAT_KEY);
+ if (b instanceof Byte) {
+ if (((Byte)b).byteValue() == RICH_TEXT_FORMAT) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private String toString(Object value, DataType type) {
+ if (value == null) {
+ return "";
+ }
+ if (type == null) {
+ //this shouldn't happen
+ return value.toString();
+ }
+ switch (type) {
+ case LONG:
+ return Integer.toString((Integer)value);
+ case TEXT:
+ return (String)value;
+ case MONEY:
+ //TODO: consider getting parsing "Format" field from
+ //field properties.
+ return formatCurrency(((BigDecimal)value).doubleValue(), type);
+ case SHORT_DATE_TIME:
+ return formatShortDateTime((Date)value);
+ case BOOLEAN:
+ return Boolean.toString((Boolean) value);
+ case MEMO:
+ return (String)value;
+ case INT:
+ return Short.toString((Short)value);
+ case DOUBLE:
+ return Double.toString((Double)value);
+ case FLOAT:
+ return Float.toString((Float)value);
+ case NUMERIC:
+ return value.toString();
+ case BYTE:
+ return Byte.toString((Byte)value);
+ case GUID:
+ return value.toString();
+ case COMPLEX_TYPE: //skip all these
+ case UNKNOWN_0D:
+ case UNKNOWN_11:
+ case UNSUPPORTED_FIXEDLEN:
+ case UNSUPPORTED_VARLEN:
+ default:
+ return "";
+
+ }
+ }
+
+ private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
+ OleBlob blob = row.getBlob(cName);
+ //lifted shamelessly from Jackcess's OleBlobTest
+ if (blob == null)
+ return;
+
+ OleBlob.Content content = blob.getContent();
+ if (content == null)
+ return;
+
+ switch (content.getType()) {
+ case LINK:
+ xhtml.characters(((OleBlob.LinkContent)
content).getLinkPath());
+ break;
+ case SIMPLE_PACKAGE:
+ OleBlob.SimplePackageContent spc =
(OleBlob.SimplePackageContent) content;
+
+ handleEmbeddedResource(
+ TikaInputStream.get(spc.getStream()),
+ spc.getFileName(),//filename
+ null,//relationshipId
+ spc.getTypeName(),//mediatype
+ xhtml, false);
+ break;
+ case OTHER:
+ OleBlob.OtherContent oc = (OleBlob.OtherContent) content;
+ handleEmbeddedResource(
+ TikaInputStream.get(oc.getStream()),
+ null,//filename
+ null,//relationshipId
+ oc.getTypeName(),//mediatype
+ xhtml, false);
+ break;
+ case COMPOUND_STORAGE:
+ OleBlob.CompoundContent cc = (OleBlob.CompoundContent) content;
+ handleCompoundContent(cc, xhtml);
+ break;
+ }
+ }
+
+ private void handleCompoundContent(OleBlob.CompoundContent cc,
XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
+ NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream());
+ handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml);
+ }
+
+ String formatCurrency(Double d, DataType type) {
+ if (d == null) {
+ return "";
+ }
+ return currencyFormatter.format(d);
+ }
+
+ String formatShortDateTime(Date d) {
+ if (d == null) {
+ return "";
+ }
+ return shortDateTimeFormatter.format(d);
+ }
+}
+
Propchange:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
------------------------------------------------------------------------------
svn:eol-style = LF
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java?rev=1688337&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
Tue Jun 30 00:48:03 2015
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Locale;
+import java.util.Set;
+
+import com.healthmarketscience.jackcess.Database;
+import com.healthmarketscience.jackcess.DatabaseBuilder;
+import com.healthmarketscience.jackcess.util.LinkResolver;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser that handles Microsoft Access files via
+ * <a href="http://jackcess.sourceforge.net/>Jackcess</a>
+ * <p>
+ * Many, many thanks to LexisNexis®/Health Market Science (HMS), Brian
O'Neill,
+ * and James Ahlborn for relicensing Jackcess to Apache v2.0!
+ */
+public class JackcessParser extends AbstractParser {
+
+ public static final String SUMMARY_PROPERTY_PREFIX = "MDB_SUMMARY_PROP";
+ public static String MDB_PROPERTY_PREFIX = "MDB_PROP" +
Metadata.NAMESPACE_PREFIX_DELIMITER;
+ public static String USER_DEFINED_PROPERTY_PREFIX = "MDB_USER_PROP" +
Metadata.NAMESPACE_PREFIX_DELIMITER;
+ public static Property MDB_PW = Property.externalText("Password");
+ private final static LinkResolver IGNORE_LINK_RESOLVER = new
IgnoreLinkResolver();
+
+ //TODO: figure out how to get this info
+ // public static Property LINKED_DATABASES =
Property.externalTextBag("LinkedDatabases");
+
+ private static final long serialVersionUID = -752276948656079347L;
+
+ private static final MediaType MEDIA_TYPE =
MediaType.application("x-msaccess");
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MEDIA_TYPE);
+
+ private Locale locale = Locale.ROOT;
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata,
+ ParseContext context) throws IOException, SAXException,
TikaException {
+ TikaInputStream tis = TikaInputStream.get(stream);
+ Database db = null;
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ try {
+ db = new DatabaseBuilder().setReadOnly(true).open(tis.getFile());
+ db.setLinkResolver(IGNORE_LINK_RESOLVER);//just in case
+ JackcessExtractor ex = new JackcessExtractor(context, locale);
+ ex.parse(db, xhtml, metadata);
+ } finally {
+ if (db != null) {
+ try {
+ db.close();
+ } catch (IOException e) {
+ //swallow = silent close
+ }
+ }
+ }
+ xhtml.endDocument();
+ }
+
+ private static final class IgnoreLinkResolver implements LinkResolver {
+ //If links are resolved, Jackcess might try to open and process
+ //any file on the current system that is specified as a linked db.
+ //This could be a nasty security issue.
+ @Override
+ public Database resolveLinkedDatabase(Database database, String s)
throws IOException {
+ throw new AssertionError("DO NOT ALLOW RESOLVING OF LINKS!!!");
+ }
+ }
+}
\ No newline at end of file
Propchange:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
------------------------------------------------------------------------------
svn:eol-style = LF
Modified:
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1688337&r1=1688336&r2=1688337&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
(original)
+++
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Tue Jun 30 00:48:03 2015
@@ -35,6 +35,7 @@ org.apache.tika.parser.jpeg.JpegParser
org.apache.tika.parser.mail.RFC822Parser
org.apache.tika.parser.mbox.MboxParser
org.apache.tika.parser.mbox.OutlookPSTParser
+org.apache.tika.parser.microsoft.JackcessParser
org.apache.tika.parser.microsoft.OfficeParser
org.apache.tika.parser.microsoft.OldExcelParser
org.apache.tika.parser.microsoft.TNEFParser
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java?rev=1688337&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
(added)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
Tue Jun 30 00:48:03 2015
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class JackcessParserTest extends TikaTest {
+
+ @Test
+ public void testBasic() throws Exception {
+
+ Parser p = new AutoDetectParser();
+
+ RecursiveParserWrapper w = new RecursiveParserWrapper(p,
+ new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+
+ for (String fName : new String[]{"testAccess2.accdb",
"testAccess2_2000.mdb",
+ "testAccess2_2002-2003.mdb"}) {
+ InputStream is = null;
+ try {
+ is = this.getResourceAsStream("/test-documents/" + fName);
+
+ Metadata meta = new Metadata();
+ ParseContext c = new ParseContext();
+ w.parse(is, new DefaultHandler(), meta, c);
+ } finally {
+ IOUtils.closeQuietly(is);
+ }
+ List<Metadata> list = w.getMetadata();
+ assertEquals(4, list.size());
+ String mainContent =
list.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+
+ //make sure there's a thead and tbody
+ assertContains("</thead><tbody>", mainContent);
+
+ //assert table header
+ assertContains("<th>ShortTextField</th>", mainContent);
+
+ //test date format
+ assertContains("6/24/15", mainContent);
+
+ //test that markup is stripped
+ assertContains("over the bold italic dog", mainContent);
+
+ //test unicode
+ assertContains("\u666E\u6797\u65AF\u987F\u5927\u5B66",
mainContent);
+
+ //test embedded document handling
+ assertContains("Test Document with embedded pdf",
+ list.get(3).get(RecursiveParserWrapper.TIKA_CONTENT));
+
+ w.reset();
+ }
+ }
+
+}
Propchange:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
------------------------------------------------------------------------------
svn:eol-style = LF
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testAccess2.accdb
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testAccess2.accdb?rev=1688337&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testAccess2.accdb
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testAccess2_2000.mdb
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testAccess2_2000.mdb?rev=1688337&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testAccess2_2000.mdb
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testAccess2_2002-2003.mdb
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testAccess2_2002-2003.mdb?rev=1688337&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testAccess2_2002-2003.mdb
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: tika/trunk/tika-server/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-server/pom.xml?rev=1688337&r1=1688336&r2=1688337&view=diff
==============================================================================
--- tika/trunk/tika-server/pom.xml (original)
+++ tika/trunk/tika-server/pom.xml Tue Jun 30 00:48:03 2015
@@ -100,7 +100,7 @@
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
- <version>2.5</version>
+ <version>2.6</version> <!-- should sync with Jackcess in tika-parsers -->
</dependency>
<dependency>
<groupId>commons-io</groupId>