This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit bb10dc205eee2a0f5e684153820bfb7babccc627 Author: TALLISON <[email protected]> AuthorDate: Thu Sep 6 11:11:02 2018 -0400 TIKA-2552 -- upgrade to POI 4.0.0 --- .../org/apache/tika/batch/fs/BatchDriverTest.java | 2 +- tika-bundle/pom.xml | 2 + tika-eval/pom.xml | 2 +- tika-parsers/pom.xml | 2 +- .../tika/parser/microsoft/ExcelExtractor.java | 6 +- .../tika/parser/microsoft/HSLFExtractor.java | 37 +- .../parser/microsoft/JackcessCompoundOleUtil.java | 268 +++++++ .../tika/parser/microsoft/JackcessExtractor.java | 28 +- .../tika/parser/microsoft/JackcessOleUtil.java | 813 +++++++++++++++++++++ .../apache/tika/parser/microsoft/OfficeParser.java | 21 +- .../tika/parser/microsoft/OutlookExtractor.java | 10 +- .../parser/microsoft/POIFSContainerDetector.java | 12 +- .../tika/parser/microsoft/SummaryExtractor.java | 6 +- .../tika/parser/microsoft/WordExtractor.java | 6 +- .../microsoft/ooxml/AbstractOOXMLExtractor.java | 17 +- .../parser/microsoft/ooxml/MetadataExtractor.java | 114 ++- .../parser/microsoft/ooxml/OOXMLExtractor.java | 9 +- .../microsoft/ooxml/OOXMLExtractorFactory.java | 28 +- .../ooxml/POIXMLTextExtractorDecorator.java | 2 +- .../ooxml/XSLFPowerPointExtractorDecorator.java | 22 +- .../ooxml/XSSFBExcelExtractorDecorator.java | 2 +- .../ooxml/XSSFExcelExtractorDecorator.java | 2 +- .../microsoft/ooxml/xps/XPSExtractorDecorator.java | 15 +- .../microsoft/ooxml/xps/XPSTextExtractor.java | 7 +- .../xslf/XSLFEventBasedPowerPointExtractor.java | 6 +- .../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 8 +- .../tika/parser/pkg/ZipContainerDetector.java | 2 +- .../apache/tika/parser/rtf/RTFObjDataParser.java | 11 +- .../tika/detect/TestContainerAwareDetector.java | 7 +- .../parser/microsoft/PowerPointParserTest.java | 6 +- .../apache/tika/server/resource/TikaResource.java | 2 +- 31 files changed, 1277 insertions(+), 198 deletions(-) diff --git a/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchDriverTest.java b/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchDriverTest.java index 13e35e6..643e7cb 100644 --- a/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchDriverTest.java +++ b/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchDriverTest.java @@ -115,7 +115,7 @@ public class BatchDriverTest extends FSBatchTestBase { readFileToString(outputDir.resolve("test2_ok.xml.xml"), UTF_8)); } - @Test(timeout = 30000) + @Test(timeout = 60000) public void allHeavyHangsTestWithStarvedCrawler() throws Exception { //this tests that if all consumers are hung and the crawler is //waiting to add to the queue, there isn't deadlock. The BatchProcess should diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml index c9d635d..e265636f 100644 --- a/tika-bundle/pom.xml +++ b/tika-bundle/pom.xml @@ -282,6 +282,8 @@ org.apache.commons.httpclient.params;resolution:=optional, org.apache.commons.httpclient.protocol;resolution:=optional, org.apache.commons.httpclient.util;resolution:=optional, + org.apache.commons.math3.exception;resolution:=optional, + org.apache.commons.math3.linear;resolution:=optional, org.apache.commons.vfs2;resolution:=optional, org.apache.commons.vfs2.provider;resolution:=optional, org.apache.commons.vfs2.util;resolution:=optional, diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml index c7d28fd..9289116 100644 --- a/tika-eval/pom.xml +++ b/tika-eval/pom.xml @@ -36,7 +36,7 @@ <properties> <cli.version>1.4</cli.version> <!--sync version with tika-server or move to parent? --> <lucene.version>7.4.0</lucene.version> - <poi.version>3.17</poi.version> + <poi.version>4.0.0</poi.version> </properties> <dependencies> diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index 737129f..18cfba5 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -35,7 +35,7 @@ <url>http://tika.apache.org/</url> <properties> - <poi.version>3.17</poi.version> + <poi.version>4.0.0</poi.version> <!-- NOTE: sync codec version with POI --> <codec.version>1.11</codec.version> <!-- NOTE: sync tukaani version with commons-compress in tika-parent--> diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java index ff5971a..0dd86ba 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java @@ -64,7 +64,7 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.Entry; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -139,7 +139,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor { * or writing the extracted content */ protected void parse( - NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml, + POIFSFileSystem filesystem, XHTMLContentHandler xhtml, Locale locale) throws IOException, SAXException, TikaException { parse(filesystem.getRoot(), xhtml, locale); } @@ -273,7 +273,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor { * @throws IOException on any IO errors. * @throws SAXException on any SAX parsing errors. */ - public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords) + public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords) throws IOException, SAXException, TikaException { processFile(filesystem.getRoot(), listenForAllRecords); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java index 7057cbe..5095709 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java @@ -24,9 +24,7 @@ import java.util.List; import org.apache.poi.common.usermodel.Hyperlink; import org.apache.poi.hslf.exceptions.EncryptedPowerPointFileException; -import org.apache.poi.hslf.model.Comment; import org.apache.poi.hslf.model.HeadersFooters; -import org.apache.poi.hslf.model.OLEShape; import org.apache.poi.hslf.record.DocInfoListContainer; import org.apache.poi.hslf.record.RecordTypes; import org.apache.poi.hslf.record.VBAInfoAtom; @@ -35,6 +33,7 @@ import org.apache.poi.hslf.usermodel.HSLFGroupShape; import org.apache.poi.hslf.usermodel.HSLFMasterSheet; import org.apache.poi.hslf.usermodel.HSLFNotes; import org.apache.poi.hslf.usermodel.HSLFObjectData; +import org.apache.poi.hslf.usermodel.HSLFObjectShape; import org.apache.poi.hslf.usermodel.HSLFPictureData; import org.apache.poi.hslf.usermodel.HSLFShape; import org.apache.poi.hslf.usermodel.HSLFSlide; @@ -46,7 +45,9 @@ import org.apache.poi.hslf.usermodel.HSLFTextParagraph; import org.apache.poi.hslf.usermodel.HSLFTextRun; import org.apache.poi.hslf.usermodel.HSLFTextShape; import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.sl.usermodel.Comment; +import org.apache.poi.sl.usermodel.SimpleShape; import org.apache.tika.exception.TikaException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.extractor.EmbeddedDocumentUtil; @@ -67,7 +68,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { } protected void parse( - NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) + POIFSFileSystem filesystem, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { parse(filesystem.getRoot(), xhtml); } @@ -269,9 +270,9 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { long persistId = vbaAtom.getPersistIdRef(); for (HSLFObjectData objData : ppt.getEmbeddedObjects()) { if (objData.getExOleObjStg().getPersistId() == persistId) { - try (NPOIFSFileSystem npoifsFileSystem = new NPOIFSFileSystem(objData.getData())) { + try (POIFSFileSystem poifsFileSystem = new POIFSFileSystem(objData.getInputStream())) { try { - OfficeParser.extractMacros(npoifsFileSystem, xhtml, + OfficeParser.extractMacros(poifsFileSystem, xhtml, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)); } catch (IOException|SAXException inner) { EmbeddedDocumentUtil.recordException(inner, parentMetadata); @@ -295,7 +296,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { xhtml.startElement("div", "class", "slide-master-content"); for (HSLFShape shape : shapes) { - if (shape != null && !HSLFMasterSheet.isPlaceholder(shape)) { + if (shape != null && ! isPlaceholder(shape)) { if (shape instanceof HSLFTextShape) { HSLFTextShape tsh = (HSLFTextShape) shape; String text = tsh.getText(); @@ -308,6 +309,10 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { xhtml.endElement("div"); } + private boolean isPlaceholder(HSLFShape shape) { + return shape instanceof SimpleShape && ((SimpleShape)shape).isPlaceholder(); + } + private void extractTableText(XHTMLContentHandler xhtml, HSLFTable shape) throws SAXException { xhtml.startElement("table"); for (int row = 0; row < shape.getNumberOfRows(); row++) { @@ -449,8 +454,8 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { } for (HSLFShape shape : shapes) { - if (shape instanceof OLEShape) { - OLEShape oleShape = (OLEShape) shape; + if (shape instanceof HSLFObjectShape) { + HSLFObjectShape oleShape = (HSLFObjectShape) shape; HSLFObjectData data = null; try { data = oleShape.getObjectData(); @@ -474,14 +479,14 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { xhtml.endElement("div"); InputStream dataStream = null; try { - dataStream = data.getData(); + dataStream = data.getInputStream(); } catch (Exception e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata); continue; } try (TikaInputStream stream = TikaInputStream.get(dataStream)) { String mediaType = null; - if ("Excel.Chart.8".equals(oleShape.getProgID())) { + if ("Excel.Chart.8".equals(oleShape.getProgId())) { mediaType = "application/vnd.ms-excel"; } else { MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata()); @@ -489,18 +494,18 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { } if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj") || mediaType.equals("application/x-tika-msoffice")) { - NPOIFSFileSystem npoifs = null; + POIFSFileSystem poifs = null; try { - npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream)); + poifs = new POIFSFileSystem(new CloseShieldInputStream(stream)); } catch (RuntimeException e) { throw new IOExceptionWithCause(e); } try { - handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml); + handleEmbeddedOfficeDoc(poifs.getRoot(), objID, xhtml); } finally { - if (npoifs != null) { - npoifs.close(); + if (poifs != null) { + poifs.close(); } } } else { diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java new file mode 100644 index 0000000..b09f19d --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java @@ -0,0 +1,268 @@ +/* +Copyright (c) 2013 James Ahlborn + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package org.apache.tika.parser.microsoft; + +import com.healthmarketscience.jackcess.RuntimeIOException; +import com.healthmarketscience.jackcess.impl.ByteUtil; +import com.healthmarketscience.jackcess.impl.CustomToStringStyle; +import com.healthmarketscience.jackcess.util.MemFileChannel; +import com.healthmarketscience.jackcess.util.OleBlob; +import org.apache.commons.lang.builder.ToStringBuilder; +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; +import java.net.URLEncoder; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * Temporary copy/paste from Jackcess to allow upgrade to POI 4.0.0. + * This class will be removed once POI 4.0.0 is released and jackcess + * updates to the most recent version of POI. + * @deprecated -- this class will be removed in Tika >= 1.20 + */ +@Deprecated +class JackcessCompoundOleUtil implements JackcessOleUtil.CompoundPackageFactory { + private static final String ENTRY_NAME_CHARSET = "UTF-8"; + private static final String ENTRY_SEPARATOR = "/"; + private static final String CONTENTS_ENTRY = "CONTENTS"; + + static { + // force a poi class to be loaded to ensure that when this class is + // loaded, we know that the poi classes are available + POIFSFileSystem.class.getName(); + } + + public JackcessCompoundOleUtil() { + } + + /** + * Creates a nes CompoundContent for the given blob information. + */ + public JackcessOleUtil.ContentImpl createCompoundPackageContent( + JackcessOleUtil.OleBlobImpl blob, String prettyName, String className, String typeName, + ByteBuffer blobBb, int dataBlockLen) { + return new CompoundContentImpl(blob, prettyName, className, typeName, + blobBb.position(), dataBlockLen); + } + + /** + * Gets a DocumentEntry from compound storage based on a fully qualified, + * encoded entry name. + * + * @param entryName fully qualified, encoded entry name + * @param dir root directory of the compound storage + * @return the relevant DocumentEntry + * @throws FileNotFoundException if the entry does not exist + * @throws IOException if some other io error occurs + */ + public static DocumentEntry getDocumentEntry(String entryName, + DirectoryEntry dir) + throws IOException { + // split entry name into individual components and decode them + List<String> entryNames = new ArrayList<String>(); + for (String str : entryName.split(ENTRY_SEPARATOR)) { + if (str.length() == 0) { + continue; + } + entryNames.add(decodeEntryName(str)); + } + + DocumentEntry entry = null; + Iterator<String> iter = entryNames.iterator(); + while (iter.hasNext()) { + org.apache.poi.poifs.filesystem.Entry tmpEntry = dir.getEntry(iter.next()); + if (tmpEntry instanceof DirectoryEntry) { + dir = (DirectoryEntry) tmpEntry; + } else if (!iter.hasNext() && (tmpEntry instanceof DocumentEntry)) { + entry = (DocumentEntry) tmpEntry; + } else { + break; + } + } + + if (entry == null) { + throw new FileNotFoundException("Could not find document " + entryName); + } + + return entry; + } + + private static String encodeEntryName(String name) { + try { + return URLEncoder.encode(name, ENTRY_NAME_CHARSET); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + + private static String decodeEntryName(String name) { + try { + return URLDecoder.decode(name, ENTRY_NAME_CHARSET); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + + private static final class CompoundContentImpl + extends JackcessOleUtil.EmbeddedPackageContentImpl + implements OleBlob.CompoundContent { + private POIFSFileSystem _fs; + + private CompoundContentImpl( + JackcessOleUtil.OleBlobImpl blob, String prettyName, String className, + String typeName, int position, int length) { + super(blob, prettyName, className, typeName, position, length); + } + + public OleBlob.ContentType getType() { + return OleBlob.ContentType.COMPOUND_STORAGE; + } + + private POIFSFileSystem getFileSystem() throws IOException { + if (_fs == null) { + _fs = new POIFSFileSystem(MemFileChannel.newChannel(getStream(), "r")); + } + return _fs; + } + + public Iterator<Entry> iterator() { + try { + return getEntries(new ArrayList<Entry>(), getFileSystem().getRoot(), + ENTRY_SEPARATOR).iterator(); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + public EntryImpl getEntry(String entryName) throws IOException { + return new EntryImpl(entryName, + getDocumentEntry(entryName, getFileSystem().getRoot())); + } + + public boolean hasContentsEntry() throws IOException { + return getFileSystem().getRoot().hasEntry(CONTENTS_ENTRY); + } + + public EntryImpl getContentsEntry() throws IOException { + return getEntry(CONTENTS_ENTRY); + } + + private List<Entry> getEntries(List<Entry> entries, DirectoryEntry dir, + String prefix) { + for (org.apache.poi.poifs.filesystem.Entry entry : dir) { + if (entry instanceof DirectoryEntry) { + // .. recurse into this directory + getEntries(entries, (DirectoryEntry) entry, prefix + ENTRY_SEPARATOR); + } else if (entry instanceof DocumentEntry) { + // grab the entry name/detils + DocumentEntry de = (DocumentEntry) entry; + String entryName = prefix + encodeEntryName(entry.getName()); + entries.add(new EntryImpl(entryName, de)); + } + } + return entries; + } + + @Override + public void close() { + ByteUtil.closeQuietly(_fs); + _fs = null; + super.close(); + } + + @Override + public String toString() { + ToStringBuilder sb = toString(CustomToStringStyle.builder(this)); + + try { + sb.append("hasContentsEntry", hasContentsEntry()); + sb.append("entries", getEntries(new ArrayList<Entry>(), + getFileSystem().getRoot(), + ENTRY_SEPARATOR)); + } catch (IOException e) { + sb.append("entries", "<" + e + ">"); + } + + return sb.toString(); + } + + private final class EntryImpl implements OleBlob.CompoundContent.Entry { + private final String _name; + private final DocumentEntry _docEntry; + + private EntryImpl(String name, DocumentEntry docEntry) { + _name = name; + _docEntry = docEntry; + } + + public OleBlob.ContentType getType() { + return OleBlob.ContentType.UNKNOWN; + } + + public String getName() { + return _name; + } + + public CompoundContentImpl getParent() { + return CompoundContentImpl.this; + } + + public JackcessOleUtil.OleBlobImpl getBlob() { + return getParent().getBlob(); + } + + public long length() { + return _docEntry.getSize(); + } + + public InputStream getStream() throws IOException { + return new DocumentInputStream(_docEntry); + } + + public void writeTo(OutputStream out) throws IOException { + InputStream in = null; + try { + ByteUtil.copy(in = getStream(), out); + } finally { + ByteUtil.closeQuietly(in); + } + } + + @Override + public String toString() { + return CustomToStringStyle.valueBuilder(this) + .append("name", _name) + .append("length", length()) + .toString(); + } + } + } +} + + + diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java index bf5c5d0..3a10346 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java @@ -41,7 +41,7 @@ import com.healthmarketscience.jackcess.Row; import com.healthmarketscience.jackcess.Table; import com.healthmarketscience.jackcess.query.Query; import com.healthmarketscience.jackcess.util.OleBlob; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.IOUtils; @@ -302,8 +302,9 @@ class JackcessExtractor extends AbstractPOIFSExtractor { } } + private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { - OleBlob blob = row.getBlob(cName); + OleBlob blob = getBlob(row, cName); //lifted shamelessly from Jackcess's OleBlobTest if (blob == null) return; @@ -367,9 +368,21 @@ class JackcessExtractor extends AbstractPOIFSExtractor { } } + /* + Temporary work around until POI 4.0.0 is released and jackcess upgrades + This is copy/pasted from jackcess + */ + private OleBlob getBlob(Row row, String cName) { + byte[] bytes = row.getBytes(cName); + if (bytes == null) { + return null; + } + return JackcessOleUtil.parseBlob(bytes); + } + private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { InputStream is = null; - NPOIFSFileSystem nfs = null; + POIFSFileSystem fileSystem = null; try { try { is = cc.getStream(); @@ -379,18 +392,18 @@ class JackcessExtractor extends AbstractPOIFSExtractor { } try { - nfs = new NPOIFSFileSystem(is); + fileSystem = new POIFSFileSystem(is); } catch (Exception e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata); return; } - handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml); + handleEmbeddedOfficeDoc(fileSystem.getRoot(), xhtml); } finally { - if (nfs != null) { + if (fileSystem != null) { try { - nfs.close(); + fileSystem.close(); } catch (IOException e) { //swallow } @@ -414,5 +427,6 @@ class JackcessExtractor extends AbstractPOIFSExtractor { } return shortDateTimeFormatter.format(d); } + } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java new file mode 100644 index 0000000..a1432d6 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java @@ -0,0 +1,813 @@ +/* +Copyright (c) 2013 James Ahlborn + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package org.apache.tika.parser.microsoft; + +import java.io.ByteArrayInputStream; +import java.io.Closeable; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.sql.Blob; +import java.sql.SQLException; +import java.sql.SQLFeatureNotSupportedException; +import java.text.Normalizer; +import java.util.EnumSet; +import java.util.Set; +import java.util.regex.Pattern; + +import com.healthmarketscience.jackcess.DataType; +import com.healthmarketscience.jackcess.util.OleBlob; +import static com.healthmarketscience.jackcess.util.OleBlob.*; +import org.apache.commons.lang.builder.ToStringBuilder; + +import com.healthmarketscience.jackcess.impl.ByteUtil; +import com.healthmarketscience.jackcess.impl.CustomToStringStyle; +import com.healthmarketscience.jackcess.impl.PageChannel; + +/** + * Utility code for working with OLE data. + * Temporary workaround until POI 4.0.0 is released and Jackcess is updated + * + * + * @author James Ahlborn + * @usage _advanced_class_ + * @deprecated this class will be removed in Tika >= 1.20 + */ +@Deprecated +class JackcessOleUtil { + + + /** + * Interface used to allow optional inclusion of the poi library for working + * with compound ole data. + */ + interface CompoundPackageFactory + { + public ContentImpl createCompoundPackageContent( + OleBlobImpl blob, String prettyName, String className, String typeName, + ByteBuffer blobBb, int dataBlockLen); + } + + private static final int PACKAGE_SIGNATURE = 0x1C15; + private static final Charset OLE_CHARSET = Charset.forName("US-ASCII"); + private static final Charset OLE_UTF_CHARSET = Charset.forName("UTF-16LE"); + private static final byte[] COMPOUND_STORAGE_SIGNATURE = + {(byte)0xd0,(byte)0xcf,(byte)0x11,(byte)0xe0, + (byte)0xa1,(byte)0xb1,(byte)0x1a,(byte)0xe1}; + private static final String SIMPLE_PACKAGE_TYPE = "Package"; + private static final int PACKAGE_OBJECT_TYPE = 0x02; + private static final int OLE_VERSION = 0x0501; + private static final int OLE_FORMAT = 0x02; + private static final int PACKAGE_STREAM_SIGNATURE = 0x02; + private static final int PS_EMBEDDED_FILE = 0x030000; + private static final int PS_LINKED_FILE = 0x010000; + private static final Set<ContentType> WRITEABLE_TYPES = EnumSet.of( + ContentType.LINK, ContentType.SIMPLE_PACKAGE, ContentType.OTHER); + private static final byte[] NO_DATA = new byte[0]; + private static final int LINK_HEADER = 0x01; + private static final byte[] PACKAGE_FOOTER = { + 0x01, 0x05, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, (byte)0xAD, 0x05, (byte)0xFE + }; + + // regex pattern which matches all the crazy extra stuff in unicode + private static final Pattern UNICODE_ACCENT_PATTERN = + Pattern.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+"); + + private static final CompoundPackageFactory COMPOUND_FACTORY; + + static { + CompoundPackageFactory compoundFactory = null; + try { + compoundFactory = (CompoundPackageFactory) + Class.forName("org.apache.tika.parser.microsoft.JackcessCompoundOleUtil") + .newInstance(); + } catch(Throwable t) { + // must not have poi, will load compound ole data as "other" + } + COMPOUND_FACTORY = compoundFactory; + } + + /** + * Parses an access database blob structure and returns an appropriate + * OleBlob instance. + */ + public static OleBlob parseBlob(byte[] bytes) { + return new OleBlobImpl(bytes); + } + + /** + * Creates a new OlBlob instance using the given information. + */ + public static OleBlob createBlob(Builder oleBuilder) + throws IOException + { + try { + + if(!WRITEABLE_TYPES.contains(oleBuilder.getType())) { + throw new IllegalArgumentException( + "Cannot currently create ole values of type " + + oleBuilder.getType()); + } + + long contentLen = oleBuilder.getContentLength(); + byte[] contentBytes = oleBuilder.getBytes(); + InputStream contentStream = oleBuilder.getStream(); + byte[] packageStreamHeader = NO_DATA; + byte[] packageStreamFooter = NO_DATA; + + switch(oleBuilder.getType()) { + case LINK: + packageStreamHeader = writePackageStreamHeader(oleBuilder); + + // link "content" is file path + contentBytes = getZeroTermStrBytes(oleBuilder.getFilePath()); + contentLen = contentBytes.length; + break; + + case SIMPLE_PACKAGE: + packageStreamHeader = writePackageStreamHeader(oleBuilder); + packageStreamFooter = writePackageStreamFooter(oleBuilder); + break; + + case OTHER: + // nothing more to do + break; + default: + throw new RuntimeException("unexpected type " + oleBuilder.getType()); + } + + long payloadLen = packageStreamHeader.length + packageStreamFooter.length + + contentLen; + byte[] packageHeader = writePackageHeader(oleBuilder, payloadLen); + + long totalOleLen = packageHeader.length + PACKAGE_FOOTER.length + + payloadLen; + if(totalOleLen > DataType.OLE.getMaxSize()) { + throw new IllegalArgumentException("Content size of " + totalOleLen + + " is too large for ole column"); + } + + byte[] oleBytes = new byte[(int)totalOleLen]; + ByteBuffer bb = PageChannel.wrap(oleBytes); + bb.put(packageHeader); + bb.put(packageStreamHeader); + + if(contentLen > 0L) { + if(contentBytes != null) { + bb.put(contentBytes); + } else { + byte[] buf = new byte[8192]; + int numBytes = 0; + while((numBytes = contentStream.read(buf)) >= 0) { + bb.put(buf, 0, numBytes); + } + } + } + + bb.put(packageStreamFooter); + bb.put(PACKAGE_FOOTER); + + return parseBlob(oleBytes); + + } finally { + ByteUtil.closeQuietly(oleBuilder.getStream()); + } + } + + private static byte[] writePackageHeader(Builder oleBuilder, + long contentLen) { + + byte[] prettyNameBytes = getZeroTermStrBytes(oleBuilder.getPrettyName()); + String className = oleBuilder.getClassName(); + String typeName = oleBuilder.getTypeName(); + if(className == null) { + className = typeName; + } else if(typeName == null) { + typeName = className; + } + byte[] classNameBytes = getZeroTermStrBytes(className); + byte[] typeNameBytes = getZeroTermStrBytes(typeName); + + int packageHeaderLen = 20 + prettyNameBytes.length + classNameBytes.length; + + int oleHeaderLen = 24 + typeNameBytes.length; + + byte[] headerBytes = new byte[packageHeaderLen + oleHeaderLen]; + + ByteBuffer bb = PageChannel.wrap(headerBytes); + + // write outer package header + bb.putShort((short)PACKAGE_SIGNATURE); + bb.putShort((short)packageHeaderLen); + bb.putInt(PACKAGE_OBJECT_TYPE); + bb.putShort((short)prettyNameBytes.length); + bb.putShort((short)classNameBytes.length); + int prettyNameOff = bb.position() + 8; + bb.putShort((short)prettyNameOff); + bb.putShort((short)(prettyNameOff + prettyNameBytes.length)); + bb.putInt(-1); + bb.put(prettyNameBytes); + bb.put(classNameBytes); + + // put ole header + bb.putInt(OLE_VERSION); + bb.putInt(OLE_FORMAT); + bb.putInt(typeNameBytes.length); + bb.put(typeNameBytes); + bb.putLong(0L); + bb.putInt((int)contentLen); + + return headerBytes; + } + + private static byte[] writePackageStreamHeader(Builder oleBuilder) { + + byte[] fileNameBytes = getZeroTermStrBytes(oleBuilder.getFileName()); + byte[] filePathBytes = getZeroTermStrBytes(oleBuilder.getFilePath()); + + int headerLen = 6 + fileNameBytes.length + filePathBytes.length; + + if(oleBuilder.getType() == ContentType.SIMPLE_PACKAGE) { + + headerLen += 8 + filePathBytes.length; + + } else { + + headerLen += 2; + } + + byte[] headerBytes = new byte[headerLen]; + ByteBuffer bb = PageChannel.wrap(headerBytes); + bb.putShort((short)PACKAGE_STREAM_SIGNATURE); + bb.put(fileNameBytes); + bb.put(filePathBytes); + + if(oleBuilder.getType() == ContentType.SIMPLE_PACKAGE) { + bb.putInt(PS_EMBEDDED_FILE); + bb.putInt(filePathBytes.length); + bb.put(filePathBytes, 0, filePathBytes.length); + bb.putInt((int) oleBuilder.getContentLength()); + } else { + bb.putInt(PS_LINKED_FILE); + bb.putShort((short)LINK_HEADER); + } + + return headerBytes; + } + + private static byte[] writePackageStreamFooter(Builder oleBuilder) { + + // note, these are _not_ zero terminated + byte[] fileNameBytes = oleBuilder.getFileName().getBytes(OLE_UTF_CHARSET); + byte[] filePathBytes = oleBuilder.getFilePath().getBytes(OLE_UTF_CHARSET); + + int footerLen = 12 + (filePathBytes.length * 2) + fileNameBytes.length; + + byte[] footerBytes = new byte[footerLen]; + ByteBuffer bb = PageChannel.wrap(footerBytes); + + bb.putInt(filePathBytes.length/2); + bb.put(filePathBytes); + bb.putInt(fileNameBytes.length/2); + bb.put(fileNameBytes); + bb.putInt(filePathBytes.length/2); + bb.put(filePathBytes); + + return footerBytes; + } + + /** + * creates the appropriate ContentImpl for the given blob. + */ + private static ContentImpl parseContent(OleBlobImpl blob) + throws IOException + { + ByteBuffer bb = PageChannel.wrap(blob.getBytes()); + + if((bb.remaining() < 2) || (bb.getShort() != PACKAGE_SIGNATURE)) { + return new UnknownContentImpl(blob); + } + + // read outer package header + int headerSize = bb.getShort(); + /* int objType = */ bb.getInt(); + int prettyNameLen = bb.getShort(); + int classNameLen = bb.getShort(); + int prettyNameOff = bb.getShort(); + int classNameOff = bb.getShort(); + /* int objSize = */ bb.getInt(); + String prettyName = readStr(bb, prettyNameOff, prettyNameLen); + String className = readStr(bb, classNameOff, classNameLen); + bb.position(headerSize); + + // read ole header + int oleVer = bb.getInt(); + /* int format = */ bb.getInt(); + + if(oleVer != OLE_VERSION) { + return new UnknownContentImpl(blob); + } + + int typeNameLen = bb.getInt(); + String typeName = readStr(bb, bb.position(), typeNameLen); + bb.getLong(); // unused + int dataBlockLen = bb.getInt(); + int dataBlockPos = bb.position(); + + + if(SIMPLE_PACKAGE_TYPE.equalsIgnoreCase(typeName)) { + return createSimplePackageContent( + blob, prettyName, className, typeName, bb, dataBlockLen); + } + + // if COMPOUND_FACTORY is null, the poi library isn't available, so just + // load compound data as "other" + if((COMPOUND_FACTORY != null) && + (bb.remaining() >= COMPOUND_STORAGE_SIGNATURE.length) && + ByteUtil.matchesRange(bb, bb.position(), COMPOUND_STORAGE_SIGNATURE)) { + return COMPOUND_FACTORY.createCompoundPackageContent( + blob, prettyName, className, typeName, bb, dataBlockLen); + } + + // this is either some other "special" (as yet unhandled) format, or it is + // simply an embedded file (or it is compound data and poi isn't available) + return new OtherContentImpl(blob, prettyName, className, + typeName, dataBlockPos, dataBlockLen); + } + + private static ContentImpl createSimplePackageContent( + OleBlobImpl blob, String prettyName, String className, String typeName, + ByteBuffer blobBb, int dataBlockLen) { + + int dataBlockPos = blobBb.position(); + ByteBuffer bb = PageChannel.narrowBuffer(blobBb, dataBlockPos, + dataBlockPos + dataBlockLen); + + int packageSig = bb.getShort(); + if(packageSig != PACKAGE_STREAM_SIGNATURE) { + return new OtherContentImpl(blob, prettyName, className, + typeName, dataBlockPos, dataBlockLen); + } + + String fileName = readZeroTermStr(bb); + String filePath = readZeroTermStr(bb); + int packageType = bb.getInt(); + + if(packageType == PS_EMBEDDED_FILE) { + + int localFilePathLen = bb.getInt(); + String localFilePath = readStr(bb, bb.position(), localFilePathLen); + int dataLen = bb.getInt(); + int dataPos = bb.position(); + bb.position(dataLen + dataPos); + + // remaining strings are in "reverse" order (local file path, file name, + // file path). these string usee a real utf charset, and therefore can + // "fix" problems with ascii based names (so we prefer these strings to + // the original strings we found) + int strNum = 0; + while(true) { + + int rem = bb.remaining(); + if(rem < 4) { + break; + } + + int strLen = bb.getInt(); + String remStr = readStr(bb, bb.position(), strLen * 2, OLE_UTF_CHARSET); + + switch(strNum) { + case 0: + localFilePath = remStr; + break; + case 1: + fileName = remStr; + break; + case 2: + filePath = remStr; + break; + default: + // ignore + } + + ++strNum; + } + + return new SimplePackageContentImpl( + blob, prettyName, className, typeName, dataPos, dataLen, + fileName, filePath, localFilePath); + } + + if(packageType == PS_LINKED_FILE) { + + bb.getShort(); //unknown + String linkStr = readZeroTermStr(bb); + + return new LinkContentImpl(blob, prettyName, className, typeName, + fileName, linkStr, filePath); + } + + return new OtherContentImpl(blob, prettyName, className, + typeName, dataBlockPos, dataBlockLen); + } + + private static String readStr(ByteBuffer bb, int off, int len) { + return readStr(bb, off, len, OLE_CHARSET); + } + + private static String readZeroTermStr(ByteBuffer bb) { + int off = bb.position(); + while(bb.hasRemaining()) { + byte b = bb.get(); + if(b == 0) { + break; + } + } + int len = bb.position() - off; + return readStr(bb, off, len); + } + + private static String readStr(ByteBuffer bb, int off, int len, + Charset charset) { + String str = new String(bb.array(), off, len, charset); + bb.position(off + len); + if(str.charAt(str.length() - 1) == '\0') { + str = str.substring(0, str.length() - 1); + } + return str; + } + + private static byte[] getZeroTermStrBytes(String str) { + // since we are converting to ascii, try to make "nicer" versions of crazy + // chars (e.g. convert "u with an umlaut" to just "u"). this may not + // ultimately help anything but it is what ms access does. + + // decompose complex chars into combos of char and accent + str = Normalizer.normalize(str, Normalizer.Form.NFD); + // strip the accents + str = UNICODE_ACCENT_PATTERN.matcher(str).replaceAll(""); + // (re)normalize what is left + str = Normalizer.normalize(str, Normalizer.Form.NFC); + + return (str + '\0').getBytes(OLE_CHARSET); + } + + + static final class OleBlobImpl implements OleBlob + { + private byte[] _bytes; + private ContentImpl _content; + + private OleBlobImpl(byte[] bytes) { + _bytes = bytes; + } + + public void writeTo(OutputStream out) throws IOException { + out.write(_bytes); + } + + public Content getContent() throws IOException { + if(_content == null) { + _content = parseContent(this); + } + return _content; + } + + public InputStream getBinaryStream() throws SQLException { + return new ByteArrayInputStream(_bytes); + } + + public InputStream getBinaryStream(long pos, long len) + throws SQLException + { + return new ByteArrayInputStream(_bytes, fromJdbcOffset(pos), (int)len); + } + + public long length() throws SQLException { + return _bytes.length; + } + + public byte[] getBytes() throws IOException { + if(_bytes == null) { + throw new IOException("blob is closed"); + } + return _bytes; + } + + public byte[] getBytes(long pos, int len) throws SQLException { + return ByteUtil.copyOf(_bytes, fromJdbcOffset(pos), len); + } + + public long position(byte[] pattern, long start) throws SQLException { + int pos = ByteUtil.findRange(PageChannel.wrap(_bytes), + fromJdbcOffset(start), pattern); + return((pos >= 0) ? toJdbcOffset(pos) : pos); + } + + public long position(Blob pattern, long start) throws SQLException { + return position(pattern.getBytes(1L, (int)pattern.length()), start); + } + + public OutputStream setBinaryStream(long position) throws SQLException { + throw new SQLFeatureNotSupportedException(); + } + + public void truncate(long len) throws SQLException { + throw new SQLFeatureNotSupportedException(); + } + + public int setBytes(long pos, byte[] bytes) throws SQLException { + throw new SQLFeatureNotSupportedException(); + } + + public int setBytes(long pos, byte[] bytes, int offset, int lesn) + throws SQLException { + throw new SQLFeatureNotSupportedException(); + } + + public void free() { + close(); + } + + public void close() { + _bytes = null; + ByteUtil.closeQuietly(_content); + _content = null; + } + + private static int toJdbcOffset(int off) { + return off + 1; + } + + private static int fromJdbcOffset(long off) { + return (int)off - 1; + } + + @Override + public String toString() { + ToStringBuilder sb = CustomToStringStyle.builder(this); + if(_content != null) { + sb.append("content", _content); + } else { + sb.append("bytes", _bytes); + sb.append("content", "(uninitialized)"); + } + return sb.toString(); + } + } + + static abstract class ContentImpl implements Content, Closeable + { + protected final OleBlobImpl _blob; + + protected ContentImpl(OleBlobImpl blob) { + _blob = blob; + } + + public OleBlobImpl getBlob() { + return _blob; + } + + protected byte[] getBytes() throws IOException { + return getBlob().getBytes(); + } + + public void close() { + // base does nothing + } + + protected ToStringBuilder toString(ToStringBuilder sb) { + sb.append("type", getType()); + return sb; + } + } + + static abstract class EmbeddedContentImpl extends ContentImpl + implements EmbeddedContent + { + private final int _position; + private final int _length; + + protected EmbeddedContentImpl(OleBlobImpl blob, int position, int length) + { + super(blob); + _position = position; + _length = length; + } + + public long length() { + return _length; + } + + public InputStream getStream() throws IOException { + return new ByteArrayInputStream(getBytes(), _position, _length); + } + + public void writeTo(OutputStream out) throws IOException { + out.write(getBytes(), _position, _length); + } + + @Override + protected ToStringBuilder toString(ToStringBuilder sb) { + super.toString(sb); + if(_position >= 0) { + sb.append("content", ByteBuffer.wrap(_blob._bytes, _position, _length)); + } + return sb; + } + } + + static abstract class EmbeddedPackageContentImpl + extends EmbeddedContentImpl + implements PackageContent + { + private final String _prettyName; + private final String _className; + private final String _typeName; + + protected EmbeddedPackageContentImpl( + OleBlobImpl blob, String prettyName, String className, + String typeName, int position, int length) + { + super(blob, position, length); + _prettyName = prettyName; + _className = className; + _typeName = typeName; + } + + public String getPrettyName() { + return _prettyName; + } + + public String getClassName() { + return _className; + } + + public String getTypeName() { + return _typeName; + } + + @Override + protected ToStringBuilder toString(ToStringBuilder sb) { + sb.append("prettyName", _prettyName) + .append("className", _className) + .append("typeName", _typeName); + super.toString(sb); + return sb; + } + } + + private static final class LinkContentImpl + extends EmbeddedPackageContentImpl + implements LinkContent + { + private final String _fileName; + private final String _linkPath; + private final String _filePath; + + private LinkContentImpl(OleBlobImpl blob, String prettyName, + String className, String typeName, + String fileName, String linkPath, + String filePath) + { + super(blob, prettyName, className, typeName, -1, -1); + _fileName = fileName; + _linkPath = linkPath; + _filePath = filePath; + } + + public ContentType getType() { + return ContentType.LINK; + } + + public String getFileName() { + return _fileName; + } + + public String getLinkPath() { + return _linkPath; + } + + public String getFilePath() { + return _filePath; + } + + public InputStream getLinkStream() throws IOException { + return new FileInputStream(getLinkPath()); + } + + @Override + public String toString() { + return toString(CustomToStringStyle.builder(this)) + .append("fileName", _fileName) + .append("linkPath", _linkPath) + .append("filePath", _filePath) + .toString(); + } + } + + private static final class SimplePackageContentImpl + extends EmbeddedPackageContentImpl + implements SimplePackageContent + { + private final String _fileName; + private final String _filePath; + private final String _localFilePath; + + private SimplePackageContentImpl(OleBlobImpl blob, String prettyName, + String className, String typeName, + int position, int length, + String fileName, String filePath, + String localFilePath) + { + super(blob, prettyName, className, typeName, position, length); + _fileName = fileName; + _filePath = filePath; + _localFilePath = localFilePath; + } + + public ContentType getType() { + return ContentType.SIMPLE_PACKAGE; + } + + public String getFileName() { + return _fileName; + } + + public String getFilePath() { + return _filePath; + } + + public String getLocalFilePath() { + return _localFilePath; + } + + @Override + public String toString() { + return toString(CustomToStringStyle.builder(this)) + .append("fileName", _fileName) + .append("filePath", _filePath) + .append("localFilePath", _localFilePath) + .toString(); + } + } + + private static final class OtherContentImpl + extends EmbeddedPackageContentImpl + implements OtherContent + { + private OtherContentImpl( + OleBlobImpl blob, String prettyName, String className, + String typeName, int position, int length) + { + super(blob, prettyName, className, typeName, position, length); + } + + public ContentType getType() { + return ContentType.OTHER; + } + + @Override + public String toString() { + return toString(CustomToStringStyle.builder(this)) + .toString(); + } + } + + private static final class UnknownContentImpl extends ContentImpl + { + private UnknownContentImpl(OleBlobImpl blob) { + super(blob); + } + + public ContentType getType() { + return ContentType.UNKNOWN; + } + + @Override + public String toString() { + return toString(CustomToStringStyle.builder(this)) + .append("content", _blob._bytes) + .toString(); + } + } + + } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java index 133d5e4..779d5ee 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java @@ -36,7 +36,6 @@ import org.apache.poi.poifs.crypt.EncryptionInfo; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.macros.VBAMacroReader; import org.apache.poi.util.IOUtils; @@ -105,23 +104,23 @@ public class OfficeParser extends AbstractOfficeParser { final DirectoryNode root; TikaInputStream tstream = TikaInputStream.cast(stream); - NPOIFSFileSystem mustCloseFs = null; + POIFSFileSystem mustCloseFs = null; try { if (tstream == null) { - mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream)); + mustCloseFs = new POIFSFileSystem(new CloseShieldInputStream(stream)); root = mustCloseFs.getRoot(); } else { final Object container = tstream.getOpenContainer(); - if (container instanceof NPOIFSFileSystem) { - root = ((NPOIFSFileSystem) container).getRoot(); + if (container instanceof POIFSFileSystem) { + root = ((POIFSFileSystem) container).getRoot(); } else if (container instanceof DirectoryNode) { root = (DirectoryNode) container; } else { - NPOIFSFileSystem fs = null; + POIFSFileSystem fs = null; if (tstream.hasFile()) { - fs = new NPOIFSFileSystem(tstream.getFile(), true); + fs = new POIFSFileSystem(tstream.getFile(), true); } else { - fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)); + fs = new POIFSFileSystem(new CloseShieldInputStream(tstream)); } //tstream will close the fs, no need to close this below tstream.setOpenContainer(fs); @@ -274,10 +273,6 @@ public class OfficeParser extends AbstractOfficeParser { return detectType(fs.getRoot()); } - public static POIFSDocumentType detectType(NPOIFSFileSystem fs) { - return detectType(fs.getRoot()); - } - public static POIFSDocumentType detectType(DirectoryEntry node) { Set<String> names = new HashSet<String>(); for (Entry entry : node) { @@ -313,7 +308,7 @@ public class OfficeParser extends AbstractOfficeParser { * @throws IOException on IOException if it occurs during the extraction of the embedded doc * @throws SAXException on SAXException for writing to xhtml */ - public static void extractMacros(NPOIFSFileSystem fs, ContentHandler xhtml, + public static void extractMacros(POIFSFileSystem fs, ContentHandler xhtml, EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException { VBAMacroReader reader = null; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index dc355ae..5d13351 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -51,9 +51,8 @@ import org.apache.poi.hsmf.datatypes.StringChunk; import org.apache.poi.hsmf.datatypes.Types; import org.apache.poi.hsmf.exceptions.ChunkNotFoundException; import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.CodePageUtil; -import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; @@ -74,7 +73,6 @@ import org.apache.tika.parser.txt.CharsetMatch; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; -import org.bouncycastle.cms.Recipient; import org.xml.sax.SAXException; /** @@ -128,7 +126,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { private final boolean extractAllAlternatives; - public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context) throws TikaException { + public OutlookExtractor(POIFSFileSystem filesystem, ParseContext context) throws TikaException { this(filesystem.getRoot(), context); } @@ -149,7 +147,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { msg.setReturnNullOnMissingChunk(true); try { - metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass())); + metadata.set(Office.MAPI_MESSAGE_CLASS, msg.getMessageClassEnum().name()); } catch (ChunkNotFoundException e){} // If the message contains strings that aren't stored @@ -485,7 +483,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { metadata.add(property, chunks.get(0).toString()); } - //TODO: replace this with getMessageClassEnum when we upgrade POI + //Still needed by PSTParser public static String getMessageClass(String messageClass){ if (messageClass == null || messageClass.trim().length() == 0) { return "UNSPECIFIED"; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java index 1c98690..1b5a0a9 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java @@ -33,7 +33,7 @@ import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.DocumentNode; import org.apache.poi.poifs.filesystem.Entry; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -385,7 +385,7 @@ public class POIFSContainerDetector implements Detector { File file = stream.getFile(); try { - NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true); + POIFSFileSystem fs = new POIFSFileSystem(file, true); // Optimize a possible later parsing process by keeping // a reference to the already opened POI file system @@ -423,8 +423,8 @@ public class POIFSContainerDetector implements Detector { Set<String> names = null; if (tis != null) { Object container = tis.getOpenContainer(); - if (container instanceof NPOIFSFileSystem) { - names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot()); + if (container instanceof POIFSFileSystem) { + names = getTopLevelNames(((POIFSFileSystem) container).getRoot()); } else if (container instanceof DirectoryNode) { names = getTopLevelNames((DirectoryNode) container); } @@ -454,8 +454,8 @@ public class POIFSContainerDetector implements Detector { // Detect based on the names (as available) if (tis != null && tis.getOpenContainer() != null && - tis.getOpenContainer() instanceof NPOIFSFileSystem) { - return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot()); + tis.getOpenContainer() instanceof POIFSFileSystem) { + return detect(names, ((POIFSFileSystem) tis.getOpenContainer()).getRoot()); } else { return detect(names, null); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java index 3e2ea26..8017184 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java @@ -32,7 +32,7 @@ import org.apache.poi.hpsf.UnexpectedPropertySetTypeException; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.DocumentInputStream; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.MSOffice; import org.apache.tika.metadata.Metadata; @@ -63,7 +63,7 @@ public class SummaryExtractor { this.metadata = metadata; } - public void parseSummaries(NPOIFSFileSystem filesystem) + public void parseSummaries(POIFSFileSystem filesystem) throws IOException, TikaException { parseSummaries(filesystem.getRoot()); } @@ -94,8 +94,6 @@ public class SummaryExtractor { // no property stream, just skip it } catch (UnexpectedPropertySetTypeException e) { throw new TikaException("Unexpected HPSF document", e); - } catch (MarkUnsupportedException e) { - throw new TikaException("Invalid DocumentInputStream", e); } catch (Exception e) { LOG.warn("Ignoring unexpected exception while parsing summary entry {}", entryName, e); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java index 4a80420..30bd4bb 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java @@ -49,7 +49,7 @@ import org.apache.poi.hwpf.usermodel.TableRow; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -145,7 +145,7 @@ public class WordExtractor extends AbstractPOIFSExtractor { } protected void parse( - NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) + POIFSFileSystem filesystem, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { parse(filesystem.getRoot(), xhtml); } @@ -661,7 +661,7 @@ public class WordExtractor extends AbstractPOIFSExtractor { } protected void parseWord6( - NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) + POIFSFileSystem filesystem, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { parseWord6(filesystem.getRoot(), xhtml); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index c072723..57c38a6 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -25,13 +25,13 @@ import java.io.InputStream; import java.net.URI; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; -import org.apache.poi.POIXMLDocument; -import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.extractor.POITextExtractor; +import org.apache.poi.ooxml.POIXMLDocument; +import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackagePart; @@ -41,9 +41,6 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; import org.apache.poi.openxml4j.opc.TargetMode; import org.apache.poi.openxml4j.opc.internal.FileHelper; import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.DocumentEntry; -import org.apache.poi.poifs.filesystem.Entry; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.Ole10Native; import org.apache.poi.poifs.filesystem.Ole10NativeException; import org.apache.poi.poifs.filesystem.POIFSFileSystem; @@ -69,8 +66,6 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import javax.xml.parsers.SAXParser; - /** * Base class for all Tika OOXML extractors. * <p/> @@ -119,7 +114,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument() */ public POIXMLDocument getDocument() { - return extractor.getDocument(); + return (POIXMLDocument)extractor.getDocument(); } /** @@ -422,9 +417,9 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { if (officeParserConfig.getExtractMacros()) { try (InputStream is = macroPart.getInputStream()) { - try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) { + try (POIFSFileSystem poifs = new POIFSFileSystem(is)) { //Macro reading exceptions are already swallowed here - OfficeParser.extractMacros(npoifs, handler, embeddedExtractor); + OfficeParser.extractMacros(poifs, handler, embeddedExtractor); } } catch (IOException e) { throw new TikaException("Broken OOXML file", e); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java index dbbb839..30f2975 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java @@ -27,7 +27,6 @@ import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart; import org.apache.poi.openxml4j.util.Nullable; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.MSOffice; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; import org.apache.tika.metadata.OfficeOpenXMLCore; @@ -75,45 +74,35 @@ public class MetadataExtractor { PackagePropertiesPart propsHolder = properties .getUnderlyingProperties(); - addProperty(metadata, OfficeOpenXMLCore.CATEGORY, propsHolder.getCategoryProperty()); - addProperty(metadata, OfficeOpenXMLCore.CONTENT_STATUS, propsHolder + setProperty(metadata, OfficeOpenXMLCore.CATEGORY, propsHolder.getCategoryProperty()); + setProperty(metadata, OfficeOpenXMLCore.CONTENT_STATUS, propsHolder .getContentStatusProperty()); - addProperty(metadata, TikaCoreProperties.CREATED, propsHolder + setProperty(metadata, TikaCoreProperties.CREATED, propsHolder .getCreatedProperty()); addMultiProperty(metadata, TikaCoreProperties.CREATOR, propsHolder .getCreatorProperty()); - addProperty(metadata, TikaCoreProperties.DESCRIPTION, propsHolder + setProperty(metadata, TikaCoreProperties.DESCRIPTION, propsHolder .getDescriptionProperty()); - addProperty(metadata, TikaCoreProperties.IDENTIFIER, propsHolder + setProperty(metadata, TikaCoreProperties.IDENTIFIER, propsHolder .getIdentifierProperty()); - addProperty(metadata, TikaCoreProperties.KEYWORDS, propsHolder + addProperty(metadata, OfficeOpenXMLCore.SUBJECT, + propsHolder.getSubjectProperty()); + addProperty(metadata, Office.KEYWORDS, propsHolder .getKeywordsProperty()); - addProperty(metadata, TikaCoreProperties.LANGUAGE, propsHolder + setProperty(metadata, TikaCoreProperties.LANGUAGE, propsHolder .getLanguageProperty()); - addProperty(metadata, TikaCoreProperties.MODIFIER, propsHolder + setProperty(metadata, TikaCoreProperties.MODIFIER, propsHolder .getLastModifiedByProperty()); - addProperty(metadata, TikaCoreProperties.PRINT_DATE, propsHolder + setProperty(metadata, TikaCoreProperties.PRINT_DATE, propsHolder .getLastPrintedProperty()); - addProperty(metadata, Metadata.LAST_MODIFIED, propsHolder - .getModifiedProperty()); - addProperty(metadata, TikaCoreProperties.MODIFIED, propsHolder + setProperty(metadata, TikaCoreProperties.MODIFIED, propsHolder .getModifiedProperty()); - addProperty(metadata, OfficeOpenXMLCore.REVISION, propsHolder + setProperty(metadata, OfficeOpenXMLCore.REVISION, propsHolder .getRevisionProperty()); - // TODO: Move to OO subject in Tika 2.0 - addProperty(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, - propsHolder.getSubjectProperty()); - addProperty(metadata, TikaCoreProperties.TITLE, propsHolder.getTitleProperty()); - addProperty(metadata, OfficeOpenXMLCore.VERSION, propsHolder.getVersionProperty()); - // Legacy Tika-1.0 style stats - // TODO Remove these in Tika 2.0 - addProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty()); - addProperty(metadata, Metadata.CONTENT_STATUS, propsHolder - .getContentStatusProperty()); - addProperty(metadata, Metadata.REVISION_NUMBER, propsHolder - .getRevisionProperty()); - addProperty(metadata, Metadata.VERSION, propsHolder.getVersionProperty()); + setProperty(metadata, TikaCoreProperties.TITLE, propsHolder.getTitleProperty()); + setProperty(metadata, OfficeOpenXMLCore.VERSION, propsHolder.getVersionProperty()); + } private void extractMetadata(ExtendedProperties properties, @@ -130,15 +119,15 @@ public class MetadataExtractor { } catch (XmlValueOutOfRangeException e) { //swallow for now } - addProperty(metadata, OfficeOpenXMLExtended.APPLICATION, propsHolder.getApplication()); - addProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, propsHolder.getAppVersion()); - addProperty(metadata, TikaCoreProperties.PUBLISHER, propsHolder.getCompany()); - addProperty(metadata, OfficeOpenXMLExtended.COMPANY, propsHolder.getCompany()); + setProperty(metadata, OfficeOpenXMLExtended.APPLICATION, propsHolder.getApplication()); + setProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, propsHolder.getAppVersion()); + setProperty(metadata, TikaCoreProperties.PUBLISHER, propsHolder.getCompany()); + setProperty(metadata, OfficeOpenXMLExtended.COMPANY, propsHolder.getCompany()); SummaryExtractor.addMulti(metadata, OfficeOpenXMLExtended.MANAGER, propsHolder.getManager()); - addProperty(metadata, OfficeOpenXMLExtended.NOTES, propsHolder.getNotes()); - addProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat()); - addProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate()); - addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, totalTime); + setProperty(metadata, OfficeOpenXMLExtended.NOTES, propsHolder.getNotes()); + setProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat()); + setProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate()); + setProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, totalTime); if (propsHolder.getPages() > 0) { metadata.set(PagedText.N_PAGES, propsHolder.getPages()); @@ -147,30 +136,13 @@ public class MetadataExtractor { } // Process the document statistics - addProperty(metadata, Office.PAGE_COUNT, propsHolder.getPages()); - addProperty(metadata, Office.SLIDE_COUNT, propsHolder.getSlides()); - addProperty(metadata, Office.PARAGRAPH_COUNT, propsHolder.getParagraphs()); - addProperty(metadata, Office.LINE_COUNT, propsHolder.getLines()); - addProperty(metadata, Office.WORD_COUNT, propsHolder.getWords()); - addProperty(metadata, Office.CHARACTER_COUNT, propsHolder.getCharacters()); - addProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces()); - - // Legacy Tika-1.0 style stats - // TODO Remove these in Tika 2.0 - addProperty(metadata, Metadata.APPLICATION_NAME, propsHolder.getApplication()); - addProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder.getAppVersion()); - addProperty(metadata, Metadata.MANAGER, propsHolder.getManager()); - addProperty(metadata, Metadata.NOTES, propsHolder.getNotes()); - addProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder.getPresentationFormat()); - addProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate()); - addProperty(metadata, Metadata.TOTAL_TIME, totalTime); - addProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages()); - addProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides()); - addProperty(metadata, MSOffice.PARAGRAPH_COUNT, propsHolder.getParagraphs()); - addProperty(metadata, MSOffice.LINE_COUNT, propsHolder.getLines()); - addProperty(metadata, MSOffice.WORD_COUNT, propsHolder.getWords()); - addProperty(metadata, MSOffice.CHARACTER_COUNT, propsHolder.getCharacters()); - addProperty(metadata, MSOffice.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces()); + setProperty(metadata, Office.PAGE_COUNT, propsHolder.getPages()); + setProperty(metadata, Office.SLIDE_COUNT, propsHolder.getSlides()); + setProperty(metadata, Office.PARAGRAPH_COUNT, propsHolder.getParagraphs()); + setProperty(metadata, Office.LINE_COUNT, propsHolder.getLines()); + setProperty(metadata, Office.WORD_COUNT, propsHolder.getWords()); + setProperty(metadata, Office.CHARACTER_COUNT, propsHolder.getCharacters()); + setProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces()); } private void extractMetadata(CustomProperties properties, @@ -257,7 +229,7 @@ public class MetadataExtractor { } } - private <T> void addProperty(Metadata metadata, Property property, Nullable<T> nullableValue) { + private <T> void setProperty(Metadata metadata, Property property, Nullable<T> nullableValue) { T value = nullableValue.getValue(); if (value != null) { if (value instanceof Date) { @@ -272,31 +244,41 @@ public class MetadataExtractor { } } - private void addProperty(Metadata metadata, String name, Nullable<?> value) { + private <T> void addProperty(Metadata metadata, Property property, Nullable<T> nullableValue) { + T value = nullableValue.getValue(); + if (value != null) { + if (value instanceof String) { + metadata.add(property, (String) value); + } else { + throw new IllegalArgumentException("Can't add property of class: "+nullableValue.getClass()); + } + } + } + private void setProperty(Metadata metadata, String name, Nullable<?> value) { if (value.getValue() != null) { - addProperty(metadata, name, value.getValue().toString()); + setProperty(metadata, name, value.getValue().toString()); } } - private void addProperty(Metadata metadata, Property property, String value) { + private void setProperty(Metadata metadata, Property property, String value) { if (value != null) { metadata.set(property, value); } } - private void addProperty(Metadata metadata, String name, String value) { + private void setProperty(Metadata metadata, String name, String value) { if (value != null) { metadata.set(name, value); } } - private void addProperty(Metadata metadata, Property property, int value) { + private void setProperty(Metadata metadata, Property property, int value) { if (value > 0) { metadata.set(property, value); } } - private void addProperty(Metadata metadata, String name, int value) { + private void setProperty(Metadata metadata, String name, int value) { if (value > 0) { metadata.set(name, Integer.toString(value)); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java index f52e52d..4ef723e 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java @@ -18,8 +18,7 @@ package org.apache.tika.parser.microsoft.ooxml; import java.io.IOException; -import org.apache.poi.POIXMLDocument; -import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.ooxml.POIXMLDocument; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; @@ -30,19 +29,19 @@ import org.xml.sax.SAXException; /** * Interface implemented by all Tika OOXML extractors. * - * @see org.apache.poi.POIXMLTextExtractor + * @see org.apache.poi.ooxml.extractor.POIXMLTextExtractor */ public interface OOXMLExtractor { /** * Returns the opened document. * - * @see POIXMLTextExtractor#getDocument() + * @see org.apache.poi.ooxml.extractor.POIXMLTextExtractor#getDocument() */ POIXMLDocument getDocument(); /** - * {@link POIXMLTextExtractor#getMetadataTextExtractor()} not yet supported + * {@link org.apache.poi.ooxml.extractor.POIXMLTextExtractor#getMetadataTextExtractor()} not yet supported * for OOXML by POI. */ MetadataExtractor getMetadataExtractor(); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index 5230d65..a6e111a 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -21,9 +21,9 @@ import java.io.InputStream; import java.util.Locale; import org.apache.commons.io.input.CloseShieldInputStream; -import org.apache.poi.POIXMLDocument; -import org.apache.poi.POIXMLTextExtractor; -import org.apache.poi.extractor.ExtractorFactory; +import org.apache.poi.ooxml.POIXMLDocument; +import org.apache.poi.ooxml.extractor.ExtractorFactory; +import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; @@ -31,8 +31,10 @@ import org.apache.poi.openxml4j.opc.PackageAccess; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.util.LocaleUtil; +import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFRelation; +import org.apache.poi.xslf.usermodel.XSLFSlideShow; import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; @@ -103,15 +105,15 @@ public class OOXMLExtractorFactory { if (config.getUseSAXDocxExtractor()) { poiExtractor = trySXWPF(pkg); } - if (poiExtractor == null && config.getUseSAXPptxExtractor()) { - poiExtractor = trySXSLF(pkg); + if (poiExtractor == null) { + poiExtractor = tryXSLF(pkg, config.getUseSAXPptxExtractor()); } if (type.equals(OOXMLParser.XPS)) { poiExtractor = new XPSTextExtractor(pkg); } if (poiExtractor == null) { - poiExtractor = ExtractorFactory.createExtractor(pkg); + poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(pkg); } POIXMLDocument document = poiExtractor.getDocument(); @@ -190,7 +192,7 @@ public class OOXMLExtractorFactory { return null; } - private static POIXMLTextExtractor trySXSLF(OPCPackage pkg) throws XmlException, OpenXML4JException, IOException { + private static POIXMLTextExtractor tryXSLF(OPCPackage pkg, boolean eventBased) throws XmlException, OpenXML4JException, IOException { PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"); if (packageRelationshipCollection.size() == 0) { @@ -208,12 +210,20 @@ public class OOXMLExtractorFactory { for (int i = 0; i < xslfRelations.length; i++) { XSLFRelation xslfRelation = xslfRelations[i]; if (xslfRelation.getContentType().equals(targetContentType)) { - return new XSLFEventBasedPowerPointExtractor(pkg); + if (eventBased) { + return new XSLFEventBasedPowerPointExtractor(pkg); + } else { + return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg)); + } } } if (XSLFRelation.THEME_MANAGER.getContentType().equals(targetContentType)) { - return new XSLFEventBasedPowerPointExtractor(pkg); + if (eventBased) { + return new XSLFEventBasedPowerPointExtractor(pkg); + } else { + return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg)); + } } return null; } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java index f6ec3bf..56d8a71 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java @@ -19,7 +19,7 @@ package org.apache.tika.parser.microsoft.ooxml; import java.util.ArrayList; import java.util.List; -import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.xssf.extractor.XSSFExcelExtractor; import org.apache.tika.parser.ParseContext; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java index 35dba6d..3d929ba 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java @@ -30,9 +30,11 @@ import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.openxml4j.opc.PackagingURIHelper; import org.apache.poi.openxml4j.opc.TargetMode; +import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.sl.usermodel.Placeholder; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; import org.apache.poi.xslf.usermodel.XMLSlideShow; +import org.apache.poi.xslf.usermodel.XSLFComment; import org.apache.poi.xslf.usermodel.XSLFCommentAuthors; import org.apache.poi.xslf.usermodel.XSLFComments; import org.apache.poi.xslf.usermodel.XSLFGraphicFrame; @@ -59,8 +61,6 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlObject; -import org.openxmlformats.schemas.presentationml.x2006.main.CTComment; -import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthor; import org.openxmlformats.schemas.presentationml.x2006.main.CTPicture; import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList; import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry; @@ -136,23 +136,21 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { } // comments (if present) - XSLFComments comments = slide.getComments(); + List<XSLFComment> comments = slide.getComments(); if (comments != null) { StringBuilder authorStringBuilder = new StringBuilder(); - for (int i = 0; i < comments.getNumberOfComments(); i++) { + for (int i = 0; i < comments.size(); i++) { authorStringBuilder.setLength(0); - CTComment comment = comments.getCommentAt(i); + XSLFComment comment = comments.get(i); xhtml.startElement("p", "class", "slide-comment"); - CTCommentAuthor cta = commentAuthors.getAuthorById(comment.getAuthorId()); - if (cta != null) { - if (cta.getName() != null) { - authorStringBuilder.append(cta.getName()); + if (comment.getAuthor() != null) { + authorStringBuilder.append(comment.getAuthor()); } - if (cta.getInitials() != null) { + if (comment.getAuthorInitials() != null) { if (authorStringBuilder.length() > 0) { authorStringBuilder.append(" "); } - authorStringBuilder.append("("+cta.getInitials()+")"); + authorStringBuilder.append("("+comment.getAuthorInitials()+")"); } if (comment.getText() != null && authorStringBuilder.length() > 0) { authorStringBuilder.append(" - "); @@ -162,7 +160,7 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { xhtml.characters(authorStringBuilder.toString()); xhtml.endElement("b"); } - } + xhtml.characters(comment.getText()); xhtml.endElement("p"); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java index db263b2..df2be9d 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java @@ -21,7 +21,7 @@ import java.io.InputStream; import java.util.List; import java.util.Locale; -import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index 4f8dfbd..7832bc9 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -27,8 +27,8 @@ import java.util.Locale; import java.util.Map; import java.util.Set; -import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java index 50e1e9a..2643a3a 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java @@ -17,10 +17,11 @@ package org.apache.tika.parser.microsoft.ooxml.xps; +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CloseShieldInputStream; -import org.apache.poi.POIXMLDocument; -import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.ooxml.POIXMLDocument; +import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; @@ -29,19 +30,16 @@ import org.apache.poi.openxml4j.util.ZipEntrySource; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.OfflineContentHandler; import org.apache.tika.sax.XHTMLContentHandler; -import org.apache.tika.utils.ExceptionUtils; import org.apache.tika.utils.XMLReaderUtils; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import javax.xml.parsers.SAXParser; import java.io.IOException; import java.io.InputStream; import java.util.Collections; @@ -49,7 +47,6 @@ import java.util.Enumeration; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.zip.ZipEntry; public class XPSExtractorDecorator extends AbstractOOXMLExtractor { @@ -249,10 +246,10 @@ public class XPSExtractorDecorator extends AbstractOOXMLExtractor { private static InputStream getZipStream(String zipPath, ZipPackage zipPackage) throws IOException, TikaException { String targPath = (zipPath.length() > 1 && zipPath.startsWith("/") ? zipPath.substring(1) : zipPath); ZipEntrySource zipEntrySource = zipPackage.getZipArchive(); - Enumeration<? extends ZipEntry> zipEntryEnumeration = zipEntrySource.getEntries(); - ZipEntry zipEntry = null; + Enumeration<? extends ZipArchiveEntry> zipEntryEnumeration = zipEntrySource.getEntries(); + ZipArchiveEntry zipEntry = null; while (zipEntryEnumeration.hasMoreElements()) { - ZipEntry ze = zipEntryEnumeration.nextElement(); + ZipArchiveEntry ze = zipEntryEnumeration.nextElement(); if (ze.getName().equals(targPath)) { zipEntry = ze; break; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java index 30aaf0f..0212920 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java @@ -17,9 +17,10 @@ package org.apache.tika.parser.microsoft.ooxml.xps; -import org.apache.poi.POIXMLDocument; -import org.apache.poi.POIXMLProperties; -import org.apache.poi.POIXMLTextExtractor; + +import org.apache.poi.ooxml.POIXMLDocument; +import org.apache.poi.ooxml.POIXMLProperties; +import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.xmlbeans.XmlException; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java index 3e98203..bd5615d 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java @@ -20,9 +20,9 @@ package org.apache.tika.parser.microsoft.ooxml.xslf; import java.io.IOException; import java.util.Date; -import org.apache.poi.POIXMLDocument; -import org.apache.poi.POIXMLProperties; -import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.ooxml.POIXMLDocument; +import org.apache.poi.ooxml.POIXMLProperties; +import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java index 7a5c0c7..ec63704 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java @@ -26,16 +26,16 @@ import java.util.List; import java.util.Map; import org.apache.commons.io.input.CloseShieldInputStream; -import org.apache.poi.POIXMLDocument; -import org.apache.poi.POIXMLProperties; -import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.ooxml.POIXMLDocument; +import org.apache.poi.ooxml.POIXMLProperties; +import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; +import org.apache.poi.ooxml.util.SAXHelper; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; -import org.apache.poi.util.SAXHelper; import org.apache.poi.xwpf.usermodel.XWPFNumbering; import org.apache.poi.xwpf.usermodel.XWPFRelation; import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java index e5b0b44..08174d0 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java @@ -238,7 +238,7 @@ public class ZipContainerDetector implements Detector { ZipEntrySource zipEntrySource = null; try { - zipEntrySource = new ZipFileZipEntrySource(new java.util.zip.ZipFile(stream.getFile())); + zipEntrySource = new ZipFileZipEntrySource(new ZipFile(stream.getFile())); } catch (IOException e) { return null; } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java index a43c789..1f4e29e 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java @@ -32,7 +32,8 @@ import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.Entry; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.FileMagic; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.Ole10Native; import org.apache.poi.poifs.filesystem.Ole10NativeException; import org.apache.poi.util.IOUtils; @@ -115,7 +116,7 @@ class RTFObjDataParser { ByteArrayInputStream embIs = new ByteArrayInputStream(embObjBytes); boolean hasPoifs = false; try { - hasPoifs = NPOIFSFileSystem.hasPOIFSHeader(embIs); + hasPoifs = hasPOIFSHeader(embIs); } catch (IOException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); return embObjBytes; @@ -139,7 +140,7 @@ class RTFObjDataParser { throws IOException { byte[] ret = null; - try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) { + try (POIFSFileSystem fs = new POIFSFileSystem(is)) { DirectoryNode root = fs.getRoot(); @@ -328,5 +329,9 @@ class RTFObjDataParser { return new byte[(int) len]; } + + private static boolean hasPOIFSHeader(InputStream is) throws IOException { + return FileMagic.valueOf(is) == FileMagic.OLE2; + } } diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java index 57b91ca..51ee1ce 100644 --- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java +++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java @@ -25,10 +25,9 @@ import java.io.FileFilter; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStream; -import java.nio.file.Path; import java.util.Random; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.MultiThreadedTikaTest; import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; @@ -43,7 +42,7 @@ import org.junit.After; import org.junit.Test; /** - * Junit test class for {@link ContainerAwareDetector} + * Junit test class for {@link org.apache.tika.parser.microsoft.POIFSContainerDetector} */ public class TestContainerAwareDetector extends MultiThreadedTikaTest { private final TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); @@ -184,7 +183,7 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest { assertEquals( MediaType.parse("application/vnd.ms-powerpoint"), detector.detect(stream, new Metadata())); - assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem); + assertTrue(stream.getOpenContainer() instanceof POIFSFileSystem); } } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java index 8388c1f..dbba939 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java @@ -137,7 +137,7 @@ public class PowerPointParserTest extends TikaTest { // Make sure boilerplate text didn't come through: assertEquals(-1, content.indexOf("Click to edit Master")); - //TIKA-1171 + //TIKA-1171, POI-62591 assertEquals(-1, content.indexOf("*")); } @@ -161,7 +161,7 @@ public class PowerPointParserTest extends TikaTest { // Make sure boilerplate text didn't come through: assertEquals(-1, content.indexOf("Click to edit Master")); - //TIKA-1171 + //TIKA-1171, POI-62591 assertEquals(-1, content.indexOf("*")); } @@ -180,7 +180,7 @@ public class PowerPointParserTest extends TikaTest { // Make sure boilerplate text didn't come through: assertEquals(-1, content.indexOf("Click to edit Master")); - //TIKA-1171 + //TIKA-1171, POI-62591 assertEquals(-1, content.indexOf("*")); } diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java index b21a03c..4d8679c 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java @@ -20,7 +20,7 @@ package org.apache.tika.server.resource; import org.apache.commons.lang.StringUtils; import org.apache.cxf.attachment.ContentDisposition; import org.apache.cxf.jaxrs.ext.multipart.Attachment; -import org.apache.poi.extractor.ExtractorFactory; +import org.apache.poi.ooxml.extractor.ExtractorFactory; import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector;
