This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-2552 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 657046d0c1bb230aa956e916cbfe7ba2904c9e78 Author: TALLISON <[email protected]> AuthorDate: Wed Sep 5 10:05:36 2018 -0400 NPOIFS->POIFS and add jackcess shim --- tika-eval/pom.xml | 2 +- tika-parsers/pom.xml | 2 +- .../tika/parser/microsoft/ExcelExtractor.java | 6 +- .../tika/parser/microsoft/HSLFExtractor.java | 18 +- .../parser/microsoft/JackcessCompoundOleUtil.java | 268 +++++++ .../tika/parser/microsoft/JackcessExtractor.java | 28 +- .../tika/parser/microsoft/JackcessOleUtil.java | 813 +++++++++++++++++++++ .../apache/tika/parser/microsoft/OfficeParser.java | 21 +- .../tika/parser/microsoft/OutlookExtractor.java | 4 +- .../parser/microsoft/POIFSContainerDetector.java | 12 +- .../tika/parser/microsoft/SummaryExtractor.java | 6 +- .../tika/parser/microsoft/WordExtractor.java | 6 +- .../microsoft/ooxml/AbstractOOXMLExtractor.java | 5 +- .../apache/tika/parser/rtf/RTFObjDataParser.java | 4 +- .../tika/detect/TestContainerAwareDetector.java | 6 +- .../tika/parser/microsoft/JackcessParserTest.java | 1 + 16 files changed, 1145 insertions(+), 57 deletions(-) diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml index c7d28fd..9289116 100644 --- a/tika-eval/pom.xml +++ b/tika-eval/pom.xml @@ -36,7 +36,7 @@ <properties> <cli.version>1.4</cli.version> <!--sync version with tika-server or move to parent? --> <lucene.version>7.4.0</lucene.version> - <poi.version>3.17</poi.version> + <poi.version>4.0.0</poi.version> </properties> <dependencies> diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index f279f32..9c6437e 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -35,7 +35,7 @@ <url>http://tika.apache.org/</url> <properties> - <poi.version>4.0.0-SNAPSHOT</poi.version> + <poi.version>4.0.0</poi.version> <!-- NOTE: sync codec version with POI --> <codec.version>1.11</codec.version> <!-- NOTE: sync tukaani version with commons-compress in tika-parent--> diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java index ff5971a..0dd86ba 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java @@ -64,7 +64,7 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.Entry; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -139,7 +139,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor { * or writing the extracted content */ protected void parse( - NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml, + POIFSFileSystem filesystem, XHTMLContentHandler xhtml, Locale locale) throws IOException, SAXException, TikaException { parse(filesystem.getRoot(), xhtml, locale); } @@ -273,7 +273,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor { * @throws IOException on any IO errors. * @throws SAXException on any SAX parsing errors. */ - public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords) + public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords) throws IOException, SAXException, TikaException { processFile(filesystem.getRoot(), listenForAllRecords); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java index 9990f30..5095709 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java @@ -45,7 +45,7 @@ import org.apache.poi.hslf.usermodel.HSLFTextParagraph; import org.apache.poi.hslf.usermodel.HSLFTextRun; import org.apache.poi.hslf.usermodel.HSLFTextShape; import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.sl.usermodel.Comment; import org.apache.poi.sl.usermodel.SimpleShape; import org.apache.tika.exception.TikaException; @@ -68,7 +68,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { } protected void parse( - NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) + POIFSFileSystem filesystem, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { parse(filesystem.getRoot(), xhtml); } @@ -270,9 +270,9 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { long persistId = vbaAtom.getPersistIdRef(); for (HSLFObjectData objData : ppt.getEmbeddedObjects()) { if (objData.getExOleObjStg().getPersistId() == persistId) { - try (NPOIFSFileSystem npoifsFileSystem = new NPOIFSFileSystem(objData.getInputStream())) { + try (POIFSFileSystem poifsFileSystem = new POIFSFileSystem(objData.getInputStream())) { try { - OfficeParser.extractMacros(npoifsFileSystem, xhtml, + OfficeParser.extractMacros(poifsFileSystem, xhtml, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)); } catch (IOException|SAXException inner) { EmbeddedDocumentUtil.recordException(inner, parentMetadata); @@ -494,18 +494,18 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { } if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj") || mediaType.equals("application/x-tika-msoffice")) { - NPOIFSFileSystem npoifs = null; + POIFSFileSystem poifs = null; try { - npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream)); + poifs = new POIFSFileSystem(new CloseShieldInputStream(stream)); } catch (RuntimeException e) { throw new IOExceptionWithCause(e); } try { - handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml); + handleEmbeddedOfficeDoc(poifs.getRoot(), objID, xhtml); } finally { - if (npoifs != null) { - npoifs.close(); + if (poifs != null) { + poifs.close(); } } } else { diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java new file mode 100644 index 0000000..b09f19d --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java @@ -0,0 +1,268 @@ +/* +Copyright (c) 2013 James Ahlborn + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package org.apache.tika.parser.microsoft; + +import com.healthmarketscience.jackcess.RuntimeIOException; +import com.healthmarketscience.jackcess.impl.ByteUtil; +import com.healthmarketscience.jackcess.impl.CustomToStringStyle; +import com.healthmarketscience.jackcess.util.MemFileChannel; +import com.healthmarketscience.jackcess.util.OleBlob; +import org.apache.commons.lang.builder.ToStringBuilder; +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; +import java.net.URLEncoder; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * Temporary copy/paste from Jackcess to allow upgrade to POI 4.0.0. + * This class will be removed once POI 4.0.0 is released and jackcess + * updates to the most recent version of POI. + * @deprecated -- this class will be removed in Tika >= 1.20 + */ +@Deprecated +class JackcessCompoundOleUtil implements JackcessOleUtil.CompoundPackageFactory { + private static final String ENTRY_NAME_CHARSET = "UTF-8"; + private static final String ENTRY_SEPARATOR = "/"; + private static final String CONTENTS_ENTRY = "CONTENTS"; + + static { + // force a poi class to be loaded to ensure that when this class is + // loaded, we know that the poi classes are available + POIFSFileSystem.class.getName(); + } + + public JackcessCompoundOleUtil() { + } + + /** + * Creates a nes CompoundContent for the given blob information. + */ + public JackcessOleUtil.ContentImpl createCompoundPackageContent( + JackcessOleUtil.OleBlobImpl blob, String prettyName, String className, String typeName, + ByteBuffer blobBb, int dataBlockLen) { + return new CompoundContentImpl(blob, prettyName, className, typeName, + blobBb.position(), dataBlockLen); + } + + /** + * Gets a DocumentEntry from compound storage based on a fully qualified, + * encoded entry name. + * + * @param entryName fully qualified, encoded entry name + * @param dir root directory of the compound storage + * @return the relevant DocumentEntry + * @throws FileNotFoundException if the entry does not exist + * @throws IOException if some other io error occurs + */ + public static DocumentEntry getDocumentEntry(String entryName, + DirectoryEntry dir) + throws IOException { + // split entry name into individual components and decode them + List<String> entryNames = new ArrayList<String>(); + for (String str : entryName.split(ENTRY_SEPARATOR)) { + if (str.length() == 0) { + continue; + } + entryNames.add(decodeEntryName(str)); + } + + DocumentEntry entry = null; + Iterator<String> iter = entryNames.iterator(); + while (iter.hasNext()) { + org.apache.poi.poifs.filesystem.Entry tmpEntry = dir.getEntry(iter.next()); + if (tmpEntry instanceof DirectoryEntry) { + dir = (DirectoryEntry) tmpEntry; + } else if (!iter.hasNext() && (tmpEntry instanceof DocumentEntry)) { + entry = (DocumentEntry) tmpEntry; + } else { + break; + } + } + + if (entry == null) { + throw new FileNotFoundException("Could not find document " + entryName); + } + + return entry; + } + + private static String encodeEntryName(String name) { + try { + return URLEncoder.encode(name, ENTRY_NAME_CHARSET); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + + private static String decodeEntryName(String name) { + try { + return URLDecoder.decode(name, ENTRY_NAME_CHARSET); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + + private static final class CompoundContentImpl + extends JackcessOleUtil.EmbeddedPackageContentImpl + implements OleBlob.CompoundContent { + private POIFSFileSystem _fs; + + private CompoundContentImpl( + JackcessOleUtil.OleBlobImpl blob, String prettyName, String className, + String typeName, int position, int length) { + super(blob, prettyName, className, typeName, position, length); + } + + public OleBlob.ContentType getType() { + return OleBlob.ContentType.COMPOUND_STORAGE; + } + + private POIFSFileSystem getFileSystem() throws IOException { + if (_fs == null) { + _fs = new POIFSFileSystem(MemFileChannel.newChannel(getStream(), "r")); + } + return _fs; + } + + public Iterator<Entry> iterator() { + try { + return getEntries(new ArrayList<Entry>(), getFileSystem().getRoot(), + ENTRY_SEPARATOR).iterator(); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + public EntryImpl getEntry(String entryName) throws IOException { + return new EntryImpl(entryName, + getDocumentEntry(entryName, getFileSystem().getRoot())); + } + + public boolean hasContentsEntry() throws IOException { + return getFileSystem().getRoot().hasEntry(CONTENTS_ENTRY); + } + + public EntryImpl getContentsEntry() throws IOException { + return getEntry(CONTENTS_ENTRY); + } + + private List<Entry> getEntries(List<Entry> entries, DirectoryEntry dir, + String prefix) { + for (org.apache.poi.poifs.filesystem.Entry entry : dir) { + if (entry instanceof DirectoryEntry) { + // .. recurse into this directory + getEntries(entries, (DirectoryEntry) entry, prefix + ENTRY_SEPARATOR); + } else if (entry instanceof DocumentEntry) { + // grab the entry name/detils + DocumentEntry de = (DocumentEntry) entry; + String entryName = prefix + encodeEntryName(entry.getName()); + entries.add(new EntryImpl(entryName, de)); + } + } + return entries; + } + + @Override + public void close() { + ByteUtil.closeQuietly(_fs); + _fs = null; + super.close(); + } + + @Override + public String toString() { + ToStringBuilder sb = toString(CustomToStringStyle.builder(this)); + + try { + sb.append("hasContentsEntry", hasContentsEntry()); + sb.append("entries", getEntries(new ArrayList<Entry>(), + getFileSystem().getRoot(), + ENTRY_SEPARATOR)); + } catch (IOException e) { + sb.append("entries", "<" + e + ">"); + } + + return sb.toString(); + } + + private final class EntryImpl implements OleBlob.CompoundContent.Entry { + private final String _name; + private final DocumentEntry _docEntry; + + private EntryImpl(String name, DocumentEntry docEntry) { + _name = name; + _docEntry = docEntry; + } + + public OleBlob.ContentType getType() { + return OleBlob.ContentType.UNKNOWN; + } + + public String getName() { + return _name; + } + + public CompoundContentImpl getParent() { + return CompoundContentImpl.this; + } + + public JackcessOleUtil.OleBlobImpl getBlob() { + return getParent().getBlob(); + } + + public long length() { + return _docEntry.getSize(); + } + + public InputStream getStream() throws IOException { + return new DocumentInputStream(_docEntry); + } + + public void writeTo(OutputStream out) throws IOException { + InputStream in = null; + try { + ByteUtil.copy(in = getStream(), out); + } finally { + ByteUtil.closeQuietly(in); + } + } + + @Override + public String toString() { + return CustomToStringStyle.valueBuilder(this) + .append("name", _name) + .append("length", length()) + .toString(); + } + } + } +} + + + diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java index bf5c5d0..3a10346 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java @@ -41,7 +41,7 @@ import com.healthmarketscience.jackcess.Row; import com.healthmarketscience.jackcess.Table; import com.healthmarketscience.jackcess.query.Query; import com.healthmarketscience.jackcess.util.OleBlob; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.IOUtils; @@ -302,8 +302,9 @@ class JackcessExtractor extends AbstractPOIFSExtractor { } } + private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { - OleBlob blob = row.getBlob(cName); + OleBlob blob = getBlob(row, cName); //lifted shamelessly from Jackcess's OleBlobTest if (blob == null) return; @@ -367,9 +368,21 @@ class JackcessExtractor extends AbstractPOIFSExtractor { } } + /* + Temporary work around until POI 4.0.0 is released and jackcess upgrades + This is copy/pasted from jackcess + */ + private OleBlob getBlob(Row row, String cName) { + byte[] bytes = row.getBytes(cName); + if (bytes == null) { + return null; + } + return JackcessOleUtil.parseBlob(bytes); + } + private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { InputStream is = null; - NPOIFSFileSystem nfs = null; + POIFSFileSystem fileSystem = null; try { try { is = cc.getStream(); @@ -379,18 +392,18 @@ class JackcessExtractor extends AbstractPOIFSExtractor { } try { - nfs = new NPOIFSFileSystem(is); + fileSystem = new POIFSFileSystem(is); } catch (Exception e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata); return; } - handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml); + handleEmbeddedOfficeDoc(fileSystem.getRoot(), xhtml); } finally { - if (nfs != null) { + if (fileSystem != null) { try { - nfs.close(); + fileSystem.close(); } catch (IOException e) { //swallow } @@ -414,5 +427,6 @@ class JackcessExtractor extends AbstractPOIFSExtractor { } return shortDateTimeFormatter.format(d); } + } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java new file mode 100644 index 0000000..a1432d6 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java @@ -0,0 +1,813 @@ +/* +Copyright (c) 2013 James Ahlborn + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package org.apache.tika.parser.microsoft; + +import java.io.ByteArrayInputStream; +import java.io.Closeable; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.sql.Blob; +import java.sql.SQLException; +import java.sql.SQLFeatureNotSupportedException; +import java.text.Normalizer; +import java.util.EnumSet; +import java.util.Set; +import java.util.regex.Pattern; + +import com.healthmarketscience.jackcess.DataType; +import com.healthmarketscience.jackcess.util.OleBlob; +import static com.healthmarketscience.jackcess.util.OleBlob.*; +import org.apache.commons.lang.builder.ToStringBuilder; + +import com.healthmarketscience.jackcess.impl.ByteUtil; +import com.healthmarketscience.jackcess.impl.CustomToStringStyle; +import com.healthmarketscience.jackcess.impl.PageChannel; + +/** + * Utility code for working with OLE data. + * Temporary workaround until POI 4.0.0 is released and Jackcess is updated + * + * + * @author James Ahlborn + * @usage _advanced_class_ + * @deprecated this class will be removed in Tika >= 1.20 + */ +@Deprecated +class JackcessOleUtil { + + + /** + * Interface used to allow optional inclusion of the poi library for working + * with compound ole data. + */ + interface CompoundPackageFactory + { + public ContentImpl createCompoundPackageContent( + OleBlobImpl blob, String prettyName, String className, String typeName, + ByteBuffer blobBb, int dataBlockLen); + } + + private static final int PACKAGE_SIGNATURE = 0x1C15; + private static final Charset OLE_CHARSET = Charset.forName("US-ASCII"); + private static final Charset OLE_UTF_CHARSET = Charset.forName("UTF-16LE"); + private static final byte[] COMPOUND_STORAGE_SIGNATURE = + {(byte)0xd0,(byte)0xcf,(byte)0x11,(byte)0xe0, + (byte)0xa1,(byte)0xb1,(byte)0x1a,(byte)0xe1}; + private static final String SIMPLE_PACKAGE_TYPE = "Package"; + private static final int PACKAGE_OBJECT_TYPE = 0x02; + private static final int OLE_VERSION = 0x0501; + private static final int OLE_FORMAT = 0x02; + private static final int PACKAGE_STREAM_SIGNATURE = 0x02; + private static final int PS_EMBEDDED_FILE = 0x030000; + private static final int PS_LINKED_FILE = 0x010000; + private static final Set<ContentType> WRITEABLE_TYPES = EnumSet.of( + ContentType.LINK, ContentType.SIMPLE_PACKAGE, ContentType.OTHER); + private static final byte[] NO_DATA = new byte[0]; + private static final int LINK_HEADER = 0x01; + private static final byte[] PACKAGE_FOOTER = { + 0x01, 0x05, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, (byte)0xAD, 0x05, (byte)0xFE + }; + + // regex pattern which matches all the crazy extra stuff in unicode + private static final Pattern UNICODE_ACCENT_PATTERN = + Pattern.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+"); + + private static final CompoundPackageFactory COMPOUND_FACTORY; + + static { + CompoundPackageFactory compoundFactory = null; + try { + compoundFactory = (CompoundPackageFactory) + Class.forName("org.apache.tika.parser.microsoft.JackcessCompoundOleUtil") + .newInstance(); + } catch(Throwable t) { + // must not have poi, will load compound ole data as "other" + } + COMPOUND_FACTORY = compoundFactory; + } + + /** + * Parses an access database blob structure and returns an appropriate + * OleBlob instance. + */ + public static OleBlob parseBlob(byte[] bytes) { + return new OleBlobImpl(bytes); + } + + /** + * Creates a new OlBlob instance using the given information. + */ + public static OleBlob createBlob(Builder oleBuilder) + throws IOException + { + try { + + if(!WRITEABLE_TYPES.contains(oleBuilder.getType())) { + throw new IllegalArgumentException( + "Cannot currently create ole values of type " + + oleBuilder.getType()); + } + + long contentLen = oleBuilder.getContentLength(); + byte[] contentBytes = oleBuilder.getBytes(); + InputStream contentStream = oleBuilder.getStream(); + byte[] packageStreamHeader = NO_DATA; + byte[] packageStreamFooter = NO_DATA; + + switch(oleBuilder.getType()) { + case LINK: + packageStreamHeader = writePackageStreamHeader(oleBuilder); + + // link "content" is file path + contentBytes = getZeroTermStrBytes(oleBuilder.getFilePath()); + contentLen = contentBytes.length; + break; + + case SIMPLE_PACKAGE: + packageStreamHeader = writePackageStreamHeader(oleBuilder); + packageStreamFooter = writePackageStreamFooter(oleBuilder); + break; + + case OTHER: + // nothing more to do + break; + default: + throw new RuntimeException("unexpected type " + oleBuilder.getType()); + } + + long payloadLen = packageStreamHeader.length + packageStreamFooter.length + + contentLen; + byte[] packageHeader = writePackageHeader(oleBuilder, payloadLen); + + long totalOleLen = packageHeader.length + PACKAGE_FOOTER.length + + payloadLen; + if(totalOleLen > DataType.OLE.getMaxSize()) { + throw new IllegalArgumentException("Content size of " + totalOleLen + + " is too large for ole column"); + } + + byte[] oleBytes = new byte[(int)totalOleLen]; + ByteBuffer bb = PageChannel.wrap(oleBytes); + bb.put(packageHeader); + bb.put(packageStreamHeader); + + if(contentLen > 0L) { + if(contentBytes != null) { + bb.put(contentBytes); + } else { + byte[] buf = new byte[8192]; + int numBytes = 0; + while((numBytes = contentStream.read(buf)) >= 0) { + bb.put(buf, 0, numBytes); + } + } + } + + bb.put(packageStreamFooter); + bb.put(PACKAGE_FOOTER); + + return parseBlob(oleBytes); + + } finally { + ByteUtil.closeQuietly(oleBuilder.getStream()); + } + } + + private static byte[] writePackageHeader(Builder oleBuilder, + long contentLen) { + + byte[] prettyNameBytes = getZeroTermStrBytes(oleBuilder.getPrettyName()); + String className = oleBuilder.getClassName(); + String typeName = oleBuilder.getTypeName(); + if(className == null) { + className = typeName; + } else if(typeName == null) { + typeName = className; + } + byte[] classNameBytes = getZeroTermStrBytes(className); + byte[] typeNameBytes = getZeroTermStrBytes(typeName); + + int packageHeaderLen = 20 + prettyNameBytes.length + classNameBytes.length; + + int oleHeaderLen = 24 + typeNameBytes.length; + + byte[] headerBytes = new byte[packageHeaderLen + oleHeaderLen]; + + ByteBuffer bb = PageChannel.wrap(headerBytes); + + // write outer package header + bb.putShort((short)PACKAGE_SIGNATURE); + bb.putShort((short)packageHeaderLen); + bb.putInt(PACKAGE_OBJECT_TYPE); + bb.putShort((short)prettyNameBytes.length); + bb.putShort((short)classNameBytes.length); + int prettyNameOff = bb.position() + 8; + bb.putShort((short)prettyNameOff); + bb.putShort((short)(prettyNameOff + prettyNameBytes.length)); + bb.putInt(-1); + bb.put(prettyNameBytes); + bb.put(classNameBytes); + + // put ole header + bb.putInt(OLE_VERSION); + bb.putInt(OLE_FORMAT); + bb.putInt(typeNameBytes.length); + bb.put(typeNameBytes); + bb.putLong(0L); + bb.putInt((int)contentLen); + + return headerBytes; + } + + private static byte[] writePackageStreamHeader(Builder oleBuilder) { + + byte[] fileNameBytes = getZeroTermStrBytes(oleBuilder.getFileName()); + byte[] filePathBytes = getZeroTermStrBytes(oleBuilder.getFilePath()); + + int headerLen = 6 + fileNameBytes.length + filePathBytes.length; + + if(oleBuilder.getType() == ContentType.SIMPLE_PACKAGE) { + + headerLen += 8 + filePathBytes.length; + + } else { + + headerLen += 2; + } + + byte[] headerBytes = new byte[headerLen]; + ByteBuffer bb = PageChannel.wrap(headerBytes); + bb.putShort((short)PACKAGE_STREAM_SIGNATURE); + bb.put(fileNameBytes); + bb.put(filePathBytes); + + if(oleBuilder.getType() == ContentType.SIMPLE_PACKAGE) { + bb.putInt(PS_EMBEDDED_FILE); + bb.putInt(filePathBytes.length); + bb.put(filePathBytes, 0, filePathBytes.length); + bb.putInt((int) oleBuilder.getContentLength()); + } else { + bb.putInt(PS_LINKED_FILE); + bb.putShort((short)LINK_HEADER); + } + + return headerBytes; + } + + private static byte[] writePackageStreamFooter(Builder oleBuilder) { + + // note, these are _not_ zero terminated + byte[] fileNameBytes = oleBuilder.getFileName().getBytes(OLE_UTF_CHARSET); + byte[] filePathBytes = oleBuilder.getFilePath().getBytes(OLE_UTF_CHARSET); + + int footerLen = 12 + (filePathBytes.length * 2) + fileNameBytes.length; + + byte[] footerBytes = new byte[footerLen]; + ByteBuffer bb = PageChannel.wrap(footerBytes); + + bb.putInt(filePathBytes.length/2); + bb.put(filePathBytes); + bb.putInt(fileNameBytes.length/2); + bb.put(fileNameBytes); + bb.putInt(filePathBytes.length/2); + bb.put(filePathBytes); + + return footerBytes; + } + + /** + * creates the appropriate ContentImpl for the given blob. + */ + private static ContentImpl parseContent(OleBlobImpl blob) + throws IOException + { + ByteBuffer bb = PageChannel.wrap(blob.getBytes()); + + if((bb.remaining() < 2) || (bb.getShort() != PACKAGE_SIGNATURE)) { + return new UnknownContentImpl(blob); + } + + // read outer package header + int headerSize = bb.getShort(); + /* int objType = */ bb.getInt(); + int prettyNameLen = bb.getShort(); + int classNameLen = bb.getShort(); + int prettyNameOff = bb.getShort(); + int classNameOff = bb.getShort(); + /* int objSize = */ bb.getInt(); + String prettyName = readStr(bb, prettyNameOff, prettyNameLen); + String className = readStr(bb, classNameOff, classNameLen); + bb.position(headerSize); + + // read ole header + int oleVer = bb.getInt(); + /* int format = */ bb.getInt(); + + if(oleVer != OLE_VERSION) { + return new UnknownContentImpl(blob); + } + + int typeNameLen = bb.getInt(); + String typeName = readStr(bb, bb.position(), typeNameLen); + bb.getLong(); // unused + int dataBlockLen = bb.getInt(); + int dataBlockPos = bb.position(); + + + if(SIMPLE_PACKAGE_TYPE.equalsIgnoreCase(typeName)) { + return createSimplePackageContent( + blob, prettyName, className, typeName, bb, dataBlockLen); + } + + // if COMPOUND_FACTORY is null, the poi library isn't available, so just + // load compound data as "other" + if((COMPOUND_FACTORY != null) && + (bb.remaining() >= COMPOUND_STORAGE_SIGNATURE.length) && + ByteUtil.matchesRange(bb, bb.position(), COMPOUND_STORAGE_SIGNATURE)) { + return COMPOUND_FACTORY.createCompoundPackageContent( + blob, prettyName, className, typeName, bb, dataBlockLen); + } + + // this is either some other "special" (as yet unhandled) format, or it is + // simply an embedded file (or it is compound data and poi isn't available) + return new OtherContentImpl(blob, prettyName, className, + typeName, dataBlockPos, dataBlockLen); + } + + private static ContentImpl createSimplePackageContent( + OleBlobImpl blob, String prettyName, String className, String typeName, + ByteBuffer blobBb, int dataBlockLen) { + + int dataBlockPos = blobBb.position(); + ByteBuffer bb = PageChannel.narrowBuffer(blobBb, dataBlockPos, + dataBlockPos + dataBlockLen); + + int packageSig = bb.getShort(); + if(packageSig != PACKAGE_STREAM_SIGNATURE) { + return new OtherContentImpl(blob, prettyName, className, + typeName, dataBlockPos, dataBlockLen); + } + + String fileName = readZeroTermStr(bb); + String filePath = readZeroTermStr(bb); + int packageType = bb.getInt(); + + if(packageType == PS_EMBEDDED_FILE) { + + int localFilePathLen = bb.getInt(); + String localFilePath = readStr(bb, bb.position(), localFilePathLen); + int dataLen = bb.getInt(); + int dataPos = bb.position(); + bb.position(dataLen + dataPos); + + // remaining strings are in "reverse" order (local file path, file name, + // file path). these string usee a real utf charset, and therefore can + // "fix" problems with ascii based names (so we prefer these strings to + // the original strings we found) + int strNum = 0; + while(true) { + + int rem = bb.remaining(); + if(rem < 4) { + break; + } + + int strLen = bb.getInt(); + String remStr = readStr(bb, bb.position(), strLen * 2, OLE_UTF_CHARSET); + + switch(strNum) { + case 0: + localFilePath = remStr; + break; + case 1: + fileName = remStr; + break; + case 2: + filePath = remStr; + break; + default: + // ignore + } + + ++strNum; + } + + return new SimplePackageContentImpl( + blob, prettyName, className, typeName, dataPos, dataLen, + fileName, filePath, localFilePath); + } + + if(packageType == PS_LINKED_FILE) { + + bb.getShort(); //unknown + String linkStr = readZeroTermStr(bb); + + return new LinkContentImpl(blob, prettyName, className, typeName, + fileName, linkStr, filePath); + } + + return new OtherContentImpl(blob, prettyName, className, + typeName, dataBlockPos, dataBlockLen); + } + + private static String readStr(ByteBuffer bb, int off, int len) { + return readStr(bb, off, len, OLE_CHARSET); + } + + private static String readZeroTermStr(ByteBuffer bb) { + int off = bb.position(); + while(bb.hasRemaining()) { + byte b = bb.get(); + if(b == 0) { + break; + } + } + int len = bb.position() - off; + return readStr(bb, off, len); + } + + private static String readStr(ByteBuffer bb, int off, int len, + Charset charset) { + String str = new String(bb.array(), off, len, charset); + bb.position(off + len); + if(str.charAt(str.length() - 1) == '\0') { + str = str.substring(0, str.length() - 1); + } + return str; + } + + private static byte[] getZeroTermStrBytes(String str) { + // since we are converting to ascii, try to make "nicer" versions of crazy + // chars (e.g. convert "u with an umlaut" to just "u"). this may not + // ultimately help anything but it is what ms access does. + + // decompose complex chars into combos of char and accent + str = Normalizer.normalize(str, Normalizer.Form.NFD); + // strip the accents + str = UNICODE_ACCENT_PATTERN.matcher(str).replaceAll(""); + // (re)normalize what is left + str = Normalizer.normalize(str, Normalizer.Form.NFC); + + return (str + '\0').getBytes(OLE_CHARSET); + } + + + static final class OleBlobImpl implements OleBlob + { + private byte[] _bytes; + private ContentImpl _content; + + private OleBlobImpl(byte[] bytes) { + _bytes = bytes; + } + + public void writeTo(OutputStream out) throws IOException { + out.write(_bytes); + } + + public Content getContent() throws IOException { + if(_content == null) { + _content = parseContent(this); + } + return _content; + } + + public InputStream getBinaryStream() throws SQLException { + return new ByteArrayInputStream(_bytes); + } + + public InputStream getBinaryStream(long pos, long len) + throws SQLException + { + return new ByteArrayInputStream(_bytes, fromJdbcOffset(pos), (int)len); + } + + public long length() throws SQLException { + return _bytes.length; + } + + public byte[] getBytes() throws IOException { + if(_bytes == null) { + throw new IOException("blob is closed"); + } + return _bytes; + } + + public byte[] getBytes(long pos, int len) throws SQLException { + return ByteUtil.copyOf(_bytes, fromJdbcOffset(pos), len); + } + + public long position(byte[] pattern, long start) throws SQLException { + int pos = ByteUtil.findRange(PageChannel.wrap(_bytes), + fromJdbcOffset(start), pattern); + return((pos >= 0) ? toJdbcOffset(pos) : pos); + } + + public long position(Blob pattern, long start) throws SQLException { + return position(pattern.getBytes(1L, (int)pattern.length()), start); + } + + public OutputStream setBinaryStream(long position) throws SQLException { + throw new SQLFeatureNotSupportedException(); + } + + public void truncate(long len) throws SQLException { + throw new SQLFeatureNotSupportedException(); + } + + public int setBytes(long pos, byte[] bytes) throws SQLException { + throw new SQLFeatureNotSupportedException(); + } + + public int setBytes(long pos, byte[] bytes, int offset, int lesn) + throws SQLException { + throw new SQLFeatureNotSupportedException(); + } + + public void free() { + close(); + } + + public void close() { + _bytes = null; + ByteUtil.closeQuietly(_content); + _content = null; + } + + private static int toJdbcOffset(int off) { + return off + 1; + } + + private static int fromJdbcOffset(long off) { + return (int)off - 1; + } + + @Override + public String toString() { + ToStringBuilder sb = CustomToStringStyle.builder(this); + if(_content != null) { + sb.append("content", _content); + } else { + sb.append("bytes", _bytes); + sb.append("content", "(uninitialized)"); + } + return sb.toString(); + } + } + + static abstract class ContentImpl implements Content, Closeable + { + protected final OleBlobImpl _blob; + + protected ContentImpl(OleBlobImpl blob) { + _blob = blob; + } + + public OleBlobImpl getBlob() { + return _blob; + } + + protected byte[] getBytes() throws IOException { + return getBlob().getBytes(); + } + + public void close() { + // base does nothing + } + + protected ToStringBuilder toString(ToStringBuilder sb) { + sb.append("type", getType()); + return sb; + } + } + + static abstract class EmbeddedContentImpl extends ContentImpl + implements EmbeddedContent + { + private final int _position; + private final int _length; + + protected EmbeddedContentImpl(OleBlobImpl blob, int position, int length) + { + super(blob); + _position = position; + _length = length; + } + + public long length() { + return _length; + } + + public InputStream getStream() throws IOException { + return new ByteArrayInputStream(getBytes(), _position, _length); + } + + public void writeTo(OutputStream out) throws IOException { + out.write(getBytes(), _position, _length); + } + + @Override + protected ToStringBuilder toString(ToStringBuilder sb) { + super.toString(sb); + if(_position >= 0) { + sb.append("content", ByteBuffer.wrap(_blob._bytes, _position, _length)); + } + return sb; + } + } + + static abstract class EmbeddedPackageContentImpl + extends EmbeddedContentImpl + implements PackageContent + { + private final String _prettyName; + private final String _className; + private final String _typeName; + + protected EmbeddedPackageContentImpl( + OleBlobImpl blob, String prettyName, String className, + String typeName, int position, int length) + { + super(blob, position, length); + _prettyName = prettyName; + _className = className; + _typeName = typeName; + } + + public String getPrettyName() { + return _prettyName; + } + + public String getClassName() { + return _className; + } + + public String getTypeName() { + return _typeName; + } + + @Override + protected ToStringBuilder toString(ToStringBuilder sb) { + sb.append("prettyName", _prettyName) + .append("className", _className) + .append("typeName", _typeName); + super.toString(sb); + return sb; + } + } + + private static final class LinkContentImpl + extends EmbeddedPackageContentImpl + implements LinkContent + { + private final String _fileName; + private final String _linkPath; + private final String _filePath; + + private LinkContentImpl(OleBlobImpl blob, String prettyName, + String className, String typeName, + String fileName, String linkPath, + String filePath) + { + super(blob, prettyName, className, typeName, -1, -1); + _fileName = fileName; + _linkPath = linkPath; + _filePath = filePath; + } + + public ContentType getType() { + return ContentType.LINK; + } + + public String getFileName() { + return _fileName; + } + + public String getLinkPath() { + return _linkPath; + } + + public String getFilePath() { + return _filePath; + } + + public InputStream getLinkStream() throws IOException { + return new FileInputStream(getLinkPath()); + } + + @Override + public String toString() { + return toString(CustomToStringStyle.builder(this)) + .append("fileName", _fileName) + .append("linkPath", _linkPath) + .append("filePath", _filePath) + .toString(); + } + } + + private static final class SimplePackageContentImpl + extends EmbeddedPackageContentImpl + implements SimplePackageContent + { + private final String _fileName; + private final String _filePath; + private final String _localFilePath; + + private SimplePackageContentImpl(OleBlobImpl blob, String prettyName, + String className, String typeName, + int position, int length, + String fileName, String filePath, + String localFilePath) + { + super(blob, prettyName, className, typeName, position, length); + _fileName = fileName; + _filePath = filePath; + _localFilePath = localFilePath; + } + + public ContentType getType() { + return ContentType.SIMPLE_PACKAGE; + } + + public String getFileName() { + return _fileName; + } + + public String getFilePath() { + return _filePath; + } + + public String getLocalFilePath() { + return _localFilePath; + } + + @Override + public String toString() { + return toString(CustomToStringStyle.builder(this)) + .append("fileName", _fileName) + .append("filePath", _filePath) + .append("localFilePath", _localFilePath) + .toString(); + } + } + + private static final class OtherContentImpl + extends EmbeddedPackageContentImpl + implements OtherContent + { + private OtherContentImpl( + OleBlobImpl blob, String prettyName, String className, + String typeName, int position, int length) + { + super(blob, prettyName, className, typeName, position, length); + } + + public ContentType getType() { + return ContentType.OTHER; + } + + @Override + public String toString() { + return toString(CustomToStringStyle.builder(this)) + .toString(); + } + } + + private static final class UnknownContentImpl extends ContentImpl + { + private UnknownContentImpl(OleBlobImpl blob) { + super(blob); + } + + public ContentType getType() { + return ContentType.UNKNOWN; + } + + @Override + public String toString() { + return toString(CustomToStringStyle.builder(this)) + .append("content", _blob._bytes) + .toString(); + } + } + + } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java index 133d5e4..779d5ee 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java @@ -36,7 +36,6 @@ import org.apache.poi.poifs.crypt.EncryptionInfo; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.macros.VBAMacroReader; import org.apache.poi.util.IOUtils; @@ -105,23 +104,23 @@ public class OfficeParser extends AbstractOfficeParser { final DirectoryNode root; TikaInputStream tstream = TikaInputStream.cast(stream); - NPOIFSFileSystem mustCloseFs = null; + POIFSFileSystem mustCloseFs = null; try { if (tstream == null) { - mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream)); + mustCloseFs = new POIFSFileSystem(new CloseShieldInputStream(stream)); root = mustCloseFs.getRoot(); } else { final Object container = tstream.getOpenContainer(); - if (container instanceof NPOIFSFileSystem) { - root = ((NPOIFSFileSystem) container).getRoot(); + if (container instanceof POIFSFileSystem) { + root = ((POIFSFileSystem) container).getRoot(); } else if (container instanceof DirectoryNode) { root = (DirectoryNode) container; } else { - NPOIFSFileSystem fs = null; + POIFSFileSystem fs = null; if (tstream.hasFile()) { - fs = new NPOIFSFileSystem(tstream.getFile(), true); + fs = new POIFSFileSystem(tstream.getFile(), true); } else { - fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)); + fs = new POIFSFileSystem(new CloseShieldInputStream(tstream)); } //tstream will close the fs, no need to close this below tstream.setOpenContainer(fs); @@ -274,10 +273,6 @@ public class OfficeParser extends AbstractOfficeParser { return detectType(fs.getRoot()); } - public static POIFSDocumentType detectType(NPOIFSFileSystem fs) { - return detectType(fs.getRoot()); - } - public static POIFSDocumentType detectType(DirectoryEntry node) { Set<String> names = new HashSet<String>(); for (Entry entry : node) { @@ -313,7 +308,7 @@ public class OfficeParser extends AbstractOfficeParser { * @throws IOException on IOException if it occurs during the extraction of the embedded doc * @throws SAXException on SAXException for writing to xhtml */ - public static void extractMacros(NPOIFSFileSystem fs, ContentHandler xhtml, + public static void extractMacros(POIFSFileSystem fs, ContentHandler xhtml, EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException { VBAMacroReader reader = null; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index 0aed803..5d13351 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -51,7 +51,7 @@ import org.apache.poi.hsmf.datatypes.StringChunk; import org.apache.poi.hsmf.datatypes.Types; import org.apache.poi.hsmf.exceptions.ChunkNotFoundException; import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.CodePageUtil; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentUtil; @@ -126,7 +126,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { private final boolean extractAllAlternatives; - public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context) throws TikaException { + public OutlookExtractor(POIFSFileSystem filesystem, ParseContext context) throws TikaException { this(filesystem.getRoot(), context); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java index 1c98690..1b5a0a9 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java @@ -33,7 +33,7 @@ import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.DocumentNode; import org.apache.poi.poifs.filesystem.Entry; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -385,7 +385,7 @@ public class POIFSContainerDetector implements Detector { File file = stream.getFile(); try { - NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true); + POIFSFileSystem fs = new POIFSFileSystem(file, true); // Optimize a possible later parsing process by keeping // a reference to the already opened POI file system @@ -423,8 +423,8 @@ public class POIFSContainerDetector implements Detector { Set<String> names = null; if (tis != null) { Object container = tis.getOpenContainer(); - if (container instanceof NPOIFSFileSystem) { - names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot()); + if (container instanceof POIFSFileSystem) { + names = getTopLevelNames(((POIFSFileSystem) container).getRoot()); } else if (container instanceof DirectoryNode) { names = getTopLevelNames((DirectoryNode) container); } @@ -454,8 +454,8 @@ public class POIFSContainerDetector implements Detector { // Detect based on the names (as available) if (tis != null && tis.getOpenContainer() != null && - tis.getOpenContainer() instanceof NPOIFSFileSystem) { - return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot()); + tis.getOpenContainer() instanceof POIFSFileSystem) { + return detect(names, ((POIFSFileSystem) tis.getOpenContainer()).getRoot()); } else { return detect(names, null); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java index 3e2ea26..8017184 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java @@ -32,7 +32,7 @@ import org.apache.poi.hpsf.UnexpectedPropertySetTypeException; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.DocumentInputStream; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.MSOffice; import org.apache.tika.metadata.Metadata; @@ -63,7 +63,7 @@ public class SummaryExtractor { this.metadata = metadata; } - public void parseSummaries(NPOIFSFileSystem filesystem) + public void parseSummaries(POIFSFileSystem filesystem) throws IOException, TikaException { parseSummaries(filesystem.getRoot()); } @@ -94,8 +94,6 @@ public class SummaryExtractor { // no property stream, just skip it } catch (UnexpectedPropertySetTypeException e) { throw new TikaException("Unexpected HPSF document", e); - } catch (MarkUnsupportedException e) { - throw new TikaException("Invalid DocumentInputStream", e); } catch (Exception e) { LOG.warn("Ignoring unexpected exception while parsing summary entry {}", entryName, e); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java index 4a80420..30bd4bb 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java @@ -49,7 +49,7 @@ import org.apache.poi.hwpf.usermodel.TableRow; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -145,7 +145,7 @@ public class WordExtractor extends AbstractPOIFSExtractor { } protected void parse( - NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) + POIFSFileSystem filesystem, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { parse(filesystem.getRoot(), xhtml); } @@ -661,7 +661,7 @@ public class WordExtractor extends AbstractPOIFSExtractor { } protected void parseWord6( - NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) + POIFSFileSystem filesystem, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { parseWord6(filesystem.getRoot(), xhtml); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index ac5abc9..57c38a6 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -41,7 +41,6 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; import org.apache.poi.openxml4j.opc.TargetMode; import org.apache.poi.openxml4j.opc.internal.FileHelper; import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.Ole10Native; import org.apache.poi.poifs.filesystem.Ole10NativeException; import org.apache.poi.poifs.filesystem.POIFSFileSystem; @@ -418,9 +417,9 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { if (officeParserConfig.getExtractMacros()) { try (InputStream is = macroPart.getInputStream()) { - try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) { + try (POIFSFileSystem poifs = new POIFSFileSystem(is)) { //Macro reading exceptions are already swallowed here - OfficeParser.extractMacros(npoifs, handler, embeddedExtractor); + OfficeParser.extractMacros(poifs, handler, embeddedExtractor); } } catch (IOException e) { throw new TikaException("Broken OOXML file", e); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java index 4387ca4..90ea58b 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java @@ -33,7 +33,7 @@ import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.FileMagic; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.Ole10Native; import org.apache.poi.poifs.filesystem.Ole10NativeException; import org.apache.poi.poifs.storage.HeaderBlock; @@ -141,7 +141,7 @@ class RTFObjDataParser { throws IOException { byte[] ret = null; - try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) { + try (POIFSFileSystem fs = new POIFSFileSystem(is)) { DirectoryNode root = fs.getRoot(); diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java index 57b91ca..ad12517 100644 --- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java +++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java @@ -28,7 +28,7 @@ import java.io.InputStream; import java.nio.file.Path; import java.util.Random; -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.MultiThreadedTikaTest; import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; @@ -43,7 +43,7 @@ import org.junit.After; import org.junit.Test; /** - * Junit test class for {@link ContainerAwareDetector} + * Junit test class for {@link org.apache.tika.parser.microsoft.POIFSContainerDetector} */ public class TestContainerAwareDetector extends MultiThreadedTikaTest { private final TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); @@ -184,7 +184,7 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest { assertEquals( MediaType.parse("application/vnd.ms-powerpoint"), detector.detect(stream, new Metadata())); - assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem); + assertTrue(stream.getOpenContainer() instanceof POIFSFileSystem); } } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java index f39b961..2ec2a56 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java @@ -65,6 +65,7 @@ public class JackcessParserTest extends TikaTest { IOUtils.closeQuietly(is); } List<Metadata> list = handler.getMetadataList(); + debug(list); assertEquals(4, list.size()); String mainContent = list.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
