POIFS and add jackcess shim

tallison Wed, 05 Sep 2018 07:06:18 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-2552
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 657046d0c1bb230aa956e916cbfe7ba2904c9e78
Author: TALLISON <[email protected]>
AuthorDate: Wed Sep 5 10:05:36 2018 -0400

    NPOIFS->POIFS and add jackcess shim
---
 tika-eval/pom.xml                                  |   2 +-
 tika-parsers/pom.xml                               |   2 +-
 .../tika/parser/microsoft/ExcelExtractor.java      |   6 +-
 .../tika/parser/microsoft/HSLFExtractor.java       |  18 +-
 .../parser/microsoft/JackcessCompoundOleUtil.java  | 268 +++++++
 .../tika/parser/microsoft/JackcessExtractor.java   |  28 +-
 .../tika/parser/microsoft/JackcessOleUtil.java     | 813 +++++++++++++++++++++
 .../apache/tika/parser/microsoft/OfficeParser.java |  21 +-
 .../tika/parser/microsoft/OutlookExtractor.java    |   4 +-
 .../parser/microsoft/POIFSContainerDetector.java   |  12 +-
 .../tika/parser/microsoft/SummaryExtractor.java    |   6 +-
 .../tika/parser/microsoft/WordExtractor.java       |   6 +-
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |   5 +-
 .../apache/tika/parser/rtf/RTFObjDataParser.java   |   4 +-
 .../tika/detect/TestContainerAwareDetector.java    |   6 +-
 .../tika/parser/microsoft/JackcessParserTest.java  |   1 +
 16 files changed, 1145 insertions(+), 57 deletions(-)

diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index c7d28fd..9289116 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -36,7 +36,7 @@
     <properties>
         <cli.version>1.4</cli.version> <!--sync version with tika-server or 
move to parent? -->
         <lucene.version>7.4.0</lucene.version>
-        <poi.version>3.17</poi.version>
+        <poi.version>4.0.0</poi.version>
     </properties>
 
     <dependencies>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index f279f32..9c6437e 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -35,7 +35,7 @@
   <url>http://tika.apache.org/</url>
 
   <properties>
-    <poi.version>4.0.0-SNAPSHOT</poi.version>
+    <poi.version>4.0.0</poi.version>
     <!-- NOTE: sync codec version with POI -->
     <codec.version>1.11</codec.version>
     <!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index ff5971a..0dd86ba 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -64,7 +64,7 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
@@ -139,7 +139,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
      *                     or writing the extracted content
      */
     protected void parse(
-            NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml,
+            POIFSFileSystem filesystem, XHTMLContentHandler xhtml,
             Locale locale) throws IOException, SAXException, TikaException {
         parse(filesystem.getRoot(), xhtml, locale);
     }
@@ -273,7 +273,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
          * @throws IOException  on any IO errors.
          * @throws SAXException on any SAX parsing errors.
          */
-        public void processFile(NPOIFSFileSystem filesystem, boolean 
listenForAllRecords)
+        public void processFile(POIFSFileSystem filesystem, boolean 
listenForAllRecords)
                 throws IOException, SAXException, TikaException {
             processFile(filesystem.getRoot(), listenForAllRecords);
         }
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index 9990f30..5095709 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -45,7 +45,7 @@ import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
 import org.apache.poi.hslf.usermodel.HSLFTextRun;
 import org.apache.poi.hslf.usermodel.HSLFTextShape;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.sl.usermodel.Comment;
 import org.apache.poi.sl.usermodel.SimpleShape;
 import org.apache.tika.exception.TikaException;
@@ -68,7 +68,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
     }
 
     protected void parse(
-            NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+            POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
         parse(filesystem.getRoot(), xhtml);
     }
@@ -270,9 +270,9 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
         long persistId = vbaAtom.getPersistIdRef();
         for (HSLFObjectData objData : ppt.getEmbeddedObjects()) {
             if (objData.getExOleObjStg().getPersistId() == persistId) {
-                try (NPOIFSFileSystem npoifsFileSystem = new 
NPOIFSFileSystem(objData.getInputStream())) {
+                try (POIFSFileSystem poifsFileSystem = new 
POIFSFileSystem(objData.getInputStream())) {
                     try {
-                        OfficeParser.extractMacros(npoifsFileSystem, xhtml,
+                        OfficeParser.extractMacros(poifsFileSystem, xhtml,
                                 
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
                     } catch (IOException|SAXException inner) {
                         EmbeddedDocumentUtil.recordException(inner, 
parentMetadata);
@@ -494,18 +494,18 @@ public class HSLFExtractor extends AbstractPOIFSExtractor 
{
                         }
                         if 
(mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")
                                 || 
mediaType.equals("application/x-tika-msoffice")) {
-                            NPOIFSFileSystem npoifs = null;
+                            POIFSFileSystem poifs = null;
 
                             try {
-                                npoifs = new NPOIFSFileSystem(new 
CloseShieldInputStream(stream));
+                                poifs = new POIFSFileSystem(new 
CloseShieldInputStream(stream));
                             } catch (RuntimeException e) {
                                 throw new IOExceptionWithCause(e);
                             }
                             try {
-                                handleEmbeddedOfficeDoc(npoifs.getRoot(), 
objID, xhtml);
+                                handleEmbeddedOfficeDoc(poifs.getRoot(), 
objID, xhtml);
                             } finally {
-                                if (npoifs != null) {
-                                    npoifs.close();
+                                if (poifs != null) {
+                                    poifs.close();
                                 }
                             }
                         } else {
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java
new file mode 100644
index 0000000..b09f19d
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java
@@ -0,0 +1,268 @@
+/*
+Copyright (c) 2013 James Ahlborn
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package org.apache.tika.parser.microsoft;
+
+import com.healthmarketscience.jackcess.RuntimeIOException;
+import com.healthmarketscience.jackcess.impl.ByteUtil;
+import com.healthmarketscience.jackcess.impl.CustomToStringStyle;
+import com.healthmarketscience.jackcess.util.MemFileChannel;
+import com.healthmarketscience.jackcess.util.OleBlob;
+import org.apache.commons.lang.builder.ToStringBuilder;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.net.URLEncoder;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * Temporary copy/paste from Jackcess to allow upgrade to POI 4.0.0.
+ * This class will be removed once POI 4.0.0 is released and jackcess
+ * updates to the most recent version of POI.
+ * @deprecated -- this class will be removed in Tika >= 1.20
+ */
+@Deprecated
+class JackcessCompoundOleUtil implements 
JackcessOleUtil.CompoundPackageFactory {
+    private static final String ENTRY_NAME_CHARSET = "UTF-8";
+    private static final String ENTRY_SEPARATOR = "/";
+    private static final String CONTENTS_ENTRY = "CONTENTS";
+
+    static {
+        // force a poi class to be loaded to ensure that when this class is
+        // loaded, we know that the poi classes are available
+        POIFSFileSystem.class.getName();
+    }
+
+    public JackcessCompoundOleUtil() {
+    }
+
+    /**
+     * Creates a nes CompoundContent for the given blob information.
+     */
+    public JackcessOleUtil.ContentImpl createCompoundPackageContent(
+            JackcessOleUtil.OleBlobImpl blob, String prettyName, String 
className, String typeName,
+            ByteBuffer blobBb, int dataBlockLen) {
+        return new CompoundContentImpl(blob, prettyName, className, typeName,
+                blobBb.position(), dataBlockLen);
+    }
+
+    /**
+     * Gets a DocumentEntry from compound storage based on a fully qualified,
+     * encoded entry name.
+     *
+     * @param entryName fully qualified, encoded entry name
+     * @param dir       root directory of the compound storage
+     * @return the relevant DocumentEntry
+     * @throws FileNotFoundException if the entry does not exist
+     * @throws IOException           if some other io error occurs
+     */
+    public static DocumentEntry getDocumentEntry(String entryName,
+                                                 DirectoryEntry dir)
+            throws IOException {
+        // split entry name into individual components and decode them
+        List<String> entryNames = new ArrayList<String>();
+        for (String str : entryName.split(ENTRY_SEPARATOR)) {
+            if (str.length() == 0) {
+                continue;
+            }
+            entryNames.add(decodeEntryName(str));
+        }
+
+        DocumentEntry entry = null;
+        Iterator<String> iter = entryNames.iterator();
+        while (iter.hasNext()) {
+            org.apache.poi.poifs.filesystem.Entry tmpEntry = 
dir.getEntry(iter.next());
+            if (tmpEntry instanceof DirectoryEntry) {
+                dir = (DirectoryEntry) tmpEntry;
+            } else if (!iter.hasNext() && (tmpEntry instanceof DocumentEntry)) 
{
+                entry = (DocumentEntry) tmpEntry;
+            } else {
+                break;
+            }
+        }
+
+        if (entry == null) {
+            throw new FileNotFoundException("Could not find document " + 
entryName);
+        }
+
+        return entry;
+    }
+
+    private static String encodeEntryName(String name) {
+        try {
+            return URLEncoder.encode(name, ENTRY_NAME_CHARSET);
+        } catch (UnsupportedEncodingException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private static String decodeEntryName(String name) {
+        try {
+            return URLDecoder.decode(name, ENTRY_NAME_CHARSET);
+        } catch (UnsupportedEncodingException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private static final class CompoundContentImpl
+            extends JackcessOleUtil.EmbeddedPackageContentImpl
+            implements OleBlob.CompoundContent {
+        private POIFSFileSystem _fs;
+
+        private CompoundContentImpl(
+                JackcessOleUtil.OleBlobImpl blob, String prettyName, String 
className,
+                String typeName, int position, int length) {
+            super(blob, prettyName, className, typeName, position, length);
+        }
+
+        public OleBlob.ContentType getType() {
+            return OleBlob.ContentType.COMPOUND_STORAGE;
+        }
+
+        private POIFSFileSystem getFileSystem() throws IOException {
+            if (_fs == null) {
+                _fs = new 
POIFSFileSystem(MemFileChannel.newChannel(getStream(), "r"));
+            }
+            return _fs;
+        }
+
+        public Iterator<Entry> iterator() {
+            try {
+                return getEntries(new ArrayList<Entry>(), 
getFileSystem().getRoot(),
+                        ENTRY_SEPARATOR).iterator();
+            } catch (IOException e) {
+                throw new RuntimeIOException(e);
+            }
+        }
+
+        public EntryImpl getEntry(String entryName) throws IOException {
+            return new EntryImpl(entryName,
+                    getDocumentEntry(entryName, getFileSystem().getRoot()));
+        }
+
+        public boolean hasContentsEntry() throws IOException {
+            return getFileSystem().getRoot().hasEntry(CONTENTS_ENTRY);
+        }
+
+        public EntryImpl getContentsEntry() throws IOException {
+            return getEntry(CONTENTS_ENTRY);
+        }
+
+        private List<Entry> getEntries(List<Entry> entries, DirectoryEntry dir,
+                                       String prefix) {
+            for (org.apache.poi.poifs.filesystem.Entry entry : dir) {
+                if (entry instanceof DirectoryEntry) {
+                    // .. recurse into this directory
+                    getEntries(entries, (DirectoryEntry) entry, prefix + 
ENTRY_SEPARATOR);
+                } else if (entry instanceof DocumentEntry) {
+                    // grab the entry name/detils
+                    DocumentEntry de = (DocumentEntry) entry;
+                    String entryName = prefix + 
encodeEntryName(entry.getName());
+                    entries.add(new EntryImpl(entryName, de));
+                }
+            }
+            return entries;
+        }
+
+        @Override
+        public void close() {
+            ByteUtil.closeQuietly(_fs);
+            _fs = null;
+            super.close();
+        }
+
+        @Override
+        public String toString() {
+            ToStringBuilder sb = toString(CustomToStringStyle.builder(this));
+
+            try {
+                sb.append("hasContentsEntry", hasContentsEntry());
+                sb.append("entries", getEntries(new ArrayList<Entry>(),
+                        getFileSystem().getRoot(),
+                        ENTRY_SEPARATOR));
+            } catch (IOException e) {
+                sb.append("entries", "<" + e + ">");
+            }
+
+            return sb.toString();
+        }
+
+        private final class EntryImpl implements OleBlob.CompoundContent.Entry 
{
+            private final String _name;
+            private final DocumentEntry _docEntry;
+
+            private EntryImpl(String name, DocumentEntry docEntry) {
+                _name = name;
+                _docEntry = docEntry;
+            }
+
+            public OleBlob.ContentType getType() {
+                return OleBlob.ContentType.UNKNOWN;
+            }
+
+            public String getName() {
+                return _name;
+            }
+
+            public CompoundContentImpl getParent() {
+                return CompoundContentImpl.this;
+            }
+
+            public JackcessOleUtil.OleBlobImpl getBlob() {
+                return getParent().getBlob();
+            }
+
+            public long length() {
+                return _docEntry.getSize();
+            }
+
+            public InputStream getStream() throws IOException {
+                return new DocumentInputStream(_docEntry);
+            }
+
+            public void writeTo(OutputStream out) throws IOException {
+                InputStream in = null;
+                try {
+                    ByteUtil.copy(in = getStream(), out);
+                } finally {
+                    ByteUtil.closeQuietly(in);
+                }
+            }
+
+            @Override
+            public String toString() {
+                return CustomToStringStyle.valueBuilder(this)
+                        .append("name", _name)
+                        .append("length", length())
+                        .toString();
+            }
+        }
+    }
+}
+
+
+
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index bf5c5d0..3a10346 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -41,7 +41,7 @@ import com.healthmarketscience.jackcess.Row;
 import com.healthmarketscience.jackcess.Table;
 import com.healthmarketscience.jackcess.query.Query;
 import com.healthmarketscience.jackcess.util.OleBlob;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.IOUtils;
@@ -302,8 +302,9 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
         }
     }
 
+
     private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) 
throws IOException, SAXException, TikaException {
-        OleBlob blob = row.getBlob(cName);
+        OleBlob blob = getBlob(row, cName);
         //lifted shamelessly from Jackcess's OleBlobTest
         if (blob == null)
             return;
@@ -367,9 +368,21 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
         }
     }
 
+    /*
+       Temporary work around until POI 4.0.0 is released and jackcess upgrades
+       This is copy/pasted from jackcess
+    */
+    private OleBlob getBlob(Row row, String cName) {
+        byte[] bytes = row.getBytes(cName);
+        if (bytes == null) {
+            return null;
+        }
+        return JackcessOleUtil.parseBlob(bytes);
+    }
+
     private void handleCompoundContent(OleBlob.CompoundContent cc, 
XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
         InputStream is = null;
-        NPOIFSFileSystem nfs = null;
+        POIFSFileSystem fileSystem = null;
         try {
             try {
                 is = cc.getStream();
@@ -379,18 +392,18 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
             }
 
             try {
-                nfs = new NPOIFSFileSystem(is);
+                fileSystem = new POIFSFileSystem(is);
             } catch (Exception e) {
                 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, 
parentMetadata);
                 return;
             }
 
-            handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml);
+            handleEmbeddedOfficeDoc(fileSystem.getRoot(), xhtml);
 
         } finally {
-            if (nfs != null) {
+            if (fileSystem != null) {
                 try {
-                    nfs.close();
+                    fileSystem.close();
                 } catch (IOException e) {
                     //swallow
                 }
@@ -414,5 +427,6 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
         }
         return shortDateTimeFormatter.format(d);
     }
+
 }
 
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java
new file mode 100644
index 0000000..a1432d6
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java
@@ -0,0 +1,813 @@
+/*
+Copyright (c) 2013 James Ahlborn
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package org.apache.tika.parser.microsoft;
+
+import java.io.ByteArrayInputStream;
+import java.io.Closeable;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.sql.Blob;
+import java.sql.SQLException;
+import java.sql.SQLFeatureNotSupportedException;
+import java.text.Normalizer;
+import java.util.EnumSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import com.healthmarketscience.jackcess.DataType;
+import com.healthmarketscience.jackcess.util.OleBlob;
+import static com.healthmarketscience.jackcess.util.OleBlob.*;
+import org.apache.commons.lang.builder.ToStringBuilder;
+
+import com.healthmarketscience.jackcess.impl.ByteUtil;
+import com.healthmarketscience.jackcess.impl.CustomToStringStyle;
+import com.healthmarketscience.jackcess.impl.PageChannel;
+
+/**
+ * Utility code for working with OLE data.
+ * Temporary workaround until POI 4.0.0 is released and Jackcess is updated
+ *
+ *
+ * @author James Ahlborn
+ * @usage _advanced_class_
+ * @deprecated this class will be removed in Tika >= 1.20
+ */
+@Deprecated
+class JackcessOleUtil {
+
+
+        /**
+         * Interface used to allow optional inclusion of the poi library for 
working
+         * with compound ole data.
+         */
+        interface CompoundPackageFactory
+        {
+            public ContentImpl createCompoundPackageContent(
+                    OleBlobImpl blob, String prettyName, String className, 
String typeName,
+                    ByteBuffer blobBb, int dataBlockLen);
+        }
+
+        private static final int PACKAGE_SIGNATURE = 0x1C15;
+        private static final Charset OLE_CHARSET = Charset.forName("US-ASCII");
+        private static final Charset OLE_UTF_CHARSET = 
Charset.forName("UTF-16LE");
+        private static final byte[] COMPOUND_STORAGE_SIGNATURE =
+                {(byte)0xd0,(byte)0xcf,(byte)0x11,(byte)0xe0,
+                        (byte)0xa1,(byte)0xb1,(byte)0x1a,(byte)0xe1};
+        private static final String SIMPLE_PACKAGE_TYPE = "Package";
+        private static final int PACKAGE_OBJECT_TYPE = 0x02;
+        private static final int OLE_VERSION = 0x0501;
+        private static final int OLE_FORMAT = 0x02;
+        private static final int PACKAGE_STREAM_SIGNATURE = 0x02;
+        private static final int PS_EMBEDDED_FILE = 0x030000;
+        private static final int PS_LINKED_FILE = 0x010000;
+        private static final Set<ContentType> WRITEABLE_TYPES = EnumSet.of(
+                ContentType.LINK, ContentType.SIMPLE_PACKAGE, 
ContentType.OTHER);
+        private static final byte[] NO_DATA = new byte[0];
+        private static final int LINK_HEADER = 0x01;
+        private static final byte[] PACKAGE_FOOTER = {
+                0x01, 0x05, 0x00, 0x00, 0x00, 0x00,
+                0x00, 0x00, 0x01, (byte)0xAD, 0x05, (byte)0xFE
+        };
+
+        // regex pattern which matches all the crazy extra stuff in unicode
+        private static final Pattern UNICODE_ACCENT_PATTERN =
+                
Pattern.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");
+
+        private static final CompoundPackageFactory COMPOUND_FACTORY;
+
+        static {
+            CompoundPackageFactory compoundFactory = null;
+            try {
+                compoundFactory = (CompoundPackageFactory)
+                        
Class.forName("org.apache.tika.parser.microsoft.JackcessCompoundOleUtil")
+                                .newInstance();
+            } catch(Throwable t) {
+                // must not have poi, will load compound ole data as "other"
+            }
+            COMPOUND_FACTORY = compoundFactory;
+        }
+
+        /**
+         * Parses an access database blob structure and returns an appropriate
+         * OleBlob instance.
+         */
+        public static OleBlob parseBlob(byte[] bytes) {
+            return new OleBlobImpl(bytes);
+        }
+
+        /**
+         * Creates a new OlBlob instance using the given information.
+         */
+        public static OleBlob createBlob(Builder oleBuilder)
+                throws IOException
+        {
+            try {
+
+                if(!WRITEABLE_TYPES.contains(oleBuilder.getType())) {
+                    throw new IllegalArgumentException(
+                            "Cannot currently create ole values of type " +
+                                    oleBuilder.getType());
+                }
+
+                long contentLen = oleBuilder.getContentLength();
+                byte[] contentBytes = oleBuilder.getBytes();
+                InputStream contentStream = oleBuilder.getStream();
+                byte[] packageStreamHeader = NO_DATA;
+                byte[] packageStreamFooter = NO_DATA;
+
+                switch(oleBuilder.getType()) {
+                    case LINK:
+                        packageStreamHeader = 
writePackageStreamHeader(oleBuilder);
+
+                        // link "content" is file path
+                        contentBytes = 
getZeroTermStrBytes(oleBuilder.getFilePath());
+                        contentLen = contentBytes.length;
+                        break;
+
+                    case SIMPLE_PACKAGE:
+                        packageStreamHeader = 
writePackageStreamHeader(oleBuilder);
+                        packageStreamFooter = 
writePackageStreamFooter(oleBuilder);
+                        break;
+
+                    case OTHER:
+                        // nothing more to do
+                        break;
+                    default:
+                        throw new RuntimeException("unexpected type " + 
oleBuilder.getType());
+                }
+
+                long payloadLen = packageStreamHeader.length + 
packageStreamFooter.length +
+                        contentLen;
+                byte[] packageHeader = writePackageHeader(oleBuilder, 
payloadLen);
+
+                long totalOleLen = packageHeader.length + 
PACKAGE_FOOTER.length +
+                        payloadLen;
+                if(totalOleLen > DataType.OLE.getMaxSize()) {
+                    throw new IllegalArgumentException("Content size of " + 
totalOleLen +
+                            " is too large for ole column");
+                }
+
+                byte[] oleBytes = new byte[(int)totalOleLen];
+                ByteBuffer bb = PageChannel.wrap(oleBytes);
+                bb.put(packageHeader);
+                bb.put(packageStreamHeader);
+
+                if(contentLen > 0L) {
+                    if(contentBytes != null) {
+                        bb.put(contentBytes);
+                    } else {
+                        byte[] buf = new byte[8192];
+                        int numBytes = 0;
+                        while((numBytes = contentStream.read(buf)) >= 0) {
+                            bb.put(buf, 0, numBytes);
+                        }
+                    }
+                }
+
+                bb.put(packageStreamFooter);
+                bb.put(PACKAGE_FOOTER);
+
+                return parseBlob(oleBytes);
+
+            } finally {
+                ByteUtil.closeQuietly(oleBuilder.getStream());
+            }
+        }
+
+        private static byte[] writePackageHeader(Builder oleBuilder,
+                                                 long contentLen) {
+
+            byte[] prettyNameBytes = 
getZeroTermStrBytes(oleBuilder.getPrettyName());
+            String className = oleBuilder.getClassName();
+            String typeName = oleBuilder.getTypeName();
+            if(className == null) {
+                className = typeName;
+            } else if(typeName == null) {
+                typeName = className;
+            }
+            byte[] classNameBytes = getZeroTermStrBytes(className);
+            byte[] typeNameBytes = getZeroTermStrBytes(typeName);
+
+            int packageHeaderLen = 20 + prettyNameBytes.length + 
classNameBytes.length;
+
+            int oleHeaderLen = 24 + typeNameBytes.length;
+
+            byte[] headerBytes = new byte[packageHeaderLen + oleHeaderLen];
+
+            ByteBuffer bb = PageChannel.wrap(headerBytes);
+
+            // write outer package header
+            bb.putShort((short)PACKAGE_SIGNATURE);
+            bb.putShort((short)packageHeaderLen);
+            bb.putInt(PACKAGE_OBJECT_TYPE);
+            bb.putShort((short)prettyNameBytes.length);
+            bb.putShort((short)classNameBytes.length);
+            int prettyNameOff = bb.position() + 8;
+            bb.putShort((short)prettyNameOff);
+            bb.putShort((short)(prettyNameOff + prettyNameBytes.length));
+            bb.putInt(-1);
+            bb.put(prettyNameBytes);
+            bb.put(classNameBytes);
+
+            // put ole header
+            bb.putInt(OLE_VERSION);
+            bb.putInt(OLE_FORMAT);
+            bb.putInt(typeNameBytes.length);
+            bb.put(typeNameBytes);
+            bb.putLong(0L);
+            bb.putInt((int)contentLen);
+
+            return headerBytes;
+        }
+
+        private static byte[] writePackageStreamHeader(Builder oleBuilder) {
+
+            byte[] fileNameBytes = 
getZeroTermStrBytes(oleBuilder.getFileName());
+            byte[] filePathBytes = 
getZeroTermStrBytes(oleBuilder.getFilePath());
+
+            int headerLen = 6 + fileNameBytes.length + filePathBytes.length;
+
+            if(oleBuilder.getType() == ContentType.SIMPLE_PACKAGE) {
+
+                headerLen += 8 + filePathBytes.length;
+
+            } else {
+
+                headerLen += 2;
+            }
+
+            byte[] headerBytes = new byte[headerLen];
+            ByteBuffer bb = PageChannel.wrap(headerBytes);
+            bb.putShort((short)PACKAGE_STREAM_SIGNATURE);
+            bb.put(fileNameBytes);
+            bb.put(filePathBytes);
+
+            if(oleBuilder.getType() == ContentType.SIMPLE_PACKAGE) {
+                bb.putInt(PS_EMBEDDED_FILE);
+                bb.putInt(filePathBytes.length);
+                bb.put(filePathBytes, 0, filePathBytes.length);
+                bb.putInt((int) oleBuilder.getContentLength());
+            } else {
+                bb.putInt(PS_LINKED_FILE);
+                bb.putShort((short)LINK_HEADER);
+            }
+
+            return headerBytes;
+        }
+
+        private static byte[] writePackageStreamFooter(Builder oleBuilder) {
+
+            // note, these are _not_ zero terminated
+            byte[] fileNameBytes = 
oleBuilder.getFileName().getBytes(OLE_UTF_CHARSET);
+            byte[] filePathBytes = 
oleBuilder.getFilePath().getBytes(OLE_UTF_CHARSET);
+
+            int footerLen = 12 + (filePathBytes.length * 2) + 
fileNameBytes.length;
+
+            byte[] footerBytes = new byte[footerLen];
+            ByteBuffer bb = PageChannel.wrap(footerBytes);
+
+            bb.putInt(filePathBytes.length/2);
+            bb.put(filePathBytes);
+            bb.putInt(fileNameBytes.length/2);
+            bb.put(fileNameBytes);
+            bb.putInt(filePathBytes.length/2);
+            bb.put(filePathBytes);
+
+            return footerBytes;
+        }
+
+        /**
+         * creates the appropriate ContentImpl for the given blob.
+         */
+        private static ContentImpl parseContent(OleBlobImpl blob)
+                throws IOException
+        {
+            ByteBuffer bb = PageChannel.wrap(blob.getBytes());
+
+            if((bb.remaining() < 2) || (bb.getShort() != PACKAGE_SIGNATURE)) {
+                return new UnknownContentImpl(blob);
+            }
+
+            // read outer package header
+            int headerSize = bb.getShort();
+            /* int objType = */ bb.getInt();
+            int prettyNameLen = bb.getShort();
+            int classNameLen = bb.getShort();
+            int prettyNameOff = bb.getShort();
+            int classNameOff = bb.getShort();
+            /* int objSize = */ bb.getInt();
+            String prettyName = readStr(bb, prettyNameOff, prettyNameLen);
+            String className = readStr(bb, classNameOff, classNameLen);
+            bb.position(headerSize);
+
+            // read ole header
+            int oleVer = bb.getInt();
+            /* int format = */ bb.getInt();
+
+            if(oleVer != OLE_VERSION) {
+                return new UnknownContentImpl(blob);
+            }
+
+            int typeNameLen = bb.getInt();
+            String typeName = readStr(bb, bb.position(), typeNameLen);
+            bb.getLong(); // unused
+            int dataBlockLen = bb.getInt();
+            int dataBlockPos = bb.position();
+
+
+            if(SIMPLE_PACKAGE_TYPE.equalsIgnoreCase(typeName)) {
+                return createSimplePackageContent(
+                        blob, prettyName, className, typeName, bb, 
dataBlockLen);
+            }
+
+            // if COMPOUND_FACTORY is null, the poi library isn't available, 
so just
+            // load compound data as "other"
+            if((COMPOUND_FACTORY != null) &&
+                    (bb.remaining() >= COMPOUND_STORAGE_SIGNATURE.length) &&
+                    ByteUtil.matchesRange(bb, bb.position(), 
COMPOUND_STORAGE_SIGNATURE)) {
+                return COMPOUND_FACTORY.createCompoundPackageContent(
+                        blob, prettyName, className, typeName, bb, 
dataBlockLen);
+            }
+
+            // this is either some other "special" (as yet unhandled) format, 
or it is
+            // simply an embedded file (or it is compound data and poi isn't 
available)
+            return new OtherContentImpl(blob, prettyName, className,
+                    typeName, dataBlockPos, dataBlockLen);
+        }
+
+        private static ContentImpl createSimplePackageContent(
+                OleBlobImpl blob, String prettyName, String className, String 
typeName,
+                ByteBuffer blobBb, int dataBlockLen) {
+
+            int dataBlockPos = blobBb.position();
+            ByteBuffer bb = PageChannel.narrowBuffer(blobBb, dataBlockPos,
+                    dataBlockPos + dataBlockLen);
+
+            int packageSig = bb.getShort();
+            if(packageSig != PACKAGE_STREAM_SIGNATURE) {
+                return new OtherContentImpl(blob, prettyName, className,
+                        typeName, dataBlockPos, dataBlockLen);
+            }
+
+            String fileName = readZeroTermStr(bb);
+            String filePath = readZeroTermStr(bb);
+            int packageType = bb.getInt();
+
+            if(packageType == PS_EMBEDDED_FILE) {
+
+                int localFilePathLen = bb.getInt();
+                String localFilePath = readStr(bb, bb.position(), 
localFilePathLen);
+                int dataLen = bb.getInt();
+                int dataPos = bb.position();
+                bb.position(dataLen + dataPos);
+
+                // remaining strings are in "reverse" order (local file path, 
file name,
+                // file path).  these string usee a real utf charset, and 
therefore can
+                // "fix" problems with ascii based names (so we prefer these 
strings to
+                // the original strings we found)
+                int strNum = 0;
+                while(true) {
+
+                    int rem = bb.remaining();
+                    if(rem < 4) {
+                        break;
+                    }
+
+                    int strLen = bb.getInt();
+                    String remStr = readStr(bb, bb.position(), strLen * 2, 
OLE_UTF_CHARSET);
+
+                    switch(strNum) {
+                        case 0:
+                            localFilePath = remStr;
+                            break;
+                        case 1:
+                            fileName = remStr;
+                            break;
+                        case 2:
+                            filePath = remStr;
+                            break;
+                        default:
+                            // ignore
+                    }
+
+                    ++strNum;
+                }
+
+                return new SimplePackageContentImpl(
+                        blob, prettyName, className, typeName, dataPos, 
dataLen,
+                        fileName, filePath, localFilePath);
+            }
+
+            if(packageType == PS_LINKED_FILE) {
+
+                bb.getShort(); //unknown
+                String linkStr = readZeroTermStr(bb);
+
+                return new LinkContentImpl(blob, prettyName, className, 
typeName,
+                        fileName, linkStr, filePath);
+            }
+
+            return new OtherContentImpl(blob, prettyName, className,
+                    typeName, dataBlockPos, dataBlockLen);
+        }
+
+        private static String readStr(ByteBuffer bb, int off, int len) {
+            return readStr(bb, off, len, OLE_CHARSET);
+        }
+
+        private static String readZeroTermStr(ByteBuffer bb) {
+            int off = bb.position();
+            while(bb.hasRemaining()) {
+                byte b = bb.get();
+                if(b == 0) {
+                    break;
+                }
+            }
+            int len = bb.position() - off;
+            return readStr(bb, off, len);
+        }
+
+        private static String readStr(ByteBuffer bb, int off, int len,
+                                      Charset charset) {
+            String str = new String(bb.array(), off, len, charset);
+            bb.position(off + len);
+            if(str.charAt(str.length() - 1) == '\0') {
+                str = str.substring(0, str.length() - 1);
+            }
+            return str;
+        }
+
+        private static byte[] getZeroTermStrBytes(String str) {
+            // since we are converting to ascii, try to make "nicer" versions 
of crazy
+            // chars (e.g. convert "u with an umlaut" to just "u").  this may 
not
+            // ultimately help anything but it is what ms access does.
+
+            // decompose complex chars into combos of char and accent
+            str = Normalizer.normalize(str, Normalizer.Form.NFD);
+            // strip the accents
+            str = UNICODE_ACCENT_PATTERN.matcher(str).replaceAll("");
+            // (re)normalize what is left
+            str = Normalizer.normalize(str, Normalizer.Form.NFC);
+
+            return (str + '\0').getBytes(OLE_CHARSET);
+        }
+
+
+        static final class OleBlobImpl implements OleBlob
+        {
+            private byte[] _bytes;
+            private ContentImpl _content;
+
+            private OleBlobImpl(byte[] bytes) {
+                _bytes = bytes;
+            }
+
+            public void writeTo(OutputStream out) throws IOException {
+                out.write(_bytes);
+            }
+
+            public Content getContent() throws IOException {
+                if(_content == null) {
+                    _content = parseContent(this);
+                }
+                return _content;
+            }
+
+            public InputStream getBinaryStream() throws SQLException {
+                return new ByteArrayInputStream(_bytes);
+            }
+
+            public InputStream getBinaryStream(long pos, long len)
+                    throws SQLException
+            {
+                return new ByteArrayInputStream(_bytes, fromJdbcOffset(pos), 
(int)len);
+            }
+
+            public long length() throws SQLException {
+                return _bytes.length;
+            }
+
+            public byte[] getBytes() throws IOException {
+                if(_bytes == null) {
+                    throw new IOException("blob is closed");
+                }
+                return _bytes;
+            }
+
+            public byte[] getBytes(long pos, int len) throws SQLException {
+                return ByteUtil.copyOf(_bytes, fromJdbcOffset(pos), len);
+            }
+
+            public long position(byte[] pattern, long start) throws 
SQLException {
+                int pos = ByteUtil.findRange(PageChannel.wrap(_bytes),
+                        fromJdbcOffset(start), pattern);
+                return((pos >= 0) ? toJdbcOffset(pos) : pos);
+            }
+
+            public long position(Blob pattern, long start) throws SQLException 
{
+                return position(pattern.getBytes(1L, (int)pattern.length()), 
start);
+            }
+
+            public OutputStream setBinaryStream(long position) throws 
SQLException {
+                throw new SQLFeatureNotSupportedException();
+            }
+
+            public void truncate(long len) throws SQLException {
+                throw new SQLFeatureNotSupportedException();
+            }
+
+            public int setBytes(long pos, byte[] bytes) throws SQLException {
+                throw new SQLFeatureNotSupportedException();
+            }
+
+            public int setBytes(long pos, byte[] bytes, int offset, int lesn)
+                    throws SQLException {
+                throw new SQLFeatureNotSupportedException();
+            }
+
+            public void free() {
+                close();
+            }
+
+            public void close() {
+                _bytes = null;
+                ByteUtil.closeQuietly(_content);
+                _content = null;
+            }
+
+            private static int toJdbcOffset(int off) {
+                return off + 1;
+            }
+
+            private static int fromJdbcOffset(long off) {
+                return (int)off - 1;
+            }
+
+            @Override
+            public String toString() {
+                ToStringBuilder sb = CustomToStringStyle.builder(this);
+                if(_content != null) {
+                    sb.append("content", _content);
+                } else {
+                    sb.append("bytes", _bytes);
+                    sb.append("content", "(uninitialized)");
+                }
+                return sb.toString();
+            }
+        }
+
+        static abstract class ContentImpl implements Content, Closeable
+        {
+            protected final OleBlobImpl _blob;
+
+            protected ContentImpl(OleBlobImpl blob) {
+                _blob = blob;
+            }
+
+            public OleBlobImpl getBlob() {
+                return _blob;
+            }
+
+            protected byte[] getBytes() throws IOException {
+                return getBlob().getBytes();
+            }
+
+            public void close() {
+                // base does nothing
+            }
+
+            protected ToStringBuilder toString(ToStringBuilder sb) {
+                sb.append("type", getType());
+                return sb;
+            }
+        }
+
+        static abstract class EmbeddedContentImpl extends ContentImpl
+                implements EmbeddedContent
+        {
+            private final int _position;
+            private final int _length;
+
+            protected EmbeddedContentImpl(OleBlobImpl blob, int position, int 
length)
+            {
+                super(blob);
+                _position = position;
+                _length = length;
+            }
+
+            public long length() {
+                return _length;
+            }
+
+            public InputStream getStream() throws IOException {
+                return new ByteArrayInputStream(getBytes(), _position, 
_length);
+            }
+
+            public void writeTo(OutputStream out) throws IOException {
+                out.write(getBytes(), _position, _length);
+            }
+
+            @Override
+            protected ToStringBuilder toString(ToStringBuilder sb) {
+                super.toString(sb);
+                if(_position >= 0) {
+                    sb.append("content", ByteBuffer.wrap(_blob._bytes, 
_position, _length));
+                }
+                return sb;
+            }
+        }
+
+        static abstract class EmbeddedPackageContentImpl
+                extends EmbeddedContentImpl
+                implements PackageContent
+        {
+            private final String _prettyName;
+            private final String _className;
+            private final String _typeName;
+
+            protected EmbeddedPackageContentImpl(
+                    OleBlobImpl blob, String prettyName, String className,
+                    String typeName, int position, int length)
+            {
+                super(blob, position, length);
+                _prettyName = prettyName;
+                _className = className;
+                _typeName = typeName;
+            }
+
+            public String getPrettyName() {
+                return _prettyName;
+            }
+
+            public String getClassName() {
+                return _className;
+            }
+
+            public String getTypeName() {
+                return _typeName;
+            }
+
+            @Override
+            protected ToStringBuilder toString(ToStringBuilder sb) {
+                sb.append("prettyName", _prettyName)
+                        .append("className", _className)
+                        .append("typeName", _typeName);
+                super.toString(sb);
+                return sb;
+            }
+        }
+
+        private static final class LinkContentImpl
+                extends EmbeddedPackageContentImpl
+                implements LinkContent
+        {
+            private final String _fileName;
+            private final String _linkPath;
+            private final String _filePath;
+
+            private LinkContentImpl(OleBlobImpl blob, String prettyName,
+                                    String className, String typeName,
+                                    String fileName, String linkPath,
+                                    String filePath)
+            {
+                super(blob, prettyName, className, typeName, -1, -1);
+                _fileName = fileName;
+                _linkPath = linkPath;
+                _filePath = filePath;
+            }
+
+            public ContentType getType() {
+                return ContentType.LINK;
+            }
+
+            public String getFileName() {
+                return _fileName;
+            }
+
+            public String getLinkPath() {
+                return _linkPath;
+            }
+
+            public String getFilePath() {
+                return _filePath;
+            }
+
+            public InputStream getLinkStream() throws IOException {
+                return new FileInputStream(getLinkPath());
+            }
+
+            @Override
+            public String toString() {
+                return toString(CustomToStringStyle.builder(this))
+                        .append("fileName", _fileName)
+                        .append("linkPath", _linkPath)
+                        .append("filePath", _filePath)
+                        .toString();
+            }
+        }
+
+        private static final class SimplePackageContentImpl
+                extends EmbeddedPackageContentImpl
+                implements SimplePackageContent
+        {
+            private final String _fileName;
+            private final String _filePath;
+            private final String _localFilePath;
+
+            private SimplePackageContentImpl(OleBlobImpl blob, String 
prettyName,
+                                             String className, String typeName,
+                                             int position, int length,
+                                             String fileName, String filePath,
+                                             String localFilePath)
+            {
+                super(blob, prettyName, className, typeName, position, length);
+                _fileName = fileName;
+                _filePath = filePath;
+                _localFilePath = localFilePath;
+            }
+
+            public ContentType getType() {
+                return ContentType.SIMPLE_PACKAGE;
+            }
+
+            public String getFileName() {
+                return _fileName;
+            }
+
+            public String getFilePath() {
+                return _filePath;
+            }
+
+            public String getLocalFilePath() {
+                return _localFilePath;
+            }
+
+            @Override
+            public String toString() {
+                return toString(CustomToStringStyle.builder(this))
+                        .append("fileName", _fileName)
+                        .append("filePath", _filePath)
+                        .append("localFilePath", _localFilePath)
+                        .toString();
+            }
+        }
+
+        private static final class OtherContentImpl
+                extends EmbeddedPackageContentImpl
+                implements OtherContent
+        {
+            private OtherContentImpl(
+                    OleBlobImpl blob, String prettyName, String className,
+                    String typeName, int position, int length)
+            {
+                super(blob, prettyName, className, typeName, position, length);
+            }
+
+            public ContentType getType() {
+                return ContentType.OTHER;
+            }
+
+            @Override
+            public String toString() {
+                return toString(CustomToStringStyle.builder(this))
+                        .toString();
+            }
+        }
+
+        private static final class UnknownContentImpl extends ContentImpl
+        {
+            private UnknownContentImpl(OleBlobImpl blob) {
+                super(blob);
+            }
+
+            public ContentType getType() {
+                return ContentType.UNKNOWN;
+            }
+
+            @Override
+            public String toString() {
+                return toString(CustomToStringStyle.builder(this))
+                        .append("content", _blob._bytes)
+                        .toString();
+            }
+        }
+
+    }
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 133d5e4..779d5ee 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -36,7 +36,6 @@ import org.apache.poi.poifs.crypt.EncryptionInfo;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.poifs.macros.VBAMacroReader;
 import org.apache.poi.util.IOUtils;
@@ -105,23 +104,23 @@ public class OfficeParser extends AbstractOfficeParser {
 
         final DirectoryNode root;
         TikaInputStream tstream = TikaInputStream.cast(stream);
-        NPOIFSFileSystem mustCloseFs = null;
+        POIFSFileSystem mustCloseFs = null;
         try {
             if (tstream == null) {
-                mustCloseFs = new NPOIFSFileSystem(new 
CloseShieldInputStream(stream));
+                mustCloseFs = new POIFSFileSystem(new 
CloseShieldInputStream(stream));
                 root = mustCloseFs.getRoot();
             } else {
                 final Object container = tstream.getOpenContainer();
-                if (container instanceof NPOIFSFileSystem) {
-                    root = ((NPOIFSFileSystem) container).getRoot();
+                if (container instanceof POIFSFileSystem) {
+                    root = ((POIFSFileSystem) container).getRoot();
                 } else if (container instanceof DirectoryNode) {
                     root = (DirectoryNode) container;
                 } else {
-                    NPOIFSFileSystem fs = null;
+                    POIFSFileSystem fs = null;
                     if (tstream.hasFile()) {
-                        fs = new NPOIFSFileSystem(tstream.getFile(), true);
+                        fs = new POIFSFileSystem(tstream.getFile(), true);
                     } else {
-                        fs = new NPOIFSFileSystem(new 
CloseShieldInputStream(tstream));
+                        fs = new POIFSFileSystem(new 
CloseShieldInputStream(tstream));
                     }
                     //tstream will close the fs, no need to close this below
                     tstream.setOpenContainer(fs);
@@ -274,10 +273,6 @@ public class OfficeParser extends AbstractOfficeParser {
             return detectType(fs.getRoot());
         }
 
-        public static POIFSDocumentType detectType(NPOIFSFileSystem fs) {
-            return detectType(fs.getRoot());
-        }
-
         public static POIFSDocumentType detectType(DirectoryEntry node) {
             Set<String> names = new HashSet<String>();
             for (Entry entry : node) {
@@ -313,7 +308,7 @@ public class OfficeParser extends AbstractOfficeParser {
      * @throws IOException on IOException if it occurs during the extraction 
of the embedded doc
      * @throws SAXException on SAXException for writing to xhtml
      */
-    public static void extractMacros(NPOIFSFileSystem fs, ContentHandler xhtml,
+    public static void extractMacros(POIFSFileSystem fs, ContentHandler xhtml,
                                      EmbeddedDocumentExtractor 
embeddedDocumentExtractor)  throws IOException, SAXException {
 
         VBAMacroReader reader = null;
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 0aed803..5d13351 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -51,7 +51,7 @@ import org.apache.poi.hsmf.datatypes.StringChunk;
 import org.apache.poi.hsmf.datatypes.Types;
 import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.CodePageUtil;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -126,7 +126,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
 
     private final boolean extractAllAlternatives;
 
-    public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context) 
throws TikaException {
+    public OutlookExtractor(POIFSFileSystem filesystem, ParseContext context) 
throws TikaException {
         this(filesystem.getRoot(), context);
     }
 
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
index 1c98690..1b5a0a9 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
@@ -33,7 +33,7 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.DocumentNode;
 import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -385,7 +385,7 @@ public class POIFSContainerDetector implements Detector {
         File file = stream.getFile();
 
         try {
-            NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true);
+            POIFSFileSystem fs = new POIFSFileSystem(file, true);
 
             // Optimize a possible later parsing process by keeping
             // a reference to the already opened POI file system
@@ -423,8 +423,8 @@ public class POIFSContainerDetector implements Detector {
         Set<String> names = null;
         if (tis != null) {
             Object container = tis.getOpenContainer();
-            if (container instanceof NPOIFSFileSystem) {
-                names = getTopLevelNames(((NPOIFSFileSystem) 
container).getRoot());
+            if (container instanceof POIFSFileSystem) {
+                names = getTopLevelNames(((POIFSFileSystem) 
container).getRoot());
             } else if (container instanceof DirectoryNode) {
                 names = getTopLevelNames((DirectoryNode) container);
             }
@@ -454,8 +454,8 @@ public class POIFSContainerDetector implements Detector {
         // Detect based on the names (as available)
         if (tis != null &&
                 tis.getOpenContainer() != null &&
-                tis.getOpenContainer() instanceof NPOIFSFileSystem) {
-            return detect(names, ((NPOIFSFileSystem) 
tis.getOpenContainer()).getRoot());
+                tis.getOpenContainer() instanceof POIFSFileSystem) {
+            return detect(names, ((POIFSFileSystem) 
tis.getOpenContainer()).getRoot());
         } else {
             return detect(names, null);
         }
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
index 3e2ea26..8017184 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
@@ -32,7 +32,7 @@ import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.MSOffice;
 import org.apache.tika.metadata.Metadata;
@@ -63,7 +63,7 @@ public class SummaryExtractor {
         this.metadata = metadata;
     }
 
-    public void parseSummaries(NPOIFSFileSystem filesystem)
+    public void parseSummaries(POIFSFileSystem filesystem)
             throws IOException, TikaException {
         parseSummaries(filesystem.getRoot());
     }
@@ -94,8 +94,6 @@ public class SummaryExtractor {
             // no property stream, just skip it
         } catch (UnexpectedPropertySetTypeException e) {
             throw new TikaException("Unexpected HPSF document", e);
-        } catch (MarkUnsupportedException e) {
-            throw new TikaException("Invalid DocumentInputStream", e);
         } catch (Exception e) {
             LOG.warn("Ignoring unexpected exception while parsing summary 
entry {}", entryName, e);
         }
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 4a80420..30bd4bb 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -49,7 +49,7 @@ import org.apache.poi.hwpf.usermodel.TableRow;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
@@ -145,7 +145,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
     }
 
     protected void parse(
-            NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+            POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
         parse(filesystem.getRoot(), xhtml);
     }
@@ -661,7 +661,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
     }
 
     protected void parseWord6(
-            NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+            POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
         parseWord6(filesystem.getRoot(), xhtml);
     }
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index ac5abc9..57c38a6 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -41,7 +41,6 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
 import org.apache.poi.openxml4j.opc.TargetMode;
 import org.apache.poi.openxml4j.opc.internal.FileHelper;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -418,9 +417,9 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
 
         if (officeParserConfig.getExtractMacros()) {
             try (InputStream is = macroPart.getInputStream()) {
-                try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) {
+                try (POIFSFileSystem poifs = new POIFSFileSystem(is)) {
                     //Macro reading exceptions are already swallowed here
-                    OfficeParser.extractMacros(npoifs, handler, 
embeddedExtractor);
+                    OfficeParser.extractMacros(poifs, handler, 
embeddedExtractor);
                 }
             } catch (IOException e) {
                 throw new TikaException("Broken OOXML file", e);
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
index 4387ca4..90ea58b 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
@@ -33,7 +33,7 @@ import org.apache.poi.poifs.filesystem.DocumentEntry;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.FileMagic;
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.poifs.storage.HeaderBlock;
@@ -141,7 +141,7 @@ class RTFObjDataParser {
             throws IOException {
 
         byte[] ret = null;
-        try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {
+        try (POIFSFileSystem fs = new POIFSFileSystem(is)) {
 
             DirectoryNode root = fs.getRoot();
 
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 
b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 57b91ca..ad12517 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -28,7 +28,7 @@ import java.io.InputStream;
 import java.nio.file.Path;
 import java.util.Random;
 
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.MultiThreadedTikaTest;
 import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
@@ -43,7 +43,7 @@ import org.junit.After;
 import org.junit.Test;
 
 /**
- * Junit test class for {@link ContainerAwareDetector}
+ * Junit test class for {@link 
org.apache.tika.parser.microsoft.POIFSContainerDetector}
  */
 public class TestContainerAwareDetector extends MultiThreadedTikaTest {
     private final TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
@@ -184,7 +184,7 @@ public class TestContainerAwareDetector extends 
MultiThreadedTikaTest {
             assertEquals(
                     MediaType.parse("application/vnd.ms-powerpoint"),
                     detector.detect(stream, new Metadata()));
-            assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem);
+            assertTrue(stream.getOpenContainer() instanceof POIFSFileSystem);
         }
     }
 
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
index f39b961..2ec2a56 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java
@@ -65,6 +65,7 @@ public class JackcessParserTest extends TikaTest {
                 IOUtils.closeQuietly(is);
             }
             List<Metadata> list = handler.getMetadataList();
+            debug(list);
             assertEquals(4, list.size());
             String mainContent = 
list.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);

[tika] 02/02: NPOIFS->POIFS and add jackcess shim

Reply via email to