[tika] branch branch_1x updated: TIKA-3316 -- improve processing of XPS files

tallison Fri, 12 Mar 2021 13:37:35 -0800

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/branch_1x by this push:
     new afea29c  TIKA-3316 -- improve processing of XPS files
afea29c is described below

commit afea29c96b79766bc207367eaf38392185afcdfa
Author: tallison <[email protected]>
AuthorDate: Fri Mar 12 16:37:05 2021 -0500

    TIKA-3316 -- improve processing of XPS files
---
 CHANGES.txt                                        |   2 +
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |   2 +
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |   5 +-
 .../microsoft/ooxml/xps/XPSExtractorDecorator.java |   5 +
 .../parser/pkg/StreamingZipContainerDetector.java  | 171 +++++++++++++--------
 .../tika/parser/pkg/ZipContainerDetector.java      |  18 ++-
 .../org/apache/tika/parser/utils/ZipSalvager.java  |  96 ++++++++----
 .../tika/detect/TestContainerAwareDetector.java    |   2 +-
 .../parser/microsoft/ooxml/xps/XPSParserTest.java  |  45 ++++++
 .../tika/parser/pkg/ZipContainerDetectorTest.java  |   2 +-
 .../test-documents/testXPSWithDataDescriptor.xps   | Bin 0 -> 44523 bytes
 .../test-documents/testXPSWithDataDescriptor2.xps  | Bin 0 -> 51175 bytes
 12 files changed, 237 insertions(+), 111 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index fe0e02d..57ca53c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.26 - 03/09/2021
 
+   * Improve detection and parsing of XPS files (TIKA-3316).
+
    * General dependency upgrades (TIKA-3244).
 
    * Great optimization in ForkParser (TIKA-3237).
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index af998b6..18a70f6 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -176,6 +176,8 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
                 thumbnailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, 
thumbName);
                 thumbnailMetadata.set(Metadata.CONTENT_TYPE, 
tPart.getContentType());
                 thumbnailMetadata.set(TikaCoreProperties.TITLE, 
tPart.getPartName().getName());
+                
thumbnailMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                        
TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString());
 
                 if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) {
                     
embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new 
EmbeddedContentHandler(handler), thumbnailMetadata, false);
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 15f2c33..193be3b 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.util.Locale;
 
+import 
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.poi.ooxml.POIXMLDocument;
 import org.apache.poi.ooxml.extractor.ExtractorFactory;
@@ -109,10 +110,10 @@ public class OOXMLExtractorFactory {
                                 true, false)) {
                     try {
                         pkg = OPCPackage.open(rereadableInputStream);
-                    } catch (EOFException e) {
+                    } catch (EOFException|UnsupportedZipFeatureException e) {
                         rereadableInputStream.rewind();
                         tmpRepairedCopy = 
File.createTempFile("tika-ooxml-repair-", "");
-                        ZipSalvager.salvageCopy(rereadableInputStream, 
tmpRepairedCopy);
+                        ZipSalvager.salvageCopy(rereadableInputStream, 
tmpRepairedCopy, false);
                         //if there isn't enough left to be opened as a package
                         //throw an exception -- we may want to fall back to 
streaming
                         //parsing
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
index 2643a3a..5cf7573 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
@@ -51,6 +51,8 @@ import java.util.Map;
 public class XPSExtractorDecorator extends AbstractOOXMLExtractor {
 
     private static String XPS_DOCUMENT = 
"http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";;
+    private static final String OPEN_XPS_DOCUMENT =
+            "http://schemas.openxps.org/oxps/v1.0/fixedrepresentation";;
 
     private final ParseContext context;
     private final ZipPackage pkg;
@@ -76,6 +78,9 @@ public class XPSExtractorDecorator extends 
AbstractOOXMLExtractor {
     protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, 
IOException {
 
         PackageRelationshipCollection prc = 
pkg.getRelationshipsByType(XPS_DOCUMENT);
+        if (prc.size() == 0) {
+            prc = pkg.getRelationshipsByType(OPEN_XPS_DOCUMENT);
+        }
         for (int i = 0; i < prc.size(); i++) {
             PackageRelationship pr = prc.getRelationship(i);
 
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
index 67eaea8..0e927bf 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
@@ -20,12 +20,15 @@ import static java.nio.charset.StandardCharsets.UTF_8;
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
+import java.io.IOException;
 import java.io.InputStream;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 
+import 
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.io.IOUtils;
@@ -92,90 +95,60 @@ public class StreamingZipContainerDetector extends 
ZipContainerDetectorBase impl
 
     }
 
+    private final int markLimit;
+
+    public StreamingZipContainerDetector(int markLimit) {
+        this.markLimit = markLimit;
+    }
     /**
      *
-     * @param is inputstream to read from. Callers must mark/reset the stream
-     *           before/after this call to detect.  This call does not close 
the stream!
-     *           Depending on the file type, this call to detect may read the 
entire stream.
-     *           Make sure to use a {@link 
org.apache.tika.io.BoundedInputStream} or similar
-     *           if you want to protect against reading the entire stream.
+     * @param is the inputstream is wrapped in a boundedInputStream to 
guarantee
+     *           this doesn't stream beyond {@link #markLimit}
      * @return
      */
     @Override
-    public MediaType detect(InputStream is, Metadata metadata) {
+    public MediaType detect(InputStream is, Metadata metadata) throws 
IOException {
+        BoundedInputStream boundedInputStream = new 
BoundedInputStream(markLimit, is);
+        boundedInputStream.mark(markLimit);
+        try {
+            return _detect(boundedInputStream, metadata, false);
+        } finally {
+            boundedInputStream.reset();
+        }
+    }
 
+    private MediaType _detect(InputStream is, Metadata metadata, boolean 
allowStoredEntries)
+            throws IOException {
         Set<String> fileNames = new HashSet<>();
         Set<String> directoryNames = new HashSet<>();
+        MediaType mt = MediaType.APPLICATION_ZIP;
+
         try (ZipArchiveInputStream zipArchiveInputStream =
-                     new ZipArchiveInputStream(new 
CloseShieldInputStream(is))) {
+                     new ZipArchiveInputStream(new CloseShieldInputStream(is),
+                             "UTF8", false, allowStoredEntries)) {
             ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
-            while (zae != null) {
-                String name = zae.getName();
-                if (zae.isDirectory()) {
-                    directoryNames.add(name);
-                    zae = zipArchiveInputStream.getNextZipEntry();
-                    continue;
-                }
-                fileNames.add(name);
-                //we could also parse _rel/.rels, but if
-                // there isn't a valid content_types, then POI
-                //will throw an exception...Better to backoff to PKG
-                //than correctly identify a truncated
-                if (name.equals("[Content_Types].xml")) {
-                    MediaType mt = 
parseOOXMLContentTypes(zipArchiveInputStream);
-                    if (mt != null) {
-                        return mt;
-                    }
-                    return TIKA_OOXML;
-                } else if 
(IWorkPackageParser.IWORK_CONTENT_ENTRIES.contains(name)) {
-                    IWorkPackageParser.IWORKDocumentType type = 
IWorkPackageParser.IWORKDocumentType.detectType(zipArchiveInputStream);
-                    if (type != null) {
-                        return type.getType();
-                    }
-                } else if (name.equals("mimetype")) {
-                    //can't rely on zae.getSize to determine if there is any
-                    //content here. :(
-                    ByteArrayOutputStream bos = new ByteArrayOutputStream();
-                    BoundedInputStream bis = new 
BoundedInputStream(MAX_MIME_TYPE, zipArchiveInputStream);
-                    IOUtils.copy(bis, bos);
-                    //do anything with an inputstream > MAX_MIME_TYPE?
-                    if (bos.toByteArray().length > 0)  {
-                        //odt -- TODO -- check that the results are valid
-                        return MediaType.parse(new String(bos.toByteArray(), 
UTF_8));
-                    }
-                } else if (name.equals("META-INF/manifest.xml")) {
-                    //for an unknown reason, passing in the 
zipArchiveInputStream
-                    //"as is" can cause the iteration of the entries to stop 
early
-                    //without exception or warning.  So, copy the full stream, 
then
-                    //process.  TIKA-3061
-                    ByteArrayOutputStream bos = new ByteArrayOutputStream();
-                    BoundedInputStream bis = new 
BoundedInputStream(MAX_MANIFEST, zipArchiveInputStream);
-                    IOUtils.copy(bis, bos);
-                    //TODO: do something if the full stream hasn't been read?
-                    MediaType mt = detectStarOfficeX(new 
ByteArrayInputStream(bos.toByteArray()));
-                    if (mt != null) {
-                        return mt;
-                    }
-                }
-                MediaType mt = 
IWork18PackageParser.IWork18DocumentType.detectIfPossible(zae);
-                if (mt != null) {
-                    return mt;
-                }
-                mt = 
IWork13PackageParser.IWork13DocumentType.detectIfPossible(zae);
-                if (mt != null) {
-                    return mt;
-                }
-                zae = zipArchiveInputStream.getNextZipEntry();
+            mt = processZAE(zae, zipArchiveInputStream, directoryNames, 
fileNames);
+        } catch (UnsupportedZipFeatureException zfe) {
+            if (allowStoredEntries == false &&
+                    zfe.getFeature() == 
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+                is.reset();
+                mt = _detect(is, metadata, true);
             }
         } catch (SecurityException e) {
             throw e;
-        } catch (Exception e) {
-            //swallow
+        } catch (EOFException e) {
+            //truncated zip -- swallow
+        } catch (IOException e) {
+            //another option for a truncated zip
+        }
+
+        if (mt != MediaType.APPLICATION_ZIP) {
+            return mt;
         }
         //entrynames is the union of directory names and file names
         Set<String> entryNames = new HashSet<>(fileNames);
         entryNames.addAll(directoryNames);
-        MediaType mt = detectKmz(fileNames);
+        mt = detectKmz(fileNames);
         if (mt != null) {
             return mt;
         }
@@ -200,6 +173,70 @@ public class StreamingZipContainerDetector extends 
ZipContainerDetectorBase impl
             }
         }
         return MediaType.APPLICATION_ZIP;
+
+    }
+
+    private MediaType processZAE(ZipArchiveEntry zae, ZipArchiveInputStream 
zipArchiveInputStream,
+                            Set<String> directoryNames, Set<String> fileNames) 
throws IOException {
+        while (zae != null) {
+            String name = zae.getName();
+            if (zae.isDirectory()) {
+                directoryNames.add(name);
+                zae = zipArchiveInputStream.getNextZipEntry();
+                continue;
+            }
+            fileNames.add(name);
+            //we could also parse _rel/.rels, but if
+            // there isn't a valid content_types, then POI
+            //will throw an exception...Better to backoff to PKG
+            //than correctly identify a truncated
+            if (name.equals("[Content_Types].xml")) {
+                MediaType mt = parseOOXMLContentTypes(zipArchiveInputStream);
+                if (mt != null) {
+                    return mt;
+                }
+                return TIKA_OOXML;
+            } else if 
(IWorkPackageParser.IWORK_CONTENT_ENTRIES.contains(name)) {
+                IWorkPackageParser.IWORKDocumentType type = 
IWorkPackageParser.IWORKDocumentType.detectType(zipArchiveInputStream);
+                if (type != null) {
+                    return type.getType();
+                }
+            } else if (name.equals("mimetype")) {
+                //can't rely on zae.getSize to determine if there is any
+                //content here. :(
+                ByteArrayOutputStream bos = new ByteArrayOutputStream();
+                BoundedInputStream bis = new BoundedInputStream(MAX_MIME_TYPE, 
zipArchiveInputStream);
+                IOUtils.copy(bis, bos);
+                //do anything with an inputstream > MAX_MIME_TYPE?
+                if (bos.toByteArray().length > 0)  {
+                    //odt -- TODO -- check that the results are valid
+                    return MediaType.parse(new String(bos.toByteArray(), 
UTF_8));
+                }
+            } else if (name.equals("META-INF/manifest.xml")) {
+                //for an unknown reason, passing in the zipArchiveInputStream
+                //"as is" can cause the iteration of the entries to stop early
+                //without exception or warning.  So, copy the full stream, then
+                //process.  TIKA-3061
+                ByteArrayOutputStream bos = new ByteArrayOutputStream();
+                BoundedInputStream bis = new BoundedInputStream(MAX_MANIFEST, 
zipArchiveInputStream);
+                IOUtils.copy(bis, bos);
+                //TODO: do something if the full stream hasn't been read?
+                MediaType mt = detectStarOfficeX(new 
ByteArrayInputStream(bos.toByteArray()));
+                if (mt != null) {
+                    return mt;
+                }
+            }
+            MediaType mt = 
IWork18PackageParser.IWork18DocumentType.detectIfPossible(zae);
+            if (mt != null) {
+                return mt;
+            }
+            mt = 
IWork13PackageParser.IWork13DocumentType.detectIfPossible(zae);
+            if (mt != null) {
+                return mt;
+            }
+            zae = zipArchiveInputStream.getNextZipEntry();
+        }
+        return MediaType.APPLICATION_ZIP;
     }
 
     private static MediaType detectIWorks(Set<String> entryNames) {
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 84ba64d..0688230 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -85,16 +85,19 @@ public class ZipContainerDetector implements Detector {
     private static final String XPS_DOCUMENT =
             "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";;
 
+    private static final String OPEN_XPS_DOCUMENT =
+            "http://schemas.openxps.org/oxps/v1.0/fixedrepresentation";;
+
     private static final String STAR_OFFICE_6_WRITER = 
"application/vnd.sun.xml.writer";
     /** Serial version UID */
     private static final long serialVersionUID = 2891763938430295453L;
 
     //this has to be > 100,000 to handle some of the iworks files
     //in our unit tests
-    @Field
     int markLimit = 16 * 1024 * 1024;
 
-    private StreamingZipContainerDetector streamingZipContainerDetector = new 
StreamingZipContainerDetector();
+    private StreamingZipContainerDetector streamingZipContainerDetector
+            = new StreamingZipContainerDetector(markLimit);
 
     @Override
     public MediaType detect(InputStream input, Metadata metadata)
@@ -128,10 +131,7 @@ public class ZipContainerDetector implements Detector {
                     return detectZipFormatOnFile(tis);
                 }
             }
-
-            try (LookaheadInputStream lookahead = new 
LookaheadInputStream(input, markLimit)) {
-                return streamingZipContainerDetector.detect(lookahead, 
metadata);
-            }
+            return streamingZipContainerDetector.detect(input, metadata);
         } else if (!type.equals(MediaType.OCTET_STREAM)) {
             return type;
         } else {
@@ -147,8 +147,10 @@ public class ZipContainerDetector implements Detector {
      *
      * @param markLimit mark limit for streaming detection
      */
+    @Field
     public void setMarkLimit(int markLimit) {
         this.markLimit = markLimit;
+        this.streamingZipContainerDetector = new 
StreamingZipContainerDetector(markLimit);
     }
 
 
@@ -333,6 +335,10 @@ public class ZipContainerDetector implements Detector {
             if (core.size() == 1) {
                 return MediaType.application("vnd.ms-xpsdocument");
             }
+            core = pkg.getRelationshipsByType(OPEN_XPS_DOCUMENT);
+            if (core.size() == 1) {
+                return MediaType.application("vnd.ms-xpsdocument");
+            }
         }
 
         if (core.size() == 0) {
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
index f7cf08a..7d45c04 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
@@ -20,13 +20,16 @@ import java.io.EOFException;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
 import java.nio.file.Files;
 import java.util.zip.ZipException;
 
+import 
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
 import org.apache.tika.io.IOUtils;
+import org.apache.tika.utils.RereadableInputStream;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -44,51 +47,76 @@ public class ZipSalvager {
      * @param brokenZip
      * @param salvagedZip
      */
-    public static void salvageCopy(InputStream brokenZip, File salvagedZip) {
-        try (ZipArchiveOutputStream outputStream = new 
ZipArchiveOutputStream(salvagedZip)) {
-            ZipArchiveInputStream zipArchiveInputStream = new 
ZipArchiveInputStream(brokenZip);
-            ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
-            while (zae != null) {
+    public static void salvageCopy(InputStream brokenZip, File salvagedZip, 
boolean allowStoredEntries) throws IOException {
+        if (!(brokenZip instanceof RereadableInputStream)) {
+            brokenZip = new RereadableInputStream(brokenZip, 50000,
+                    true, false);
+        }
+        try {
+            try (ZipArchiveOutputStream outputStream = new 
ZipArchiveOutputStream(salvagedZip);
+                ZipArchiveInputStream zipArchiveInputStream = new 
ZipArchiveInputStream(brokenZip,
+                        "UTF8", false, allowStoredEntries)) {
+                ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
                 try {
-                    if (!zae.isDirectory() && 
zipArchiveInputStream.canReadEntryData(zae)) {
-                        //create a new ZAE and copy over only the name so that
-                        //if there is bad info (e.g. CRC) in brokenZip's zae, 
that
-                        //won't be propagated or cause an exception
-                        outputStream.putArchiveEntry(new 
ZipArchiveEntry(zae.getName()));
-                        //this will copy an incomplete stream...so there
-                        //could be truncation of the xml/contents, but the zip 
file
-                        //should be intact.
-                        boolean successfullyCopied = false;
-                        try {
-                            IOUtils.copy(zipArchiveInputStream, outputStream);
-                            successfullyCopied = true;
-                        } catch (IOException e) {
-                            //this can hit a "truncated ZipFile" IOException
-                        }
-                        outputStream.flush();
-                        outputStream.closeArchiveEntry();
-                        if (!successfullyCopied) {
-                            break;
-                        }
+                    processZAE(zae, zipArchiveInputStream, outputStream);
+                } catch (UnsupportedZipFeatureException uzfe) {
+                    if (uzfe.getFeature() ==
+                            
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+                        //percolate up to allow for retry
+                        throw uzfe;
                     }
-                    zae = zipArchiveInputStream.getNextZipEntry();
+                    //else swallow
                 } catch (ZipException | EOFException e) {
-                    break;
+                    //swallow
                 }
-
+                outputStream.flush();
+                outputStream.finish();
+            } catch (UnsupportedZipFeatureException e) {
+                //percolate up to allow for retry
+                throw e;
+            } catch (IOException e) {
+                LOG.warn("problem fixing zip", e);
             }
-            outputStream.flush();
-            outputStream.finish();
-
+        } catch (UnsupportedZipFeatureException e) {
+            //now retry
+            if (allowStoredEntries == false) {
+                ((RereadableInputStream) brokenZip).rewind();
+                salvageCopy(brokenZip, salvagedZip, true);
+            }
+        }
+    }
 
-        } catch (IOException e) {
-            LOG.warn("problem fixing zip", e);
+    private static void processZAE(ZipArchiveEntry zae, ZipArchiveInputStream 
zipArchiveInputStream,
+                                   ZipArchiveOutputStream outputStream) throws 
IOException {
+        while (zae != null) {
+            if (!zae.isDirectory() && 
zipArchiveInputStream.canReadEntryData(zae)) {
+                //create a new ZAE and copy over only the name so that
+                //if there is bad info (e.g. CRC) in brokenZip's zae, that
+                //won't be propagated or cause an exception
+                outputStream.putArchiveEntry(new 
ZipArchiveEntry(zae.getName()));
+                //this will copy an incomplete stream...so there
+                //could be truncation of the xml/contents, but the zip file
+                //should be intact.
+                boolean successfullyCopied = false;
+                try {
+                    IOUtils.copy(zipArchiveInputStream, outputStream);
+                    successfullyCopied = true;
+                } catch (IOException e) {
+                    //this can hit a "truncated ZipFile" IOException
+                }
+                outputStream.flush();
+                outputStream.closeArchiveEntry();
+                if (!successfullyCopied) {
+                    break;
+                }
+            }
+            zae = zipArchiveInputStream.getNextZipEntry();
         }
     }
 
     public static void salvageCopy(File brokenZip, File salvagedZip) throws 
IOException {
         try (InputStream is = Files.newInputStream(brokenZip.toPath())) {
-            salvageCopy(is, salvagedZip);
+            salvageCopy(is, salvagedZip, false);
         }
     }
 }
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 
b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 862088a..3646298 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -53,7 +53,7 @@ public class TestContainerAwareDetector extends 
MultiThreadedTikaTest {
     private final MimeTypes mimeTypes = tikaConfig.getMimeRepository();
     private final MediaTypeRegistry mediaTypeRegistry = 
mimeTypes.getMediaTypeRegistry();
     private final Detector detector = new DefaultDetector(mimeTypes);
-    private final StreamingZipContainerDetector streamingZipDetector = new 
StreamingZipContainerDetector();
+    private final StreamingZipContainerDetector streamingZipDetector = new 
StreamingZipContainerDetector(1000000);
 
     @After
     public void tearDown() throws TikaException {
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
index f9ed085..d57ad91 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
@@ -16,12 +16,20 @@
  */
 package org.apache.tika.parser.microsoft.ooxml.xps;
 
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackageAccess;
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.junit.Test;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.List;
 
 import static org.junit.Assert.assertEquals;
@@ -94,4 +102,41 @@ public class XPSParserTest extends TikaTest {
 
     }
 
+    @Test
+    public void testXPSWithDataDescriptor() throws Exception {
+        Path path = Paths.get(
+                
XPSParserTest.class.getResource("/test-documents/testXPSWithDataDescriptor.xps").toURI());
+        //test both path and stream based
+        List<Metadata> metadataList = getRecursiveMetadata(path, true);
+        assertEquals(2, metadataList.size());
+        assertContains("This is my XPS document test",
+                
metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        Files.copy(path, bos);
+        metadataList = getRecursiveMetadata(new 
ByteArrayInputStream(bos.toByteArray()), true);
+        assertEquals(2, metadataList.size());
+        assertContains("This is my XPS document test",
+                
metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+
+        
assertEquals(TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString(),
+                
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
+
+    @Test
+    public void testOpenXPSWithDataDescriptor() throws Exception {
+        Path path = Paths.get(
+                
XPSParserTest.class.getResource("/test-documents/testXPSWithDataDescriptor2.xps").toURI());
+        List<Metadata> metadataList = getRecursiveMetadata(path, true);
+        assertEquals(2, metadataList.size());
+        assertContains("How was I supposed to know",
+                
metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        Files.copy(path, bos);
+        metadataList = getRecursiveMetadata(new 
ByteArrayInputStream(bos.toByteArray()), true);
+        assertEquals(2, metadataList.size());
+        assertContains("How was I supposed to know",
+                
metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+    }
 }
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
index d05c41a..edd35f6 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
@@ -46,7 +46,7 @@ public class ZipContainerDetectorTest extends TikaTest {
     private static MediaType ODT_TEXT = 
MediaType.application("vnd.oasis.opendocument.text");
     private static MediaType TIFF = MediaType.image("tiff");
     ZipContainerDetector zipContainerDetector = new ZipContainerDetector();
-    StreamingZipContainerDetector streamingZipDetector = new 
StreamingZipContainerDetector();
+    StreamingZipContainerDetector streamingZipDetector = new 
StreamingZipContainerDetector(100000);
 
     @Test
     public void testTiffWorkaround() throws Exception {
diff --git 
a/tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor.xps 
b/tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor.xps
new file mode 100644
index 0000000..1569377
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor.xps 
differ
diff --git 
a/tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps 
b/tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps
new file mode 100644
index 0000000..efc4a0e
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps 
differ

[tika] branch branch_1x updated: TIKA-3316 -- improve processing of XPS files

Reply via email to