This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new afea29c TIKA-3316 -- improve processing of XPS files
afea29c is described below
commit afea29c96b79766bc207367eaf38392185afcdfa
Author: tallison <[email protected]>
AuthorDate: Fri Mar 12 16:37:05 2021 -0500
TIKA-3316 -- improve processing of XPS files
---
CHANGES.txt | 2 +
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 2 +
.../microsoft/ooxml/OOXMLExtractorFactory.java | 5 +-
.../microsoft/ooxml/xps/XPSExtractorDecorator.java | 5 +
.../parser/pkg/StreamingZipContainerDetector.java | 171 +++++++++++++--------
.../tika/parser/pkg/ZipContainerDetector.java | 18 ++-
.../org/apache/tika/parser/utils/ZipSalvager.java | 96 ++++++++----
.../tika/detect/TestContainerAwareDetector.java | 2 +-
.../parser/microsoft/ooxml/xps/XPSParserTest.java | 45 ++++++
.../tika/parser/pkg/ZipContainerDetectorTest.java | 2 +-
.../test-documents/testXPSWithDataDescriptor.xps | Bin 0 -> 44523 bytes
.../test-documents/testXPSWithDataDescriptor2.xps | Bin 0 -> 51175 bytes
12 files changed, 237 insertions(+), 111 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index fe0e02d..57ca53c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 1.26 - 03/09/2021
+ * Improve detection and parsing of XPS files (TIKA-3316).
+
* General dependency upgrades (TIKA-3244).
* Great optimization in ForkParser (TIKA-3237).
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index af998b6..18a70f6 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -176,6 +176,8 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
thumbnailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID,
thumbName);
thumbnailMetadata.set(Metadata.CONTENT_TYPE,
tPart.getContentType());
thumbnailMetadata.set(TikaCoreProperties.TITLE,
tPart.getPartName().getName());
+
thumbnailMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+
TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString());
if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) {
embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new
EmbeddedContentHandler(handler), thumbnailMetadata, false);
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 15f2c33..193be3b 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;
+import
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ooxml.extractor.ExtractorFactory;
@@ -109,10 +110,10 @@ public class OOXMLExtractorFactory {
true, false)) {
try {
pkg = OPCPackage.open(rereadableInputStream);
- } catch (EOFException e) {
+ } catch (EOFException|UnsupportedZipFeatureException e) {
rereadableInputStream.rewind();
tmpRepairedCopy =
File.createTempFile("tika-ooxml-repair-", "");
- ZipSalvager.salvageCopy(rereadableInputStream,
tmpRepairedCopy);
+ ZipSalvager.salvageCopy(rereadableInputStream,
tmpRepairedCopy, false);
//if there isn't enough left to be opened as a package
//throw an exception -- we may want to fall back to
streaming
//parsing
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
index 2643a3a..5cf7573 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
@@ -51,6 +51,8 @@ import java.util.Map;
public class XPSExtractorDecorator extends AbstractOOXMLExtractor {
private static String XPS_DOCUMENT =
"http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
+ private static final String OPEN_XPS_DOCUMENT =
+ "http://schemas.openxps.org/oxps/v1.0/fixedrepresentation";
private final ParseContext context;
private final ZipPackage pkg;
@@ -76,6 +78,9 @@ public class XPSExtractorDecorator extends
AbstractOOXMLExtractor {
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
IOException {
PackageRelationshipCollection prc =
pkg.getRelationshipsByType(XPS_DOCUMENT);
+ if (prc.size() == 0) {
+ prc = pkg.getRelationshipsByType(OPEN_XPS_DOCUMENT);
+ }
for (int i = 0; i < prc.size(); i++) {
PackageRelationship pr = prc.getRelationship(i);
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
index 67eaea8..0e927bf 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
@@ -20,12 +20,15 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
+import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
+import
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.io.IOUtils;
@@ -92,90 +95,60 @@ public class StreamingZipContainerDetector extends
ZipContainerDetectorBase impl
}
+ private final int markLimit;
+
+ public StreamingZipContainerDetector(int markLimit) {
+ this.markLimit = markLimit;
+ }
/**
*
- * @param is inputstream to read from. Callers must mark/reset the stream
- * before/after this call to detect. This call does not close
the stream!
- * Depending on the file type, this call to detect may read the
entire stream.
- * Make sure to use a {@link
org.apache.tika.io.BoundedInputStream} or similar
- * if you want to protect against reading the entire stream.
+ * @param is the inputstream is wrapped in a boundedInputStream to
guarantee
+ * this doesn't stream beyond {@link #markLimit}
* @return
*/
@Override
- public MediaType detect(InputStream is, Metadata metadata) {
+ public MediaType detect(InputStream is, Metadata metadata) throws
IOException {
+ BoundedInputStream boundedInputStream = new
BoundedInputStream(markLimit, is);
+ boundedInputStream.mark(markLimit);
+ try {
+ return _detect(boundedInputStream, metadata, false);
+ } finally {
+ boundedInputStream.reset();
+ }
+ }
+ private MediaType _detect(InputStream is, Metadata metadata, boolean
allowStoredEntries)
+ throws IOException {
Set<String> fileNames = new HashSet<>();
Set<String> directoryNames = new HashSet<>();
+ MediaType mt = MediaType.APPLICATION_ZIP;
+
try (ZipArchiveInputStream zipArchiveInputStream =
- new ZipArchiveInputStream(new
CloseShieldInputStream(is))) {
+ new ZipArchiveInputStream(new CloseShieldInputStream(is),
+ "UTF8", false, allowStoredEntries)) {
ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
- while (zae != null) {
- String name = zae.getName();
- if (zae.isDirectory()) {
- directoryNames.add(name);
- zae = zipArchiveInputStream.getNextZipEntry();
- continue;
- }
- fileNames.add(name);
- //we could also parse _rel/.rels, but if
- // there isn't a valid content_types, then POI
- //will throw an exception...Better to backoff to PKG
- //than correctly identify a truncated
- if (name.equals("[Content_Types].xml")) {
- MediaType mt =
parseOOXMLContentTypes(zipArchiveInputStream);
- if (mt != null) {
- return mt;
- }
- return TIKA_OOXML;
- } else if
(IWorkPackageParser.IWORK_CONTENT_ENTRIES.contains(name)) {
- IWorkPackageParser.IWORKDocumentType type =
IWorkPackageParser.IWORKDocumentType.detectType(zipArchiveInputStream);
- if (type != null) {
- return type.getType();
- }
- } else if (name.equals("mimetype")) {
- //can't rely on zae.getSize to determine if there is any
- //content here. :(
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- BoundedInputStream bis = new
BoundedInputStream(MAX_MIME_TYPE, zipArchiveInputStream);
- IOUtils.copy(bis, bos);
- //do anything with an inputstream > MAX_MIME_TYPE?
- if (bos.toByteArray().length > 0) {
- //odt -- TODO -- check that the results are valid
- return MediaType.parse(new String(bos.toByteArray(),
UTF_8));
- }
- } else if (name.equals("META-INF/manifest.xml")) {
- //for an unknown reason, passing in the
zipArchiveInputStream
- //"as is" can cause the iteration of the entries to stop
early
- //without exception or warning. So, copy the full stream,
then
- //process. TIKA-3061
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- BoundedInputStream bis = new
BoundedInputStream(MAX_MANIFEST, zipArchiveInputStream);
- IOUtils.copy(bis, bos);
- //TODO: do something if the full stream hasn't been read?
- MediaType mt = detectStarOfficeX(new
ByteArrayInputStream(bos.toByteArray()));
- if (mt != null) {
- return mt;
- }
- }
- MediaType mt =
IWork18PackageParser.IWork18DocumentType.detectIfPossible(zae);
- if (mt != null) {
- return mt;
- }
- mt =
IWork13PackageParser.IWork13DocumentType.detectIfPossible(zae);
- if (mt != null) {
- return mt;
- }
- zae = zipArchiveInputStream.getNextZipEntry();
+ mt = processZAE(zae, zipArchiveInputStream, directoryNames,
fileNames);
+ } catch (UnsupportedZipFeatureException zfe) {
+ if (allowStoredEntries == false &&
+ zfe.getFeature() ==
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+ is.reset();
+ mt = _detect(is, metadata, true);
}
} catch (SecurityException e) {
throw e;
- } catch (Exception e) {
- //swallow
+ } catch (EOFException e) {
+ //truncated zip -- swallow
+ } catch (IOException e) {
+ //another option for a truncated zip
+ }
+
+ if (mt != MediaType.APPLICATION_ZIP) {
+ return mt;
}
//entrynames is the union of directory names and file names
Set<String> entryNames = new HashSet<>(fileNames);
entryNames.addAll(directoryNames);
- MediaType mt = detectKmz(fileNames);
+ mt = detectKmz(fileNames);
if (mt != null) {
return mt;
}
@@ -200,6 +173,70 @@ public class StreamingZipContainerDetector extends
ZipContainerDetectorBase impl
}
}
return MediaType.APPLICATION_ZIP;
+
+ }
+
+ private MediaType processZAE(ZipArchiveEntry zae, ZipArchiveInputStream
zipArchiveInputStream,
+ Set<String> directoryNames, Set<String> fileNames)
throws IOException {
+ while (zae != null) {
+ String name = zae.getName();
+ if (zae.isDirectory()) {
+ directoryNames.add(name);
+ zae = zipArchiveInputStream.getNextZipEntry();
+ continue;
+ }
+ fileNames.add(name);
+ //we could also parse _rel/.rels, but if
+ // there isn't a valid content_types, then POI
+ //will throw an exception...Better to backoff to PKG
+ //than correctly identify a truncated
+ if (name.equals("[Content_Types].xml")) {
+ MediaType mt = parseOOXMLContentTypes(zipArchiveInputStream);
+ if (mt != null) {
+ return mt;
+ }
+ return TIKA_OOXML;
+ } else if
(IWorkPackageParser.IWORK_CONTENT_ENTRIES.contains(name)) {
+ IWorkPackageParser.IWORKDocumentType type =
IWorkPackageParser.IWORKDocumentType.detectType(zipArchiveInputStream);
+ if (type != null) {
+ return type.getType();
+ }
+ } else if (name.equals("mimetype")) {
+ //can't rely on zae.getSize to determine if there is any
+ //content here. :(
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ BoundedInputStream bis = new BoundedInputStream(MAX_MIME_TYPE,
zipArchiveInputStream);
+ IOUtils.copy(bis, bos);
+ //do anything with an inputstream > MAX_MIME_TYPE?
+ if (bos.toByteArray().length > 0) {
+ //odt -- TODO -- check that the results are valid
+ return MediaType.parse(new String(bos.toByteArray(),
UTF_8));
+ }
+ } else if (name.equals("META-INF/manifest.xml")) {
+ //for an unknown reason, passing in the zipArchiveInputStream
+ //"as is" can cause the iteration of the entries to stop early
+ //without exception or warning. So, copy the full stream, then
+ //process. TIKA-3061
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ BoundedInputStream bis = new BoundedInputStream(MAX_MANIFEST,
zipArchiveInputStream);
+ IOUtils.copy(bis, bos);
+ //TODO: do something if the full stream hasn't been read?
+ MediaType mt = detectStarOfficeX(new
ByteArrayInputStream(bos.toByteArray()));
+ if (mt != null) {
+ return mt;
+ }
+ }
+ MediaType mt =
IWork18PackageParser.IWork18DocumentType.detectIfPossible(zae);
+ if (mt != null) {
+ return mt;
+ }
+ mt =
IWork13PackageParser.IWork13DocumentType.detectIfPossible(zae);
+ if (mt != null) {
+ return mt;
+ }
+ zae = zipArchiveInputStream.getNextZipEntry();
+ }
+ return MediaType.APPLICATION_ZIP;
}
private static MediaType detectIWorks(Set<String> entryNames) {
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 84ba64d..0688230 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -85,16 +85,19 @@ public class ZipContainerDetector implements Detector {
private static final String XPS_DOCUMENT =
"http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
+ private static final String OPEN_XPS_DOCUMENT =
+ "http://schemas.openxps.org/oxps/v1.0/fixedrepresentation";
+
private static final String STAR_OFFICE_6_WRITER =
"application/vnd.sun.xml.writer";
/** Serial version UID */
private static final long serialVersionUID = 2891763938430295453L;
//this has to be > 100,000 to handle some of the iworks files
//in our unit tests
- @Field
int markLimit = 16 * 1024 * 1024;
- private StreamingZipContainerDetector streamingZipContainerDetector = new
StreamingZipContainerDetector();
+ private StreamingZipContainerDetector streamingZipContainerDetector
+ = new StreamingZipContainerDetector(markLimit);
@Override
public MediaType detect(InputStream input, Metadata metadata)
@@ -128,10 +131,7 @@ public class ZipContainerDetector implements Detector {
return detectZipFormatOnFile(tis);
}
}
-
- try (LookaheadInputStream lookahead = new
LookaheadInputStream(input, markLimit)) {
- return streamingZipContainerDetector.detect(lookahead,
metadata);
- }
+ return streamingZipContainerDetector.detect(input, metadata);
} else if (!type.equals(MediaType.OCTET_STREAM)) {
return type;
} else {
@@ -147,8 +147,10 @@ public class ZipContainerDetector implements Detector {
*
* @param markLimit mark limit for streaming detection
*/
+ @Field
public void setMarkLimit(int markLimit) {
this.markLimit = markLimit;
+ this.streamingZipContainerDetector = new
StreamingZipContainerDetector(markLimit);
}
@@ -333,6 +335,10 @@ public class ZipContainerDetector implements Detector {
if (core.size() == 1) {
return MediaType.application("vnd.ms-xpsdocument");
}
+ core = pkg.getRelationshipsByType(OPEN_XPS_DOCUMENT);
+ if (core.size() == 1) {
+ return MediaType.application("vnd.ms-xpsdocument");
+ }
}
if (core.size() == 0) {
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
b/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
index f7cf08a..7d45c04 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
@@ -20,13 +20,16 @@ import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
+import java.io.OutputStream;
import java.nio.file.Files;
import java.util.zip.ZipException;
+import
org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
import org.apache.tika.io.IOUtils;
+import org.apache.tika.utils.RereadableInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -44,51 +47,76 @@ public class ZipSalvager {
* @param brokenZip
* @param salvagedZip
*/
- public static void salvageCopy(InputStream brokenZip, File salvagedZip) {
- try (ZipArchiveOutputStream outputStream = new
ZipArchiveOutputStream(salvagedZip)) {
- ZipArchiveInputStream zipArchiveInputStream = new
ZipArchiveInputStream(brokenZip);
- ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
- while (zae != null) {
+ public static void salvageCopy(InputStream brokenZip, File salvagedZip,
boolean allowStoredEntries) throws IOException {
+ if (!(brokenZip instanceof RereadableInputStream)) {
+ brokenZip = new RereadableInputStream(brokenZip, 50000,
+ true, false);
+ }
+ try {
+ try (ZipArchiveOutputStream outputStream = new
ZipArchiveOutputStream(salvagedZip);
+ ZipArchiveInputStream zipArchiveInputStream = new
ZipArchiveInputStream(brokenZip,
+ "UTF8", false, allowStoredEntries)) {
+ ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
try {
- if (!zae.isDirectory() &&
zipArchiveInputStream.canReadEntryData(zae)) {
- //create a new ZAE and copy over only the name so that
- //if there is bad info (e.g. CRC) in brokenZip's zae,
that
- //won't be propagated or cause an exception
- outputStream.putArchiveEntry(new
ZipArchiveEntry(zae.getName()));
- //this will copy an incomplete stream...so there
- //could be truncation of the xml/contents, but the zip
file
- //should be intact.
- boolean successfullyCopied = false;
- try {
- IOUtils.copy(zipArchiveInputStream, outputStream);
- successfullyCopied = true;
- } catch (IOException e) {
- //this can hit a "truncated ZipFile" IOException
- }
- outputStream.flush();
- outputStream.closeArchiveEntry();
- if (!successfullyCopied) {
- break;
- }
+ processZAE(zae, zipArchiveInputStream, outputStream);
+ } catch (UnsupportedZipFeatureException uzfe) {
+ if (uzfe.getFeature() ==
+
UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+ //percolate up to allow for retry
+ throw uzfe;
}
- zae = zipArchiveInputStream.getNextZipEntry();
+ //else swallow
} catch (ZipException | EOFException e) {
- break;
+ //swallow
}
-
+ outputStream.flush();
+ outputStream.finish();
+ } catch (UnsupportedZipFeatureException e) {
+ //percolate up to allow for retry
+ throw e;
+ } catch (IOException e) {
+ LOG.warn("problem fixing zip", e);
}
- outputStream.flush();
- outputStream.finish();
-
+ } catch (UnsupportedZipFeatureException e) {
+ //now retry
+ if (allowStoredEntries == false) {
+ ((RereadableInputStream) brokenZip).rewind();
+ salvageCopy(brokenZip, salvagedZip, true);
+ }
+ }
+ }
- } catch (IOException e) {
- LOG.warn("problem fixing zip", e);
+ private static void processZAE(ZipArchiveEntry zae, ZipArchiveInputStream
zipArchiveInputStream,
+ ZipArchiveOutputStream outputStream) throws
IOException {
+ while (zae != null) {
+ if (!zae.isDirectory() &&
zipArchiveInputStream.canReadEntryData(zae)) {
+ //create a new ZAE and copy over only the name so that
+ //if there is bad info (e.g. CRC) in brokenZip's zae, that
+ //won't be propagated or cause an exception
+ outputStream.putArchiveEntry(new
ZipArchiveEntry(zae.getName()));
+ //this will copy an incomplete stream...so there
+ //could be truncation of the xml/contents, but the zip file
+ //should be intact.
+ boolean successfullyCopied = false;
+ try {
+ IOUtils.copy(zipArchiveInputStream, outputStream);
+ successfullyCopied = true;
+ } catch (IOException e) {
+ //this can hit a "truncated ZipFile" IOException
+ }
+ outputStream.flush();
+ outputStream.closeArchiveEntry();
+ if (!successfullyCopied) {
+ break;
+ }
+ }
+ zae = zipArchiveInputStream.getNextZipEntry();
}
}
public static void salvageCopy(File brokenZip, File salvagedZip) throws
IOException {
try (InputStream is = Files.newInputStream(brokenZip.toPath())) {
- salvageCopy(is, salvagedZip);
+ salvageCopy(is, salvagedZip, false);
}
}
}
diff --git
a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 862088a..3646298 100644
---
a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++
b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -53,7 +53,7 @@ public class TestContainerAwareDetector extends
MultiThreadedTikaTest {
private final MimeTypes mimeTypes = tikaConfig.getMimeRepository();
private final MediaTypeRegistry mediaTypeRegistry =
mimeTypes.getMediaTypeRegistry();
private final Detector detector = new DefaultDetector(mimeTypes);
- private final StreamingZipContainerDetector streamingZipDetector = new
StreamingZipContainerDetector();
+ private final StreamingZipContainerDetector streamingZipDetector = new
StreamingZipContainerDetector(1000000);
@After
public void tearDown() throws TikaException {
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
index f9ed085..d57ad91 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
@@ -16,12 +16,20 @@
*/
package org.apache.tika.parser.microsoft.ooxml.xps;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.junit.Test;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.List;
import static org.junit.Assert.assertEquals;
@@ -94,4 +102,41 @@ public class XPSParserTest extends TikaTest {
}
+ @Test
+ public void testXPSWithDataDescriptor() throws Exception {
+ Path path = Paths.get(
+
XPSParserTest.class.getResource("/test-documents/testXPSWithDataDescriptor.xps").toURI());
+ //test both path and stream based
+ List<Metadata> metadataList = getRecursiveMetadata(path, true);
+ assertEquals(2, metadataList.size());
+ assertContains("This is my XPS document test",
+
metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ Files.copy(path, bos);
+ metadataList = getRecursiveMetadata(new
ByteArrayInputStream(bos.toByteArray()), true);
+ assertEquals(2, metadataList.size());
+ assertContains("This is my XPS document test",
+
metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+
+
assertEquals(TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString(),
+
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ }
+
+ @Test
+ public void testOpenXPSWithDataDescriptor() throws Exception {
+ Path path = Paths.get(
+
XPSParserTest.class.getResource("/test-documents/testXPSWithDataDescriptor2.xps").toURI());
+ List<Metadata> metadataList = getRecursiveMetadata(path, true);
+ assertEquals(2, metadataList.size());
+ assertContains("How was I supposed to know",
+
metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ Files.copy(path, bos);
+ metadataList = getRecursiveMetadata(new
ByteArrayInputStream(bos.toByteArray()), true);
+ assertEquals(2, metadataList.size());
+ assertContains("How was I supposed to know",
+
metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ }
}
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
index d05c41a..edd35f6 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
@@ -46,7 +46,7 @@ public class ZipContainerDetectorTest extends TikaTest {
private static MediaType ODT_TEXT =
MediaType.application("vnd.oasis.opendocument.text");
private static MediaType TIFF = MediaType.image("tiff");
ZipContainerDetector zipContainerDetector = new ZipContainerDetector();
- StreamingZipContainerDetector streamingZipDetector = new
StreamingZipContainerDetector();
+ StreamingZipContainerDetector streamingZipDetector = new
StreamingZipContainerDetector(100000);
@Test
public void testTiffWorkaround() throws Exception {
diff --git
a/tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor.xps
b/tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor.xps
new file mode 100644
index 0000000..1569377
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor.xps
differ
diff --git
a/tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps
b/tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps
new file mode 100644
index 0000000..efc4a0e
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps
differ