This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit cba0372821022833a9c976bd47bd67193f73f635 Author: tallison <[email protected]> AuthorDate: Sat Mar 13 20:54:16 2021 -0500 TIKA-3316 -- improve XPS parser to include open XPS and allow for streaming zips with data descriptors --- .../detect/microsoft/ooxml/OPCPackageDetector.java | 8 +- .../microsoft/ooxml/OOXMLExtractorFactory.java | 14 ++- .../microsoft/ooxml/xps/XPSExtractorDecorator.java | 5 ++ .../parser/microsoft/ooxml/xps/XPSParserTest.java | 45 +++++++++- .../test-documents/testXPSWithDataDescriptor.xps | Bin 0 -> 44523 bytes .../test-documents/testXPSWithDataDescriptor2.xps | Bin 0 -> 51175 bytes .../detect/zip/DefaultZipContainerDetector.java | 38 +++++++-- .../org/apache/tika/zip/utils/ZipSalvager.java | 95 +++++++++++++-------- 8 files changed, 159 insertions(+), 46 deletions(-) diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java index 03dbda5..2dd9cf4 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java @@ -97,7 +97,6 @@ public class OPCPackageDetector implements ZipContainerDetector { static final MediaType XPS = MediaType.application("vnd.ms-xpsdocument"); - static final Set<String> OOXML_HINTS = fillSet( "word/document.xml", "_rels/.rels", @@ -156,6 +155,9 @@ public class OPCPackageDetector implements ZipContainerDetector { private static final String XPS_DOCUMENT = "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation"; + private static final String OPEN_XPS_DOCUMENT = + "http://schemas.openxps.org/oxps/v1.0/fixedrepresentation"; + private static final String STAR_OFFICE_6_WRITER = "application/vnd.sun.xml.writer"; @@ -210,6 +212,10 @@ public class OPCPackageDetector implements ZipContainerDetector { if (core.size() == 1) { return MediaType.application("vnd.ms-xpsdocument"); } + core = pkg.getRelationshipsByType(OPEN_XPS_DOCUMENT); + if (core.size() == 1) { + return MediaType.application("vnd.ms-xpsdocument"); + } } if (core.size() == 0) { diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index 8aff0e8..c135e8c 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.InputStream; import java.util.Locale; +import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException; import org.apache.poi.ooxml.POIXMLDocument; import org.apache.poi.ooxml.extractor.ExtractorFactory; import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; @@ -111,7 +112,18 @@ public class OOXMLExtractorFactory { } catch (EOFException e) { rereadableInputStream.rewind(); tmpRepairedCopy = File.createTempFile("tika-ooxml-repair-", ""); - ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy); + ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false); + //if there isn't enough left to be opened as a package + //throw an exception -- we may want to fall back to streaming + //parsing + pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ); + } catch (UnsupportedZipFeatureException e) { + if (e.getFeature() != UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) { + throw e; + } + rereadableInputStream.rewind(); + tmpRepairedCopy = File.createTempFile("tika-ooxml-repair-", ""); + ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false); //if there isn't enough left to be opened as a package //throw an exception -- we may want to fall back to streaming //parsing diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java index 2643a3a..5cf7573 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java @@ -51,6 +51,8 @@ import java.util.Map; public class XPSExtractorDecorator extends AbstractOOXMLExtractor { private static String XPS_DOCUMENT = "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation"; + private static final String OPEN_XPS_DOCUMENT = + "http://schemas.openxps.org/oxps/v1.0/fixedrepresentation"; private final ParseContext context; private final ZipPackage pkg; @@ -76,6 +78,9 @@ public class XPSExtractorDecorator extends AbstractOOXMLExtractor { protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException { PackageRelationshipCollection prc = pkg.getRelationshipsByType(XPS_DOCUMENT); + if (prc.size() == 0) { + prc = pkg.getRelationshipsByType(OPEN_XPS_DOCUMENT); + } for (int i = 0; i < prc.size(); i++) { PackageRelationship pr = prc.getRelationship(i); diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java index 6015f6a..9c3ae65 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java @@ -19,9 +19,14 @@ package org.apache.tika.parser.microsoft.ooxml.xps; import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.junit.Test; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.List; import static org.junit.Assert.assertEquals; @@ -94,4 +99,42 @@ public class XPSParserTest extends TikaTest { } + @Test + public void testXPSWithDataDescriptor() throws Exception { + Path path = Paths.get( + XPSParserTest.class.getResource("/test-documents/testXPSWithDataDescriptor.xps").toURI()); + //test both path and stream based + List<Metadata> metadataList = getRecursiveMetadata(path, true); + assertEquals(2, metadataList.size()); + assertContains("This is my XPS document test", + metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + Files.copy(path, bos); + metadataList = getRecursiveMetadata(new ByteArrayInputStream(bos.toByteArray()), true); + assertEquals(2, metadataList.size()); + assertContains("This is my XPS document test", + metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + + assertEquals(TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString(), + metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + } + + @Test + public void testOpenXPSWithDataDescriptor() throws Exception { + Path path = Paths.get( + XPSParserTest.class.getResource("/test-documents/testXPSWithDataDescriptor2.xps").toURI()); + List<Metadata> metadataList = getRecursiveMetadata(path, true); + assertEquals(2, metadataList.size()); + assertContains("How was I supposed to know", + metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + Files.copy(path, bos); + metadataList = getRecursiveMetadata(new ByteArrayInputStream(bos.toByteArray()), true); + assertEquals(2, metadataList.size()); + assertContains("How was I supposed to know", + metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + } + } diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor.xps b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor.xps new file mode 100644 index 0000000..1569377 Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor.xps differ diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps new file mode 100644 index 0000000..efc4a0e Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps differ diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java index 8a3d1f3..d3ed8a6 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java @@ -18,6 +18,7 @@ package org.apache.tika.detect.zip; import org.apache.commons.compress.archivers.ArchiveException; import org.apache.commons.compress.archivers.ArchiveStreamFactory; +import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; import org.apache.commons.compress.archivers.zip.ZipFile; @@ -28,12 +29,14 @@ import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.tika.config.Field; import org.apache.tika.config.ServiceLoader; import org.apache.tika.detect.Detector; +import org.apache.tika.io.BoundedInputStream; import org.apache.tika.io.LookaheadInputStream; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import java.io.ByteArrayInputStream; +import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.util.List; @@ -83,6 +86,7 @@ public class DefaultZipContainerDetector implements Detector { * * @param markLimit mark limit for streaming detection */ + @Field public void setMarkLimit(int markLimit) { this.markLimit = markLimit; } @@ -118,10 +122,7 @@ public class DefaultZipContainerDetector implements Detector { return detectZipFormatOnFile(tis); } } - - try (LookaheadInputStream lookahead = new LookaheadInputStream(input, markLimit)) { - return detectStreaming(lookahead, metadata); - } + return detectStreaming(input, metadata); } else if (!type.equals(MediaType.OCTET_STREAM)) { return type; } else { @@ -207,10 +208,21 @@ public class DefaultZipContainerDetector implements Detector { } MediaType detectStreaming(InputStream input, Metadata metadata) throws IOException { + BoundedInputStream boundedInputStream = new BoundedInputStream(markLimit, input); + boundedInputStream.mark(markLimit); + try { + return detectStreaming(boundedInputStream, metadata, false); + } finally { + boundedInputStream.reset(); + } + } + + MediaType detectStreaming (InputStream input, + Metadata metadata, boolean allowStoredEntries) throws IOException { StreamingDetectContext detectContext = new StreamingDetectContext(); - try ( - ZipArchiveInputStream zis = - new ZipArchiveInputStream(new CloseShieldInputStream(input))) { + try (ZipArchiveInputStream zis = + new ZipArchiveInputStream(new CloseShieldInputStream(input), + "UTF8", false, allowStoredEntries)) { ZipArchiveEntry zae = zis.getNextZipEntry(); while (zae != null) { MediaType mt = detect(zae, zis, detectContext); @@ -219,10 +231,18 @@ public class DefaultZipContainerDetector implements Detector { } zae = zis.getNextZipEntry(); } + } catch (UnsupportedZipFeatureException zfe) { + if (allowStoredEntries == false && + zfe.getFeature() == UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) { + input.reset(); + return detectStreaming(input, metadata, true); + } } catch (SecurityException e) { throw e; - } catch (Exception e) { - //swallow + } catch (EOFException e) { + //truncated zip -- swallow + } catch (IOException e) { + //another option for a truncated zip } return finalDetect(detectContext); diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java index 00c6b5b..1000d5f 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java @@ -23,10 +23,12 @@ import java.io.InputStream; import java.nio.file.Files; import java.util.zip.ZipException; +import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.io.IOUtils; +import org.apache.tika.utils.RereadableInputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -44,51 +46,76 @@ public class ZipSalvager { * @param brokenZip * @param salvagedZip */ - public static void salvageCopy(InputStream brokenZip, File salvagedZip) { - try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(salvagedZip)) { - ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(brokenZip); - ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry(); - while (zae != null) { + public static void salvageCopy(InputStream brokenZip, File salvagedZip, boolean allowStoredEntries) throws IOException { + if (!(brokenZip instanceof RereadableInputStream)) { + brokenZip = new RereadableInputStream(brokenZip, 50000, + true, false); + } + try { + try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(salvagedZip); + ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(brokenZip, + "UTF8", false, allowStoredEntries)) { + ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry(); try { - if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) { - //create a new ZAE and copy over only the name so that - //if there is bad info (e.g. CRC) in brokenZip's zae, that - //won't be propagated or cause an exception - outputStream.putArchiveEntry(new ZipArchiveEntry(zae.getName())); - //this will copy an incomplete stream...so there - //could be truncation of the xml/contents, but the zip file - //should be intact. - boolean successfullyCopied = false; - try { - IOUtils.copy(zipArchiveInputStream, outputStream); - successfullyCopied = true; - } catch (IOException e) { - //this can hit a "truncated ZipFile" IOException - } - outputStream.flush(); - outputStream.closeArchiveEntry(); - if (!successfullyCopied) { - break; - } + processZAE(zae, zipArchiveInputStream, outputStream); + } catch (UnsupportedZipFeatureException uzfe) { + if (uzfe.getFeature() == + UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) { + //percolate up to allow for retry + throw uzfe; } - zae = zipArchiveInputStream.getNextZipEntry(); + //else swallow } catch (ZipException | EOFException e) { - break; + //swallow } - + outputStream.flush(); + outputStream.finish(); + } catch (UnsupportedZipFeatureException e) { + //percolate up to allow for retry + throw e; + } catch (IOException e) { + LOG.warn("problem fixing zip", e); } - outputStream.flush(); - outputStream.finish(); - + } catch (UnsupportedZipFeatureException e) { + //now retry + if (allowStoredEntries == false) { + ((RereadableInputStream) brokenZip).rewind(); + salvageCopy(brokenZip, salvagedZip, true); + } + } + } - } catch (IOException e) { - LOG.warn("problem fixing zip", e); + private static void processZAE(ZipArchiveEntry zae, ZipArchiveInputStream zipArchiveInputStream, + ZipArchiveOutputStream outputStream) throws IOException { + while (zae != null) { + if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) { + //create a new ZAE and copy over only the name so that + //if there is bad info (e.g. CRC) in brokenZip's zae, that + //won't be propagated or cause an exception + outputStream.putArchiveEntry(new ZipArchiveEntry(zae.getName())); + //this will copy an incomplete stream...so there + //could be truncation of the xml/contents, but the zip file + //should be intact. + boolean successfullyCopied = false; + try { + IOUtils.copy(zipArchiveInputStream, outputStream); + successfullyCopied = true; + } catch (IOException e) { + //this can hit a "truncated ZipFile" IOException + } + outputStream.flush(); + outputStream.closeArchiveEntry(); + if (!successfullyCopied) { + break; + } + } + zae = zipArchiveInputStream.getNextZipEntry(); } } public static void salvageCopy(File brokenZip, File salvagedZip) throws IOException { try (InputStream is = Files.newInputStream(brokenZip.toPath())) { - salvageCopy(is, salvagedZip); + salvageCopy(is, salvagedZip, false); } } }
