This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new caf6730e1 TIKA-4399 -- require TikaInputStream for embedded documents
(#2182)
caf6730e1 is described below
commit caf6730e10f051defb76894b8d88ea41378a3f6e
Author: Tim Allison <[email protected]>
AuthorDate: Tue Apr 8 14:27:08 2025 -0400
TIKA-4399 -- require TikaInputStream for embedded documents (#2182)
* TIKA-4399 -- require TikaInputStream for embedded documents and avoid
incorrect zip bomb detection in RUnpackExtractor
---
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 14 +++----
.../tika/extractor/EmbeddedDocumentExtractor.java | 4 +-
.../tika/extractor/EmbeddedDocumentUtil.java | 5 +--
.../ParsingEmbeddedDocumentExtractor.java | 20 +++-------
.../apache/tika/extractor/RUnpackExtractor.java | 20 ++++------
.../java/org/apache/tika/io/TikaInputStream.java | 45 ++++++++++++++++++++++
.../apache/tika/parser/RecursiveParserWrapper.java | 4 +-
.../org/apache/tika/renderer/RenderResult.java | 3 +-
.../org/apache/tika/parser/mock/MockParser.java | 4 +-
.../apache/tika/example/ExtractEmbeddedFiles.java | 3 +-
.../tika/parser/apple/AppleSingleFileParser.java | 18 +++++++--
.../parser/iwork/iwana/IWork13PackageParser.java | 30 ++++++++-------
.../org/apache/tika/parser/crypto/TSDParser.java | 4 +-
.../org/apache/tika/parser/html/HtmlHandler.java | 20 +++++-----
.../apache/tika/parser/jdbc/JDBCTableReader.java | 6 +--
.../tika/parser/mail/MailContentHandler.java | 8 ++--
.../org/apache/tika/parser/mbox/MboxParser.java | 5 ++-
.../apache/tika/parser/microsoft/EMFParser.java | 8 ++--
.../tika/parser/microsoft/JackcessExtractor.java | 6 +--
.../apache/tika/parser/microsoft/OfficeParser.java | 9 ++---
.../tika/parser/microsoft/OutlookExtractor.java | 11 +++---
.../tika/parser/microsoft/chm/ChmParser.java | 9 ++---
.../tika/parser/microsoft/libpst/EmailVisitor.java | 5 +--
.../microsoft/ooxml/xps/XPSExtractorDecorator.java | 13 ++++---
.../parser/microsoft/pst/PSTMailItemParser.java | 6 +--
.../parser/odf/FlatOpenDocumentMacroHandler.java | 5 +--
.../tika/parser/odf/OpenDocumentBodyHandler.java | 5 +--
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 38 ++++++++----------
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 5 +--
.../java/org/apache/tika/parser/pdf/PDFParser.java | 4 +-
.../apache/tika/parser/pdf/PDFRenderingTest.java | 11 +++---
.../apache/tika/parser/pkg/CompressorParser.java | 5 ++-
.../java/org/apache/tika/parser/pkg/RarParser.java | 4 +-
.../org/apache/tika/parser/pkg/UnrarParser.java | 4 +-
.../org/apache/tika/parser/http/HttpParser.java | 11 +++---
.../org/apache/tika/parser/wacz/WACZParser.java | 4 +-
.../org/apache/tika/parser/warc/WARCParser.java | 2 +-
.../apache/tika/parser/xml/FictionBookParser.java | 8 ++--
.../org/apache/tika/parser/pkg/ZipParserTest.java | 3 +-
.../server/core/resource/UnpackerResource.java | 8 ++--
40 files changed, 217 insertions(+), 180 deletions(-)
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 74c3cbf44..448feeff3 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -1086,12 +1086,10 @@ public class TikaCLI {
return true;
}
- public void parseEmbedded(InputStream inputStream, ContentHandler
contentHandler, Metadata metadata, boolean outputHtml) throws SAXException,
IOException {
+ @Override
+ public void parseEmbedded(TikaInputStream tis, ContentHandler
contentHandler, Metadata metadata, boolean outputHtml) throws SAXException,
IOException {
- if (!inputStream.markSupported()) {
- inputStream = TikaInputStream.get(inputStream);
- }
- MediaType contentType = detector.detect(inputStream, metadata);
+ MediaType contentType = detector.detect(tis, metadata);
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
File outputFile = null;
@@ -1110,12 +1108,12 @@ public class TikaCLI {
System.out.println("Extracting '" + name + "' (" + contentType +
") to " + outputFile);
try (FileOutputStream os = new FileOutputStream(outputFile)) {
- if (embeddedStreamTranslator.shouldTranslate(inputStream,
metadata)) {
- try (InputStream translated =
embeddedStreamTranslator.translate(inputStream, metadata)) {
+ if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
+ try (InputStream translated =
embeddedStreamTranslator.translate(tis, metadata)) {
IOUtils.copy(translated, os);
}
} else {
- IOUtils.copy(inputStream, os);
+ IOUtils.copy(tis, os);
}
} catch (Exception e) {
//
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java
index f80420868..3f977e3db 100644
---
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java
+++
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java
@@ -18,11 +18,11 @@
package org.apache.tika.extractor;
import java.io.IOException;
-import java.io.InputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
public interface EmbeddedDocumentExtractor {
@@ -39,6 +39,6 @@ public interface EmbeddedDocumentExtractor {
* @throws java.io.IOException
*/
void parseEmbedded(
- InputStream stream, ContentHandler handler, Metadata metadata,
boolean outputHtml)
+ TikaInputStream stream, ContentHandler handler, Metadata metadata,
boolean outputHtml)
throws SAXException, IOException;
}
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index d6e2c28a8..4d73545c1 100644
---
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -18,7 +18,6 @@ package org.apache.tika.extractor;
import java.io.IOException;
-import java.io.InputStream;
import java.io.Serializable;
import org.xml.sax.ContentHandler;
@@ -219,9 +218,9 @@ public class EmbeddedDocumentUtil implements Serializable {
return embeddedDocumentExtractor;
}
- public void parseEmbedded(InputStream inputStream, ContentHandler handler,
Metadata metadata,
+ public void parseEmbedded(TikaInputStream tis, ContentHandler handler,
Metadata metadata,
boolean outputHtml) throws IOException,
SAXException {
- embeddedDocumentExtractor.parseEmbedded(inputStream, handler,
metadata, outputHtml);
+ embeddedDocumentExtractor.parseEmbedded(tis, handler, metadata,
outputHtml);
}
/**
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
index 4f2331359..21117b33b 100644
---
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
@@ -21,9 +21,7 @@ import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
-import java.io.InputStream;
-import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -31,7 +29,6 @@ import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.CorruptedFileException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -82,7 +79,7 @@ public class ParsingEmbeddedDocumentExtractor implements
EmbeddedDocumentExtract
@Override
public void parseEmbedded(
- InputStream stream, ContentHandler handler, Metadata metadata,
boolean outputHtml)
+ TikaInputStream tis, ContentHandler handler, Metadata metadata,
boolean outputHtml)
throws SAXException, IOException {
if (outputHtml) {
AttributesImpl attributes = new AttributesImpl();
@@ -99,16 +96,9 @@ public class ParsingEmbeddedDocumentExtractor implements
EmbeddedDocumentExtract
}
// Use the delegate parser to parse this entry
- try (TemporaryResources tmp = new TemporaryResources()) {
- final TikaInputStream newStream =
- TikaInputStream.get(CloseShieldInputStream.wrap(stream),
tmp, metadata);
- if (stream instanceof TikaInputStream) {
- final Object container = ((TikaInputStream)
stream).getOpenContainer();
- if (container != null) {
- newStream.setOpenContainer(container);
- }
- }
- DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new
BodyContentHandler(handler)),
+ try {
+ tis.setCloseShield();
+ DELEGATING_PARSER.parse(tis, new EmbeddedContentHandler(new
BodyContentHandler(handler)),
metadata, context);
} catch (EncryptedDocumentException ede) {
recordException(ede, context);
@@ -118,6 +108,8 @@ public class ParsingEmbeddedDocumentExtractor implements
EmbeddedDocumentExtract
throw new IOException(e);
} catch (TikaException e) {
recordException(e, context);
+ } finally {
+ tis.removeCloseShield();
}
if (outputHtml) {
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
index 0e5928845..cbd560c50 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
@@ -35,7 +35,6 @@ import org.apache.tika.exception.CorruptedFileException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.BoundedInputStream;
-import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -68,7 +67,7 @@ public class RUnpackExtractor extends
ParsingEmbeddedDocumentExtractor {
@Override
public void parseEmbedded(
- InputStream stream, ContentHandler handler, Metadata metadata,
boolean outputHtml)
+ TikaInputStream tis, ContentHandler handler, Metadata metadata,
boolean outputHtml)
throws SAXException, IOException {
if (outputHtml) {
AttributesImpl attributes = new AttributesImpl();
@@ -85,20 +84,13 @@ public class RUnpackExtractor extends
ParsingEmbeddedDocumentExtractor {
}
// Use the delegate parser to parse this entry
- try (TemporaryResources tmp = new TemporaryResources()) {
- final TikaInputStream newStream =
- TikaInputStream.get(CloseShieldInputStream.wrap(stream),
tmp, metadata);
- if (stream instanceof TikaInputStream) {
- final Object container = ((TikaInputStream)
stream).getOpenContainer();
- if (container != null) {
- newStream.setOpenContainer(container);
- }
- }
+ try {
EmbeddedDocumentBytesHandler bytesHandler =
context.get(EmbeddedDocumentBytesHandler.class);
+ tis.setCloseShield();
if (bytesHandler != null) {
- parseWithBytes(newStream, handler, metadata);
+ parseWithBytes(tis, handler, metadata);
} else {
- parse(newStream, handler, metadata);
+ parse(tis, handler, metadata);
}
} catch (EncryptedDocumentException ede) {
recordException(ede, context);
@@ -108,6 +100,8 @@ public class RUnpackExtractor extends
ParsingEmbeddedDocumentExtractor {
throw new IOException(e);
} catch (TikaException e) {
recordException(e, context);
+ } finally {
+ tis.removeCloseShield();
}
if (outputHtml) {
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 77e09226a..ea48487a0 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -119,6 +119,11 @@ public class TikaInputStream extends TaggedInputStream {
private int consecutiveEOFs = 0;
private byte[] skipBuffer;
+ /**
+ * If the stream should be shielded from closing
+ */
+ private int closeShieldDepth = 0;
+
//suffix of the file if known. This is used to create temp files
//with the right suffixes. This should include the initial . as in ".doc"
private String suffix = null;
@@ -266,6 +271,32 @@ public class TikaInputStream extends TaggedInputStream {
return get(stream, new TemporaryResources(), null);
}
+ /**
+ * Casts or wraps the given stream to a TikaInputStream instance.
+ * This method can be used to access the functionality of this class
+ * even when given just a normal input stream instance.
+ * <p>
+ * Use this method instead of the
+ * {@link #get(InputStream, TemporaryResources, Metadata)} alternative
when you
+ * <em>do</em> explicitly close the returned stream. The recommended
+ * access pattern is:
+ * <pre>
+ * try (TikaInputStream stream = TikaInputStream.get(...)) {
+ * // process stream
+ * }
+ * </pre>
+ * <p>
+ * The given stream instance will be closed along with any other resources
+ * associated with the returned TikaInputStream instance when the
+ * {@link #close()} method is called by the try-with-resources statement.
+ *
+ * @param stream normal input stream
+ * @return a TikaInputStream instance
+ */
+ public static TikaInputStream get(InputStream stream, Metadata metadata) {
+ return get(stream, new TemporaryResources(), metadata);
+ }
+
/**
* Returns the given stream casts to a TikaInputStream, or
* <code>null</code> if the stream is not a TikaInputStream.
@@ -827,6 +858,9 @@ public class TikaInputStream extends TaggedInputStream {
@Override
public void close() throws IOException {
+ if (closeShieldDepth > 0) {
+ return;
+ }
path = null;
mark = -1;
@@ -853,6 +887,17 @@ public class TikaInputStream extends TaggedInputStream {
}
}
+ public void setCloseShield() {
+ this.closeShieldDepth++;
+ }
+
+ public void removeCloseShield() {
+ this.closeShieldDepth--;
+ }
+
+ public boolean isCloseShield() {
+ return closeShieldDepth > 0;
+ }
@Override
public String toString() {
String str = "TikaInputStream of ";
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 4e4f72dfa..07eee752b 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -272,7 +272,8 @@ public class RecursiveParserWrapper extends ParserDecorator
{
preParseHandler.throwOnWriteLimitReached, context);
try {
- super.parse(stream, secureContentHandler, metadata, context);
+ tis.setCloseShield();
+ super.parse(tis, secureContentHandler, metadata, context);
} catch (SAXException e) {
if (WriteLimitReachedException.isWriteLimitReached(e)) {
metadata.add(TikaCoreProperties.WRITE_LIMIT_REACHED,
"true");
@@ -299,6 +300,7 @@ public class RecursiveParserWrapper extends ParserDecorator
{
throw e;
}
} finally {
+ tis.removeCloseShield();
context.set(Parser.class, preContextParser);
context.set(RecursivelySecureContentHandler.class,
preParseHandler);
context.set(ParentContentHandler.class, preParseParentHandler);
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
index 3fd8d7d2c..25588c45b 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
@@ -18,7 +18,6 @@ package org.apache.tika.renderer;
import java.io.Closeable;
import java.io.IOException;
-import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -62,7 +61,7 @@ public class RenderResult implements Closeable {
}
}
- public InputStream getInputStream() throws IOException {
+ public TikaInputStream getInputStream() throws IOException {
if (result instanceof Path) {
return TikaInputStream.get((Path)result, metadata);
} else {
diff --git
a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
index 16458c9e9..84c9b7ab1 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
@@ -19,7 +19,6 @@ package org.apache.tika.parser.mock;
import static java.nio.charset.StandardCharsets.UTF_8;
-import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
@@ -56,6 +55,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -267,7 +267,7 @@ public class MockParser implements Parser {
if (!"".equals(contentType)) {
m.set(Metadata.CONTENT_TYPE, contentType);
}
- try (InputStream is = new
ByteArrayInputStream(embeddedText.getBytes(UTF_8))) {
+ try (TikaInputStream is =
TikaInputStream.get(embeddedText.getBytes(UTF_8))) {
extractor.parseEmbedded(is, new EmbeddedContentHandler(handler),
m, true);
}
}
diff --git
a/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
b/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
index 9f1425da8..68d136202 100644
---
a/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
+++
b/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java
@@ -32,6 +32,7 @@ import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -73,7 +74,7 @@ public class ExtractEmbeddedFiles {
}
@Override
- public void parseEmbedded(InputStream stream, ContentHandler handler,
Metadata metadata, boolean outputHtml) throws SAXException, IOException {
+ public void parseEmbedded(TikaInputStream stream, ContentHandler
handler, Metadata metadata, boolean outputHtml) throws SAXException,
IOException {
//try to get the name of the embedded file from the metadata
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
index ac43b2985..d97ed1ba7 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@ -26,7 +26,6 @@ import java.util.List;
import java.util.Set;
import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -35,6 +34,8 @@ import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.EndianUtils;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -99,8 +100,19 @@ public class AppleSingleFileParser implements Parser {
// TODO: we should probably add a readlimiting wrapper around
this
// stream to ensure that not more than contentFieldInfo.length
bytes
// are read
- ex.parseEmbedded(CloseShieldInputStream.wrap(stream), xhtml,
embeddedMetadata,
- true);
+ TikaInputStream tis = TikaInputStream.cast(stream);
+ TemporaryResources tmp = null;
+ if (tis == null) {
+ tmp = new TemporaryResources();
+ tis = TikaInputStream.get(stream, tmp, embeddedMetadata);
+ }
+ try {
+ ex.parseEmbedded(tis, xhtml, embeddedMetadata, true);
+ } finally {
+ if (tmp != null) {
+ tmp.close();
+ }
+ }
}
}
xhtml.endDocument();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
index 2816efac1..ac65ff587 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java
@@ -36,13 +36,13 @@ import com.dd.plist.PropertyListParser;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
@@ -128,9 +128,11 @@ public class IWork13PackageParser implements Parser {
if (type == null) {
type = IWork13DocumentType.detectIfPossible(entry);
}
- processZipEntry(entry, CloseShieldInputStream.wrap(zipStream),
metadata, xhtml,
- parseContext,
- embeddedDocumentExtractor);
+
+ try (TemporaryResources tmp = new TemporaryResources()) {
+ TikaInputStream tis = TikaInputStream.get(zipStream, tmp, new
Metadata());
+ processZipEntry(entry, tis, metadata, xhtml, parseContext,
embeddedDocumentExtractor);
+ }
entry = zipStream.getNextEntry();
}
if (type == null) {
@@ -153,8 +155,8 @@ public class IWork13PackageParser implements Parser {
if (type == null) {
type = IWork13DocumentType.detectIfPossible(entry);
}
- try (InputStream is = zipFile.getInputStream(entry)) {
- processZipEntry(entry, is, metadata, xhtml, parseContext,
embeddedDocumentExtractor);
+ try (TikaInputStream tis =
TikaInputStream.get(zipFile.getInputStream(entry))) {
+ processZipEntry(entry, tis, metadata, xhtml, parseContext,
embeddedDocumentExtractor);
} catch (SecurityException e) {
throw e;
} catch (Exception e) {
@@ -171,7 +173,7 @@ public class IWork13PackageParser implements Parser {
}
private void processZipEntry(ZipEntry entry,
- InputStream inputStream,
+ TikaInputStream tis,
Metadata metadata, XHTMLContentHandler xhtml,
ParseContext parseContext,
EmbeddedDocumentExtractor
embeddedDocumentExtractor)
@@ -181,18 +183,18 @@ public class IWork13PackageParser implements Parser {
return;
}
if ("Metadata/Properties.plist".equals(streamName)) {
- extractProperties(inputStream, metadata);
+ extractProperties(tis, metadata);
} else if ("Metadata/BuildVersionHistory.plist".equals(streamName)) {
- extractVersionHistory(inputStream, metadata);
+ extractVersionHistory(tis, metadata);
} else if ("Metadata/DocumentIdentifier".equals(streamName)) {
- extractDocumentIdentifier(inputStream, metadata);
+ extractDocumentIdentifier(tis, metadata);
} else if ("preview.jpg".equals(streamName)) {
//process thumbnail
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString());
embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
streamName);
- handleEmbedded(inputStream, embeddedMetadata, xhtml,
embeddedDocumentExtractor);
+ handleEmbedded(tis, embeddedMetadata, xhtml,
embeddedDocumentExtractor);
} else if (streamName.equals("preview-micro.jpg") ||
streamName.equals("preview-web.jpg")
|| streamName.endsWith(".iwa")) {
@@ -200,18 +202,18 @@ public class IWork13PackageParser implements Parser {
} else {
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
streamName);
- handleEmbedded(inputStream, embeddedMetadata, xhtml,
embeddedDocumentExtractor);
+ handleEmbedded(tis, embeddedMetadata, xhtml,
embeddedDocumentExtractor);
}
}
- private void handleEmbedded(InputStream inputStream, Metadata
embeddedMetadata,
+ private void handleEmbedded(TikaInputStream tis, Metadata embeddedMetadata,
XHTMLContentHandler xhtml,
EmbeddedDocumentExtractor
embeddedDocumentExtractor)
throws IOException, SAXException {
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
- embeddedDocumentExtractor.parseEmbedded(inputStream, xhtml,
embeddedMetadata, true);
+ embeddedDocumentExtractor.parseEmbedded(tis, xhtml,
embeddedMetadata, true);
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
index c5a362464..2a0e4a0f9 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
@@ -171,8 +171,8 @@ public class TSDParser implements Parser {
try {
cmsTimeStampedDataParser = new
CMSTimeStampedDataParser(stream);
- try (InputStream is =
TikaInputStream.get(cmsTimeStampedDataParser.getContent())) {
- edx.parseEmbedded(is, handler, metadata, true);
+ try (TikaInputStream tis =
TikaInputStream.get(cmsTimeStampedDataParser.getContent())) {
+ edx.parseEmbedded(tis, handler, metadata, true);
}
} catch (SecurityException e) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index ea4d0195f..b613f3939 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -17,7 +17,6 @@
package org.apache.tika.parser.html;
import java.io.IOException;
-import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
@@ -31,7 +30,6 @@ import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -39,6 +37,7 @@ import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.HTML;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -341,8 +340,8 @@ class HtmlHandler extends TextContentHandler {
EmbeddedDocumentExtractor embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
- try (InputStream stream =
UnsynchronizedByteArrayInputStream.builder().setByteArray(string.getBytes(StandardCharsets.UTF_8)).get())
{
- embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m,
true);
+ try (TikaInputStream tis =
TikaInputStream.get(string.getBytes(StandardCharsets.UTF_8))) {
+ embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
metadata);
}
@@ -368,8 +367,8 @@ class HtmlHandler extends TextContentHandler {
EmbeddedDocumentExtractor embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
- try (InputStream stream = dataURIScheme.getInputStream()) {
- embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m,
true);
+ try (TikaInputStream tis =
TikaInputStream.get(dataURIScheme.getInputStream())) {
+ embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
metadata);
}
@@ -401,18 +400,17 @@ class HtmlHandler extends TextContentHandler {
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
dataUriMetadata.set(Metadata.CONTENT_TYPE,
dataURIScheme.getMediaType().toString());
if
(embeddedDocumentExtractor.shouldParseEmbedded(dataUriMetadata)) {
- try (InputStream dataURISchemeInputStream =
dataURIScheme.getInputStream()) {
+ try (TikaInputStream tis =
TikaInputStream.get(dataURIScheme.getInputStream())) {
embeddedDocumentExtractor
- .parseEmbedded(dataURISchemeInputStream, xhtml,
dataUriMetadata, true);
+ .parseEmbedded(tis, xhtml, dataUriMetadata, true);
} catch (IOException e) {
//swallow
}
}
}
- try (InputStream stream =
UnsynchronizedByteArrayInputStream.builder().setByteArray(
- script.toString().getBytes(StandardCharsets.UTF_8)).get()) {
- embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m, true);
+ try (TikaInputStream tis =
TikaInputStream.get(script.toString().getBytes(StandardCharsets.UTF_8))) {
+ embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true);
} catch (IOException e) {
//shouldn't ever happen
} finally {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java
index 9df4d197d..3ed87ae0b 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java
@@ -32,7 +32,6 @@ import java.util.List;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -200,8 +199,9 @@ public class JDBCTableReader {
//is there a more efficient way to go from a Reader to an InputStream?
String s = clob.getSubString(0, readSize);
if (embeddedDocumentUtil.shouldParseEmbedded(m)) {
- embeddedDocumentUtil
-
.parseEmbedded(UnsynchronizedByteArrayInputStream.builder().setByteArray(s.getBytes(UTF_8)).get(),
handler, m, true);
+ try (TikaInputStream tis = TikaInputStream.get(s.getBytes(UTF_8)))
{
+ embeddedDocumentUtil.parseEmbedded(tis, handler, m, true);
+ }
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 7c5d266ca..9af23d004 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -26,7 +26,6 @@ import java.util.Map.Entry;
import java.util.Stack;
import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.james.mime4j.MimeException;
import org.apache.james.mime4j.codec.DecodeMonitor;
@@ -546,9 +545,10 @@ class MailContentHandler implements ContentHandler {
inlineMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
MediaType.TEXT_PLAIN.toString());
}
-
parser.parse(UnsynchronizedByteArrayInputStream.builder().setByteArray(part.bytes).get(),
- new EmbeddedContentHandler(new
BodyContentHandler(handler)), inlineMetadata,
- parseContext);
+ try (TikaInputStream tis = TikaInputStream.get(part.bytes)) {
+ parser.parse(tis,
+ new EmbeddedContentHandler(new
BodyContentHandler(handler)), inlineMetadata, parseContext);
+ }
} catch (SAXException | TikaException e) {
throw new MimeException(e);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
index 4c7bea74c..dddd9bd92 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
@@ -40,6 +40,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -130,11 +131,11 @@ public class MboxParser implements Parser {
saveHeaderInMetadata(mailMetadata, item);
}
- InputStream messageStream = message.toInputStream();
+ TikaInputStream msgStream =
TikaInputStream.get(message.toInputStream());
message = null;
if (extractor.shouldParseEmbedded(mailMetadata)) {
- extractor.parseEmbedded(messageStream, xhtml,
mailMetadata, true);
+ extractor.parseEmbedded(msgStream, xhtml,
mailMetadata, true);
}
if (tracking) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
index ae80cb62b..1ef4f1ee6 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
@@ -69,11 +69,11 @@ public class EMFParser implements Parser {
private static void handleEmbedded(byte[] data,
EmbeddedDocumentExtractor
embeddedDocumentExtractor,
ContentHandler handler) throws
TikaException, SAXException {
- try (InputStream is = TikaInputStream.get(data)) {
+ try (TikaInputStream tis = TikaInputStream.get(data)) {
Metadata embeddedMetadata = new Metadata();
if
(embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
embeddedDocumentExtractor
- .parseEmbedded(is, new
EmbeddedContentHandler(handler), embeddedMetadata,
+ .parseEmbedded(tis, new
EmbeddedContentHandler(handler), embeddedMetadata,
true);
}
} catch (IOException e) {
@@ -204,9 +204,9 @@ public class EMFParser implements Parser {
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(Metadata.CONTENT_TYPE, WMF_MEDIA_TYPE.toString());
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
- try (InputStream is = TikaInputStream.get(bytes)) {
+ try (TikaInputStream tis = TikaInputStream.get(bytes)) {
embeddedDocumentExtractor
- .parseEmbedded(is, new
EmbeddedContentHandler(contentHandler),
+ .parseEmbedded(tis, new
EmbeddedContentHandler(contentHandler),
embeddedMetadata, true);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index 409c34ca0..619ba601c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -41,7 +41,6 @@ import com.healthmarketscience.jackcess.Table;
import com.healthmarketscience.jackcess.query.Query;
import com.healthmarketscience.jackcess.util.OleBlob;
import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.xml.sax.SAXException;
@@ -220,9 +219,8 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
BodyContentHandler h = new BodyContentHandler();
Metadata m = new Metadata();
m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
- try {
- htmlParser
-
.parse(UnsynchronizedByteArrayInputStream.builder().setByteArray(v.getBytes(UTF_8)).get(),
h, m, parseContext);
+ try (TikaInputStream tis =
TikaInputStream.get(v.getBytes(UTF_8))) {
+ htmlParser.parse(tis, h, m, parseContext);
handler.characters(h.toString());
} catch (SAXException e) {
WriteLimitReachedException.throwIfWriteLimitReached(e);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 8fe685686..dade2ca5f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -32,7 +32,6 @@ import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.poifs.crypt.Decryptor;
@@ -118,7 +117,7 @@ public class OfficeParser extends AbstractOfficeParser {
if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
embeddedDocumentExtractor.parseEmbedded(
//pass in space character so that we don't trigger a
zero-byte exception
-
UnsynchronizedByteArrayInputStream.builder().setByteArray(new
byte[]{'\u0020'}).get(), xhtml, m, true);
+ TikaInputStream.get(new byte[]{'\u0020'}), xhtml, m,
true);
}
return;
}
@@ -131,9 +130,9 @@ public class OfficeParser extends AbstractOfficeParser {
m.set(TikaCoreProperties.RESOURCE_NAME_KEY, e.getKey());
}
if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
- embeddedDocumentExtractor.parseEmbedded(
-
UnsynchronizedByteArrayInputStream.builder().setByteArray(e.getValue().getBytes(StandardCharsets.UTF_8)).get(),
- xhtml, m, true);
+ try (TikaInputStream tis =
TikaInputStream.get(e.getValue().getBytes(StandardCharsets.UTF_8))) {
+ embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m,
true);
+ }
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 687724566..30e1ca14a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -468,9 +468,9 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
if (htmlParser == null) {
htmlParser = new JSoupParser();
}
-
htmlParser.parse(UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(),
- new EmbeddedContentHandler(new
BodyContentHandler(xhtml)), new Metadata(),
- parseContext);
+ try (TikaInputStream tis = TikaInputStream.get(data)) {
+ htmlParser.parse(tis, new EmbeddedContentHandler(new
BodyContentHandler(xhtml)), new Metadata(), parseContext);
+ }
doneBody = true;
}
}
@@ -488,8 +488,9 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
if (rtfParser == null) {
rtfParser = new RTFParser();
}
-
rtfParser.parseInline(UnsynchronizedByteArrayInputStream.builder().setByteArray(rtf.getData()).get(),
- xhtml, new Metadata(), parseContext);
+ try (TikaInputStream tis = TikaInputStream.get(rtf.getData()))
{
+ rtfParser.parseInline(tis, xhtml, new Metadata(),
parseContext);
+ }
doneBody = true;
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
index 0255a9161..06b2dd518 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
@@ -23,12 +23,12 @@ import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
@@ -94,12 +94,11 @@ public class ChmParser implements Parser {
private void parsePage(byte[] byteObject, Parser htmlParser,
ContentHandler xhtml,
ParseContext context) throws TikaException,
IOException, SAXException { // throws IOException
- InputStream stream = null;
Metadata metadata = new Metadata();
ContentHandler handler = new EmbeddedContentHandler(new
BodyContentHandler(xhtml));// -1
- stream =
UnsynchronizedByteArrayInputStream.builder().setByteArray(byteObject).get();
- htmlParser.parse(stream, handler, metadata, context);
-
+ try (TikaInputStream tis = TikaInputStream.get(byteObject)) {
+ htmlParser.parse(tis, handler, metadata, context);
+ }
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java
index a12806472..d9cb6b9d2 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java
@@ -17,7 +17,6 @@
package org.apache.tika.parser.microsoft.libpst;
import java.io.IOException;
-import java.io.InputStream;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
import java.nio.file.Path;
@@ -78,9 +77,9 @@ public class EmailVisitor implements FileVisitor<Path> {
.relativize(file.getParent())
.toString();
emailMetadata.set(PST.PST_FOLDER_PATH, pstPath);
- try (InputStream is = TikaInputStream.get(file)) {
+ try (TikaInputStream tis = TikaInputStream.get(file)) {
try {
- embeddedDocumentExtractor.parseEmbedded(is, xhtml,
emailMetadata, true);
+ embeddedDocumentExtractor.parseEmbedded(tis, xhtml,
emailMetadata, true);
} catch (SAXException e) {
throw new IOException(e);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
index 933f3e99e..2e3d2d914 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
@@ -41,6 +41,7 @@ import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor;
@@ -69,7 +70,7 @@ public class XPSExtractorDecorator extends
AbstractOOXMLExtractor {
}
}
- private static InputStream getZipStream(String zipPath, ZipPackage
zipPackage)
+ private static TikaInputStream getZipStream(String zipPath, ZipPackage
zipPackage)
throws IOException, TikaException {
String targPath =
(zipPath.length() > 1 && zipPath.startsWith("/") ?
zipPath.substring(1) : zipPath);
@@ -86,7 +87,7 @@ public class XPSExtractorDecorator extends
AbstractOOXMLExtractor {
if (zipEntry == null) {
throw new TikaException("Couldn't find required zip entry: " +
zipPath);
}
- return zipEntrySource.getInputStream(zipEntry);
+ return TikaInputStream.get(zipEntrySource.getInputStream(zipEntry));
}
@Override
@@ -130,9 +131,9 @@ public class XPSExtractorDecorator extends
AbstractOOXMLExtractor {
private void handleEmbeddedImage(String zipPath, Metadata metadata,
EmbeddedDocumentUtil embeddedDocumentUtil,
XHTMLContentHandler xhtml) throws
SAXException, IOException {
- InputStream stream = null;
+ TikaInputStream tis = null;
try {
- stream = getZipStream(zipPath, pkg);
+ tis = getZipStream(zipPath, pkg);
} catch (IOException | TikaException e) {
//store this exception in the parent's metadata
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
@@ -140,9 +141,9 @@ public class XPSExtractorDecorator extends
AbstractOOXMLExtractor {
}
try {
- embeddedDocumentUtil.parseEmbedded(stream, xhtml, metadata, true);
+ embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, true);
} finally {
- IOUtils.closeQuietly(stream);
+ IOUtils.closeQuietly(tis);
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
index 4b21e5141..13c23a690 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
@@ -27,7 +27,6 @@ import com.pff.PSTAttachment;
import com.pff.PSTException;
import com.pff.PSTMessage;
import com.pff.PSTRecipient;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -115,8 +114,9 @@ public class PSTMailItemParser implements Parser {
metadata, context);
} else {
byte[] data = htmlChunk.getBytes(StandardCharsets.UTF_8);
-
htmlParser.parse(UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(),
- new EmbeddedContentHandler(new
BodyContentHandler(xhtml)), new Metadata(), context);
+ try (TikaInputStream tis = TikaInputStream.get(data)) {
+ htmlParser.parse(tis, new EmbeddedContentHandler(new
BodyContentHandler(xhtml)), new Metadata(), context);
+ }
}
return;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
index 3bd4b92de..4688cda1a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
@@ -17,7 +17,6 @@
package org.apache.tika.parser.odf;
import java.io.IOException;
-import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import org.apache.commons.lang3.StringUtils;
@@ -108,9 +107,9 @@ class FlatOpenDocumentMacroHandler extends
ContentHandlerDecorator {
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
- try (InputStream is = TikaInputStream.get(bytes)) {
+ try (TikaInputStream tis = TikaInputStream.get(bytes)) {
embeddedDocumentExtractor
- .parseEmbedded(is, contentHandler, embeddedMetadata,
true);
+ .parseEmbedded(tis, contentHandler, embeddedMetadata,
true);
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
index 94b1d86fb..0fbe29a74 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
@@ -19,7 +19,6 @@ package org.apache.tika.parser.odf;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
import java.io.IOException;
-import java.io.InputStream;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Map;
@@ -529,8 +528,8 @@ class OpenDocumentBodyHandler extends
ElementMappingContentHandler {
}
Metadata embeddedMetadata = new Metadata();
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
- try (InputStream is = TikaInputStream.get(bytes)) {
- embeddedDocumentExtractor.parseEmbedded(is, handler,
embeddedMetadata, true);
+ try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+ embeddedDocumentExtractor.parseEmbedded(tis, handler,
embeddedMetadata, true);
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index c3e6bb7e8..de47f2394 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -263,9 +263,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (supportedTypes.contains(XMP_MEDIA_TYPE)) {
//try the main metadata
if (pdfDocument.getDocumentCatalog().getMetadata() != null) {
- try (InputStream is =
pdfDocument.getDocumentCatalog().getMetadata()
- .exportXMPMetadata()) {
- extractXMPAsEmbeddedFile(is,
XMP_DOCUMENT_CATALOG_LOCATION);
+ try (TikaInputStream tis = TikaInputStream.get(
+
pdfDocument.getDocumentCatalog().getMetadata().exportXMPMetadata())) {
+ extractXMPAsEmbeddedFile(tis,
XMP_DOCUMENT_CATALOG_LOCATION);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
parentMetadata);
}
@@ -274,8 +274,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
int pageNumber = 1;
for (PDPage page : pdfDocument.getPages()) {
if (page.getMetadata() != null) {
- try (InputStream is =
page.getMetadata().exportXMPMetadata()) {
- extractXMPAsEmbeddedFile(is, XMP_PAGE_LOCATION_PREFIX
+ pageNumber);
+ try (TikaInputStream tis =
TikaInputStream.get(page.getMetadata().exportXMPMetadata())) {
+ extractXMPAsEmbeddedFile(tis, XMP_PAGE_LOCATION_PREFIX
+ pageNumber);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
parentMetadata);
}
@@ -301,17 +301,17 @@ class AbstractPDF2XHTML extends PDFTextStripper {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
parentMetadata);
}
if (bytes != null) {
- try (InputStream is =
UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
- parseMetadata(is, xfaMetadata);
+ try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+ parseMetadata(tis, xfaMetadata);
}
}
}
}
}
- private void extractXMPAsEmbeddedFile(InputStream is, String location)
+ private void extractXMPAsEmbeddedFile(TikaInputStream tis, String location)
throws IOException, SAXException {
- if (is == null) {
+ if (tis == null) {
return;
}
Metadata xmpMetadata = new Metadata();
@@ -320,19 +320,15 @@ class AbstractPDF2XHTML extends PDFTextStripper {
TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
xmpMetadata.set(PDF.XMP_LOCATION, location);
if (embeddedDocumentExtractor.shouldParseEmbedded(xmpMetadata)) {
- try {
- parseMetadata(is, xmpMetadata);
- } finally {
- IOUtils.closeQuietly(is);
- }
+ parseMetadata(tis, xmpMetadata);
}
}
- private void parseMetadata(InputStream stream, Metadata embeddedMetadata)
+ private void parseMetadata(TikaInputStream tis, Metadata embeddedMetadata)
throws IOException, SAXException {
try {
- embeddedDocumentExtractor.parseEmbedded(stream, new
EmbeddedContentHandler(xhtml),
+ embeddedDocumentExtractor.parseEmbedded(tis, new
EmbeddedContentHandler(xhtml),
embeddedMetadata, true);
} catch (IOException e) {
handleCatchableIOE(e);
@@ -557,10 +553,10 @@ class AbstractPDF2XHTML extends PDFTextStripper {
try (TemporaryResources tmp = new TemporaryResources()) {
try (RenderResult renderResult = renderCurrentPage(pdPage,
context, tmp)) {
Metadata renderMetadata = renderResult.getMetadata();
- try (InputStream is = renderResult.getInputStream()) {
+ try (TikaInputStream tis = renderResult.getInputStream()) {
renderMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
ocrImageMediaType.toString());
- ocrParser.parse(is, new EmbeddedContentHandler(new
BodyContentHandler(xhtml)),
+ ocrParser.parse(tis, new EmbeddedContentHandler(new
BodyContentHandler(xhtml)),
renderMetadata, context);
}
}
@@ -966,8 +962,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
String js = jsAction.getAction();
js = (js == null) ? "" : js;
if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
- try (InputStream is =
TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
- embeddedDocumentExtractor.parseEmbedded(is, xhtml, m,
true);
+ try (TikaInputStream tis =
TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
+ embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m,
true);
}
}
addNonNullAttribute("class", "javascript", attributes);
@@ -1105,7 +1101,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
updateMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.VERSION.toString());
if (embeddedDocumentExtractor.shouldParseEmbedded(updateMetadata))
{
- try (InputStream tis = TikaInputStream.get(update)) {
+ try (TikaInputStream tis = TikaInputStream.get(update)) {
context.set(IsIncrementalUpdate.class,
IsIncrementalUpdate.IS_INCREMENTAL_UPDATE);
embeddedDocumentExtractor.parseEmbedded(tis, xhtml,
updateMetadata, false);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 70d17a8b3..c483c4090 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -17,7 +17,6 @@
package org.apache.tika.parser.pdf;
import java.io.IOException;
-import java.io.InputStream;
import java.io.Writer;
import java.util.HashMap;
import java.util.HashSet;
@@ -175,9 +174,9 @@ class PDF2XHTML extends AbstractPDF2XHTML {
if (result.getStatus() == RenderResult.STATUS.SUCCESS) {
if
(embeddedDocumentExtractor.shouldParseEmbedded(result.getMetadata())) {
- try (InputStream is = result.getInputStream()) {
+ try (TikaInputStream resultInputStream =
result.getInputStream()) {
//TODO: add markup here?
- embeddedDocumentExtractor.parseEmbedded(is, xhtml,
+
embeddedDocumentExtractor.parseEmbedded(resultInputStream, xhtml,
result.getMetadata(), true);
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index ce2cb398f..0d92ee520 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -432,8 +432,8 @@ public class PDFParser implements Parser, RenderingParser,
Initializable {
for (RenderResult result : renderResults.getResults()) {
if (result.getStatus() == RenderResult.STATUS.SUCCESS) {
if
(embeddedDocumentExtractor.shouldParseEmbedded(result.getMetadata())) {
- try (InputStream is = result.getInputStream()) {
- embeddedDocumentExtractor.parseEmbedded(is, xhtml,
result.getMetadata(),
+ try (TikaInputStream tis = result.getInputStream()) {
+ embeddedDocumentExtractor.parseEmbedded(tis, xhtml,
result.getMetadata(),
false);
} catch (SecurityException e) {
throw e;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
index 08d18b6c1..526f27731 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
@@ -115,14 +115,13 @@ public class PDFRenderingTest extends TikaTest {
super(context);
}
- public void parseEmbedded(InputStream stream, ContentHandler handler,
Metadata metadata,
+ @Override
+ public void parseEmbedded(TikaInputStream tis, ContentHandler handler,
Metadata metadata,
boolean outputHtml) throws SAXException,
IOException {
- TikaInputStream tstream = TikaInputStream.get(stream);
- byte[] bytes = Files.readAllBytes(tstream.getPath());
+
+ byte[] bytes = Files.readAllBytes(tis.getPath());
embedded.put(count++, bytes);
- try (InputStream is = Files.newInputStream(tstream.getPath())) {
- super.parseEmbedded(is, handler, metadata, outputHtml);
- }
+ super.parseEmbedded(tis, handler, metadata, outputHtml);
}
public Map<Integer, byte[]> getEmbedded() {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index 11ecf902e..48651cc7b 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -65,6 +65,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -228,7 +229,9 @@ public class CompressorParser implements Parser {
EmbeddedDocumentExtractor extractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (extractor.shouldParseEmbedded(entrydata)) {
- extractor.parseEmbedded(cis, xhtml, entrydata, true);
+ try (TikaInputStream tis = TikaInputStream.get(cis)) {
+ extractor.parseEmbedded(tis, xhtml, entrydata, true);
+ }
}
} finally {
cis.close();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
index 871f29388..11a6e52a4 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
@@ -86,9 +86,9 @@ public class RarParser implements Parser {
Metadata entrydata =
PackageParser.handleEntryMetadata(header.getFileName(),
header.getCTime(), header.getMTime(),
header.getFullUnpackSize(),
xhtml);
- try (InputStream subFile = rar.getInputStream(header)) {
+ try (TikaInputStream rarTis =
TikaInputStream.get(rar.getInputStream(header))) {
if (extractor.shouldParseEmbedded(entrydata)) {
- extractor.parseEmbedded(subFile, handler,
entrydata, true);
+ extractor.parseEmbedded(rarTis, handler,
entrydata, true);
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UnrarParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UnrarParser.java
index 20ee89dd7..f4005b526 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UnrarParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UnrarParser.java
@@ -132,8 +132,8 @@ public class UnrarParser implements Parser {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fName);
metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, relPath);
if (extractor.shouldParseEmbedded(metadata)) {
- try (InputStream is = TikaInputStream.get(embeddedFile)) {
- extractor.parseEmbedded(is, xhtml, metadata, true);
+ try (TikaInputStream tis = TikaInputStream.get(embeddedFile)) {
+ extractor.parseEmbedded(tis, xhtml, metadata, true);
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/http/HttpParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/http/HttpParser.java
index 440291aee..3bfa09a8f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/http/HttpParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/http/HttpParser.java
@@ -35,6 +35,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
@@ -79,8 +80,8 @@ public class HttpParser implements Parser {
if (contentLength > 0) {
MessageBody messageBody = LengthedBody.create(channel, buffer,
contentLength);
Metadata payloadMetadata = new Metadata();
- try (InputStream messageStream = messageBody.stream()) {
- parsePayload(messageStream, xhtml, payloadMetadata,
context);
+ try (TikaInputStream tis =
TikaInputStream.get(messageBody.stream())) {
+ parsePayload(tis, xhtml, payloadMetadata, context);
}
}
} finally {
@@ -88,11 +89,11 @@ public class HttpParser implements Parser {
}
}
- private void parsePayload(InputStream stream, ContentHandler handler,
Metadata metadata,
- ParseContext context) throws IOException, SAXException
{
+ private void parsePayload(TikaInputStream tis, ContentHandler handler,
Metadata metadata,
+ ParseContext context) throws IOException,
SAXException {
EmbeddedDocumentExtractor ex =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (ex.shouldParseEmbedded(metadata)) {
- ex.parseEmbedded(stream, handler, metadata, true);
+ ex.parseEmbedded(tis, handler, metadata, true);
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java
index e78511f2b..fcf4042df 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java
@@ -116,9 +116,9 @@ public class WACZParser implements Parser {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
metadata.set(Metadata.CONTENT_LENGTH, Long.toString(zae.getSize()));
- try (InputStream inputStream =
getMaybeGzipInputStream(TikaInputStream.get(zais))) {
+ try (TikaInputStream tis =
TikaInputStream.get(getMaybeGzipInputStream(TikaInputStream.get(zais)))) {
if (ex.shouldParseEmbedded(metadata)) {
- ex.parseEmbedded(inputStream, xhtml, metadata, true);
+ ex.parseEmbedded(tis, xhtml, metadata, true);
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
index ad4894b54..9aa1e2f2f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
@@ -146,7 +146,7 @@ public class WARCParser implements Parser {
if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
//TODO check Content-Encoding on the warcResponse.http.headers and
wrap the stream.
//May need to sniff first few bytes to confirm accuracy, e.g. gzip
compression ?
- try (InputStream tis =
TikaInputStream.get(payload.body().stream())) {
+ try (TikaInputStream tis =
TikaInputStream.get(payload.body().stream())) {
embeddedDocumentExtractor.parseEmbedded(tis, xhtml, metadata,
true);
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
index 4e7f0dad7..094db7cc9 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
@@ -17,12 +17,10 @@
package org.apache.tika.parser.xml;
import java.io.IOException;
-import java.io.InputStream;
import java.util.Collections;
import java.util.Set;
import org.apache.commons.codec.binary.Base64;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -30,6 +28,7 @@ import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -86,10 +85,9 @@ public class FictionBookParser extends XMLParser {
@Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
if (binaryMode) {
- try (InputStream stream =
-
UnsynchronizedByteArrayInputStream.builder().setByteArray(Base64.decodeBase64(binaryData.toString())).get())
{
+ try (TikaInputStream tis =
TikaInputStream.get(Base64.decodeBase64(binaryData.toString()))) {
partExtractor.parseEmbedded(
- stream, handler, metadata, true);
+ tis, handler, metadata, true);
} catch (IOException e) {
throw new SAXException("IOException in parseEmbedded", e);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 9f9f71357..de112e5ac 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -29,6 +29,7 @@ import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
@@ -134,7 +135,7 @@ public class ZipParserTest extends AbstractPkgTest {
return false;
}
- public void parseEmbedded(InputStream inputStream, ContentHandler
contentHandler,
+ public void parseEmbedded(TikaInputStream inputStream, ContentHandler
contentHandler,
Metadata metadata, boolean outputHtml) {
throw new UnsupportedOperationException("should never be called");
}
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
index 7fb362300..a2e3064d6 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
@@ -58,6 +58,7 @@ import
org.apache.tika.extractor.DefaultEmbeddedStreamTranslator;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedStreamTranslator;
import org.apache.tika.io.BoundedInputStream;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MimeTypeException;
@@ -186,12 +187,13 @@ public class UnpackerResource {
return true;
}
- public void parseEmbedded(InputStream inputStream, ContentHandler
contentHandler, Metadata metadata, boolean b) throws SAXException, IOException {
+ @Override
+ public void parseEmbedded(TikaInputStream tis, ContentHandler
contentHandler, Metadata metadata, boolean b) throws SAXException, IOException {
UnsynchronizedByteArrayOutputStream bos =
UnsynchronizedByteArrayOutputStream
.builder()
.get();
- BoundedInputStream bis = new BoundedInputStream(unpackMaxBytes,
inputStream);
+ BoundedInputStream bis = new BoundedInputStream(unpackMaxBytes,
tis);
IOUtils.copy(bis, bos);
if (bis.hasHitBound()) {
throw new IOException(new TikaMemoryLimitException(
@@ -222,7 +224,7 @@ public class UnpackerResource {
LOG.warn("Unexpected MimeTypeException", e);
}
}
- try (InputStream is =
UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get()) {
+ try (TikaInputStream is = TikaInputStream.get(data)) {
if (embeddedStreamTranslator.shouldTranslate(is, metadata)) {
InputStream translated =
embeddedStreamTranslator.translate(UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(),
metadata);
UnsynchronizedByteArrayOutputStream bos2 =
UnsynchronizedByteArrayOutputStream