This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e0c1d1df8 TIKA-4310 -- add CloseShieldInputStream to JsoupParser to
ensure that… (#1959)
e0c1d1df8 is described below
commit e0c1d1df81eff6175b50d32fdad1d69e9709bb74
Author: Tim Allison <[email protected]>
AuthorDate: Mon Sep 16 16:06:49 2024 -0400
TIKA-4310 -- add CloseShieldInputStream to JsoupParser to ensure that…
(#1959)
* TIKA-4310 -- add CloseShieldInputStream to JsoupParser to ensure that
underlying stream is not closed and add CloseShieldInputStream more generally
to RUnpackExtractor
---
.../java/org/apache/tika/extractor/RUnpackExtractor.java | 6 ++++--
.../java/org/apache/tika/parser/html/JSoupParser.java | 3 ++-
.../java/org/apache/tika/parser/html/HtmlParserTest.java | 16 ++++++++++++++++
3 files changed, 22 insertions(+), 3 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
index 76b297dd7..0e5928845 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
@@ -121,13 +121,15 @@ public class RUnpackExtractor extends
ParsingEmbeddedDocumentExtractor {
//literally writing out a file per request
Path p = stream.getPath();
try {
- parse(stream, handler, metadata);
+ //warp in CloseShieldInputStream to ensure that a misbehaving
parser isn't closing
+ //the stream and thereby deleting the temp file.
+ parse(CloseShieldInputStream.wrap(stream), handler, metadata);
} finally {
storeEmbeddedBytes(p, metadata);
}
}
- private void parse(TikaInputStream stream, ContentHandler handler,
Metadata metadata)
+ private void parse(InputStream stream, ContentHandler handler, Metadata
metadata)
throws TikaException, IOException, SAXException {
getDelegatingParser().parse(stream,
new EmbeddedContentHandler(new BodyContentHandler(handler)),
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
index 2f16f696b..c34a496db 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
@@ -27,6 +27,7 @@ import java.util.Iterator;
import java.util.Set;
import javax.xml.XMLConstants;
+import org.apache.commons.io.input.CloseShieldInputStream;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.DataNode;
@@ -127,7 +128,7 @@ public class JSoupParser extends
AbstractEncodingDetectorParser {
HtmlMapper mapper = context.get(HtmlMapper.class, new
DefaultHtmlMapper());
//do better with baseUri?
- Document document = Jsoup.parse(stream, charset.name(), "");
+ Document document = Jsoup.parse(CloseShieldInputStream.wrap(stream),
charset.name(), "");
document.quirksMode(Document.QuirksMode.quirks);
ContentHandler xhtml = new XHTMLDowngradeHandler(
new HtmlHandler(mapper, handler, metadata, context,
extractScripts));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 502911fd3..2fcc4f6b0 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -32,6 +32,7 @@ import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
@@ -78,6 +79,7 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
public class HtmlParserTest extends TikaTest {
@@ -1243,6 +1245,20 @@ public class HtmlParserTest extends TikaTest {
assertEquals("OldMetaTitle", m.get("title"));
}
+ @Test
+ public void testStreamNotClosed() throws Exception {
+ String path = "/test-documents/testHTML.html";
+ Metadata metadata = new Metadata();
+ Path tmp = null;
+ try (TikaInputStream stream =
TikaInputStream.get(getResourceAsStream(path))) {
+ //spool tika stream to disk
+ tmp = stream.getPath();
+ new JSoupParser().parse(stream, new WriteOutContentHandler(),
metadata, new ParseContext());
+ //make sure that the tmp file is still there
+ assertTrue(Files.isRegularFile(tmp));
+ }
+ }
+
private class EncodingDetectorRunner implements Callable<String> {
final static String DONE = "done";