This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e805ceef1 add todo for when we upgrade jsoup:
https://github.com/jhy/jsoup/issues/2330#issuecomment-2899689838
e805ceef1 is described below
commit e805ceef1232a059d878fe56cf7a92fab84e2b9c
Author: tallison <[email protected]>
AuthorDate: Thu May 22 10:43:29 2025 -0400
add todo for when we upgrade jsoup:
https://github.com/jhy/jsoup/issues/2330#issuecomment-2899689838
---
.../main/java/org/apache/tika/parser/html/JSoupParser.java | 11 ++++++++++-
.../test/java/org/apache/tika/parser/html/HtmlParserTest.java | 9 ++++++++-
2 files changed, 18 insertions(+), 2 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
index bedbce788..36115a08f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
@@ -152,8 +152,17 @@ public class JSoupParser extends
AbstractEncodingDetectorParser {
// Get the HTML mapper from the parse context
HtmlMapper mapper = context.get(HtmlMapper.class, new
DefaultHtmlMapper());
+ TagSet tagSet = new TagSet(SELF_CLOSEABLE_TAGS);
+ /* TODO -- when we upgrade jsoup to 1.21.1
+ .onNewTag(tag -> {
+ if (!tag.isKnownTag())
+ tag.set(Tag.SelfClose);
+ });
+ */
+
//do better with baseUri?
- Document document = Jsoup.parse(CloseShieldInputStream.wrap(stream),
charset.name(), "", Parser.htmlParser().tagSet(SELF_CLOSEABLE_TAGS));
+ Document document = Jsoup.parse(CloseShieldInputStream.wrap(stream),
charset.name(), "",
+ Parser.htmlParser().tagSet(tagSet));
document.quirksMode(Document.QuirksMode.quirks);
ContentHandler xhtml = new XHTMLDowngradeHandler(
new HtmlHandler(mapper, handler, metadata, context,
extractScripts));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 9850463ef..3fb542076 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -1289,11 +1289,18 @@ public class HtmlParserTest extends TikaTest {
}
@Test
- public void testJsoupScriptTagRegression() throws Exception {
+ public void testJsoupKnownSelfCloseableTags() throws Exception {
//https://github.com/jhy/jsoup/issues/2329
String html = "<html><head><script src=\"blah\"/></head><body>this is
content</body></html";
String xml =
getXML(UnsynchronizedByteArrayInputStream.builder().setByteArray(html.getBytes(UTF_8)).get(),
TikaTest.AUTO_DETECT_PARSER, new Metadata()).xml;
assertContains("this is content", xml);
}
+
+
+ @Test
+ @Disabled("until we upgrade jsoup to >= 1.21.1 ")
+ public void testJsoupUnknownSelfCloseableTags() throws Exception {
+ //TODO -- figure out how to test this.
+ }
}