This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new e805ceef1 add todo for when we upgrade jsoup: 
https://github.com/jhy/jsoup/issues/2330#issuecomment-2899689838
e805ceef1 is described below

commit e805ceef1232a059d878fe56cf7a92fab84e2b9c
Author: tallison <[email protected]>
AuthorDate: Thu May 22 10:43:29 2025 -0400

    add todo for when we upgrade jsoup: 
https://github.com/jhy/jsoup/issues/2330#issuecomment-2899689838
---
 .../main/java/org/apache/tika/parser/html/JSoupParser.java    | 11 ++++++++++-
 .../test/java/org/apache/tika/parser/html/HtmlParserTest.java |  9 ++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
index bedbce788..36115a08f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
@@ -152,8 +152,17 @@ public class JSoupParser extends 
AbstractEncodingDetectorParser {
         // Get the HTML mapper from the parse context
         HtmlMapper mapper = context.get(HtmlMapper.class, new 
DefaultHtmlMapper());
 
+        TagSet tagSet = new TagSet(SELF_CLOSEABLE_TAGS);
+        /* TODO -- when we upgrade jsoup to 1.21.1
+                .onNewTag(tag -> {
+            if (!tag.isKnownTag())
+                tag.set(Tag.SelfClose);
+        });
+        */
+
         //do better with baseUri?
-        Document document = Jsoup.parse(CloseShieldInputStream.wrap(stream), 
charset.name(), "", Parser.htmlParser().tagSet(SELF_CLOSEABLE_TAGS));
+        Document document = Jsoup.parse(CloseShieldInputStream.wrap(stream), 
charset.name(), "",
+                Parser.htmlParser().tagSet(tagSet));
         document.quirksMode(Document.QuirksMode.quirks);
         ContentHandler xhtml = new XHTMLDowngradeHandler(
                 new HtmlHandler(mapper, handler, metadata, context, 
extractScripts));
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 9850463ef..3fb542076 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -1289,11 +1289,18 @@ public class HtmlParserTest extends TikaTest {
     }
 
     @Test
-    public void testJsoupScriptTagRegression() throws Exception {
+    public void testJsoupKnownSelfCloseableTags() throws Exception {
         //https://github.com/jhy/jsoup/issues/2329
         String html = "<html><head><script src=\"blah\"/></head><body>this is 
content</body></html";
         String xml = 
getXML(UnsynchronizedByteArrayInputStream.builder().setByteArray(html.getBytes(UTF_8)).get(),
                 TikaTest.AUTO_DETECT_PARSER, new Metadata()).xml;
         assertContains("this is content", xml);
     }
+
+
+    @Test
+    @Disabled("until we upgrade jsoup to >= 1.21.1 ")
+    public void testJsoupUnknownSelfCloseableTags() throws Exception {
+        //TODO -- figure out how to test this.
+    }
 }

Reply via email to