This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4419
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 10c2dac4b6214aa99f3df54764f782f5839c2526
Author: tallison <[email protected]>
AuthorDate: Mon May 19 10:46:07 2025 -0400

    TIKA-4419 -- downgrade jsoup
---
 tika-parent/pom.xml                                            |  2 +-
 .../test/java/org/apache/tika/parser/html/HtmlParserTest.java  | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 408adaef4..7491961c4 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -388,7 +388,7 @@
     <jhighlight.version>1.1.0</jhighlight.version>
     <jna.version>5.17.0</jna.version>
     <json.simple.version>1.1.1</json.simple.version>
-    <jsoup.version>1.20.1</jsoup.version>
+    <jsoup.version>1.19.1</jsoup.version>
     <jsr305.version>3.0.2</jsr305.version>
     <junit4.version>4.13.2</junit4.version>
     <junit5.version>5.12.2</junit5.version>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 2fcc4f6b0..d354bfbc9 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -53,6 +53,7 @@ import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
 
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 import org.xml.sax.Attributes;
@@ -1289,4 +1290,13 @@ public class HtmlParserTest extends TikaTest {
             return DONE;
         }
     }
+
+    @Test
+    public void testJsoupScriptTagRegression() throws Exception {
+        //https://github.com/jhy/jsoup/issues/2329
+        String html = "<html><head><script src=\"blah\"/></head><body>this is 
content</body></html";
+        String xml = 
getXML(UnsynchronizedByteArrayInputStream.builder().setByteArray(html.getBytes(UTF_8)).get(),
+                TikaTest.AUTO_DETECT_PARSER, new Metadata()).xml;
+        assertContains("this is content", xml);
+    }
 }

Reply via email to