This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new bd878d373 TIKA-4350 HTML snippet containing <iframe> as root element
erroneously recognized as application/xml (#2045)
bd878d373 is described below
commit bd878d3733853d0a9c6ffef32e18fba61f505760
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Thu Nov 14 15:04:33 2024 +0100
TIKA-4350 HTML snippet containing <iframe> as root element erroneously
recognized as application/xml (#2045)
---
tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml | 2 ++
tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java | 2 ++
.../test/resources/org/apache/tika/mime/test-html-snippet-iframe.jsp | 2 ++
3 files changed, 6 insertions(+)
diff --git
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 87cc7aae8..9ea801f60 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -7680,6 +7680,8 @@
<root-XML localName="SCRIPT"/>
<root-XML localName="frameset"/>
<root-XML localName="FRAMESET"/>
+ <root-XML localName="iframe"/>
+ <root-XML localName="IFRAME"/>
<magic priority="60">
<match value="(?i)<(html|head|body|title|div)[ >]" type="regex"
offset="0"/>
<match value="(?i)<h[123][ >]" type="regex" offset="0"/>
diff --git
a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
index 6e844a0bf..690290cbd 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
@@ -74,6 +74,8 @@ public class MimeDetectionTest {
testFile("text/html", "testlargerbuffer.html");
// test fragment of HTML with <div> (TIKA-1102)
testFile("text/html", "htmlfragment");
+ // test fragment of HTML with <iframe> and potentially misleading file
suffix
+ testFile("text/html", "test-html-snippet-iframe.jsp");
// test binary CGM detection (TIKA-1170)
testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
// test HTML detection of malformed file, previously identified as
image/cgm (TIKA-1170)
diff --git
a/tika-core/src/test/resources/org/apache/tika/mime/test-html-snippet-iframe.jsp
b/tika-core/src/test/resources/org/apache/tika/mime/test-html-snippet-iframe.jsp
new file mode 100644
index 000000000..2681fecdc
--- /dev/null
+++
b/tika-core/src/test/resources/org/apache/tika/mime/test-html-snippet-iframe.jsp
@@ -0,0 +1,2 @@
+<!-- this is a comment: https://www.example.org/path/file.pdf -->
+ <iframe src='/path/file.pdf' width='100%' height='100%'
target='_blank'>