This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x-TIKA-4419-v2
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ea533aa000fe695c41504a67842cd4de5eacf776
Author: tallison <[email protected]>
AuthorDate: Tue May 20 08:17:38 2025 -0400

    TIKA-4419 -- experiment with custom list of self-closeable-tags.txt
---
 .../org/apache/tika/parser/html/JSoupParser.java   | 29 +++++++++++-
 .../tika/parser/html/self-closeable-tags.txt       | 52 ++++++++++++++++++++++
 2 files changed, 79 insertions(+), 2 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
index c34a496db..bedbce788 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
@@ -16,8 +16,10 @@
  */
 package org.apache.tika.parser.html;
 
+import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
@@ -34,6 +36,9 @@ import org.jsoup.nodes.DataNode;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Node;
 import org.jsoup.nodes.TextNode;
+import org.jsoup.parser.Parser;
+import org.jsoup.parser.Tag;
+import org.jsoup.parser.TagSet;
 import org.jsoup.select.NodeFilter;
 import org.jsoup.select.NodeTraversor;
 import org.xml.sax.ContentHandler;
@@ -70,6 +75,26 @@ public class JSoupParser extends 
AbstractEncodingDetectorParser {
     private static final Set<MediaType> SUPPORTED_TYPES = 
Collections.unmodifiableSet(
             new HashSet<MediaType>(Arrays.asList(MediaType.text("html"), 
XHTML, WAP_XHTML, X_ASP)));
 
+    private static final TagSet SELF_CLOSEABLE_TAGS = TagSet.Html();
+
+    static {
+        try (BufferedReader reader = new BufferedReader(new InputStreamReader(
+                
JSoupParser.class.getResourceAsStream("self-closeable-tags.txt"), 
StandardCharsets.UTF_8))) {
+            String line = reader.readLine();
+            while (line != null) {
+                if (line.startsWith("#") || line.trim().isEmpty()) {
+                    line = reader.readLine();
+                    continue;
+                }
+                Tag t = SELF_CLOSEABLE_TAGS.valueOf(line.trim(), 
Parser.NamespaceHtml);
+                t.set(Tag.SelfClose);
+                line = reader.readLine();
+            }
+        } catch (IOException e) {
+            throw new RuntimeException("Can't find self-closeable-tags.txt");
+        }
+    }
+
     @Field
     private boolean extractScripts = false;
 
@@ -128,7 +153,7 @@ public class JSoupParser extends 
AbstractEncodingDetectorParser {
         HtmlMapper mapper = context.get(HtmlMapper.class, new 
DefaultHtmlMapper());
 
         //do better with baseUri?
-        Document document = Jsoup.parse(CloseShieldInputStream.wrap(stream), 
charset.name(), "");
+        Document document = Jsoup.parse(CloseShieldInputStream.wrap(stream), 
charset.name(), "", Parser.htmlParser().tagSet(SELF_CLOSEABLE_TAGS));
         document.quirksMode(Document.QuirksMode.quirks);
         ContentHandler xhtml = new XHTMLDowngradeHandler(
                 new HtmlHandler(mapper, handler, metadata, context, 
extractScripts));
@@ -147,7 +172,7 @@ public class JSoupParser extends 
AbstractEncodingDetectorParser {
         HtmlMapper mapper = context.get(HtmlMapper.class, new 
DefaultHtmlMapper());
 
         //do better with baseUri?
-        Document document = Jsoup.parse(html);
+        Document document = Jsoup.parse(html, 
Parser.htmlParser().tagSet(SELF_CLOSEABLE_TAGS));
         document.quirksMode(Document.QuirksMode.quirks);
         ContentHandler xhtml = new XHTMLDowngradeHandler(
                 new HtmlHandler(mapper, handler, metadata, context, 
extractScripts));
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/self-closeable-tags.txt
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/self-closeable-tags.txt
new file mode 100644
index 000000000..ff44a4385
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/self-closeable-tags.txt
@@ -0,0 +1,52 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+# See TIKA-4419
+br
+img
+meta
+input
+link
+path
+a
+hr
+image
+col
+area
+param
+base
+td
+div
+p
+source
+option
+embed
+span
+tr
+td
+script
+wbr
+li
+button
+iframe
+b
+i
+ul
+form
+textarea
+table
+select
+strong
+title
\ No newline at end of file

Reply via email to