This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 369a94097 TIKA-4419 -- experiment with custom list of
self-closeable-tags.txt (#2216)
369a94097 is described below
commit 369a94097d66a7e8ae442e800f8581ac3af3dd7f
Author: Tim Allison <[email protected]>
AuthorDate: Wed May 21 15:30:35 2025 -0400
TIKA-4419 -- experiment with custom list of self-closeable-tags.txt (#2216)
---
.../org/apache/tika/parser/html/JSoupParser.java | 29 +++++++++++-
.../tika/parser/html/self-closeable-tags.txt | 52 ++++++++++++++++++++++
2 files changed, 79 insertions(+), 2 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
index c34a496db..bedbce788 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
@@ -16,8 +16,10 @@
*/
package org.apache.tika.parser.html;
+import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
+import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
@@ -34,6 +36,9 @@ import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
+import org.jsoup.parser.Parser;
+import org.jsoup.parser.Tag;
+import org.jsoup.parser.TagSet;
import org.jsoup.select.NodeFilter;
import org.jsoup.select.NodeTraversor;
import org.xml.sax.ContentHandler;
@@ -70,6 +75,26 @@ public class JSoupParser extends
AbstractEncodingDetectorParser {
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(
new HashSet<MediaType>(Arrays.asList(MediaType.text("html"),
XHTML, WAP_XHTML, X_ASP)));
+ private static final TagSet SELF_CLOSEABLE_TAGS = TagSet.Html();
+
+ static {
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(
+
JSoupParser.class.getResourceAsStream("self-closeable-tags.txt"),
StandardCharsets.UTF_8))) {
+ String line = reader.readLine();
+ while (line != null) {
+ if (line.startsWith("#") || line.trim().isEmpty()) {
+ line = reader.readLine();
+ continue;
+ }
+ Tag t = SELF_CLOSEABLE_TAGS.valueOf(line.trim(),
Parser.NamespaceHtml);
+ t.set(Tag.SelfClose);
+ line = reader.readLine();
+ }
+ } catch (IOException e) {
+ throw new RuntimeException("Can't find self-closeable-tags.txt");
+ }
+ }
+
@Field
private boolean extractScripts = false;
@@ -128,7 +153,7 @@ public class JSoupParser extends
AbstractEncodingDetectorParser {
HtmlMapper mapper = context.get(HtmlMapper.class, new
DefaultHtmlMapper());
//do better with baseUri?
- Document document = Jsoup.parse(CloseShieldInputStream.wrap(stream),
charset.name(), "");
+ Document document = Jsoup.parse(CloseShieldInputStream.wrap(stream),
charset.name(), "", Parser.htmlParser().tagSet(SELF_CLOSEABLE_TAGS));
document.quirksMode(Document.QuirksMode.quirks);
ContentHandler xhtml = new XHTMLDowngradeHandler(
new HtmlHandler(mapper, handler, metadata, context,
extractScripts));
@@ -147,7 +172,7 @@ public class JSoupParser extends
AbstractEncodingDetectorParser {
HtmlMapper mapper = context.get(HtmlMapper.class, new
DefaultHtmlMapper());
//do better with baseUri?
- Document document = Jsoup.parse(html);
+ Document document = Jsoup.parse(html,
Parser.htmlParser().tagSet(SELF_CLOSEABLE_TAGS));
document.quirksMode(Document.QuirksMode.quirks);
ContentHandler xhtml = new XHTMLDowngradeHandler(
new HtmlHandler(mapper, handler, metadata, context,
extractScripts));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/self-closeable-tags.txt
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/self-closeable-tags.txt
new file mode 100644
index 000000000..ff44a4385
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/self-closeable-tags.txt
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See TIKA-4419
+br
+img
+meta
+input
+link
+path
+a
+hr
+image
+col
+area
+param
+base
+td
+div
+p
+source
+option
+embed
+span
+tr
+td
+script
+wbr
+li
+button
+iframe
+b
+i
+ul
+form
+textarea
+table
+select
+strong
+title
\ No newline at end of file